rhymes-ai
/

Aria

@@ -66,14 +66,19 @@ class AriaConfig(PretrainedConfig):
         },
         ignore_index=-100,
         image_token_index=32000,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         attn_implementation = kwargs.pop("attn_implementation", None)
-        self._attn_implementation = attn_implementation
         # Convert the keys and values of projector_patch_to_query_dict to integers
         # This ensures consistency even if they were provided as strings

         },
         ignore_index=-100,
         image_token_index=32000,
+        tie_word_embeddings=False,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
+        self.tie_word_embeddings = tie_word_embeddings
         attn_implementation = kwargs.pop("attn_implementation", None)
+        # Set the default attention implementation to flash_attention_2 if not specified
+        self._attn_implementation = (
+            "flash_attention_2" if attn_implementation is None else attn_implementation
+        )
         # Convert the keys and values of projector_patch_to_query_dict to integers
         # This ensures consistency even if they were provided as strings

modeling_aria.py CHANGED Viewed

@@ -165,6 +165,14 @@ class AriaForConditionalGeneration(AriaPretrainedModel, GenerationMixin):
         """Set the input embeddings for the language model."""
         self.language_model.set_input_embeddings(value)
     def set_moe_z_loss_coeff(self, value):
         """
         Set the z-loss coefficient for Mixture of Experts (MoE) models.

         """Set the input embeddings for the language model."""
         self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        """Retrieve the output embeddings from the language model."""
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, value):
+        """Set the output embeddings for the language model."""
+        self.language_model.set_output_embeddings(value)
     def set_moe_z_loss_coeff(self, value):
         """
         Set the z-loss coefficient for Mixture of Experts (MoE) models.

moe_lm.py CHANGED Viewed

@@ -255,7 +255,8 @@ class TopKRouter(nn.Module):
                 - top_indices: Indices of top-k experts for each token.
                 - tokens_per_expert: Number of tokens assigned to each expert.
         """
-        logits = self.apply_z_loss(logits)
         top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1)
         scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32).type_as(logits)
@@ -267,7 +268,8 @@ class TopKRouter(nn.Module):
             max=self.config.moe_num_experts - 1,
         )
-        scores = self.apply_aux_loss(logits, tokens_per_expert, scores)
         return scores, top_indices, tokens_per_expert
     def forward(

                 - top_indices: Indices of top-k experts for each token.
                 - tokens_per_expert: Number of tokens assigned to each expert.
         """
+        if self.training:
+            logits = self.apply_z_loss(logits)
         top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1)
         scores = torch.softmax(top_logits, dim=-1, dtype=torch.float32).type_as(logits)
             max=self.config.moe_num_experts - 1,
         )
+        if self.training:
+            scores = self.apply_aux_loss(logits, tokens_per_expert, scores)
         return scores, top_indices, tokens_per_expert
     def forward(