refactor: kwargs comprehension

Signed-off-by: jupyterjazz <[email protected]>

Files changed (4) hide show

embedding.py CHANGED Viewed

@@ -47,9 +47,7 @@ class XLMRobertaEmbeddings(nn.Module):
         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
-        lora_kwargs = {}
-        if task is not None:
-            lora_kwargs['task'] = task
         embeddings = self.word_embeddings(input_ids, **lora_kwargs)
         if self.max_position_embeddings > 0:
             if position_ids is None:

         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
+        lora_kwargs = {'task': task} if task is not None else {}
         embeddings = self.word_embeddings(input_ids, **lora_kwargs)
         if self.max_position_embeddings > 0:
             if position_ids is None:

mha.py CHANGED Viewed

@@ -645,14 +645,11 @@ class MHA(nn.Module):
         batch, seqlen = x.shape[:2]
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
-            lora_kwargs = {}
-            if task is not None:
-                lora_kwargs['task'] = task
-                lora_kwargs['residual'] = self.return_residual
             if not self.return_residual:
                 qkv = self.Wqkv(x, **lora_kwargs)
             else:
                 qkv, x = self.Wqkv(x, **lora_kwargs)
             if self.dwconv:
@@ -739,6 +736,6 @@ class MHA(nn.Module):
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
                 context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
-        lora_kwargs.pop('residual', None)
         out = self.out_proj(rearrange(context, "... h d -> ... (h d)"), **lora_kwargs)
         return out if not self.return_residual else (out, x)

         batch, seqlen = x.shape[:2]
         if not self.cross_attn and self.num_heads_kv == self.num_heads:
             assert x_kv is None and mixer_subset is None
+            lora_kwargs = {'task': task} if task is not None else {}
             if not self.return_residual:
                 qkv = self.Wqkv(x, **lora_kwargs)
             else:
+                lora_kwargs['residual'] = True
                 qkv, x = self.Wqkv(x, **lora_kwargs)
             if self.dwconv:
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
                 context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
         out = self.out_proj(rearrange(context, "... h d -> ... (h d)"), **lora_kwargs)
         return out if not self.return_residual else (out, x)

mlp.py CHANGED Viewed

@@ -48,9 +48,7 @@ class Mlp(nn.Module):
         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
     def forward(self, x, task):
-        lora_kwargs = {}
-        if task is not None:
-            lora_kwargs['task'] = task
         y = self.fc1(x, **lora_kwargs)
         y = self.activation(y)
         y = self.fc2(y, **lora_kwargs)

         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
     def forward(self, x, task):
+        lora_kwargs = {'task': task} if task is not None else {}
         y = self.fc1(x, **lora_kwargs)
         y = self.activation(y)
         y = self.fc2(y, **lora_kwargs)

modeling_xlm_roberta.py CHANGED Viewed

@@ -313,9 +313,7 @@ class XLMRobertaPooler(nn.Module):
     def forward(self, hidden_states, pool=True, task=None):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
-        lora_kwargs = {}
-        if task is not None:
-            lora_kwargs['task'] = task
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
         pooled_output = self.dense(first_token_tensor, **lora_kwargs)
@@ -550,9 +548,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
             )
         else:
             range_iter = range(0, len(sentences), batch_size)
-        lora_kwargs = {}
-        if task is not None:
-            lora_kwargs['task'] = task
         for i in range_iter:
             encoded_input = self.tokenizer(
                 sentences[i : i + batch_size],

     def forward(self, hidden_states, pool=True, task=None):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
+        lora_kwargs = {'task': task} if task is not None else {}
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
         pooled_output = self.dense(first_token_tensor, **lora_kwargs)
             )
         else:
             range_iter = range(0, len(sentences), batch_size)
+        lora_kwargs = {'task': task} if task is not None else {}
         for i in range_iter:
             encoded_input = self.tokenizer(
                 sentences[i : i + batch_size],