Commit
·
4e13c90
1
Parent(s):
acffa62
refactor: kwargs comprehension
Browse filesSigned-off-by: jupyterjazz <[email protected]>
- embedding.py +1 -3
- mha.py +3 -6
- mlp.py +1 -3
- modeling_xlm_roberta.py +2 -6
embedding.py
CHANGED
|
@@ -47,9 +47,7 @@ class XLMRobertaEmbeddings(nn.Module):
|
|
| 47 |
token_type_ids: (batch, seqlen)
|
| 48 |
"""
|
| 49 |
batch_size, seqlen = input_ids.shape
|
| 50 |
-
lora_kwargs = {}
|
| 51 |
-
if task is not None:
|
| 52 |
-
lora_kwargs['task'] = task
|
| 53 |
embeddings = self.word_embeddings(input_ids, **lora_kwargs)
|
| 54 |
if self.max_position_embeddings > 0:
|
| 55 |
if position_ids is None:
|
|
|
|
| 47 |
token_type_ids: (batch, seqlen)
|
| 48 |
"""
|
| 49 |
batch_size, seqlen = input_ids.shape
|
| 50 |
+
lora_kwargs = {'task': task} if task is not None else {}
|
|
|
|
|
|
|
| 51 |
embeddings = self.word_embeddings(input_ids, **lora_kwargs)
|
| 52 |
if self.max_position_embeddings > 0:
|
| 53 |
if position_ids is None:
|
mha.py
CHANGED
|
@@ -645,14 +645,11 @@ class MHA(nn.Module):
|
|
| 645 |
batch, seqlen = x.shape[:2]
|
| 646 |
if not self.cross_attn and self.num_heads_kv == self.num_heads:
|
| 647 |
assert x_kv is None and mixer_subset is None
|
| 648 |
-
lora_kwargs = {}
|
| 649 |
-
if task is not None:
|
| 650 |
-
lora_kwargs['task'] = task
|
| 651 |
-
lora_kwargs['residual'] = self.return_residual
|
| 652 |
-
|
| 653 |
if not self.return_residual:
|
| 654 |
qkv = self.Wqkv(x, **lora_kwargs)
|
| 655 |
else:
|
|
|
|
| 656 |
qkv, x = self.Wqkv(x, **lora_kwargs)
|
| 657 |
|
| 658 |
if self.dwconv:
|
|
@@ -739,6 +736,6 @@ class MHA(nn.Module):
|
|
| 739 |
context = self._update_kvcache_attention(q, kv, inference_params)
|
| 740 |
else:
|
| 741 |
context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
|
| 742 |
-
|
| 743 |
out = self.out_proj(rearrange(context, "... h d -> ... (h d)"), **lora_kwargs)
|
| 744 |
return out if not self.return_residual else (out, x)
|
|
|
|
| 645 |
batch, seqlen = x.shape[:2]
|
| 646 |
if not self.cross_attn and self.num_heads_kv == self.num_heads:
|
| 647 |
assert x_kv is None and mixer_subset is None
|
| 648 |
+
lora_kwargs = {'task': task} if task is not None else {}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
if not self.return_residual:
|
| 650 |
qkv = self.Wqkv(x, **lora_kwargs)
|
| 651 |
else:
|
| 652 |
+
lora_kwargs['residual'] = True
|
| 653 |
qkv, x = self.Wqkv(x, **lora_kwargs)
|
| 654 |
|
| 655 |
if self.dwconv:
|
|
|
|
| 736 |
context = self._update_kvcache_attention(q, kv, inference_params)
|
| 737 |
else:
|
| 738 |
context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
|
| 739 |
+
|
| 740 |
out = self.out_proj(rearrange(context, "... h d -> ... (h d)"), **lora_kwargs)
|
| 741 |
return out if not self.return_residual else (out, x)
|
mlp.py
CHANGED
|
@@ -48,9 +48,7 @@ class Mlp(nn.Module):
|
|
| 48 |
self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
|
| 49 |
|
| 50 |
def forward(self, x, task):
|
| 51 |
-
lora_kwargs = {}
|
| 52 |
-
if task is not None:
|
| 53 |
-
lora_kwargs['task'] = task
|
| 54 |
y = self.fc1(x, **lora_kwargs)
|
| 55 |
y = self.activation(y)
|
| 56 |
y = self.fc2(y, **lora_kwargs)
|
|
|
|
| 48 |
self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
|
| 49 |
|
| 50 |
def forward(self, x, task):
|
| 51 |
+
lora_kwargs = {'task': task} if task is not None else {}
|
|
|
|
|
|
|
| 52 |
y = self.fc1(x, **lora_kwargs)
|
| 53 |
y = self.activation(y)
|
| 54 |
y = self.fc2(y, **lora_kwargs)
|
modeling_xlm_roberta.py
CHANGED
|
@@ -313,9 +313,7 @@ class XLMRobertaPooler(nn.Module):
|
|
| 313 |
def forward(self, hidden_states, pool=True, task=None):
|
| 314 |
# We "pool" the model by simply taking the hidden state corresponding
|
| 315 |
# to the first token.
|
| 316 |
-
lora_kwargs = {}
|
| 317 |
-
if task is not None:
|
| 318 |
-
lora_kwargs['task'] = task
|
| 319 |
|
| 320 |
first_token_tensor = hidden_states[:, 0] if pool else hidden_states
|
| 321 |
pooled_output = self.dense(first_token_tensor, **lora_kwargs)
|
|
@@ -550,9 +548,7 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel):
|
|
| 550 |
)
|
| 551 |
else:
|
| 552 |
range_iter = range(0, len(sentences), batch_size)
|
| 553 |
-
lora_kwargs = {}
|
| 554 |
-
if task is not None:
|
| 555 |
-
lora_kwargs['task'] = task
|
| 556 |
for i in range_iter:
|
| 557 |
encoded_input = self.tokenizer(
|
| 558 |
sentences[i : i + batch_size],
|
|
|
|
| 313 |
def forward(self, hidden_states, pool=True, task=None):
|
| 314 |
# We "pool" the model by simply taking the hidden state corresponding
|
| 315 |
# to the first token.
|
| 316 |
+
lora_kwargs = {'task': task} if task is not None else {}
|
|
|
|
|
|
|
| 317 |
|
| 318 |
first_token_tensor = hidden_states[:, 0] if pool else hidden_states
|
| 319 |
pooled_output = self.dense(first_token_tensor, **lora_kwargs)
|
|
|
|
| 548 |
)
|
| 549 |
else:
|
| 550 |
range_iter = range(0, len(sentences), batch_size)
|
| 551 |
+
lora_kwargs = {'task': task} if task is not None else {}
|
|
|
|
|
|
|
| 552 |
for i in range_iter:
|
| 553 |
encoded_input = self.tokenizer(
|
| 554 |
sentences[i : i + batch_size],
|