GSAI-ML
/

LLaDA-1.5

@@ -87,6 +87,7 @@ def init_weights(
 ) -> None:
     """
     Initialize weights of a linear or embedding module.
     :param config: The model config.
     :param module: The linear or embedding submodule to initialize.
     :param d: The effective input dimensionality of the weights. This could be smaller than the actual dimensions
@@ -648,12 +649,12 @@ class LLaDABlock(nn.Module):
                 k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
                 v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
-            # Modify: MDM set causal to False, and with no attn_mask.
             return F.scaled_dot_product_attention(
                 q,
                 k,
                 v,
-                attn_mask=None,
                 dropout_p=dropout_p,
                 is_causal=False,
             )
@@ -711,7 +712,7 @@ class LLaDABlock(nn.Module):
             q,
             k,
             v,
-            attn_mask=None,
             dropout_p=0.0 if not self.training else self.config.attention_dropout,
             is_causal=False,
         )
@@ -1156,7 +1157,20 @@ class LLaDAModel(nn.Module):
             alibi_bias = alibi_attention_bias(seq_len, self.config, device)
         self.__cache["alibi_attention_bias"] = alibi_bias
         return alibi_bias
     def forward(
         self,
         input_ids: torch.LongTensor,
@@ -1176,16 +1190,20 @@ class LLaDAModel(nn.Module):
             which input IDs are masked. A `1` value in the mask means that
             the corresponding input ID should *not* be ignored. A `0` means
             that the corresponding input ID is masked.
             This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
             library.
         :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
             `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
             to introduce causal or other biases.
             If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
             indicates that the i-th element in the sequence is allowed to attend to the j-th
             element in the sequence.
             If the tensor is a float tensor, it will just be added to the attention
             scores before the softmax.
             The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
         :param past_key_values: Pre-computed keys and values for each attention block.
             Can be used to speed up sequential decoding. The `input_ids` which have
@@ -1252,7 +1270,7 @@ class LLaDAModel(nn.Module):
                     self.__cache, past_length + seq_len, x.device
                 ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
             elif attention_bias is None:
-                attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
             elif attention_bias.dtype in (torch.int8, torch.bool):
                 attention_bias = attention_bias.to(dtype=torch.float)
                 attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)

 ) -> None:
     """
     Initialize weights of a linear or embedding module.
     :param config: The model config.
     :param module: The linear or embedding submodule to initialize.
     :param d: The effective input dimensionality of the weights. This could be smaller than the actual dimensions
                 k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
                 v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            # Modify: MDM set causal to False.
             return F.scaled_dot_product_attention(
                 q,
                 k,
                 v,
+                attn_mask=attn_mask,
                 dropout_p=dropout_p,
                 is_causal=False,
             )
             q,
             k,
             v,
+            attn_mask=attention_bias,
             dropout_p=0.0 if not self.training else self.config.attention_dropout,
             is_causal=False,
         )
             alibi_bias = alibi_attention_bias(seq_len, self.config, device)
         self.__cache["alibi_attention_bias"] = alibi_bias
         return alibi_bias
+    def get_bidirectional_attention_bias(self, seq_len: int, device: torch.device) -> torch.Tensor:
+        if (bidirectional_bias := self.__cache.get("bidirectional_attention_bias")) is not None and bidirectional_bias.shape[
+            -1
+        ] >= seq_len:
+            if bidirectional_bias.device != device:
+                bidirectional_bias = bidirectional_bias.to(device)
+                self.__cache["bidirectional_attention_bias"] = bidirectional_bias
+            return bidirectional_bias
+        with torch.autocast(device.type, enabled=False):
+            bidirectional_bias = torch.zeros((1, 1, seq_len, seq_len), device=device, dtype=torch.float)
+        self.__cache["bidirectional_attention_bias"] = bidirectional_bias
+        return bidirectional_bias
     def forward(
         self,
         input_ids: torch.LongTensor,
             which input IDs are masked. A `1` value in the mask means that
             the corresponding input ID should *not* be ignored. A `0` means
             that the corresponding input ID is masked.
             This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
             library.
         :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
             `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
             to introduce causal or other biases.
             If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
             indicates that the i-th element in the sequence is allowed to attend to the j-th
             element in the sequence.
             If the tensor is a float tensor, it will just be added to the attention
             scores before the softmax.
             The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
         :param past_key_values: Pre-computed keys and values for each attention block.
             Can be used to speed up sequential decoding. The `input_ids` which have
                     self.__cache, past_length + seq_len, x.device
                 ) + self.get_alibi_attention_bias(past_length + seq_len, x.device)
             elif attention_bias is None:
+                attention_bias = self.get_bidirectional_attention_bias(past_length + seq_len, x.device)
             elif attention_bias.dtype in (torch.int8, torch.bool):
                 attention_bias = attention_bias.to(dtype=torch.float)
                 attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)