moonshotai
/

Kimi-VL-A3B-Instruct

Image-Text-to-Text

feature-extraction

Model card Files Files and versions

zhouzaida commited on Apr 11

Commit

7718375

·

1 Parent(s): 9e6c322

add sdpa back

Files changed (1) hide show

modeling_kimi_vl.py +33 -1

modeling_kimi_vl.py CHANGED Viewed

@@ -145,6 +145,38 @@ def multihead_attention(
     return attn_out
 def eager_attention(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -178,6 +210,7 @@ def eager_attention(
 VL_VISION_ATTENTION_FUNCTIONS = {
     "flash_attention_2": multihead_attention,
     "eager": eager_attention,
 }
@@ -2230,7 +2263,6 @@ class MoonVitPretrainedModel(PreTrainedModel):
     _no_split_modules = ["PackingTransformer"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         config = deepcopy(config)

     return attn_out
+def sdpa_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """SDPA attention.
+    Args:
+        q, k, v: tensor of shape (batch_size, seqlen, num_heads, head_dim),
+            or (tot_seqlens, num_heads, head_dim) if packing.
+    """
+    seq_length = q.shape[0]
+    attention_mask = torch.zeros(
+        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
+    )
+    for i in range(1, len(q_cu_seqlens)):
+        attention_mask[
+            ...,
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+        ] = True
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
+    return attn_output
 def eager_attention(
     q: torch.Tensor,
     k: torch.Tensor,
 VL_VISION_ATTENTION_FUNCTIONS = {
     "flash_attention_2": multihead_attention,
+    "sdpa": sdpa_attention,
     "eager": eager_attention,
 }
     _no_split_modules = ["PackingTransformer"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         config = deepcopy(config)