xiazhi commited on 4 days ago

Commit

b8d31f6

verified ·

1 Parent(s): aecc019

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +1 -0
README.md +83 -3
added_tokens.json +24 -0
chat_template.jinja +7 -0
config.json +155 -0
configuration_diffusionvl_qwen2_5_vl.py +280 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +737 -0
modeling_diffusionvl_qwen2_5_vl.py +1513 -0
preprocessor_config.json +23 -0
processing_diffusionvl_qwen2_5_vl.py +313 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +208 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,83 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+tags:
+- diffusion
+- vision-language
+- qwen2.5-vl
+---
+# DiffusionVL
+DiffusionVL is a vision-language model based on Qwen2.5-VL architecture with BD3LM diffusion-based generation.
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+import torch
+# Load model with trust_remote_code
+model = AutoModelForCausalLM.from_pretrained(
+    "path/to/model",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+# Load processor (includes tokenizer)
+processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True)
+# Image + text generation
+from PIL import Image
+import requests
+url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+messages = [
+    {"role": "user", "content": [
+        {"type": "image"},
+        {"type": "text", "text": "Describe this image."}
+    ]}
+]
+text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
+inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
+# Generate with diffusion
+output_ids = model.generate(
+    inputs=inputs["input_ids"],
+    images=inputs.get("pixel_values"),
+    image_grid_thws=inputs.get("image_grid_thw"),
+    gen_length=256,
+    steps=8,
+    temperature=0.0,
+    remasking_strategy="low_confidence_static",
+)
+# Decode output
+output_text = processor.decode(output_ids[0], skip_special_tokens=True)
+print(output_text)
+```
+## Generation Parameters
+- `gen_length`: Number of tokens to generate (default: 256)
+- `steps`: Number of diffusion steps per block (default: 8)
+- `temperature`: Sampling temperature, 0 for greedy (default: 0.0)
+- `top_k`: Top-k sampling parameter (default: 0, disabled)
+- `top_p`: Top-p (nucleus) sampling parameter (default: 1.0)
+- `remasking_strategy`: 'low_confidence' or 'sequential' (default: 'low_confidence')
+## Model Configuration
+- **Architecture**: DiffusionVL_Qwen2_5_VL_ForConditionalGeneration
+- **BD3LM Enabled**: True
+- **Block Size**: 8
+- **Hidden Size**: 3584
+- **Num Layers**: 28
+## Notes
+- The model uses `trust_remote_code=True` because it includes custom modeling code
+- Both model and processor can be loaded from the same directory
+- Image preprocessing uses Qwen2VLImageProcessor internally (identical to Qwen2.5-VL)

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}<image>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}<video>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,155 @@

+{
+  "add_faster_video": false,
+  "add_time_instruction": false,
+  "anneal_start_block_size": 1,
+  "architectures": [
+    "DiffusionVL_Qwen2_5_VL_ForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bd3lm_antithetic_sampling": true,
+  "bd3lm_attn_backend": "sdpa",
+  "bd3lm_block_aligned_eos": true,
+  "bd3lm_block_size": 8,
+  "bd3lm_complementary_mask": false,
+  "bd3lm_cross_attn": true,
+  "bd3lm_ignore_bos": true,
+  "bd3lm_mask_prob": 0.5,
+  "bd3lm_noise_granularity": "block",
+  "bd3lm_noise_type": "loglinear",
+  "bd3lm_parameterization": "subs",
+  "bd3lm_resample": false,
+  "bd3lm_sampling_eps_max": 1.0,
+  "bd3lm_sampling_eps_min": 0.001,
+  "bd3lm_time_conditioning": false,
+  "bd3lm_token_shift_prediction": false,
+  "bd3lm_var_min": true,
+  "bos_token_id": 151643,
+  "enable_bd3lm": true,
+  "enable_block_size_annealing": false,
+  "enable_mtd": false,
+  "enable_noise_level_annealing": false,
+  "eos_token_id": 151645,
+  "faster_token_stride": 10,
+  "force_sample": false,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_aspect_ratio": "pad",
+  "image_crop_resolution": null,
+  "image_grid_pinpoints": null,
+  "image_split_resolution": null,
+  "image_token_id": null,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_pixels": 262144,
+  "max_position_embeddings": 128000,
+  "max_window_layers": 28,
+  "min_pixels": 147456,
+  "mm_hidden_size": 1280,
+  "mm_newline_position": "grid",
+  "mm_patch_merge_type": "flat",
+  "mm_projector_lr": null,
+  "mm_projector_type": "qwen_merger",
+  "mm_resampler_type": null,
+  "mm_spatial_pool_mode": "bilinear",
+  "mm_spatial_pool_stride": null,
+  "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -2,
+  "mm_vision_tower_lr": 2e-06,
+  "model_max_length": 8192,
+  "model_type": "diffusionvl_qwen2_5_vl",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "pos_skipping_range": 4096,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "rope_type": "default",
+    "type": "default"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 8192,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.0",
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_pos_skipping": false,
+  "use_sliding_window": false,
+  "video_token_id": null,
+  "vision_config": {
+    "depth": 32,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "model_type": "",
+    "num_heads": 16,
+    "out_hidden_size": 3584,
+    "patch_size": 14,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2,
+    "torch_dtype": "float32",
+    "window_size": 112
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vision_token_id": 151654,
+  "vision_tower_pretrained": null,
+  "vocab_size": 152064,
+  "mask_token_id": 151671,
+  "auto_map": {
+    "AutoConfig": "configuration_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_Config",
+    "AutoModelForCausalLM": "modeling_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_ForConditionalGeneration",
+    "AutoProcessor": "processing_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_Processor"
+  }
+}

configuration_diffusionvl_qwen2_5_vl.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# coding=utf-8
+# Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
+# and the GPT-NeoX and OPT implementations. It has been modified to create DiffusionVL.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DiffusionVL (Qwen2.5-VL based) model configuration."""
+from typing import List, Optional, Union
+from transformers.configuration_utils import PretrainedConfig
+class DiffusionVL_Qwen2_5_VL_VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiffusionVL_Qwen2_5_VL_VisionModel`].
+    It is used to instantiate the vision encoder according to the specified arguments.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        depth (`int`, *optional*, defaults to 32):
+            Number of vision transformer layers.
+        hidden_size (`int`, *optional*, defaults to 1280):
+            Dimensionality of the encoder layers and the pooler layer.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the encoder.
+        intermediate_size (`int`, *optional*, defaults to 3420):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer.
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer.
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size of each image patch.
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The spatial merge size for patch merging.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size for video processing.
+        tokens_per_second (`int`, *optional*, defaults to 4):
+            Number of tokens per second for video processing.
+        window_size (`int`, *optional*, defaults to 112):
+            Window size for windowed attention.
+        out_hidden_size (`int`, *optional*, defaults to 3584):
+            Output hidden size after the vision encoder.
+        fullatt_block_indexes (`List[int]`, *optional*):
+            Indices of blocks that use full attention instead of windowed attention.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
+    Example:
+    ```python
+    >>> from configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_VisionConfig
+    >>> # Initializing a DiffusionVL vision configuration
+    >>> configuration = DiffusionVL_Qwen2_5_VL_VisionConfig()
+    ```
+    """
+    model_type = "diffusionvl_qwen2_5_vl_vision"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        depth: int = 32,
+        hidden_size: int = 1280,
+        hidden_act: str = "silu",
+        intermediate_size: int = 3420,
+        num_heads: int = 16,
+        in_channels: int = 3,
+        patch_size: int = 14,
+        spatial_merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        tokens_per_second: int = 4,
+        window_size: int = 112,
+        out_hidden_size: int = 3584,
+        fullatt_block_indexes: Optional[List[int]] = None,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+        self.window_size = window_size
+        self.out_hidden_size = out_hidden_size
+        self.fullatt_block_indexes = fullatt_block_indexes or [7, 15, 23, 31]
+        self.initializer_range = initializer_range
+class DiffusionVL_Qwen2_5_VL_Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiffusionVL_Qwen2_5_VL_ForConditionalGeneration`].
+    It is used to instantiate a DiffusionVL model according to the specified arguments.
+    DiffusionVL extends Qwen2.5-VL architecture with BD3LM (Block Diffusion Language Model)
+    for diffusion-based text generation instead of autoregressive decoding.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the DiffusionVL model.
+        hidden_size (`int`, *optional*, defaults to 3584):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 18944):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 28):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            Number of key-value heads for Grouped Query Attention (GQA).
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the RMS normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether to use the past key/values attentions.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`DiffusionVL_Qwen2_5_VL_VisionConfig`, *optional*):
+            The configuration for the vision encoder.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The token index for image placeholder.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The token index for video placeholder.
+        vision_start_token_id (`int`, *optional*, defaults to 151652):
+            The token index denoting start of vision input.
+        vision_end_token_id (`int`, *optional*, defaults to 151653):
+            The token index denoting end of vision input.
+        enable_bd3lm (`bool`, *optional*, defaults to `True`):
+            Whether to enable BD3LM diffusion-based generation.
+        bd3lm_block_size (`int`, *optional*, defaults to 8):
+            Block size for BD3LM generation.
+        bd3lm_cross_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use cross-attention in BD3LM.
+        mask_token_id (`int`, *optional*, defaults to 151671):
+            The token index for mask token used in diffusion.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for RoPE embeddings.
+    Example:
+    ```python
+    >>> from transformers import AutoModelForCausalLM
+    >>> from configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_Config
+    >>> # Initializing a DiffusionVL configuration
+    >>> configuration = DiffusionVL_Qwen2_5_VL_Config()
+    >>> # Initializing a model from the configuration
+    >>> model = AutoModelForCausalLM.from_pretrained(
+    ...     "path/to/model", config=configuration, trust_remote_code=True
+    ... )
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "diffusionvl_qwen2_5_vl"
+    sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VL_VisionConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size: int = 152064,
+        hidden_size: int = 3584,
+        intermediate_size: int = 18944,
+        num_hidden_layers: int = 28,
+        num_attention_heads: int = 28,
+        num_key_value_heads: int = 4,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 128000,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-6,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        attention_dropout: float = 0.0,
+        # Vision configuration
+        vision_config: Optional[Union[DiffusionVL_Qwen2_5_VL_VisionConfig, dict]] = None,
+        # Multimodal token IDs
+        image_token_id: int = 151655,
+        video_token_id: int = 151656,
+        vision_start_token_id: int = 151652,
+        vision_end_token_id: int = 151653,
+        # BD3LM diffusion parameters
+        enable_bd3lm: bool = True,
+        bd3lm_block_size: int = 8,
+        bd3lm_cross_attn: bool = True,
+        bd3lm_antithetic_sampling: bool = True,
+        bd3lm_sampling_eps_min: float = 1e-3,
+        bd3lm_sampling_eps_max: float = 1.0,
+        mask_token_id: int = 151671,
+        # RoPE parameters
+        rope_theta: float = 1000000.0,
+        rope_scaling: Optional[dict] = None,
+        **kwargs,
+    ):
+        # Text model configuration
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_dropout = attention_dropout
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling or {
+            "mrope_section": [16, 24, 24],
+            "rope_type": "default",
+            "type": "default",
+        }
+        # Vision configuration
+        if vision_config is None:
+            self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig(**vision_config)
+        elif isinstance(vision_config, DiffusionVL_Qwen2_5_VL_VisionConfig):
+            self.vision_config = vision_config
+        else:
+            self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig()
+        # Multimodal token IDs
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        # BD3LM diffusion configuration
+        self.enable_bd3lm = enable_bd3lm
+        self.bd3lm_block_size = bd3lm_block_size
+        self.bd3lm_cross_attn = bd3lm_cross_attn
+        self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
+        self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
+        self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max
+        self.mask_token_id = mask_token_id
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+__all__ = ["DiffusionVL_Qwen2_5_VL_Config", "DiffusionVL_Qwen2_5_VL_VisionConfig"]

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e753b920ec44c00e7c1395c2d048e594cbd46663ddcac8f1b6c9aa16cea2a86f
+size 4877660744

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:896fbe05de22acd29ccdcd719c45c1e99cbdb4c1bd1c00339574b1cf565c2bd6
+size 4932751008

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ef02d3158f2e613522a6512e4170479555a8d12baf5a22d39c0bdf0b548e5ff
+size 4995019896

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8b1fbd664a0cd6dc23bbb30f0c0199635de1c237cffd64856096af8c43898e
+size 1778992544

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,737 @@

+{
+  "metadata": {
+    "total_parameters": 848896,
+    "total_size": 16584333320
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.mm_projector.merger.ln_q.weight": "model-00004-of-00004.safetensors",
+    "model.mm_projector.merger.mlp.0.bias": "model-00004-of-00004.safetensors",
+    "model.mm_projector.merger.mlp.0.weight": "model-00004-of-00004.safetensors",
+    "model.mm_projector.merger.mlp.2.bias": "model-00004-of-00004.safetensors",
+    "model.mm_projector.merger.mlp.2.weight": "model-00004-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.0.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.1.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.10.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.11.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.12.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.13.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.14.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.15.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.16.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.2.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.23.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.24.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.25.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.26.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.27.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.28.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.29.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.3.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.30.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.qkv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.4.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.5.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.6.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.7.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.qkv.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.attn.qkv.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_tower.vision_tower.patch_embed.proj.weight": "model-00003-of-00004.safetensors"
+  }
+}

modeling_diffusionvl_qwen2_5_vl.py ADDED Viewed

	@@ -0,0 +1,1513 @@

+# coding=utf-8
+# Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
+# and the GPT-NeoX and OPT implementations. It has been modified to create DiffusionVL.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DiffusionVL model implementation."""
+import math
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
+from transformers.utils import logging
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from transformers.modeling_layers import GradientCheckpointingLayer
+from .configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_Config, DiffusionVL_Qwen2_5_VL_VisionConfig
+IMAGE_TOKEN_INDEX = -200
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates half the hidden dims of the input for rotary position embedding.
+    Args:
+        x: Input tensor of shape (..., head_dim).
+    Returns:
+        Rotated tensor of the same shape.
+    """
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary position embedding for vision encoder.
+    Args:
+        q: Query tensor.
+        k: Key tensor.
+        cos: Cosine part of rotary embedding.
+        sin: Sine part of rotary embedding.
+    Returns:
+        Tuple of (rotated_q, rotated_k).
+    """
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(orig_q_dtype), k_embed.to(orig_k_dtype)
+def apply_multimodal_rotary_pos_emb(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    mrope_section: List[int],
+    unsqueeze_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply multimodal rotary position embedding (M-RoPE) for 3D position encoding.
+    Args:
+        q: Query tensor of shape (batch, heads, seq_len, head_dim).
+        k: Key tensor of shape (batch, heads, seq_len, head_dim).
+        cos: Cosine tensor of shape (3, batch, seq_len, head_dim).
+        sin: Sine tensor of shape (3, batch, seq_len, head_dim).
+        mrope_section: List of 3 ints defining section sizes [temporal, height, width].
+            For example, [16, 24, 24] for head_dim=128.
+        unsqueeze_dim: Dimension to unsqueeze for broadcasting.
+    Returns:
+        Tuple of (rotated_q, rotated_k) with M-RoPE applied.
+    """
+    # mrope_section is like [16, 24, 24] for head_dim=128
+    # Multiply by 2 because head_dim is full (not half like in standard RoPE)
+    mrope_section = mrope_section * 2  # [16, 24, 24] -> [32, 48, 48]
+    # Split cos/sin along head_dim, then select appropriate dimension (0, 1, 2) for each section
+    # cos/sin shape: (3, batch, seq_len, head_dim)
+    cos = torch.cat(
+        [m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1
+    ).unsqueeze(unsqueeze_dim)
+    sin = torch.cat(
+        [m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1
+    ).unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class DiffusionVL_Qwen2_5_VL_RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    """Eager attention implementation."""
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class DiffusionVL_Qwen2_5_VL_VisionMLP(nn.Module):
+    def __init__(self, config, bias: bool = False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+class DiffusionVL_Qwen2_5_VL_VisionPatchEmbed(nn.Module):
+    def __init__(self, patch_size=14, temporal_patch_size=2, in_channels=3, embed_dim=1152):
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class DiffusionVL_Qwen2_5_VL_VisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor
+    def __init__(self, dim: int, theta: float = 10000.0):
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class DiffusionVL_Qwen2_5_VL_VisionPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2):
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size ** 2)
+        self.ln_q = DiffusionVL_Qwen2_5_VL_RMSNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+class DiffusionVL_Qwen2_5_VL_VisionAttention(nn.Module):
+    def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig) -> None:
+        super().__init__()
+        self.dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.dim // self.num_heads
+        self.num_key_value_groups = 1  # needed for eager attention
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.scaling = self.head_dim**-0.5
+        self.config = config
+        self.attention_dropout = 0.0
+        self.is_causal = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states, key_states, value_states = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+        attention_interface: Callable = eager_attention_forward
+        if getattr(self.config, "_attn_implementation", "eager") != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if getattr(self.config, "_attn_implementation", "eager") == "flash_attention_2":
+            # Flash Attention 2: Use cu_seqlens for variable length attention
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=None,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                cu_seq_lens_q=cu_seqlens,
+                cu_seq_lens_k=cu_seqlens,
+                max_length_q=max_seqlen,
+                max_length_k=max_seqlen,
+                is_causal=False,
+                **kwargs,
+            )
+        else:
+            # Other implementations: Process each chunk separately
+            lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+            splits = [
+                torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+            ]
+            attn_outputs = [
+                attention_interface(
+                    self,
+                    q,
+                    k,
+                    v,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    is_causal=False,
+                    **kwargs,
+                )[0]
+                for q, k, v in zip(*splits)
+            ]
+            attn_output = torch.cat(attn_outputs, dim=1)
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
+        attn_output = self.proj(attn_output)
+        return attn_output
+class DiffusionVL_Qwen2_5_VL_VisionBlock(GradientCheckpointingLayer):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=1e-6)
+        self.attn = DiffusionVL_Qwen2_5_VL_VisionAttention(config=config)
+        self.mlp = DiffusionVL_Qwen2_5_VL_VisionMLP(config, bias=True)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class DiffusionVL_Qwen2_5_VL_VisionPreTrainedModel(PreTrainedModel):
+    config_class = DiffusionVL_Qwen2_5_VL_VisionConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DiffusionVL_Qwen2_5_VL_VisionBlock"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_attention_backend = True
+class DiffusionVL_Qwen2_5_VL_VisionTransformer(DiffusionVL_Qwen2_5_VL_VisionPreTrainedModel):
+    config_class = DiffusionVL_Qwen2_5_VL_VisionConfig
+    _no_split_modules = ["DiffusionVL_Qwen2_5_VL_VisionBlock"]
+    def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.fullatt_block_indexes = config.fullatt_block_indexes
+        self.window_size = config.window_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+        self.patch_embed = DiffusionVL_Qwen2_5_VL_VisionPatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.hidden_size,
+        )
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = DiffusionVL_Qwen2_5_VL_VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList([DiffusionVL_Qwen2_5_VL_VisionBlock(config) for _ in range(config.depth)])
+        self.gradient_checkpointing = False
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def get_window_index(self, grid_thw: torch.Tensor):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs):
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens_now,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        # Return hidden_states AND window_index for MMProjector to apply merger and reverse shuffle
+        return hidden_states, window_index
+class DiffusionVL_Qwen2_5_VL_VisionTower(nn.Module):
+    def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig):
+        super().__init__()
+        self.vision_tower = DiffusionVL_Qwen2_5_VL_VisionTransformer(config)
+        self.spatial_merge_size = config.spatial_merge_size
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor = None):
+        """Returns (hidden_states, window_index) tuple for MMProjector."""
+        return self.vision_tower(hidden_states, grid_thw)
+class DiffusionVL_Qwen2_5_VL_MMProjector(nn.Module):
+    def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig):
+        super().__init__()
+        self.merger = DiffusionVL_Qwen2_5_VL_VisionPatchMerger(
+            dim=config.out_hidden_size,
+            context_dim=config.hidden_size,
+            spatial_merge_size=config.spatial_merge_size,
+        )
+    def forward(self, features_tuple):
+        """Forward pass with merger and window index reversal."""
+        if isinstance(features_tuple, tuple):
+            hidden_states, window_index = features_tuple
+            # Apply merger
+            projected_features = self.merger(hidden_states)
+            # Reverse the window shuffle to restore original spatial order
+            reverse_indices = torch.argsort(window_index)
+            final_features = projected_features[reverse_indices, :]
+            return final_features
+        else:
+            # Fallback for simple tensor input
+            return self.merger(features_tuple)
+class DiffusionVL_Qwen2_5_VL_RotaryEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        dim = config.hidden_size // config.num_attention_heads
+        inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, x, position_ids):
+        """
+        Args:
+            x: Input tensor for dtype reference
+            position_ids: Position IDs with shape (3, batch_size, seq_length) for M-RoPE
+                          or (batch_size, seq_length) for standard RoPE (will be converted to 3D)
+        Returns:
+            cos, sin: Tensors of shape (3, batch, seq_len, head_dim) for M-RoPE
+        """
+        # Always convert 2D position_ids to 3D for M-RoPE
+        if position_ids.ndim == 2:
+            # (batch, seq) -> (3, batch, seq)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        # Now position_ids should be 3D: (3, batch_size, seq_length)
+        if position_ids.ndim == 3 and position_ids.shape[0] == 3:
+            # M-RoPE: position_ids shape is (3, batch_size, seq_length)
+            # Expand inv_freq to (3, batch_size, head_dim//2, 1)
+            inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(
+                3, position_ids.shape[1], -1, 1
+            )
+            # position_ids_expanded shape: (3, batch_size, 1, seq_length)
+            position_ids_expanded = position_ids[:, :, None, :].float()
+            device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+            with torch.autocast(device_type=device_type, enabled=False):
+                # freqs shape: (3, batch_size, seq_length, head_dim//2)
+                freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+                # emb shape: (3, batch_size, seq_length, head_dim)
+                emb = torch.cat((freqs, freqs), dim=-1)
+                cos = emb.cos()
+                sin = emb.sin()
+            return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+        else:
+            # Standard 1D RoPE (fallback)
+            inv_freq_expanded = self.inv_freq[None, :, None].expand(position_ids.shape[0], -1, 1)
+            position_ids_expanded = position_ids[:, None, :].float()
+            freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+            return cos.to(x.dtype), sin.to(x.dtype)
+class DiffusionVL_Qwen2_5_VL_MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class DiffusionVL_Qwen2_5_VL_Attention(nn.Module):
+    """Non-causal attention for diffusion-based generation with KV-cache support."""
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.scaling = self.head_dim ** -0.5
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        # Non-causal for diffusion
+        self.is_causal = False
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        output_attentions=False,
+        use_cache=False,
+        cache_position=None,
+        position_embeddings=None,
+        store_kv=False,
+        **kwargs,
+    ):
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_multimodal_rotary_pos_emb(
+                query_states, key_states, cos, sin,
+                self.config.rope_scaling.get("mrope_section", [16, 24, 24])
+            )
+        # KV cache handling with store_kv support
+        if past_key_values is not None and use_cache:
+            cache_kwargs = {"cache_position": cache_position}
+            if store_kv:
+                # Store current KV to cache (for prefill or final step)
+                key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+            else:
+                # Use cached KV but don't update (for diffusion steps within a block)
+                cached_key = past_key_values.key_cache[self.layer_idx] if self.layer_idx < len(past_key_values.key_cache) else None
+                cached_value = past_key_values.value_cache[self.layer_idx] if self.layer_idx < len(past_key_values.value_cache) else None
+                if cached_key is not None and cached_value is not None:
+                    key_states = torch.cat([cached_key, key_states], dim=2)
+                    value_states = torch.cat([cached_value, value_states], dim=2)
+        # Repeat KV for GQA
+        key_states = key_states.repeat_interleave(self.num_key_value_groups, dim=1)
+        value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
+        # Handle dict-format attention_mask (for BD3LM compatibility)
+        if attention_mask is not None:
+            if isinstance(attention_mask, dict):
+                # Use full_attention mask for all layers (simplified)
+                attn_mask = attention_mask.get("full_attention", None)
+            else:
+                attn_mask = attention_mask
+        else:
+            attn_mask = None
+        if attn_mask is not None:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attn_mask,
+                dropout_p=0.0,
+                is_causal=False,
+                scale=self.scaling,
+            )
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                dropout_p=0.0,
+                is_causal=False,
+                scale=self.scaling,
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+class DiffusionVL_Qwen2_5_VL_DecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = DiffusionVL_Qwen2_5_VL_Attention(config, layer_idx)
+        self.mlp = DiffusionVL_Qwen2_5_VL_MLP(config)
+        self.input_layernorm = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        output_attentions=False,
+        use_cache=False,
+        cache_position=None,
+        position_embeddings=None,
+        store_kv=False,
+        **kwargs,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            store_kv=store_kv,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states, attn_weights
+class DiffusionVL_Qwen2_5_VL_PreTrainedModel(PreTrainedModel):
+    config_class = DiffusionVL_Qwen2_5_VL_Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DiffusionVL_Qwen2_5_VL_DecoderLayer", "DiffusionVL_Qwen2_5_VL_VisionBlock"]
+    def _init_weights(self, module: nn.Module) -> None:
+        """Initialize the weights."""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+class DiffusionVL_Qwen2_5_VL_Model(DiffusionVL_Qwen2_5_VL_PreTrainedModel):
+    def __init__(self, config: DiffusionVL_Qwen2_5_VL_Config):
+        super().__init__(config)
+        self.config = config
+        # Vision components (matching weight keys)
+        self.vision_tower = DiffusionVL_Qwen2_5_VL_VisionTower(config.vision_config)
+        self.mm_projector = DiffusionVL_Qwen2_5_VL_MMProjector(config.vision_config)
+        # Text components
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList([
+            DiffusionVL_Qwen2_5_VL_DecoderLayer(config, layer_idx)
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DiffusionVL_Qwen2_5_VL_RotaryEmbedding(config)
+        # BD3LM block size
+        self.bd3lm_block_size = config.bd3lm_block_size
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        """
+        Encodes images into continuous embeddings through vision tower and mm_projector.
+        Args:
+            pixel_values: Image tensor
+            image_grid_thw: Grid dimensions (temporal, height, width) for each image
+        Returns:
+            Image embeddings ready to be merged with text embeddings
+        """
+        pixel_values = pixel_values.to(dtype=self.vision_tower.vision_tower.patch_embed.proj.weight.dtype)
+        hidden_states = self.vision_tower(pixel_values, image_grid_thw)
+        image_embeds = self.mm_projector(hidden_states)
+        return image_embeds
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+        store_kv=False,
+        pixel_values=None,
+        image_grid_thw=None,
+        **kwargs,
+    ):
+        """Forward pass with optional vision input processing."""
+        output_attentions = output_attentions or False
+        output_hidden_states = output_hidden_states or False
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else True
+        IMAGE_TOKEN_INDEX = -200
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if pixel_values is not None and image_grid_thw is not None:
+            # Get image features
+            image_features = self.get_image_features(pixel_values, image_grid_thw)
+            # Split features per image
+            spatial_merge_size = self.vision_tower.spatial_merge_size
+            split_sizes = (image_grid_thw.prod(dim=1) // (spatial_merge_size ** 2)).tolist()
+            image_features_list = list(torch.split(image_features, split_sizes))
+            # Replace IMAGE_TOKEN positions with image features
+            batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+            new_inputs_embeds_list = []
+            for batch_idx in range(batch_size):
+                cur_input_ids = input_ids[batch_idx] if input_ids is not None else None
+                cur_embeds = inputs_embeds[batch_idx]
+                if cur_input_ids is None or (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
+                    new_inputs_embeds_list.append(cur_embeds)
+                    continue
+                # Find IMAGE_TOKEN positions
+                image_positions = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
+                image_token_indices = [-1] + image_positions + [len(cur_input_ids)]
+                # Split embeddings and interleave with image features
+                cur_new_embeds = []
+                cur_image_idx = 0
+                for i in range(len(image_token_indices) - 1):
+                    start = image_token_indices[i] + 1
+                    end = image_token_indices[i + 1]
+                    # Add text segment
+                    if start < end:
+                        cur_new_embeds.append(cur_embeds[start:end])
+                    # Add image features (before the next segment, except after last)
+                    if i < len(image_positions) and cur_image_idx < len(image_features_list):
+                        cur_new_embeds.append(image_features_list[cur_image_idx].to(cur_embeds.dtype))
+                        cur_image_idx += 1
+                if cur_new_embeds:
+                    new_inputs_embeds_list.append(torch.cat(cur_new_embeds, dim=0))
+                else:
+                    new_inputs_embeds_list.append(cur_embeds)
+            # Pad and stack
+            max_len = max(x.shape[0] for x in new_inputs_embeds_list)
+            hidden_size = new_inputs_embeds_list[0].shape[-1]
+            inputs_embeds = torch.zeros(
+                batch_size, max_len, hidden_size,
+                dtype=new_inputs_embeds_list[0].dtype,
+                device=new_inputs_embeds_list[0].device
+            )
+            for i, embed in enumerate(new_inputs_embeds_list):
+                inputs_embeds[i, :embed.shape[0]] = embed
+        batch_size, seq_length = inputs_embeds.shape[:2]
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
+        if position_ids is None:
+            # position_ids will be converted to 3D for M-RoPE in rotary_emb
+            position_ids = cache_position.unsqueeze(0)
+        # Position embeddings
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            hidden_states, attn_weights = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                store_kv=store_kv,
+            )
+            if output_attentions:
+                all_attentions += (attn_weights,)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+class DiffusionVL_Qwen2_5_VL_ForConditionalGeneration(DiffusionVL_Qwen2_5_VL_PreTrainedModel):
+    r"""
+    DiffusionVL Model with a language modeling head for diffusion-based generation.
+    This model uses block diffusion instead of autoregressive
+    generation. The `generate()` method implements the diffusion denoising process.
+    """
+    # Weight tying keys - used when tie_word_embeddings=True
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: DiffusionVL_Qwen2_5_VL_Config):
+        super().__init__(config)
+        self.model = DiffusionVL_Qwen2_5_VL_Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Diffusion parameters
+        self.mask_token_id = config.mask_token_id
+        self.block_size = config.bd3lm_block_size
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def tie_weights(self):
+        """Tie weights if config.tie_word_embeddings is True (3B model)."""
+        if getattr(self.config, "tie_word_embeddings", False):
+            # Call parent's tie_weights to tie lm_head with embed_tokens
+            super().tie_weights()
+        # else: do nothing, keep separate lm_head weights (7B model)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        pixel_values=None,
+        image_grid_thw=None,
+        **kwargs,
+    ):
+        return_dict = return_dict if return_dict is not None else True
+        # Handle vision inputs if provided
+        if pixel_values is not None and inputs_embeds is None:
+            # Get vision features and merge with text
+            vision_features = self.model.vision_tower(pixel_values, image_grid_thw)
+            inputs_embeds = self._merge_vision_text(input_ids, vision_features)
+            input_ids = None
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, self.vocab_size),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def _merge_vision_text(self, input_ids, vision_features):
+        """Merge vision features with text embeddings."""
+        text_embeds = self.model.embed_tokens(input_ids)
+        # Simple placeholder - full implementation would properly insert vision tokens
+        return text_embeds
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        image_grid_thws: Optional[torch.Tensor] = None,
+        modalities: Optional[List] = None,
+        gen_length: int = 256,
+        steps: int = 8,
+        temperature: float = 0.0,
+        **kwargs,
+    ):
+        """
+        Diffusion-based generation using BD3LM algorithm.
+        Follows the same logic as DiffusionVLQwenVLForCausalLM.generate():
+        1. If images provided, call prepare_inputs_labels_for_multimodal
+        2. Otherwise, just embed the input tokens
+        3. Call generate_with_bd3lm
+        Args:
+            inputs: Input token IDs (prompt) [batch_size, seq_len]
+            images: Image tensor (pixel_values) for vision inputs
+            image_sizes: Image sizes
+            image_grid_thws: Grid dimensions for vision inputs (num_images, 3)
+            modalities: List of modalities (e.g., ["image"])
+            gen_length: Number of tokens to generate
+            steps: Number of diffusion steps per block
+            temperature: Sampling temperature (0 for greedy)
+        Returns:
+            Generated token IDs
+        """
+        if modalities is None:
+            modalities = ["image"]
+        if images is not None:
+            inputs_embeds = self.prepare_inputs_labels_for_multimodal(
+                input_ids=inputs,
+                images=images,
+                image_grid_thws=image_grid_thws,
+            )
+        else:
+            inputs_embeds = self.get_input_embeddings()(inputs)
+        # Call the BD3LM generation
+        return self.generate_with_bd3lm(
+            inputs_embeds=inputs_embeds,
+            gen_length=gen_length,
+            steps=steps,
+            temperature=temperature,
+            **kwargs,
+        )
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids: torch.Tensor,
+        images: torch.Tensor,
+        image_grid_thws: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Prepare inputs_embeds by merging text embeddings with image features.
+        Uses LLaVA format: IMAGE_TOKEN_INDEX (-200) as placeholder.
+        Args:
+            input_ids: Input token IDs with IMAGE_TOKEN_INDEX (-200) as image placeholders
+            images: Pixel values tensor
+            image_grid_thws: Grid dimensions for each image
+        Returns:
+            inputs_embeds: Merged text + image embeddings
+        """
+        IMAGE_TOKEN_INDEX = -200
+        device = input_ids.device
+        batch_size = input_ids.shape[0]
+        # Convert image_grid_thws to tensor if needed
+        if image_grid_thws is not None:
+            if not isinstance(image_grid_thws, torch.Tensor):
+                image_grid_thw = torch.tensor(image_grid_thws, device=device)
+            else:
+                image_grid_thw = image_grid_thws.to(device)
+        else:
+            raise ValueError("image_grid_thws is required for vision processing")
+        # 1. Get image features through vision tower + mm_projector
+        image_features = self.model.get_image_features(images, image_grid_thw)
+        # 2. Split features per image based on grid_thw
+        spatial_merge_size = self.model.vision_tower.spatial_merge_size
+        split_sizes = (image_grid_thw.prod(dim=1) // (spatial_merge_size ** 2)).tolist()
+        image_features_list = list(torch.split(image_features, split_sizes))
+        # 3. Build new input embeddings (LLaVA format)
+        new_input_embeds_list = []
+        for batch_idx in range(batch_size):
+            cur_input_ids = input_ids[batch_idx]
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum().item()
+            if num_images == 0:
+                # No image tokens, just embed text
+                cur_input_embeds = self.get_input_embeddings()(cur_input_ids)
+                new_input_embeds_list.append(cur_input_embeds)
+                continue
+            # LLaVA format: IMAGE_TOKEN_INDEX (-200) as placeholder
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [len(cur_input_ids)]
+            cur_input_ids_noim = []
+            for idx in range(len(image_token_indices) - 1):
+                start = image_token_indices[idx] + 1
+                end = image_token_indices[idx + 1]
+                if start < end:
+                    cur_input_ids_noim.append(cur_input_ids[start:end])
+            if cur_input_ids_noim:
+                cur_input_embeds_noim = self.get_input_embeddings()(torch.cat(cur_input_ids_noim))
+                split_sizes_text = [x.shape[0] for x in cur_input_ids_noim]
+                cur_input_embeds_noim_split = list(torch.split(cur_input_embeds_noim, split_sizes_text))
+            else:
+                cur_input_embeds_noim_split = []
+            cur_new_input_embeds = []
+            cur_image_idx = 0
+            for idx in range(num_images + 1):
+                if idx < len(cur_input_embeds_noim_split):
+                    cur_new_input_embeds.append(cur_input_embeds_noim_split[idx])
+                if idx < num_images and cur_image_idx < len(image_features_list):
+                    cur_image_features = image_features_list[cur_image_idx]
+                    target_dtype = cur_input_embeds_noim_split[0].dtype if cur_input_embeds_noim_split else images.dtype
+                    cur_new_input_embeds.append(cur_image_features.to(target_dtype))
+                    cur_image_idx += 1
+            if cur_new_input_embeds:
+                # Ensure all tensors are on the same device before cat (multi-GPU support)
+                target_device = cur_new_input_embeds[0].device
+                cur_new_input_embeds = [t.to(target_device) for t in cur_new_input_embeds]
+                cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
+            else:
+                cur_new_input_embeds = self.get_input_embeddings()(cur_input_ids)
+            new_input_embeds_list.append(cur_new_input_embeds)
+        # 4. Pad to same length and stack
+        max_len = max(x.shape[0] for x in new_input_embeds_list)
+        hidden_size = new_input_embeds_list[0].shape[-1]
+        dtype = new_input_embeds_list[0].dtype
+        inputs_embeds = torch.zeros(batch_size, max_len, hidden_size, dtype=dtype, device=device)
+        for i, embed in enumerate(new_input_embeds_list):
+            inputs_embeds[i, :embed.shape[0]] = embed.to(device)
+        return inputs_embeds
+    @torch.no_grad()
+    def generate_with_bd3lm(
+        self,
+        inputs_embeds: torch.FloatTensor,
+        gen_length: int = 256,
+        steps: int = 8,
+        temperature: float = 0.0,
+        top_k: int = 0,
+        top_p: float = 1.0,
+        remasking_strategy: str = 'low_confidence_static',
+        use_kv_cache: bool = True,
+        confidence_threshold: float = 0.85,
+        **kwargs,
+    ):
+        """
+        BD3LM generation algorithm with KV-cache support.
+        Args:
+            inputs_embeds: Input embeddings (prompt)
+            gen_length: Number of tokens to generate
+            steps: Number of diffusion steps per block
+            temperature: Sampling temperature (0 for greedy)
+            top_k: Top-k sampling parameter
+            top_p: Top-p (nucleus) sampling parameter
+            remasking_strategy: 'low_confidence_static', 'low_confidence_dynamic', or 'sequential'
+            use_kv_cache: Whether to use KV cache (default True)
+            confidence_threshold: Threshold for low_confidence_dynamic strategy
+        Returns:
+            Generated token IDs
+        """
+        device = inputs_embeds.device
+        batch_size = inputs_embeds.shape[0]
+        prompt_len = inputs_embeds.shape[1]
+        block_size = self.block_size
+        mask_id = self.mask_token_id
+        # Compute total length aligned to block size
+        num_blocks = (prompt_len + gen_length + block_size - 1) // block_size
+        total_length = num_blocks * block_size
+        # Initialize with mask tokens
+        x_ids = torch.full((batch_size, total_length), mask_id, dtype=torch.long, device=device)
+        # Get mask embedding and ensure it's on the same device as inputs_embeds
+        embed_layer = self.get_input_embeddings()
+        mask_embed = embed_layer(torch.tensor([mask_id], device=embed_layer.weight.device))
+        mask_embed = mask_embed.to(device)  # Move to same device as inputs_embeds
+        x_embeds = mask_embed.repeat(batch_size, total_length, 1)
+        x_embeds[:, :prompt_len] = inputs_embeds.clone()
+        # Reconstruct prompt IDs from embeddings
+        prompt_logits = self.lm_head(inputs_embeds)
+        prompt_ids = torch.argmax(prompt_logits, dim=-1)
+        x_ids[:, :prompt_len] = prompt_ids
+        # Block causal mask
+        block_mask = torch.tril(torch.ones(num_blocks, num_blocks, device=device)).to(inputs_embeds.dtype)
+        block_diffusion_mask_bool = block_mask.repeat_interleave(block_size, dim=0) \
+                                              .repeat_interleave(block_size, dim=1).unsqueeze(0)
+        block_diffusion_mask = block_diffusion_mask_bool.unsqueeze(1)
+        block_diffusion_mask = torch.where(block_diffusion_mask == 0., torch.full_like(block_diffusion_mask, float('-inf')), 0.)
+        position_ids = torch.arange(total_length, device=device).unsqueeze(0).expand(batch_size, -1)
+        # KV-cache prefill
+        prefill_blocks = prompt_len // block_size
+        prefill_length = prefill_blocks * block_size
+        past_key_values = DynamicCache() if use_kv_cache else None
+        if use_kv_cache and prefill_length > 0:
+            prefill_embeds = x_embeds[:, :prefill_length]
+            prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
+            prefill_pos_ids = position_ids[:, :prefill_length]
+            # Dict-format mask for BD3LM compatibility
+            model_mask = {"full_attention": prefill_mask, "sliding_attention": prefill_mask}
+            prefill_outputs = self.model(
+                inputs_embeds=prefill_embeds,
+                attention_mask=model_mask,
+                position_ids=prefill_pos_ids,
+                past_key_values=past_key_values,
+                use_cache=True,
+                store_kv=True
+            )
+            prefill_logits = self.lm_head(prefill_outputs.last_hidden_state).float()
+            self.last_prefill_logits = prefill_logits[:, -1:, :].clone()
+            past_key_values = prefill_outputs.past_key_values
+        # Calculate how many tokens to unmask per step
+        num_transfer_tokens = self._get_num_transfer_tokens(block_size, steps)
+        eos_token_id = kwargs.get('eos_token_id', 151645)
+        # Generate block by block
+        for block_idx in range(prefill_blocks, num_blocks):
+            block_start = block_idx * block_size
+            block_end = block_start + block_size
+            cur_block_embeds = x_embeds[:, block_start:block_end].clone()
+            cur_block_ids = x_ids[:, block_start:block_end]
+            cur_mask = block_diffusion_mask[:, :, block_start:block_end, :block_end]
+            cur_pos_ids = position_ids[:, block_start:block_end]
+            # Dict-format mask for BD3LM compatibility
+            model_mask = {"full_attention": cur_mask, "sliding_attention": cur_mask}
+            # Run diffusion steps within this block
+            for step in range(steps + 1):
+                # Check mask using embedding comparison (ensure same device for multi-GPU)
+                is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed.to(cur_block_embeds.device)) < 1e-5, dim=-1)
+                if not is_mask.any():
+                    # Store KV for fully unmasked block
+                    if use_kv_cache:
+                        _ = self.model(
+                            inputs_embeds=cur_block_embeds,
+                            attention_mask=model_mask,
+                            position_ids=cur_pos_ids,
+                            past_key_values=past_key_values,
+                            use_cache=True,
+                            store_kv=True
+                        )
+                    break
+                # Forward pass
+                if use_kv_cache:
+                    outputs = self.model(
+                        inputs_embeds=cur_block_embeds,
+                        attention_mask=model_mask,
+                        position_ids=cur_pos_ids,
+                        past_key_values=past_key_values,
+                        use_cache=True,
+                        store_kv=False
+                    )
+                    logits = self.lm_head(outputs.last_hidden_state).float()
+                else:
+                    # No KV-cache: recompute full context
+                    context_embeds = x_embeds[:, :block_end].clone()
+                    context_embeds[:, block_start:block_end] = cur_block_embeds
+                    context_mask = block_diffusion_mask[:, :, :block_end, :block_end]
+                    context_pos_ids = position_ids[:, :block_end]
+                    context_model_mask = {"full_attention": context_mask, "sliding_attention": context_mask}
+                    outputs = self.model(
+                        inputs_embeds=context_embeds,
+                        attention_mask=context_model_mask,
+                        position_ids=context_pos_ids,
+                        past_key_values=None,
+                        use_cache=False,
+                        store_kv=False
+                    )
+                    logits = self.lm_head(outputs.last_hidden_state[:, block_start:block_end]).float()
+                # Sample tokens
+                x0, x0_p = self._sample_tokens(logits, temperature, top_k, top_p)
+                # Select tokens to unmask based on strategy
+                num_to_transfer = num_transfer_tokens[step].item()
+                # Ensure all tensors are on the same device for multi-GPU support
+                target_device = x0.device
+                is_mask = is_mask.to(target_device)
+                x0_p = x0_p.to(target_device)
+                transfer_mask = torch.zeros_like(x0, dtype=torch.bool)
+                if remasking_strategy == 'sequential':
+                    for j in range(batch_size):
+                        if is_mask[j].any():
+                            mask_positions = is_mask[j].nonzero(as_tuple=True)[0]
+                            num_to_select = min(num_to_transfer, len(mask_positions))
+                            selected_positions = mask_positions[:num_to_select]
+                            transfer_mask[j, selected_positions] = True
+                elif remasking_strategy == 'low_confidence_static':
+                    confidence = torch.where(is_mask, x0_p, torch.tensor(-torch.inf, device=target_device))
+                    for j in range(batch_size):
+                        num_masks = is_mask[j].sum().item()
+                        k = min(num_to_transfer, num_masks)
+                        if k > 0 and not torch.all(torch.isinf(confidence[j])):
+                            _, idx = torch.topk(confidence[j], k)
+                            transfer_mask[j, idx] = True
+                elif remasking_strategy == 'low_confidence_dynamic':
+                    confidence = torch.where(is_mask, x0_p, torch.tensor(-torch.inf, device=target_device))
+                    for j in range(batch_size):
+                        high_conf_mask = confidence[j] > confidence_threshold
+                        num_high_confidence = high_conf_mask.sum().item()
+                        if num_high_confidence >= num_to_transfer:
+                            transfer_mask[j] = high_conf_mask
+                        else:
+                            num_masks = is_mask[j].sum().item()
+                            k = min(num_to_transfer, num_masks)
+                            if k > 0:
+                                _, idx = torch.topk(confidence[j], k)
+                                transfer_mask[j, idx] = True
+                else:
+                    raise ValueError(f"Unknown remasking strategy: {remasking_strategy}")
+                # Update tokens - ensure all tensors are on same device
+                cur_block_ids = cur_block_ids.to(x0.device)
+                cur_block_ids = torch.where(transfer_mask, x0, cur_block_ids)
+                # Get embeddings - move x0 to embed layer's device first
+                embed_layer = self.get_input_embeddings()
+                x0_embeds = embed_layer(x0.to(embed_layer.weight.device))
+                cur_block_embeds = cur_block_embeds.to(x0_embeds.device)
+                cur_block_embeds = torch.where(transfer_mask.unsqueeze(-1).to(x0_embeds.device), x0_embeds, cur_block_embeds)
+            # Update global state - handle multi-GPU
+            x_embeds[:, block_start:block_end] = cur_block_embeds.to(x_embeds.device)
+            x_ids[:, block_start:block_end] = cur_block_ids.to(x_ids.device)
+            # Check for EOS
+            if block_end > prompt_len:
+                gen_start_in_block = max(prompt_len, block_start)
+                gen_ids_check = x_ids[:, gen_start_in_block:block_end]
+                if eos_token_id in gen_ids_check:
+                    break
+        # Return only generated tokens
+        return x_ids[:, prompt_len:prompt_len + gen_length]
+    def _sample_tokens(self, logits, temperature=0.0, top_k=0, top_p=1.0):
+        """Sample tokens with temperature, top-k, and top-p."""
+        batch_size = logits.shape[0]
+        seq_len = logits.shape[1]
+        vocab_size = logits.shape[-1]
+        logits_2d = logits.reshape(-1, vocab_size)
+        if temperature == 0:
+            # Greedy sampling
+            tokens = torch.argmax(logits_2d, dim=-1, keepdim=True)
+            probs = F.softmax(logits_2d, dim=-1)
+            token_probs = torch.gather(probs, -1, tokens)
+        else:
+            # Apply temperature
+            logits_scaled = logits_2d / temperature
+            # Apply top-k
+            if top_k > 0:
+                values, _ = torch.topk(logits_scaled, top_k)
+                min_values = values[:, -1:]
+                logits_scaled = torch.where(logits_scaled < min_values, float('-inf'), logits_scaled)
+            # Apply top-p
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(logits_scaled, descending=True)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_mask = cumulative_probs > top_p
+                sorted_mask[:, 1:] = sorted_mask[:, :-1].clone()
+                sorted_mask[:, 0] = False
+                mask_indices = torch.scatter(
+                    torch.zeros_like(logits_scaled, dtype=torch.bool),
+                    -1, sorted_indices, sorted_mask
+                )
+                logits_scaled = logits_scaled.masked_fill(mask_indices, float('-inf'))
+            probs = F.softmax(logits_scaled, dim=-1)
+            tokens = torch.multinomial(probs, num_samples=1)
+            token_probs = torch.gather(probs, -1, tokens)
+        return tokens.view(batch_size, seq_len), token_probs.view(batch_size, seq_len)
+    def _get_num_transfer_tokens(self, block_length, steps):
+        """Calculate how many tokens to unmask at each step."""
+        if steps == 0:
+            return torch.zeros(1, dtype=torch.int64)
+        base = block_length // steps
+        remainder = block_length % steps
+        num_transfer = torch.zeros(steps + 1, dtype=torch.int64) + base
+        num_transfer[:remainder] += 1
+        return num_transfer
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("diffusionvl_qwen2_5_vl", DiffusionVL_Qwen2_5_VL_Config)
+AutoModelForCausalLM.register(DiffusionVL_Qwen2_5_VL_Config, DiffusionVL_Qwen2_5_VL_ForConditionalGeneration)
+__all__ = [
+    "DiffusionVL_Qwen2_5_VL_Config",
+    "DiffusionVL_Qwen2_5_VL_VisionConfig",
+    "DiffusionVL_Qwen2_5_VL_PreTrainedModel",
+    "DiffusionVL_Qwen2_5_VL_Model",
+    "DiffusionVL_Qwen2_5_VL_ForConditionalGeneration",
+]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "min_pixels": 3136,
+  "max_pixels": 12845056,
+  "patch_size": 14,
+  "temporal_patch_size": 2,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "use_fast": false,
+  "processor_class": "DiffusionVL_Qwen2_5_VL_Processor",
+  "auto_map": {
+    "AutoProcessor": "processing_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_Processor"
+  }
+}

processing_diffusionvl_qwen2_5_vl.py ADDED Viewed

	@@ -0,0 +1,313 @@

+# coding=utf-8
+# Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
+# and the GPT-NeoX and OPT implementations. It has been modified to create DiffusionVL.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+DiffusionVL Processor - Combines image processor and tokenizer.
+"""
+import re
+from typing import List, Optional, Union
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.video_utils import VideoInput
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+class DiffusionVL_Qwen2_5_VL_ProcessorKwargs(ProcessingKwargs, total=False):
+    """Keyword arguments for DiffusionVL_Qwen2_5_VL_Processor."""
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+def tokenizer_image_token(
+    prompt: str,
+    tokenizer,
+    image_token_index: int = IMAGE_TOKEN_INDEX,
+    return_tensors: Optional[str] = None,
+) -> Union[List[int], torch.Tensor]:
+    """
+    Tokenize text with image placeholders, replacing <image> with IMAGE_TOKEN_INDEX.
+    Args:
+        prompt: Input text containing <image> placeholders.
+        tokenizer: The tokenizer to use for encoding text.
+        image_token_index: The token index to use for image placeholders.
+        return_tensors: If "pt", return a PyTorch tensor.
+    Returns:
+        List of token IDs or a PyTorch tensor.
+    """
+    prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN)
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0:
+        # First chunk has text
+        input_ids = tokenizer(prompt_chunks[0], add_special_tokens=False).input_ids
+        offset = 1
+    for chunk_idx in range(offset, len(prompt_chunks)):
+        chunk = prompt_chunks[chunk_idx]
+        # Add image token
+        input_ids.append(image_token_index)
+        # Add text after image
+        if len(chunk) > 0:
+            input_ids.extend(tokenizer(chunk, add_special_tokens=False).input_ids)
+    if return_tensors == "pt":
+        return torch.tensor(input_ids, dtype=torch.long)
+    return input_ids
+class DiffusionVL_Qwen2_5_VL_Processor(ProcessorMixin):
+    r"""
+    Constructs a DiffusionVL processor which wraps an image processor and a tokenizer into a single processor.
+    [`DiffusionVL_Qwen2_5_VL_Processor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
+    See the [`~DiffusionVL_Qwen2_5_VL_Processor.__call__`] and [`~DiffusionVL_Qwen2_5_VL_Processor.decode`] for more information.
+    This processor uses LLaVA-style image token handling:
+    - `<image>` in text is replaced with `IMAGE_TOKEN_INDEX` (-200) in input_ids
+    - The model's `prepare_inputs_labels_for_multimodal` replaces -200 with actual image features
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*):
+            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+    Example:
+    ```python
+    >>> from transformers import AutoProcessor
+    >>> from PIL import Image
+    >>> processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True)
+    >>> # Prepare text with image placeholder
+    >>> messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
+    >>> text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    >>> # Process image and text
+    >>> image = Image.open("image.jpg")
+    >>> inputs = processor(text=[text], images=[image], return_tensors="pt")
+    ```
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Qwen2VLImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template: Optional[str] = None,
+        **kwargs,
+    ):
+        self.image_token = DEFAULT_IMAGE_TOKEN
+        self.image_token_index = IMAGE_TOKEN_INDEX
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        videos: Optional[VideoInput] = None,
+        **kwargs: Unpack[DiffusionVL_Qwen2_5_VL_ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences and image(s).
+        This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`]
+        if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `images`
+        and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `images` is not `None`.
+        The text should contain `<image>` placeholders where images should be inserted.
+        These will be replaced with `IMAGE_TOKEN_INDEX` (-200) in the output input_ids.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, *optional*):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array, or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence should be a string containing
+                `<image>` placeholders where images will be inserted.
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, *optional*):
+                The video or batch of videos to be prepared. Currently not fully supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid dimensions. Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            DiffusionVL_Qwen2_5_VL_ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # Process images
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(
+                images=images, **output_kwargs.get("images_kwargs", {})
+            )
+        # Handle text input
+        if text is None:
+            return BatchFeature(data=image_inputs)
+        if not isinstance(text, list):
+            text = [text]
+        # Tokenize with LLaVA-style image token handling
+        return_tensors = output_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
+        all_input_ids = []
+        for t in text:
+            input_ids = tokenizer_image_token(
+                t, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors=None
+            )
+            all_input_ids.append(input_ids)
+        # Pad sequences
+        max_len = max(len(ids) for ids in all_input_ids)
+        padded_input_ids = []
+        attention_masks = []
+        pad_token_id = (
+            self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0
+        )
+        for ids in all_input_ids:
+            padding_length = max_len - len(ids)
+            padded_ids = ids + [pad_token_id] * padding_length
+            mask = [1] * len(ids) + [0] * padding_length
+            padded_input_ids.append(padded_ids)
+            attention_masks.append(mask)
+        text_inputs = {
+            "input_ids": padded_input_ids,
+            "attention_mask": attention_masks,
+        }
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+    def build_conversation_input_ids(
+        self,
+        messages: List[dict],
+        images: Optional[List] = None,
+        add_generation_prompt: bool = True,
+    ) -> dict:
+        """
+        Build input_ids from conversation messages in LLaVA format.
+        This method converts a list of messages into a prompt string with `<image>` placeholders.
+        Uses LLaVA-style chat template format (trained format).
+        Args:
+            messages: List of message dicts with 'role' and 'content' keys.
+                Content can be a string or a list of dicts with 'type' key ('text' or 'image').
+            images: Optional list of images (used for validation).
+            add_generation_prompt: Whether to add generation prompt at the end.
+        Returns:
+            dict with 'text' key containing the prompt string with `<image>` placeholders.
+        """
+        # Build LLaVA-style prompt directly
+        # Format: <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nPrompt<|im_end|>\n<|im_start|>assistant\n
+        text_parts = []
+        for message in messages:
+            role = message.get("role", "user")
+            content = message.get("content", "")
+            text_parts.append(f"<|im_start|>{role}\n")
+            # Handle content - can be string or list of content items
+            if isinstance(content, str):
+                text_parts.append(content)
+            elif isinstance(content, list):
+                for item in content:
+                    if isinstance(item, dict):
+                        if item.get("type") == "image":
+                            text_parts.append(DEFAULT_IMAGE_TOKEN)
+                        elif item.get("type") == "text":
+                            text_parts.append(item.get("text", ""))
+                    else:
+                        text_parts.append(str(item))
+            text_parts.append("<|im_end|>\n")
+        if add_generation_prompt:
+            text_parts.append("<|im_start|>assistant\n")
+        text = "".join(text_parts)
+        return {"text": text}
+    def batch_decode(self, *args, **kwargs):
+        """
+        Decode a batch of token IDs to text.
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        Decode token IDs to text.
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`].
+        Please refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self) -> List[str]:
+        """Return the list of model input names."""
+        tokenizer_names = self.tokenizer.model_input_names
+        image_processor_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_names + image_processor_names))
+__all__ = ["DiffusionVL_Qwen2_5_VL_Processor", "tokenizer_image_token"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff