xiazhi commited on
Commit
b8d31f6
·
verified ·
1 Parent(s): aecc019

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,83 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - diffusion
5
+ - vision-language
6
+ - qwen2.5-vl
7
+ ---
8
+
9
+ # DiffusionVL
10
+
11
+ DiffusionVL is a vision-language model based on Qwen2.5-VL architecture with BD3LM diffusion-based generation.
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
17
+ import torch
18
+
19
+ # Load model with trust_remote_code
20
+ model = AutoModelForCausalLM.from_pretrained(
21
+ "path/to/model",
22
+ torch_dtype=torch.bfloat16,
23
+ device_map="auto",
24
+ trust_remote_code=True
25
+ )
26
+
27
+ # Load processor (includes tokenizer)
28
+ processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True)
29
+
30
+ # Image + text generation
31
+ from PIL import Image
32
+ import requests
33
+
34
+ url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
35
+ image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
36
+ messages = [
37
+ {"role": "user", "content": [
38
+ {"type": "image"},
39
+ {"type": "text", "text": "Describe this image."}
40
+ ]}
41
+ ]
42
+ text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
43
+ inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
44
+ inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
45
+
46
+ # Generate with diffusion
47
+ output_ids = model.generate(
48
+ inputs=inputs["input_ids"],
49
+ images=inputs.get("pixel_values"),
50
+ image_grid_thws=inputs.get("image_grid_thw"),
51
+ gen_length=256,
52
+ steps=8,
53
+ temperature=0.0,
54
+ remasking_strategy="low_confidence_static",
55
+ )
56
+
57
+ # Decode output
58
+ output_text = processor.decode(output_ids[0], skip_special_tokens=True)
59
+ print(output_text)
60
+ ```
61
+
62
+ ## Generation Parameters
63
+
64
+ - `gen_length`: Number of tokens to generate (default: 256)
65
+ - `steps`: Number of diffusion steps per block (default: 8)
66
+ - `temperature`: Sampling temperature, 0 for greedy (default: 0.0)
67
+ - `top_k`: Top-k sampling parameter (default: 0, disabled)
68
+ - `top_p`: Top-p (nucleus) sampling parameter (default: 1.0)
69
+ - `remasking_strategy`: 'low_confidence' or 'sequential' (default: 'low_confidence')
70
+
71
+ ## Model Configuration
72
+
73
+ - **Architecture**: DiffusionVL_Qwen2_5_VL_ForConditionalGeneration
74
+ - **BD3LM Enabled**: True
75
+ - **Block Size**: 8
76
+ - **Hidden Size**: 3584
77
+ - **Num Layers**: 28
78
+
79
+ ## Notes
80
+
81
+ - The model uses `trust_remote_code=True` because it includes custom modeling code
82
+ - Both model and processor can be loaded from the same directory
83
+ - Image preprocessing uses Qwen2VLImageProcessor internally (identical to Qwen2.5-VL)
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}<image>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}<video>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_faster_video": false,
3
+ "add_time_instruction": false,
4
+ "anneal_start_block_size": 1,
5
+ "architectures": [
6
+ "DiffusionVL_Qwen2_5_VL_ForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.0,
9
+ "bd3lm_antithetic_sampling": true,
10
+ "bd3lm_attn_backend": "sdpa",
11
+ "bd3lm_block_aligned_eos": true,
12
+ "bd3lm_block_size": 8,
13
+ "bd3lm_complementary_mask": false,
14
+ "bd3lm_cross_attn": true,
15
+ "bd3lm_ignore_bos": true,
16
+ "bd3lm_mask_prob": 0.5,
17
+ "bd3lm_noise_granularity": "block",
18
+ "bd3lm_noise_type": "loglinear",
19
+ "bd3lm_parameterization": "subs",
20
+ "bd3lm_resample": false,
21
+ "bd3lm_sampling_eps_max": 1.0,
22
+ "bd3lm_sampling_eps_min": 0.001,
23
+ "bd3lm_time_conditioning": false,
24
+ "bd3lm_token_shift_prediction": false,
25
+ "bd3lm_var_min": true,
26
+ "bos_token_id": 151643,
27
+ "enable_bd3lm": true,
28
+ "enable_block_size_annealing": false,
29
+ "enable_mtd": false,
30
+ "enable_noise_level_annealing": false,
31
+ "eos_token_id": 151645,
32
+ "faster_token_stride": 10,
33
+ "force_sample": false,
34
+ "hidden_act": "silu",
35
+ "hidden_size": 3584,
36
+ "image_aspect_ratio": "pad",
37
+ "image_crop_resolution": null,
38
+ "image_grid_pinpoints": null,
39
+ "image_split_resolution": null,
40
+ "image_token_id": null,
41
+ "initializer_range": 0.02,
42
+ "intermediate_size": 18944,
43
+ "layer_types": [
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention"
72
+ ],
73
+ "max_pixels": 262144,
74
+ "max_position_embeddings": 128000,
75
+ "max_window_layers": 28,
76
+ "min_pixels": 147456,
77
+ "mm_hidden_size": 1280,
78
+ "mm_newline_position": "grid",
79
+ "mm_patch_merge_type": "flat",
80
+ "mm_projector_lr": null,
81
+ "mm_projector_type": "qwen_merger",
82
+ "mm_resampler_type": null,
83
+ "mm_spatial_pool_mode": "bilinear",
84
+ "mm_spatial_pool_stride": null,
85
+ "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model",
86
+ "mm_use_im_patch_token": false,
87
+ "mm_use_im_start_end": false,
88
+ "mm_vision_select_feature": "patch",
89
+ "mm_vision_select_layer": -2,
90
+ "mm_vision_tower_lr": 2e-06,
91
+ "model_max_length": 8192,
92
+ "model_type": "diffusionvl_qwen2_5_vl",
93
+ "num_attention_heads": 28,
94
+ "num_hidden_layers": 28,
95
+ "num_key_value_heads": 4,
96
+ "pos_skipping_range": 4096,
97
+ "rms_norm_eps": 1e-06,
98
+ "rope_scaling": {
99
+ "mrope_section": [
100
+ 16,
101
+ 24,
102
+ 24
103
+ ],
104
+ "rope_type": "default",
105
+ "type": "default"
106
+ },
107
+ "rope_theta": 1000000.0,
108
+ "sliding_window": null,
109
+ "tie_word_embeddings": false,
110
+ "tokenizer_model_max_length": 8192,
111
+ "tokenizer_padding_side": "right",
112
+ "torch_dtype": "bfloat16",
113
+ "transformers_version": "4.55.0",
114
+ "use_cache": true,
115
+ "use_mm_proj": true,
116
+ "use_pos_skipping": false,
117
+ "use_sliding_window": false,
118
+ "video_token_id": null,
119
+ "vision_config": {
120
+ "depth": 32,
121
+ "fullatt_block_indexes": [
122
+ 7,
123
+ 15,
124
+ 23,
125
+ 31
126
+ ],
127
+ "hidden_act": "silu",
128
+ "hidden_size": 1280,
129
+ "in_channels": 3,
130
+ "in_chans": 3,
131
+ "initializer_range": 0.02,
132
+ "intermediate_size": 3420,
133
+ "model_type": "",
134
+ "num_heads": 16,
135
+ "out_hidden_size": 3584,
136
+ "patch_size": 14,
137
+ "spatial_merge_size": 2,
138
+ "spatial_patch_size": 14,
139
+ "temporal_patch_size": 2,
140
+ "tokens_per_second": 2,
141
+ "torch_dtype": "float32",
142
+ "window_size": 112
143
+ },
144
+ "vision_end_token_id": 151653,
145
+ "vision_start_token_id": 151652,
146
+ "vision_token_id": 151654,
147
+ "vision_tower_pretrained": null,
148
+ "vocab_size": 152064,
149
+ "mask_token_id": 151671,
150
+ "auto_map": {
151
+ "AutoConfig": "configuration_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_Config",
152
+ "AutoModelForCausalLM": "modeling_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_ForConditionalGeneration",
153
+ "AutoProcessor": "processing_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_Processor"
154
+ }
155
+ }
configuration_diffusionvl_qwen2_5_vl.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
5
+ # and the GPT-NeoX and OPT implementations. It has been modified to create DiffusionVL.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ """DiffusionVL (Qwen2.5-VL based) model configuration."""
20
+
21
+ from typing import List, Optional, Union
22
+
23
+ from transformers.configuration_utils import PretrainedConfig
24
+
25
+
26
+ class DiffusionVL_Qwen2_5_VL_VisionConfig(PretrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`DiffusionVL_Qwen2_5_VL_VisionModel`].
29
+ It is used to instantiate the vision encoder according to the specified arguments.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
32
+ Read the documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ depth (`int`, *optional*, defaults to 32):
36
+ Number of vision transformer layers.
37
+ hidden_size (`int`, *optional*, defaults to 1280):
38
+ Dimensionality of the encoder layers and the pooler layer.
39
+ hidden_act (`str`, *optional*, defaults to `"silu"`):
40
+ The non-linear activation function in the encoder.
41
+ intermediate_size (`int`, *optional*, defaults to 3420):
42
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer.
43
+ num_heads (`int`, *optional*, defaults to 16):
44
+ Number of attention heads for each attention layer.
45
+ in_channels (`int`, *optional*, defaults to 3):
46
+ Number of input channels.
47
+ patch_size (`int`, *optional*, defaults to 14):
48
+ The size of each image patch.
49
+ spatial_merge_size (`int`, *optional*, defaults to 2):
50
+ The spatial merge size for patch merging.
51
+ temporal_patch_size (`int`, *optional*, defaults to 2):
52
+ The temporal patch size for video processing.
53
+ tokens_per_second (`int`, *optional*, defaults to 4):
54
+ Number of tokens per second for video processing.
55
+ window_size (`int`, *optional*, defaults to 112):
56
+ Window size for windowed attention.
57
+ out_hidden_size (`int`, *optional*, defaults to 3584):
58
+ Output hidden size after the vision encoder.
59
+ fullatt_block_indexes (`List[int]`, *optional*):
60
+ Indices of blocks that use full attention instead of windowed attention.
61
+ initializer_range (`float`, *optional*, defaults to 0.02):
62
+ The standard deviation of the truncated_normal_initializer for initializing weight matrices.
63
+
64
+ Example:
65
+
66
+ ```python
67
+ >>> from configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_VisionConfig
68
+
69
+ >>> # Initializing a DiffusionVL vision configuration
70
+ >>> configuration = DiffusionVL_Qwen2_5_VL_VisionConfig()
71
+ ```
72
+ """
73
+
74
+ model_type = "diffusionvl_qwen2_5_vl_vision"
75
+ base_config_key = "vision_config"
76
+
77
+ def __init__(
78
+ self,
79
+ depth: int = 32,
80
+ hidden_size: int = 1280,
81
+ hidden_act: str = "silu",
82
+ intermediate_size: int = 3420,
83
+ num_heads: int = 16,
84
+ in_channels: int = 3,
85
+ patch_size: int = 14,
86
+ spatial_merge_size: int = 2,
87
+ temporal_patch_size: int = 2,
88
+ tokens_per_second: int = 4,
89
+ window_size: int = 112,
90
+ out_hidden_size: int = 3584,
91
+ fullatt_block_indexes: Optional[List[int]] = None,
92
+ initializer_range: float = 0.02,
93
+ **kwargs,
94
+ ):
95
+ super().__init__(**kwargs)
96
+
97
+ self.depth = depth
98
+ self.hidden_size = hidden_size
99
+ self.hidden_act = hidden_act
100
+ self.intermediate_size = intermediate_size
101
+ self.num_heads = num_heads
102
+ self.in_channels = in_channels
103
+ self.patch_size = patch_size
104
+ self.spatial_merge_size = spatial_merge_size
105
+ self.temporal_patch_size = temporal_patch_size
106
+ self.tokens_per_second = tokens_per_second
107
+ self.window_size = window_size
108
+ self.out_hidden_size = out_hidden_size
109
+ self.fullatt_block_indexes = fullatt_block_indexes or [7, 15, 23, 31]
110
+ self.initializer_range = initializer_range
111
+
112
+
113
+ class DiffusionVL_Qwen2_5_VL_Config(PretrainedConfig):
114
+ r"""
115
+ This is the configuration class to store the configuration of a [`DiffusionVL_Qwen2_5_VL_ForConditionalGeneration`].
116
+ It is used to instantiate a DiffusionVL model according to the specified arguments.
117
+
118
+ DiffusionVL extends Qwen2.5-VL architecture with BD3LM (Block Diffusion Language Model)
119
+ for diffusion-based text generation instead of autoregressive decoding.
120
+
121
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
122
+ Read the documentation from [`PretrainedConfig`] for more information.
123
+
124
+ Args:
125
+ vocab_size (`int`, *optional*, defaults to 152064):
126
+ Vocabulary size of the DiffusionVL model.
127
+ hidden_size (`int`, *optional*, defaults to 3584):
128
+ Dimension of the hidden representations.
129
+ intermediate_size (`int`, *optional*, defaults to 18944):
130
+ Dimension of the MLP representations.
131
+ num_hidden_layers (`int`, *optional*, defaults to 28):
132
+ Number of hidden layers in the Transformer encoder.
133
+ num_attention_heads (`int`, *optional*, defaults to 28):
134
+ Number of attention heads for each attention layer.
135
+ num_key_value_heads (`int`, *optional*, defaults to 4):
136
+ Number of key-value heads for Grouped Query Attention (GQA).
137
+ hidden_act (`str`, *optional*, defaults to `"silu"`):
138
+ The non-linear activation function in the decoder.
139
+ max_position_embeddings (`int`, *optional*, defaults to 128000):
140
+ The maximum sequence length that this model might ever be used with.
141
+ initializer_range (`float`, *optional*, defaults to 0.02):
142
+ The standard deviation of the truncated_normal_initializer for initializing weight matrices.
143
+ rms_norm_eps (`float`, *optional*, defaults to 1e-6):
144
+ The epsilon used by the RMS normalization layers.
145
+ use_cache (`bool`, *optional*, defaults to `True`):
146
+ Whether to use the past key/values attentions.
147
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
148
+ Whether the model's input and output word embeddings should be tied.
149
+ attention_dropout (`float`, *optional*, defaults to 0.0):
150
+ The dropout ratio for the attention probabilities.
151
+ vision_config (`DiffusionVL_Qwen2_5_VL_VisionConfig`, *optional*):
152
+ The configuration for the vision encoder.
153
+ image_token_id (`int`, *optional*, defaults to 151655):
154
+ The token index for image placeholder.
155
+ video_token_id (`int`, *optional*, defaults to 151656):
156
+ The token index for video placeholder.
157
+ vision_start_token_id (`int`, *optional*, defaults to 151652):
158
+ The token index denoting start of vision input.
159
+ vision_end_token_id (`int`, *optional*, defaults to 151653):
160
+ The token index denoting end of vision input.
161
+ enable_bd3lm (`bool`, *optional*, defaults to `True`):
162
+ Whether to enable BD3LM diffusion-based generation.
163
+ bd3lm_block_size (`int`, *optional*, defaults to 8):
164
+ Block size for BD3LM generation.
165
+ bd3lm_cross_attn (`bool`, *optional*, defaults to `True`):
166
+ Whether to use cross-attention in BD3LM.
167
+ mask_token_id (`int`, *optional*, defaults to 151671):
168
+ The token index for mask token used in diffusion.
169
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
170
+ The base period of the RoPE embeddings.
171
+ rope_scaling (`Dict`, *optional*):
172
+ Dictionary containing the scaling configuration for RoPE embeddings.
173
+
174
+ Example:
175
+
176
+ ```python
177
+ >>> from transformers import AutoModelForCausalLM
178
+ >>> from configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_Config
179
+
180
+ >>> # Initializing a DiffusionVL configuration
181
+ >>> configuration = DiffusionVL_Qwen2_5_VL_Config()
182
+
183
+ >>> # Initializing a model from the configuration
184
+ >>> model = AutoModelForCausalLM.from_pretrained(
185
+ ... "path/to/model", config=configuration, trust_remote_code=True
186
+ ... )
187
+
188
+ >>> # Accessing the model configuration
189
+ >>> configuration = model.config
190
+ ```
191
+ """
192
+
193
+ model_type = "diffusionvl_qwen2_5_vl"
194
+ sub_configs = {"vision_config": DiffusionVL_Qwen2_5_VL_VisionConfig}
195
+ keys_to_ignore_at_inference = ["past_key_values"]
196
+
197
+ def __init__(
198
+ self,
199
+ vocab_size: int = 152064,
200
+ hidden_size: int = 3584,
201
+ intermediate_size: int = 18944,
202
+ num_hidden_layers: int = 28,
203
+ num_attention_heads: int = 28,
204
+ num_key_value_heads: int = 4,
205
+ hidden_act: str = "silu",
206
+ max_position_embeddings: int = 128000,
207
+ initializer_range: float = 0.02,
208
+ rms_norm_eps: float = 1e-6,
209
+ use_cache: bool = True,
210
+ tie_word_embeddings: bool = False,
211
+ attention_dropout: float = 0.0,
212
+ # Vision configuration
213
+ vision_config: Optional[Union[DiffusionVL_Qwen2_5_VL_VisionConfig, dict]] = None,
214
+ # Multimodal token IDs
215
+ image_token_id: int = 151655,
216
+ video_token_id: int = 151656,
217
+ vision_start_token_id: int = 151652,
218
+ vision_end_token_id: int = 151653,
219
+ # BD3LM diffusion parameters
220
+ enable_bd3lm: bool = True,
221
+ bd3lm_block_size: int = 8,
222
+ bd3lm_cross_attn: bool = True,
223
+ bd3lm_antithetic_sampling: bool = True,
224
+ bd3lm_sampling_eps_min: float = 1e-3,
225
+ bd3lm_sampling_eps_max: float = 1.0,
226
+ mask_token_id: int = 151671,
227
+ # RoPE parameters
228
+ rope_theta: float = 1000000.0,
229
+ rope_scaling: Optional[dict] = None,
230
+ **kwargs,
231
+ ):
232
+ # Text model configuration
233
+ self.vocab_size = vocab_size
234
+ self.hidden_size = hidden_size
235
+ self.intermediate_size = intermediate_size
236
+ self.num_hidden_layers = num_hidden_layers
237
+ self.num_attention_heads = num_attention_heads
238
+ self.num_key_value_heads = num_key_value_heads
239
+ self.hidden_act = hidden_act
240
+ self.max_position_embeddings = max_position_embeddings
241
+ self.initializer_range = initializer_range
242
+ self.rms_norm_eps = rms_norm_eps
243
+ self.use_cache = use_cache
244
+ self.attention_dropout = attention_dropout
245
+ self.rope_theta = rope_theta
246
+ self.rope_scaling = rope_scaling or {
247
+ "mrope_section": [16, 24, 24],
248
+ "rope_type": "default",
249
+ "type": "default",
250
+ }
251
+
252
+ # Vision configuration
253
+ if vision_config is None:
254
+ self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig()
255
+ elif isinstance(vision_config, dict):
256
+ self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig(**vision_config)
257
+ elif isinstance(vision_config, DiffusionVL_Qwen2_5_VL_VisionConfig):
258
+ self.vision_config = vision_config
259
+ else:
260
+ self.vision_config = DiffusionVL_Qwen2_5_VL_VisionConfig()
261
+
262
+ # Multimodal token IDs
263
+ self.image_token_id = image_token_id
264
+ self.video_token_id = video_token_id
265
+ self.vision_start_token_id = vision_start_token_id
266
+ self.vision_end_token_id = vision_end_token_id
267
+
268
+ # BD3LM diffusion configuration
269
+ self.enable_bd3lm = enable_bd3lm
270
+ self.bd3lm_block_size = bd3lm_block_size
271
+ self.bd3lm_cross_attn = bd3lm_cross_attn
272
+ self.bd3lm_antithetic_sampling = bd3lm_antithetic_sampling
273
+ self.bd3lm_sampling_eps_min = bd3lm_sampling_eps_min
274
+ self.bd3lm_sampling_eps_max = bd3lm_sampling_eps_max
275
+ self.mask_token_id = mask_token_id
276
+
277
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
278
+
279
+
280
+ __all__ = ["DiffusionVL_Qwen2_5_VL_Config", "DiffusionVL_Qwen2_5_VL_VisionConfig"]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e753b920ec44c00e7c1395c2d048e594cbd46663ddcac8f1b6c9aa16cea2a86f
3
+ size 4877660744
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:896fbe05de22acd29ccdcd719c45c1e99cbdb4c1bd1c00339574b1cf565c2bd6
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ef02d3158f2e613522a6512e4170479555a8d12baf5a22d39c0bdf0b548e5ff
3
+ size 4995019896
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c8b1fbd664a0cd6dc23bbb30f0c0199635de1c237cffd64856096af8c43898e
3
+ size 1778992544
model.safetensors.index.json ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 848896,
4
+ "total_size": 16584333320
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
284
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
308
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
320
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
321
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
332
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
333
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
344
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
345
+ "model.mm_projector.merger.ln_q.weight": "model-00004-of-00004.safetensors",
346
+ "model.mm_projector.merger.mlp.0.bias": "model-00004-of-00004.safetensors",
347
+ "model.mm_projector.merger.mlp.0.weight": "model-00004-of-00004.safetensors",
348
+ "model.mm_projector.merger.mlp.2.bias": "model-00004-of-00004.safetensors",
349
+ "model.mm_projector.merger.mlp.2.weight": "model-00004-of-00004.safetensors",
350
+ "model.norm.weight": "model-00003-of-00004.safetensors",
351
+ "model.vision_tower.vision_tower.blocks.0.attn.proj.bias": "model-00003-of-00004.safetensors",
352
+ "model.vision_tower.vision_tower.blocks.0.attn.proj.weight": "model-00003-of-00004.safetensors",
353
+ "model.vision_tower.vision_tower.blocks.0.attn.qkv.bias": "model-00003-of-00004.safetensors",
354
+ "model.vision_tower.vision_tower.blocks.0.attn.qkv.weight": "model-00003-of-00004.safetensors",
355
+ "model.vision_tower.vision_tower.blocks.0.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
356
+ "model.vision_tower.vision_tower.blocks.0.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
357
+ "model.vision_tower.vision_tower.blocks.0.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
358
+ "model.vision_tower.vision_tower.blocks.0.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
359
+ "model.vision_tower.vision_tower.blocks.0.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
360
+ "model.vision_tower.vision_tower.blocks.0.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
361
+ "model.vision_tower.vision_tower.blocks.0.norm1.weight": "model-00003-of-00004.safetensors",
362
+ "model.vision_tower.vision_tower.blocks.0.norm2.weight": "model-00003-of-00004.safetensors",
363
+ "model.vision_tower.vision_tower.blocks.1.attn.proj.bias": "model-00003-of-00004.safetensors",
364
+ "model.vision_tower.vision_tower.blocks.1.attn.proj.weight": "model-00003-of-00004.safetensors",
365
+ "model.vision_tower.vision_tower.blocks.1.attn.qkv.bias": "model-00003-of-00004.safetensors",
366
+ "model.vision_tower.vision_tower.blocks.1.attn.qkv.weight": "model-00003-of-00004.safetensors",
367
+ "model.vision_tower.vision_tower.blocks.1.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
368
+ "model.vision_tower.vision_tower.blocks.1.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
369
+ "model.vision_tower.vision_tower.blocks.1.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
370
+ "model.vision_tower.vision_tower.blocks.1.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
371
+ "model.vision_tower.vision_tower.blocks.1.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
372
+ "model.vision_tower.vision_tower.blocks.1.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
373
+ "model.vision_tower.vision_tower.blocks.1.norm1.weight": "model-00003-of-00004.safetensors",
374
+ "model.vision_tower.vision_tower.blocks.1.norm2.weight": "model-00003-of-00004.safetensors",
375
+ "model.vision_tower.vision_tower.blocks.10.attn.proj.bias": "model-00003-of-00004.safetensors",
376
+ "model.vision_tower.vision_tower.blocks.10.attn.proj.weight": "model-00003-of-00004.safetensors",
377
+ "model.vision_tower.vision_tower.blocks.10.attn.qkv.bias": "model-00003-of-00004.safetensors",
378
+ "model.vision_tower.vision_tower.blocks.10.attn.qkv.weight": "model-00003-of-00004.safetensors",
379
+ "model.vision_tower.vision_tower.blocks.10.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
380
+ "model.vision_tower.vision_tower.blocks.10.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
381
+ "model.vision_tower.vision_tower.blocks.10.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
382
+ "model.vision_tower.vision_tower.blocks.10.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
383
+ "model.vision_tower.vision_tower.blocks.10.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
384
+ "model.vision_tower.vision_tower.blocks.10.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
385
+ "model.vision_tower.vision_tower.blocks.10.norm1.weight": "model-00003-of-00004.safetensors",
386
+ "model.vision_tower.vision_tower.blocks.10.norm2.weight": "model-00003-of-00004.safetensors",
387
+ "model.vision_tower.vision_tower.blocks.11.attn.proj.bias": "model-00003-of-00004.safetensors",
388
+ "model.vision_tower.vision_tower.blocks.11.attn.proj.weight": "model-00003-of-00004.safetensors",
389
+ "model.vision_tower.vision_tower.blocks.11.attn.qkv.bias": "model-00003-of-00004.safetensors",
390
+ "model.vision_tower.vision_tower.blocks.11.attn.qkv.weight": "model-00003-of-00004.safetensors",
391
+ "model.vision_tower.vision_tower.blocks.11.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
392
+ "model.vision_tower.vision_tower.blocks.11.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
393
+ "model.vision_tower.vision_tower.blocks.11.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
394
+ "model.vision_tower.vision_tower.blocks.11.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
395
+ "model.vision_tower.vision_tower.blocks.11.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
396
+ "model.vision_tower.vision_tower.blocks.11.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
397
+ "model.vision_tower.vision_tower.blocks.11.norm1.weight": "model-00003-of-00004.safetensors",
398
+ "model.vision_tower.vision_tower.blocks.11.norm2.weight": "model-00003-of-00004.safetensors",
399
+ "model.vision_tower.vision_tower.blocks.12.attn.proj.bias": "model-00003-of-00004.safetensors",
400
+ "model.vision_tower.vision_tower.blocks.12.attn.proj.weight": "model-00003-of-00004.safetensors",
401
+ "model.vision_tower.vision_tower.blocks.12.attn.qkv.bias": "model-00003-of-00004.safetensors",
402
+ "model.vision_tower.vision_tower.blocks.12.attn.qkv.weight": "model-00003-of-00004.safetensors",
403
+ "model.vision_tower.vision_tower.blocks.12.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
404
+ "model.vision_tower.vision_tower.blocks.12.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
405
+ "model.vision_tower.vision_tower.blocks.12.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
406
+ "model.vision_tower.vision_tower.blocks.12.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
407
+ "model.vision_tower.vision_tower.blocks.12.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
408
+ "model.vision_tower.vision_tower.blocks.12.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
409
+ "model.vision_tower.vision_tower.blocks.12.norm1.weight": "model-00003-of-00004.safetensors",
410
+ "model.vision_tower.vision_tower.blocks.12.norm2.weight": "model-00003-of-00004.safetensors",
411
+ "model.vision_tower.vision_tower.blocks.13.attn.proj.bias": "model-00003-of-00004.safetensors",
412
+ "model.vision_tower.vision_tower.blocks.13.attn.proj.weight": "model-00003-of-00004.safetensors",
413
+ "model.vision_tower.vision_tower.blocks.13.attn.qkv.bias": "model-00003-of-00004.safetensors",
414
+ "model.vision_tower.vision_tower.blocks.13.attn.qkv.weight": "model-00003-of-00004.safetensors",
415
+ "model.vision_tower.vision_tower.blocks.13.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
416
+ "model.vision_tower.vision_tower.blocks.13.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
417
+ "model.vision_tower.vision_tower.blocks.13.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
418
+ "model.vision_tower.vision_tower.blocks.13.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
419
+ "model.vision_tower.vision_tower.blocks.13.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
420
+ "model.vision_tower.vision_tower.blocks.13.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
421
+ "model.vision_tower.vision_tower.blocks.13.norm1.weight": "model-00003-of-00004.safetensors",
422
+ "model.vision_tower.vision_tower.blocks.13.norm2.weight": "model-00003-of-00004.safetensors",
423
+ "model.vision_tower.vision_tower.blocks.14.attn.proj.bias": "model-00003-of-00004.safetensors",
424
+ "model.vision_tower.vision_tower.blocks.14.attn.proj.weight": "model-00003-of-00004.safetensors",
425
+ "model.vision_tower.vision_tower.blocks.14.attn.qkv.bias": "model-00003-of-00004.safetensors",
426
+ "model.vision_tower.vision_tower.blocks.14.attn.qkv.weight": "model-00003-of-00004.safetensors",
427
+ "model.vision_tower.vision_tower.blocks.14.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
428
+ "model.vision_tower.vision_tower.blocks.14.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
429
+ "model.vision_tower.vision_tower.blocks.14.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
430
+ "model.vision_tower.vision_tower.blocks.14.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
431
+ "model.vision_tower.vision_tower.blocks.14.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
432
+ "model.vision_tower.vision_tower.blocks.14.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
433
+ "model.vision_tower.vision_tower.blocks.14.norm1.weight": "model-00003-of-00004.safetensors",
434
+ "model.vision_tower.vision_tower.blocks.14.norm2.weight": "model-00003-of-00004.safetensors",
435
+ "model.vision_tower.vision_tower.blocks.15.attn.proj.bias": "model-00003-of-00004.safetensors",
436
+ "model.vision_tower.vision_tower.blocks.15.attn.proj.weight": "model-00003-of-00004.safetensors",
437
+ "model.vision_tower.vision_tower.blocks.15.attn.qkv.bias": "model-00003-of-00004.safetensors",
438
+ "model.vision_tower.vision_tower.blocks.15.attn.qkv.weight": "model-00003-of-00004.safetensors",
439
+ "model.vision_tower.vision_tower.blocks.15.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
440
+ "model.vision_tower.vision_tower.blocks.15.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
441
+ "model.vision_tower.vision_tower.blocks.15.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
442
+ "model.vision_tower.vision_tower.blocks.15.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
443
+ "model.vision_tower.vision_tower.blocks.15.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
444
+ "model.vision_tower.vision_tower.blocks.15.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
445
+ "model.vision_tower.vision_tower.blocks.15.norm1.weight": "model-00003-of-00004.safetensors",
446
+ "model.vision_tower.vision_tower.blocks.15.norm2.weight": "model-00003-of-00004.safetensors",
447
+ "model.vision_tower.vision_tower.blocks.16.attn.proj.bias": "model-00003-of-00004.safetensors",
448
+ "model.vision_tower.vision_tower.blocks.16.attn.proj.weight": "model-00003-of-00004.safetensors",
449
+ "model.vision_tower.vision_tower.blocks.16.attn.qkv.bias": "model-00003-of-00004.safetensors",
450
+ "model.vision_tower.vision_tower.blocks.16.attn.qkv.weight": "model-00003-of-00004.safetensors",
451
+ "model.vision_tower.vision_tower.blocks.16.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
452
+ "model.vision_tower.vision_tower.blocks.16.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
453
+ "model.vision_tower.vision_tower.blocks.16.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
454
+ "model.vision_tower.vision_tower.blocks.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
455
+ "model.vision_tower.vision_tower.blocks.16.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
456
+ "model.vision_tower.vision_tower.blocks.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
457
+ "model.vision_tower.vision_tower.blocks.16.norm1.weight": "model-00003-of-00004.safetensors",
458
+ "model.vision_tower.vision_tower.blocks.16.norm2.weight": "model-00003-of-00004.safetensors",
459
+ "model.vision_tower.vision_tower.blocks.17.attn.proj.bias": "model-00004-of-00004.safetensors",
460
+ "model.vision_tower.vision_tower.blocks.17.attn.proj.weight": "model-00004-of-00004.safetensors",
461
+ "model.vision_tower.vision_tower.blocks.17.attn.qkv.bias": "model-00004-of-00004.safetensors",
462
+ "model.vision_tower.vision_tower.blocks.17.attn.qkv.weight": "model-00004-of-00004.safetensors",
463
+ "model.vision_tower.vision_tower.blocks.17.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
464
+ "model.vision_tower.vision_tower.blocks.17.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
465
+ "model.vision_tower.vision_tower.blocks.17.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
466
+ "model.vision_tower.vision_tower.blocks.17.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
467
+ "model.vision_tower.vision_tower.blocks.17.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
468
+ "model.vision_tower.vision_tower.blocks.17.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
469
+ "model.vision_tower.vision_tower.blocks.17.norm1.weight": "model-00004-of-00004.safetensors",
470
+ "model.vision_tower.vision_tower.blocks.17.norm2.weight": "model-00004-of-00004.safetensors",
471
+ "model.vision_tower.vision_tower.blocks.18.attn.proj.bias": "model-00004-of-00004.safetensors",
472
+ "model.vision_tower.vision_tower.blocks.18.attn.proj.weight": "model-00004-of-00004.safetensors",
473
+ "model.vision_tower.vision_tower.blocks.18.attn.qkv.bias": "model-00004-of-00004.safetensors",
474
+ "model.vision_tower.vision_tower.blocks.18.attn.qkv.weight": "model-00004-of-00004.safetensors",
475
+ "model.vision_tower.vision_tower.blocks.18.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
476
+ "model.vision_tower.vision_tower.blocks.18.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
477
+ "model.vision_tower.vision_tower.blocks.18.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
478
+ "model.vision_tower.vision_tower.blocks.18.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
479
+ "model.vision_tower.vision_tower.blocks.18.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
480
+ "model.vision_tower.vision_tower.blocks.18.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
481
+ "model.vision_tower.vision_tower.blocks.18.norm1.weight": "model-00004-of-00004.safetensors",
482
+ "model.vision_tower.vision_tower.blocks.18.norm2.weight": "model-00004-of-00004.safetensors",
483
+ "model.vision_tower.vision_tower.blocks.19.attn.proj.bias": "model-00004-of-00004.safetensors",
484
+ "model.vision_tower.vision_tower.blocks.19.attn.proj.weight": "model-00004-of-00004.safetensors",
485
+ "model.vision_tower.vision_tower.blocks.19.attn.qkv.bias": "model-00004-of-00004.safetensors",
486
+ "model.vision_tower.vision_tower.blocks.19.attn.qkv.weight": "model-00004-of-00004.safetensors",
487
+ "model.vision_tower.vision_tower.blocks.19.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
488
+ "model.vision_tower.vision_tower.blocks.19.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
489
+ "model.vision_tower.vision_tower.blocks.19.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
490
+ "model.vision_tower.vision_tower.blocks.19.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
491
+ "model.vision_tower.vision_tower.blocks.19.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
492
+ "model.vision_tower.vision_tower.blocks.19.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
493
+ "model.vision_tower.vision_tower.blocks.19.norm1.weight": "model-00004-of-00004.safetensors",
494
+ "model.vision_tower.vision_tower.blocks.19.norm2.weight": "model-00004-of-00004.safetensors",
495
+ "model.vision_tower.vision_tower.blocks.2.attn.proj.bias": "model-00003-of-00004.safetensors",
496
+ "model.vision_tower.vision_tower.blocks.2.attn.proj.weight": "model-00003-of-00004.safetensors",
497
+ "model.vision_tower.vision_tower.blocks.2.attn.qkv.bias": "model-00003-of-00004.safetensors",
498
+ "model.vision_tower.vision_tower.blocks.2.attn.qkv.weight": "model-00003-of-00004.safetensors",
499
+ "model.vision_tower.vision_tower.blocks.2.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
500
+ "model.vision_tower.vision_tower.blocks.2.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
501
+ "model.vision_tower.vision_tower.blocks.2.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
502
+ "model.vision_tower.vision_tower.blocks.2.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
503
+ "model.vision_tower.vision_tower.blocks.2.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
504
+ "model.vision_tower.vision_tower.blocks.2.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
505
+ "model.vision_tower.vision_tower.blocks.2.norm1.weight": "model-00003-of-00004.safetensors",
506
+ "model.vision_tower.vision_tower.blocks.2.norm2.weight": "model-00003-of-00004.safetensors",
507
+ "model.vision_tower.vision_tower.blocks.20.attn.proj.bias": "model-00004-of-00004.safetensors",
508
+ "model.vision_tower.vision_tower.blocks.20.attn.proj.weight": "model-00004-of-00004.safetensors",
509
+ "model.vision_tower.vision_tower.blocks.20.attn.qkv.bias": "model-00004-of-00004.safetensors",
510
+ "model.vision_tower.vision_tower.blocks.20.attn.qkv.weight": "model-00004-of-00004.safetensors",
511
+ "model.vision_tower.vision_tower.blocks.20.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
512
+ "model.vision_tower.vision_tower.blocks.20.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
513
+ "model.vision_tower.vision_tower.blocks.20.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
514
+ "model.vision_tower.vision_tower.blocks.20.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
515
+ "model.vision_tower.vision_tower.blocks.20.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
516
+ "model.vision_tower.vision_tower.blocks.20.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
517
+ "model.vision_tower.vision_tower.blocks.20.norm1.weight": "model-00004-of-00004.safetensors",
518
+ "model.vision_tower.vision_tower.blocks.20.norm2.weight": "model-00004-of-00004.safetensors",
519
+ "model.vision_tower.vision_tower.blocks.21.attn.proj.bias": "model-00004-of-00004.safetensors",
520
+ "model.vision_tower.vision_tower.blocks.21.attn.proj.weight": "model-00004-of-00004.safetensors",
521
+ "model.vision_tower.vision_tower.blocks.21.attn.qkv.bias": "model-00004-of-00004.safetensors",
522
+ "model.vision_tower.vision_tower.blocks.21.attn.qkv.weight": "model-00004-of-00004.safetensors",
523
+ "model.vision_tower.vision_tower.blocks.21.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
524
+ "model.vision_tower.vision_tower.blocks.21.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
525
+ "model.vision_tower.vision_tower.blocks.21.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
526
+ "model.vision_tower.vision_tower.blocks.21.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
527
+ "model.vision_tower.vision_tower.blocks.21.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
528
+ "model.vision_tower.vision_tower.blocks.21.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
529
+ "model.vision_tower.vision_tower.blocks.21.norm1.weight": "model-00004-of-00004.safetensors",
530
+ "model.vision_tower.vision_tower.blocks.21.norm2.weight": "model-00004-of-00004.safetensors",
531
+ "model.vision_tower.vision_tower.blocks.22.attn.proj.bias": "model-00004-of-00004.safetensors",
532
+ "model.vision_tower.vision_tower.blocks.22.attn.proj.weight": "model-00004-of-00004.safetensors",
533
+ "model.vision_tower.vision_tower.blocks.22.attn.qkv.bias": "model-00004-of-00004.safetensors",
534
+ "model.vision_tower.vision_tower.blocks.22.attn.qkv.weight": "model-00004-of-00004.safetensors",
535
+ "model.vision_tower.vision_tower.blocks.22.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
536
+ "model.vision_tower.vision_tower.blocks.22.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
537
+ "model.vision_tower.vision_tower.blocks.22.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
538
+ "model.vision_tower.vision_tower.blocks.22.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
539
+ "model.vision_tower.vision_tower.blocks.22.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
540
+ "model.vision_tower.vision_tower.blocks.22.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
541
+ "model.vision_tower.vision_tower.blocks.22.norm1.weight": "model-00004-of-00004.safetensors",
542
+ "model.vision_tower.vision_tower.blocks.22.norm2.weight": "model-00004-of-00004.safetensors",
543
+ "model.vision_tower.vision_tower.blocks.23.attn.proj.bias": "model-00004-of-00004.safetensors",
544
+ "model.vision_tower.vision_tower.blocks.23.attn.proj.weight": "model-00004-of-00004.safetensors",
545
+ "model.vision_tower.vision_tower.blocks.23.attn.qkv.bias": "model-00004-of-00004.safetensors",
546
+ "model.vision_tower.vision_tower.blocks.23.attn.qkv.weight": "model-00004-of-00004.safetensors",
547
+ "model.vision_tower.vision_tower.blocks.23.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
548
+ "model.vision_tower.vision_tower.blocks.23.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
549
+ "model.vision_tower.vision_tower.blocks.23.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
550
+ "model.vision_tower.vision_tower.blocks.23.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
551
+ "model.vision_tower.vision_tower.blocks.23.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
552
+ "model.vision_tower.vision_tower.blocks.23.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
553
+ "model.vision_tower.vision_tower.blocks.23.norm1.weight": "model-00004-of-00004.safetensors",
554
+ "model.vision_tower.vision_tower.blocks.23.norm2.weight": "model-00004-of-00004.safetensors",
555
+ "model.vision_tower.vision_tower.blocks.24.attn.proj.bias": "model-00004-of-00004.safetensors",
556
+ "model.vision_tower.vision_tower.blocks.24.attn.proj.weight": "model-00004-of-00004.safetensors",
557
+ "model.vision_tower.vision_tower.blocks.24.attn.qkv.bias": "model-00004-of-00004.safetensors",
558
+ "model.vision_tower.vision_tower.blocks.24.attn.qkv.weight": "model-00004-of-00004.safetensors",
559
+ "model.vision_tower.vision_tower.blocks.24.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
560
+ "model.vision_tower.vision_tower.blocks.24.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
561
+ "model.vision_tower.vision_tower.blocks.24.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
562
+ "model.vision_tower.vision_tower.blocks.24.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
563
+ "model.vision_tower.vision_tower.blocks.24.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
564
+ "model.vision_tower.vision_tower.blocks.24.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
565
+ "model.vision_tower.vision_tower.blocks.24.norm1.weight": "model-00004-of-00004.safetensors",
566
+ "model.vision_tower.vision_tower.blocks.24.norm2.weight": "model-00004-of-00004.safetensors",
567
+ "model.vision_tower.vision_tower.blocks.25.attn.proj.bias": "model-00004-of-00004.safetensors",
568
+ "model.vision_tower.vision_tower.blocks.25.attn.proj.weight": "model-00004-of-00004.safetensors",
569
+ "model.vision_tower.vision_tower.blocks.25.attn.qkv.bias": "model-00004-of-00004.safetensors",
570
+ "model.vision_tower.vision_tower.blocks.25.attn.qkv.weight": "model-00004-of-00004.safetensors",
571
+ "model.vision_tower.vision_tower.blocks.25.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
572
+ "model.vision_tower.vision_tower.blocks.25.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
573
+ "model.vision_tower.vision_tower.blocks.25.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
574
+ "model.vision_tower.vision_tower.blocks.25.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
575
+ "model.vision_tower.vision_tower.blocks.25.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
576
+ "model.vision_tower.vision_tower.blocks.25.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
577
+ "model.vision_tower.vision_tower.blocks.25.norm1.weight": "model-00004-of-00004.safetensors",
578
+ "model.vision_tower.vision_tower.blocks.25.norm2.weight": "model-00004-of-00004.safetensors",
579
+ "model.vision_tower.vision_tower.blocks.26.attn.proj.bias": "model-00004-of-00004.safetensors",
580
+ "model.vision_tower.vision_tower.blocks.26.attn.proj.weight": "model-00004-of-00004.safetensors",
581
+ "model.vision_tower.vision_tower.blocks.26.attn.qkv.bias": "model-00004-of-00004.safetensors",
582
+ "model.vision_tower.vision_tower.blocks.26.attn.qkv.weight": "model-00004-of-00004.safetensors",
583
+ "model.vision_tower.vision_tower.blocks.26.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
584
+ "model.vision_tower.vision_tower.blocks.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
585
+ "model.vision_tower.vision_tower.blocks.26.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
586
+ "model.vision_tower.vision_tower.blocks.26.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
587
+ "model.vision_tower.vision_tower.blocks.26.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
588
+ "model.vision_tower.vision_tower.blocks.26.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
589
+ "model.vision_tower.vision_tower.blocks.26.norm1.weight": "model-00004-of-00004.safetensors",
590
+ "model.vision_tower.vision_tower.blocks.26.norm2.weight": "model-00004-of-00004.safetensors",
591
+ "model.vision_tower.vision_tower.blocks.27.attn.proj.bias": "model-00004-of-00004.safetensors",
592
+ "model.vision_tower.vision_tower.blocks.27.attn.proj.weight": "model-00004-of-00004.safetensors",
593
+ "model.vision_tower.vision_tower.blocks.27.attn.qkv.bias": "model-00004-of-00004.safetensors",
594
+ "model.vision_tower.vision_tower.blocks.27.attn.qkv.weight": "model-00004-of-00004.safetensors",
595
+ "model.vision_tower.vision_tower.blocks.27.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
596
+ "model.vision_tower.vision_tower.blocks.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
597
+ "model.vision_tower.vision_tower.blocks.27.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
598
+ "model.vision_tower.vision_tower.blocks.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
599
+ "model.vision_tower.vision_tower.blocks.27.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
600
+ "model.vision_tower.vision_tower.blocks.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
601
+ "model.vision_tower.vision_tower.blocks.27.norm1.weight": "model-00004-of-00004.safetensors",
602
+ "model.vision_tower.vision_tower.blocks.27.norm2.weight": "model-00004-of-00004.safetensors",
603
+ "model.vision_tower.vision_tower.blocks.28.attn.proj.bias": "model-00004-of-00004.safetensors",
604
+ "model.vision_tower.vision_tower.blocks.28.attn.proj.weight": "model-00004-of-00004.safetensors",
605
+ "model.vision_tower.vision_tower.blocks.28.attn.qkv.bias": "model-00004-of-00004.safetensors",
606
+ "model.vision_tower.vision_tower.blocks.28.attn.qkv.weight": "model-00004-of-00004.safetensors",
607
+ "model.vision_tower.vision_tower.blocks.28.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
608
+ "model.vision_tower.vision_tower.blocks.28.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
609
+ "model.vision_tower.vision_tower.blocks.28.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
610
+ "model.vision_tower.vision_tower.blocks.28.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
611
+ "model.vision_tower.vision_tower.blocks.28.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
612
+ "model.vision_tower.vision_tower.blocks.28.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
613
+ "model.vision_tower.vision_tower.blocks.28.norm1.weight": "model-00004-of-00004.safetensors",
614
+ "model.vision_tower.vision_tower.blocks.28.norm2.weight": "model-00004-of-00004.safetensors",
615
+ "model.vision_tower.vision_tower.blocks.29.attn.proj.bias": "model-00004-of-00004.safetensors",
616
+ "model.vision_tower.vision_tower.blocks.29.attn.proj.weight": "model-00004-of-00004.safetensors",
617
+ "model.vision_tower.vision_tower.blocks.29.attn.qkv.bias": "model-00004-of-00004.safetensors",
618
+ "model.vision_tower.vision_tower.blocks.29.attn.qkv.weight": "model-00004-of-00004.safetensors",
619
+ "model.vision_tower.vision_tower.blocks.29.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
620
+ "model.vision_tower.vision_tower.blocks.29.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
621
+ "model.vision_tower.vision_tower.blocks.29.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
622
+ "model.vision_tower.vision_tower.blocks.29.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
623
+ "model.vision_tower.vision_tower.blocks.29.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
624
+ "model.vision_tower.vision_tower.blocks.29.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
625
+ "model.vision_tower.vision_tower.blocks.29.norm1.weight": "model-00004-of-00004.safetensors",
626
+ "model.vision_tower.vision_tower.blocks.29.norm2.weight": "model-00004-of-00004.safetensors",
627
+ "model.vision_tower.vision_tower.blocks.3.attn.proj.bias": "model-00003-of-00004.safetensors",
628
+ "model.vision_tower.vision_tower.blocks.3.attn.proj.weight": "model-00003-of-00004.safetensors",
629
+ "model.vision_tower.vision_tower.blocks.3.attn.qkv.bias": "model-00003-of-00004.safetensors",
630
+ "model.vision_tower.vision_tower.blocks.3.attn.qkv.weight": "model-00003-of-00004.safetensors",
631
+ "model.vision_tower.vision_tower.blocks.3.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
632
+ "model.vision_tower.vision_tower.blocks.3.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
633
+ "model.vision_tower.vision_tower.blocks.3.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
634
+ "model.vision_tower.vision_tower.blocks.3.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
635
+ "model.vision_tower.vision_tower.blocks.3.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
636
+ "model.vision_tower.vision_tower.blocks.3.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
637
+ "model.vision_tower.vision_tower.blocks.3.norm1.weight": "model-00003-of-00004.safetensors",
638
+ "model.vision_tower.vision_tower.blocks.3.norm2.weight": "model-00003-of-00004.safetensors",
639
+ "model.vision_tower.vision_tower.blocks.30.attn.proj.bias": "model-00004-of-00004.safetensors",
640
+ "model.vision_tower.vision_tower.blocks.30.attn.proj.weight": "model-00004-of-00004.safetensors",
641
+ "model.vision_tower.vision_tower.blocks.30.attn.qkv.bias": "model-00004-of-00004.safetensors",
642
+ "model.vision_tower.vision_tower.blocks.30.attn.qkv.weight": "model-00004-of-00004.safetensors",
643
+ "model.vision_tower.vision_tower.blocks.30.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
644
+ "model.vision_tower.vision_tower.blocks.30.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
645
+ "model.vision_tower.vision_tower.blocks.30.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
646
+ "model.vision_tower.vision_tower.blocks.30.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
647
+ "model.vision_tower.vision_tower.blocks.30.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
648
+ "model.vision_tower.vision_tower.blocks.30.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
649
+ "model.vision_tower.vision_tower.blocks.30.norm1.weight": "model-00004-of-00004.safetensors",
650
+ "model.vision_tower.vision_tower.blocks.30.norm2.weight": "model-00004-of-00004.safetensors",
651
+ "model.vision_tower.vision_tower.blocks.31.attn.proj.bias": "model-00004-of-00004.safetensors",
652
+ "model.vision_tower.vision_tower.blocks.31.attn.proj.weight": "model-00004-of-00004.safetensors",
653
+ "model.vision_tower.vision_tower.blocks.31.attn.qkv.bias": "model-00004-of-00004.safetensors",
654
+ "model.vision_tower.vision_tower.blocks.31.attn.qkv.weight": "model-00004-of-00004.safetensors",
655
+ "model.vision_tower.vision_tower.blocks.31.mlp.down_proj.bias": "model-00004-of-00004.safetensors",
656
+ "model.vision_tower.vision_tower.blocks.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
657
+ "model.vision_tower.vision_tower.blocks.31.mlp.gate_proj.bias": "model-00004-of-00004.safetensors",
658
+ "model.vision_tower.vision_tower.blocks.31.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
659
+ "model.vision_tower.vision_tower.blocks.31.mlp.up_proj.bias": "model-00004-of-00004.safetensors",
660
+ "model.vision_tower.vision_tower.blocks.31.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
661
+ "model.vision_tower.vision_tower.blocks.31.norm1.weight": "model-00004-of-00004.safetensors",
662
+ "model.vision_tower.vision_tower.blocks.31.norm2.weight": "model-00004-of-00004.safetensors",
663
+ "model.vision_tower.vision_tower.blocks.4.attn.proj.bias": "model-00003-of-00004.safetensors",
664
+ "model.vision_tower.vision_tower.blocks.4.attn.proj.weight": "model-00003-of-00004.safetensors",
665
+ "model.vision_tower.vision_tower.blocks.4.attn.qkv.bias": "model-00003-of-00004.safetensors",
666
+ "model.vision_tower.vision_tower.blocks.4.attn.qkv.weight": "model-00003-of-00004.safetensors",
667
+ "model.vision_tower.vision_tower.blocks.4.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
668
+ "model.vision_tower.vision_tower.blocks.4.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
669
+ "model.vision_tower.vision_tower.blocks.4.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
670
+ "model.vision_tower.vision_tower.blocks.4.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
671
+ "model.vision_tower.vision_tower.blocks.4.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
672
+ "model.vision_tower.vision_tower.blocks.4.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
673
+ "model.vision_tower.vision_tower.blocks.4.norm1.weight": "model-00003-of-00004.safetensors",
674
+ "model.vision_tower.vision_tower.blocks.4.norm2.weight": "model-00003-of-00004.safetensors",
675
+ "model.vision_tower.vision_tower.blocks.5.attn.proj.bias": "model-00003-of-00004.safetensors",
676
+ "model.vision_tower.vision_tower.blocks.5.attn.proj.weight": "model-00003-of-00004.safetensors",
677
+ "model.vision_tower.vision_tower.blocks.5.attn.qkv.bias": "model-00003-of-00004.safetensors",
678
+ "model.vision_tower.vision_tower.blocks.5.attn.qkv.weight": "model-00003-of-00004.safetensors",
679
+ "model.vision_tower.vision_tower.blocks.5.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
680
+ "model.vision_tower.vision_tower.blocks.5.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
681
+ "model.vision_tower.vision_tower.blocks.5.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
682
+ "model.vision_tower.vision_tower.blocks.5.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
683
+ "model.vision_tower.vision_tower.blocks.5.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
684
+ "model.vision_tower.vision_tower.blocks.5.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
685
+ "model.vision_tower.vision_tower.blocks.5.norm1.weight": "model-00003-of-00004.safetensors",
686
+ "model.vision_tower.vision_tower.blocks.5.norm2.weight": "model-00003-of-00004.safetensors",
687
+ "model.vision_tower.vision_tower.blocks.6.attn.proj.bias": "model-00003-of-00004.safetensors",
688
+ "model.vision_tower.vision_tower.blocks.6.attn.proj.weight": "model-00003-of-00004.safetensors",
689
+ "model.vision_tower.vision_tower.blocks.6.attn.qkv.bias": "model-00003-of-00004.safetensors",
690
+ "model.vision_tower.vision_tower.blocks.6.attn.qkv.weight": "model-00003-of-00004.safetensors",
691
+ "model.vision_tower.vision_tower.blocks.6.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
692
+ "model.vision_tower.vision_tower.blocks.6.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
693
+ "model.vision_tower.vision_tower.blocks.6.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
694
+ "model.vision_tower.vision_tower.blocks.6.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
695
+ "model.vision_tower.vision_tower.blocks.6.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
696
+ "model.vision_tower.vision_tower.blocks.6.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
697
+ "model.vision_tower.vision_tower.blocks.6.norm1.weight": "model-00003-of-00004.safetensors",
698
+ "model.vision_tower.vision_tower.blocks.6.norm2.weight": "model-00003-of-00004.safetensors",
699
+ "model.vision_tower.vision_tower.blocks.7.attn.proj.bias": "model-00003-of-00004.safetensors",
700
+ "model.vision_tower.vision_tower.blocks.7.attn.proj.weight": "model-00003-of-00004.safetensors",
701
+ "model.vision_tower.vision_tower.blocks.7.attn.qkv.bias": "model-00003-of-00004.safetensors",
702
+ "model.vision_tower.vision_tower.blocks.7.attn.qkv.weight": "model-00003-of-00004.safetensors",
703
+ "model.vision_tower.vision_tower.blocks.7.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
704
+ "model.vision_tower.vision_tower.blocks.7.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
705
+ "model.vision_tower.vision_tower.blocks.7.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
706
+ "model.vision_tower.vision_tower.blocks.7.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
707
+ "model.vision_tower.vision_tower.blocks.7.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
708
+ "model.vision_tower.vision_tower.blocks.7.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
709
+ "model.vision_tower.vision_tower.blocks.7.norm1.weight": "model-00003-of-00004.safetensors",
710
+ "model.vision_tower.vision_tower.blocks.7.norm2.weight": "model-00003-of-00004.safetensors",
711
+ "model.vision_tower.vision_tower.blocks.8.attn.proj.bias": "model-00003-of-00004.safetensors",
712
+ "model.vision_tower.vision_tower.blocks.8.attn.proj.weight": "model-00003-of-00004.safetensors",
713
+ "model.vision_tower.vision_tower.blocks.8.attn.qkv.bias": "model-00003-of-00004.safetensors",
714
+ "model.vision_tower.vision_tower.blocks.8.attn.qkv.weight": "model-00003-of-00004.safetensors",
715
+ "model.vision_tower.vision_tower.blocks.8.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
716
+ "model.vision_tower.vision_tower.blocks.8.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
717
+ "model.vision_tower.vision_tower.blocks.8.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
718
+ "model.vision_tower.vision_tower.blocks.8.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
719
+ "model.vision_tower.vision_tower.blocks.8.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
720
+ "model.vision_tower.vision_tower.blocks.8.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
721
+ "model.vision_tower.vision_tower.blocks.8.norm1.weight": "model-00003-of-00004.safetensors",
722
+ "model.vision_tower.vision_tower.blocks.8.norm2.weight": "model-00003-of-00004.safetensors",
723
+ "model.vision_tower.vision_tower.blocks.9.attn.proj.bias": "model-00003-of-00004.safetensors",
724
+ "model.vision_tower.vision_tower.blocks.9.attn.proj.weight": "model-00003-of-00004.safetensors",
725
+ "model.vision_tower.vision_tower.blocks.9.attn.qkv.bias": "model-00003-of-00004.safetensors",
726
+ "model.vision_tower.vision_tower.blocks.9.attn.qkv.weight": "model-00003-of-00004.safetensors",
727
+ "model.vision_tower.vision_tower.blocks.9.mlp.down_proj.bias": "model-00003-of-00004.safetensors",
728
+ "model.vision_tower.vision_tower.blocks.9.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
729
+ "model.vision_tower.vision_tower.blocks.9.mlp.gate_proj.bias": "model-00003-of-00004.safetensors",
730
+ "model.vision_tower.vision_tower.blocks.9.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
731
+ "model.vision_tower.vision_tower.blocks.9.mlp.up_proj.bias": "model-00003-of-00004.safetensors",
732
+ "model.vision_tower.vision_tower.blocks.9.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
733
+ "model.vision_tower.vision_tower.blocks.9.norm1.weight": "model-00003-of-00004.safetensors",
734
+ "model.vision_tower.vision_tower.blocks.9.norm2.weight": "model-00003-of-00004.safetensors",
735
+ "model.vision_tower.vision_tower.patch_embed.proj.weight": "model-00003-of-00004.safetensors"
736
+ }
737
+ }
modeling_diffusionvl_qwen2_5_vl.py ADDED
@@ -0,0 +1,1513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
5
+ # and the GPT-NeoX and OPT implementations. It has been modified to create DiffusionVL.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+
19
+ """DiffusionVL model implementation."""
20
+
21
+ import math
22
+ from dataclasses import dataclass
23
+ from typing import Callable, Dict, List, Optional, Tuple, Union
24
+
25
+ import torch
26
+ import torch.nn as nn
27
+ import torch.nn.functional as F
28
+
29
+ from transformers import PreTrainedModel
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache, DynamicCache
32
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput
33
+ from transformers.utils import logging
34
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
35
+ from transformers.modeling_layers import GradientCheckpointingLayer
36
+
37
+ from .configuration_diffusionvl_qwen2_5_vl import DiffusionVL_Qwen2_5_VL_Config, DiffusionVL_Qwen2_5_VL_VisionConfig
38
+
39
+ IMAGE_TOKEN_INDEX = -200
40
+
41
+ def rotate_half(x: torch.Tensor) -> torch.Tensor:
42
+ """
43
+ Rotates half the hidden dims of the input for rotary position embedding.
44
+
45
+ Args:
46
+ x: Input tensor of shape (..., head_dim).
47
+
48
+ Returns:
49
+ Rotated tensor of the same shape.
50
+ """
51
+ x1 = x[..., : x.shape[-1] // 2]
52
+ x2 = x[..., x.shape[-1] // 2 :]
53
+ return torch.cat((-x2, x1), dim=-1)
54
+
55
+
56
+ def apply_rotary_pos_emb_vision(
57
+ q: torch.Tensor,
58
+ k: torch.Tensor,
59
+ cos: torch.Tensor,
60
+ sin: torch.Tensor,
61
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
62
+ """
63
+ Apply rotary position embedding for vision encoder.
64
+
65
+ Args:
66
+ q: Query tensor.
67
+ k: Key tensor.
68
+ cos: Cosine part of rotary embedding.
69
+ sin: Sine part of rotary embedding.
70
+
71
+ Returns:
72
+ Tuple of (rotated_q, rotated_k).
73
+ """
74
+ orig_q_dtype = q.dtype
75
+ orig_k_dtype = k.dtype
76
+ q, k = q.float(), k.float()
77
+ cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
78
+ q_embed = (q * cos) + (rotate_half(q) * sin)
79
+ k_embed = (k * cos) + (rotate_half(k) * sin)
80
+ return q_embed.to(orig_q_dtype), k_embed.to(orig_k_dtype)
81
+
82
+
83
+ def apply_multimodal_rotary_pos_emb(
84
+ q: torch.Tensor,
85
+ k: torch.Tensor,
86
+ cos: torch.Tensor,
87
+ sin: torch.Tensor,
88
+ mrope_section: List[int],
89
+ unsqueeze_dim: int = 1,
90
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
91
+ """
92
+ Apply multimodal rotary position embedding (M-RoPE) for 3D position encoding.
93
+
94
+ Args:
95
+ q: Query tensor of shape (batch, heads, seq_len, head_dim).
96
+ k: Key tensor of shape (batch, heads, seq_len, head_dim).
97
+ cos: Cosine tensor of shape (3, batch, seq_len, head_dim).
98
+ sin: Sine tensor of shape (3, batch, seq_len, head_dim).
99
+ mrope_section: List of 3 ints defining section sizes [temporal, height, width].
100
+ For example, [16, 24, 24] for head_dim=128.
101
+ unsqueeze_dim: Dimension to unsqueeze for broadcasting.
102
+
103
+ Returns:
104
+ Tuple of (rotated_q, rotated_k) with M-RoPE applied.
105
+ """
106
+ # mrope_section is like [16, 24, 24] for head_dim=128
107
+ # Multiply by 2 because head_dim is full (not half like in standard RoPE)
108
+ mrope_section = mrope_section * 2 # [16, 24, 24] -> [32, 48, 48]
109
+
110
+ # Split cos/sin along head_dim, then select appropriate dimension (0, 1, 2) for each section
111
+ # cos/sin shape: (3, batch, seq_len, head_dim)
112
+ cos = torch.cat(
113
+ [m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1
114
+ ).unsqueeze(unsqueeze_dim)
115
+ sin = torch.cat(
116
+ [m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1
117
+ ).unsqueeze(unsqueeze_dim)
118
+
119
+ q_embed = (q * cos) + (rotate_half(q) * sin)
120
+ k_embed = (k * cos) + (rotate_half(k) * sin)
121
+ return q_embed, k_embed
122
+
123
+
124
+ class DiffusionVL_Qwen2_5_VL_RMSNorm(nn.Module):
125
+ def __init__(self, hidden_size, eps=1e-6):
126
+ super().__init__()
127
+ self.weight = nn.Parameter(torch.ones(hidden_size))
128
+ self.variance_epsilon = eps
129
+
130
+ def forward(self, hidden_states):
131
+ input_dtype = hidden_states.dtype
132
+ hidden_states = hidden_states.to(torch.float32)
133
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
134
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
135
+ return self.weight * hidden_states.to(input_dtype)
136
+
137
+
138
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
139
+ """
140
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
141
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
142
+ """
143
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
144
+ if n_rep == 1:
145
+ return hidden_states
146
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
147
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
148
+
149
+
150
+ def eager_attention_forward(
151
+ module: nn.Module,
152
+ query: torch.Tensor,
153
+ key: torch.Tensor,
154
+ value: torch.Tensor,
155
+ attention_mask: Optional[torch.Tensor],
156
+ scaling: float,
157
+ dropout: float = 0.0,
158
+ **kwargs,
159
+ ):
160
+ """Eager attention implementation."""
161
+ key_states = repeat_kv(key, module.num_key_value_groups)
162
+ value_states = repeat_kv(value, module.num_key_value_groups)
163
+
164
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
165
+ if attention_mask is not None:
166
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
167
+ attn_weights = attn_weights + causal_mask
168
+
169
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
170
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
171
+ attn_output = torch.matmul(attn_weights, value_states)
172
+ attn_output = attn_output.transpose(1, 2).contiguous()
173
+
174
+ return attn_output, attn_weights
175
+
176
+
177
+ class DiffusionVL_Qwen2_5_VL_VisionMLP(nn.Module):
178
+ def __init__(self, config, bias: bool = False):
179
+ super().__init__()
180
+ self.hidden_size = config.hidden_size
181
+ self.intermediate_size = config.intermediate_size
182
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
183
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
184
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
185
+ self.act_fn = ACT2FN[config.hidden_act]
186
+
187
+ def forward(self, hidden_state):
188
+ return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
189
+
190
+
191
+ class DiffusionVL_Qwen2_5_VL_VisionPatchEmbed(nn.Module):
192
+ def __init__(self, patch_size=14, temporal_patch_size=2, in_channels=3, embed_dim=1152):
193
+ super().__init__()
194
+ self.patch_size = patch_size
195
+ self.temporal_patch_size = temporal_patch_size
196
+ self.in_channels = in_channels
197
+ self.embed_dim = embed_dim
198
+ kernel_size = [temporal_patch_size, patch_size, patch_size]
199
+ self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
200
+
201
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
202
+ target_dtype = self.proj.weight.dtype
203
+ hidden_states = hidden_states.view(
204
+ -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
205
+ )
206
+ hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
207
+ return hidden_states
208
+
209
+
210
+ class DiffusionVL_Qwen2_5_VL_VisionRotaryEmbedding(nn.Module):
211
+ inv_freq: torch.Tensor
212
+
213
+ def __init__(self, dim: int, theta: float = 10000.0):
214
+ super().__init__()
215
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
216
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
217
+
218
+ def forward(self, seqlen: int) -> torch.Tensor:
219
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
220
+ freqs = torch.outer(seq, self.inv_freq)
221
+ return freqs
222
+
223
+
224
+ class DiffusionVL_Qwen2_5_VL_VisionPatchMerger(nn.Module):
225
+ def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2):
226
+ super().__init__()
227
+ self.hidden_size = context_dim * (spatial_merge_size ** 2)
228
+ self.ln_q = DiffusionVL_Qwen2_5_VL_RMSNorm(context_dim, eps=1e-6)
229
+ self.mlp = nn.Sequential(
230
+ nn.Linear(self.hidden_size, self.hidden_size),
231
+ nn.GELU(),
232
+ nn.Linear(self.hidden_size, dim),
233
+ )
234
+
235
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
236
+ x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
237
+ return x
238
+
239
+
240
+ class DiffusionVL_Qwen2_5_VL_VisionAttention(nn.Module):
241
+ def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig) -> None:
242
+ super().__init__()
243
+ self.dim = config.hidden_size
244
+ self.num_heads = config.num_heads
245
+ self.head_dim = self.dim // self.num_heads
246
+ self.num_key_value_groups = 1 # needed for eager attention
247
+ self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True)
248
+ self.proj = nn.Linear(self.dim, self.dim)
249
+ self.scaling = self.head_dim**-0.5
250
+ self.config = config
251
+ self.attention_dropout = 0.0
252
+ self.is_causal = False
253
+
254
+ def forward(
255
+ self,
256
+ hidden_states: torch.Tensor,
257
+ cu_seqlens: torch.Tensor,
258
+ rotary_pos_emb: Optional[torch.Tensor] = None,
259
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
260
+ **kwargs,
261
+ ) -> torch.Tensor:
262
+ seq_length = hidden_states.shape[0]
263
+ query_states, key_states, value_states = (
264
+ self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
265
+ )
266
+ cos, sin = position_embeddings
267
+ query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
268
+
269
+ query_states = query_states.transpose(0, 1).unsqueeze(0)
270
+ key_states = key_states.transpose(0, 1).unsqueeze(0)
271
+ value_states = value_states.transpose(0, 1).unsqueeze(0)
272
+
273
+ attention_interface: Callable = eager_attention_forward
274
+ if getattr(self.config, "_attn_implementation", "eager") != "eager":
275
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
276
+
277
+ if getattr(self.config, "_attn_implementation", "eager") == "flash_attention_2":
278
+ # Flash Attention 2: Use cu_seqlens for variable length attention
279
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
280
+ attn_output, _ = attention_interface(
281
+ self,
282
+ query_states,
283
+ key_states,
284
+ value_states,
285
+ attention_mask=None,
286
+ scaling=self.scaling,
287
+ dropout=0.0 if not self.training else self.attention_dropout,
288
+ cu_seq_lens_q=cu_seqlens,
289
+ cu_seq_lens_k=cu_seqlens,
290
+ max_length_q=max_seqlen,
291
+ max_length_k=max_seqlen,
292
+ is_causal=False,
293
+ **kwargs,
294
+ )
295
+ else:
296
+ # Other implementations: Process each chunk separately
297
+ lengths = cu_seqlens[1:] - cu_seqlens[:-1]
298
+ splits = [
299
+ torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
300
+ ]
301
+
302
+ attn_outputs = [
303
+ attention_interface(
304
+ self,
305
+ q,
306
+ k,
307
+ v,
308
+ attention_mask=None,
309
+ scaling=self.scaling,
310
+ dropout=0.0 if not self.training else self.attention_dropout,
311
+ is_causal=False,
312
+ **kwargs,
313
+ )[0]
314
+ for q, k, v in zip(*splits)
315
+ ]
316
+ attn_output = torch.cat(attn_outputs, dim=1)
317
+
318
+ attn_output = attn_output.reshape(seq_length, -1).contiguous()
319
+ attn_output = self.proj(attn_output)
320
+ return attn_output
321
+
322
+
323
+ class DiffusionVL_Qwen2_5_VL_VisionBlock(GradientCheckpointingLayer):
324
+ def __init__(self, config, attn_implementation: str = "sdpa") -> None:
325
+ super().__init__()
326
+ self.norm1 = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=1e-6)
327
+ self.norm2 = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=1e-6)
328
+ self.attn = DiffusionVL_Qwen2_5_VL_VisionAttention(config=config)
329
+ self.mlp = DiffusionVL_Qwen2_5_VL_VisionMLP(config, bias=True)
330
+
331
+ def forward(
332
+ self,
333
+ hidden_states: torch.Tensor,
334
+ cu_seqlens: torch.Tensor,
335
+ rotary_pos_emb: Optional[torch.Tensor] = None,
336
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
337
+ **kwargs,
338
+ ) -> torch.Tensor:
339
+ hidden_states = hidden_states + self.attn(
340
+ self.norm1(hidden_states),
341
+ cu_seqlens=cu_seqlens,
342
+ rotary_pos_emb=rotary_pos_emb,
343
+ position_embeddings=position_embeddings,
344
+ **kwargs,
345
+ )
346
+ hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
347
+ return hidden_states
348
+
349
+
350
+ class DiffusionVL_Qwen2_5_VL_VisionPreTrainedModel(PreTrainedModel):
351
+ config_class = DiffusionVL_Qwen2_5_VL_VisionConfig
352
+ base_model_prefix = "model"
353
+ supports_gradient_checkpointing = True
354
+ _no_split_modules = ["DiffusionVL_Qwen2_5_VL_VisionBlock"]
355
+ _supports_flash_attn_2 = True
356
+ _supports_sdpa = True
357
+ _supports_attention_backend = True
358
+
359
+
360
+ class DiffusionVL_Qwen2_5_VL_VisionTransformer(DiffusionVL_Qwen2_5_VL_VisionPreTrainedModel):
361
+ config_class = DiffusionVL_Qwen2_5_VL_VisionConfig
362
+ _no_split_modules = ["DiffusionVL_Qwen2_5_VL_VisionBlock"]
363
+
364
+ def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig, *inputs, **kwargs) -> None:
365
+ super().__init__(config, *inputs, **kwargs)
366
+ self.spatial_merge_size = config.spatial_merge_size
367
+ self.patch_size = config.patch_size
368
+ self.fullatt_block_indexes = config.fullatt_block_indexes
369
+ self.window_size = config.window_size
370
+ self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
371
+
372
+ self.patch_embed = DiffusionVL_Qwen2_5_VL_VisionPatchEmbed(
373
+ patch_size=config.patch_size,
374
+ temporal_patch_size=config.temporal_patch_size,
375
+ in_channels=config.in_channels,
376
+ embed_dim=config.hidden_size,
377
+ )
378
+
379
+ head_dim = config.hidden_size // config.num_heads
380
+ self.rotary_pos_emb = DiffusionVL_Qwen2_5_VL_VisionRotaryEmbedding(head_dim // 2)
381
+
382
+ self.blocks = nn.ModuleList([DiffusionVL_Qwen2_5_VL_VisionBlock(config) for _ in range(config.depth)])
383
+ self.gradient_checkpointing = False
384
+
385
+ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
386
+
387
+ pos_ids = []
388
+ for t, h, w in grid_thw:
389
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
390
+ hpos_ids = hpos_ids.reshape(
391
+ h // self.spatial_merge_size,
392
+ self.spatial_merge_size,
393
+ w // self.spatial_merge_size,
394
+ self.spatial_merge_size,
395
+ )
396
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3).flatten()
397
+
398
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
399
+ wpos_ids = wpos_ids.reshape(
400
+ h // self.spatial_merge_size,
401
+ self.spatial_merge_size,
402
+ w // self.spatial_merge_size,
403
+ self.spatial_merge_size,
404
+ )
405
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3).flatten()
406
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
407
+ pos_ids = torch.cat(pos_ids, dim=0)
408
+ max_grid_size = grid_thw[:, 1:].max()
409
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
410
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
411
+ return rotary_pos_emb
412
+
413
+ def get_window_index(self, grid_thw: torch.Tensor):
414
+
415
+ window_index: list = []
416
+ cu_window_seqlens: list = [0]
417
+ window_index_id = 0
418
+ vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
419
+
420
+ for grid_t, grid_h, grid_w in grid_thw:
421
+ llm_grid_h = grid_h // self.spatial_merge_size
422
+ llm_grid_w = grid_w // self.spatial_merge_size
423
+ index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
424
+ pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
425
+ pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
426
+ num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
427
+ num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
428
+ index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
429
+ index_padded = index_padded.reshape(
430
+ grid_t,
431
+ num_windows_h,
432
+ vit_merger_window_size,
433
+ num_windows_w,
434
+ vit_merger_window_size,
435
+ )
436
+ index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
437
+ grid_t,
438
+ num_windows_h * num_windows_w,
439
+ vit_merger_window_size,
440
+ vit_merger_window_size,
441
+ )
442
+ seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
443
+ index_padded = index_padded.reshape(-1)
444
+ index_new = index_padded[index_padded != -100]
445
+ window_index.append(index_new + window_index_id)
446
+ cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
447
+ cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
448
+ window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
449
+ window_index = torch.cat(window_index, dim=0)
450
+ return window_index, cu_window_seqlens
451
+
452
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs):
453
+
454
+ hidden_states = self.patch_embed(hidden_states)
455
+ rotary_pos_emb = self.rot_pos_emb(grid_thw)
456
+ window_index, cu_window_seqlens = self.get_window_index(grid_thw)
457
+ cu_window_seqlens = torch.tensor(
458
+ cu_window_seqlens,
459
+ device=hidden_states.device,
460
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
461
+ )
462
+ cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
463
+
464
+ seq_len, _ = hidden_states.size()
465
+ hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
466
+ hidden_states = hidden_states[window_index, :, :]
467
+ hidden_states = hidden_states.reshape(seq_len, -1)
468
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
469
+ rotary_pos_emb = rotary_pos_emb[window_index, :, :]
470
+ rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
471
+ emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
472
+ position_embeddings = (emb.cos(), emb.sin())
473
+
474
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
475
+ dim=0,
476
+ # Select dtype based on the following factors:
477
+ # - FA2 requires that cu_seqlens_q must have dtype int32
478
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
479
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
480
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
481
+ )
482
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
483
+
484
+ for layer_num, blk in enumerate(self.blocks):
485
+ if layer_num in self.fullatt_block_indexes:
486
+ cu_seqlens_now = cu_seqlens
487
+ else:
488
+ cu_seqlens_now = cu_window_seqlens
489
+
490
+ hidden_states = blk(
491
+ hidden_states,
492
+ cu_seqlens=cu_seqlens_now,
493
+ position_embeddings=position_embeddings,
494
+ **kwargs,
495
+ )
496
+
497
+ # Return hidden_states AND window_index for MMProjector to apply merger and reverse shuffle
498
+ return hidden_states, window_index
499
+
500
+
501
+ class DiffusionVL_Qwen2_5_VL_VisionTower(nn.Module):
502
+
503
+ def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig):
504
+ super().__init__()
505
+ self.vision_tower = DiffusionVL_Qwen2_5_VL_VisionTransformer(config)
506
+ self.spatial_merge_size = config.spatial_merge_size
507
+
508
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor = None):
509
+ """Returns (hidden_states, window_index) tuple for MMProjector."""
510
+ return self.vision_tower(hidden_states, grid_thw)
511
+
512
+
513
+ class DiffusionVL_Qwen2_5_VL_MMProjector(nn.Module):
514
+
515
+ def __init__(self, config: DiffusionVL_Qwen2_5_VL_VisionConfig):
516
+ super().__init__()
517
+ self.merger = DiffusionVL_Qwen2_5_VL_VisionPatchMerger(
518
+ dim=config.out_hidden_size,
519
+ context_dim=config.hidden_size,
520
+ spatial_merge_size=config.spatial_merge_size,
521
+ )
522
+
523
+ def forward(self, features_tuple):
524
+ """Forward pass with merger and window index reversal."""
525
+ if isinstance(features_tuple, tuple):
526
+ hidden_states, window_index = features_tuple
527
+ # Apply merger
528
+ projected_features = self.merger(hidden_states)
529
+ # Reverse the window shuffle to restore original spatial order
530
+ reverse_indices = torch.argsort(window_index)
531
+ final_features = projected_features[reverse_indices, :]
532
+ return final_features
533
+ else:
534
+ # Fallback for simple tensor input
535
+ return self.merger(features_tuple)
536
+
537
+ class DiffusionVL_Qwen2_5_VL_RotaryEmbedding(nn.Module):
538
+
539
+ def __init__(self, config):
540
+ super().__init__()
541
+ self.config = config
542
+ dim = config.hidden_size // config.num_attention_heads
543
+ inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
544
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
545
+
546
+ def forward(self, x, position_ids):
547
+ """
548
+ Args:
549
+ x: Input tensor for dtype reference
550
+ position_ids: Position IDs with shape (3, batch_size, seq_length) for M-RoPE
551
+ or (batch_size, seq_length) for standard RoPE (will be converted to 3D)
552
+
553
+ Returns:
554
+ cos, sin: Tensors of shape (3, batch, seq_len, head_dim) for M-RoPE
555
+ """
556
+ # Always convert 2D position_ids to 3D for M-RoPE
557
+ if position_ids.ndim == 2:
558
+ # (batch, seq) -> (3, batch, seq)
559
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
560
+
561
+ # Now position_ids should be 3D: (3, batch_size, seq_length)
562
+ if position_ids.ndim == 3 and position_ids.shape[0] == 3:
563
+ # M-RoPE: position_ids shape is (3, batch_size, seq_length)
564
+ # Expand inv_freq to (3, batch_size, head_dim//2, 1)
565
+ inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(
566
+ 3, position_ids.shape[1], -1, 1
567
+ )
568
+ # position_ids_expanded shape: (3, batch_size, 1, seq_length)
569
+ position_ids_expanded = position_ids[:, :, None, :].float()
570
+
571
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
572
+ with torch.autocast(device_type=device_type, enabled=False):
573
+ # freqs shape: (3, batch_size, seq_length, head_dim//2)
574
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
575
+ # emb shape: (3, batch_size, seq_length, head_dim)
576
+ emb = torch.cat((freqs, freqs), dim=-1)
577
+ cos = emb.cos()
578
+ sin = emb.sin()
579
+
580
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
581
+ else:
582
+ # Standard 1D RoPE (fallback)
583
+ inv_freq_expanded = self.inv_freq[None, :, None].expand(position_ids.shape[0], -1, 1)
584
+ position_ids_expanded = position_ids[:, None, :].float()
585
+ freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
586
+ emb = torch.cat((freqs, freqs), dim=-1)
587
+ cos = emb.cos()
588
+ sin = emb.sin()
589
+ return cos.to(x.dtype), sin.to(x.dtype)
590
+
591
+
592
+ class DiffusionVL_Qwen2_5_VL_MLP(nn.Module):
593
+ def __init__(self, config):
594
+ super().__init__()
595
+ self.hidden_size = config.hidden_size
596
+ self.intermediate_size = config.intermediate_size
597
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
598
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
599
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
600
+ self.act_fn = nn.SiLU()
601
+
602
+ def forward(self, x):
603
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
604
+
605
+
606
+ class DiffusionVL_Qwen2_5_VL_Attention(nn.Module):
607
+ """Non-causal attention for diffusion-based generation with KV-cache support."""
608
+
609
+ def __init__(self, config, layer_idx):
610
+ super().__init__()
611
+ self.config = config
612
+ self.layer_idx = layer_idx
613
+ self.hidden_size = config.hidden_size
614
+ self.num_heads = config.num_attention_heads
615
+ self.head_dim = self.hidden_size // self.num_heads
616
+ self.num_key_value_heads = config.num_key_value_heads
617
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
618
+ self.scaling = self.head_dim ** -0.5
619
+
620
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
621
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
622
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
623
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
624
+
625
+ # Non-causal for diffusion
626
+ self.is_causal = False
627
+
628
+ def forward(
629
+ self,
630
+ hidden_states,
631
+ attention_mask=None,
632
+ position_ids=None,
633
+ past_key_values=None,
634
+ output_attentions=False,
635
+ use_cache=False,
636
+ cache_position=None,
637
+ position_embeddings=None,
638
+ store_kv=False,
639
+ **kwargs,
640
+ ):
641
+ bsz, q_len, _ = hidden_states.size()
642
+
643
+ query_states = self.q_proj(hidden_states)
644
+ key_states = self.k_proj(hidden_states)
645
+ value_states = self.v_proj(hidden_states)
646
+
647
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
648
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
649
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
650
+
651
+ if position_embeddings is not None:
652
+ cos, sin = position_embeddings
653
+ query_states, key_states = apply_multimodal_rotary_pos_emb(
654
+ query_states, key_states, cos, sin,
655
+ self.config.rope_scaling.get("mrope_section", [16, 24, 24])
656
+ )
657
+
658
+ # KV cache handling with store_kv support
659
+ if past_key_values is not None and use_cache:
660
+ cache_kwargs = {"cache_position": cache_position}
661
+ if store_kv:
662
+ # Store current KV to cache (for prefill or final step)
663
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
664
+ else:
665
+ # Use cached KV but don't update (for diffusion steps within a block)
666
+ cached_key = past_key_values.key_cache[self.layer_idx] if self.layer_idx < len(past_key_values.key_cache) else None
667
+ cached_value = past_key_values.value_cache[self.layer_idx] if self.layer_idx < len(past_key_values.value_cache) else None
668
+ if cached_key is not None and cached_value is not None:
669
+ key_states = torch.cat([cached_key, key_states], dim=2)
670
+ value_states = torch.cat([cached_value, value_states], dim=2)
671
+
672
+ # Repeat KV for GQA
673
+ key_states = key_states.repeat_interleave(self.num_key_value_groups, dim=1)
674
+ value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
675
+
676
+ # Handle dict-format attention_mask (for BD3LM compatibility)
677
+ if attention_mask is not None:
678
+ if isinstance(attention_mask, dict):
679
+ # Use full_attention mask for all layers (simplified)
680
+ attn_mask = attention_mask.get("full_attention", None)
681
+ else:
682
+ attn_mask = attention_mask
683
+ else:
684
+ attn_mask = None
685
+
686
+ if attn_mask is not None:
687
+ attn_output = F.scaled_dot_product_attention(
688
+ query_states,
689
+ key_states,
690
+ value_states,
691
+ attn_mask=attn_mask,
692
+ dropout_p=0.0,
693
+ is_causal=False,
694
+ scale=self.scaling,
695
+ )
696
+ else:
697
+ attn_output = F.scaled_dot_product_attention(
698
+ query_states,
699
+ key_states,
700
+ value_states,
701
+ dropout_p=0.0,
702
+ is_causal=False,
703
+ scale=self.scaling,
704
+ )
705
+
706
+ attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, -1)
707
+ attn_output = self.o_proj(attn_output)
708
+
709
+ return attn_output, None
710
+
711
+
712
+ class DiffusionVL_Qwen2_5_VL_DecoderLayer(nn.Module):
713
+ def __init__(self, config, layer_idx):
714
+ super().__init__()
715
+ self.hidden_size = config.hidden_size
716
+ self.self_attn = DiffusionVL_Qwen2_5_VL_Attention(config, layer_idx)
717
+ self.mlp = DiffusionVL_Qwen2_5_VL_MLP(config)
718
+ self.input_layernorm = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
719
+ self.post_attention_layernorm = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
720
+
721
+ def forward(
722
+ self,
723
+ hidden_states,
724
+ attention_mask=None,
725
+ position_ids=None,
726
+ past_key_values=None,
727
+ output_attentions=False,
728
+ use_cache=False,
729
+ cache_position=None,
730
+ position_embeddings=None,
731
+ store_kv=False,
732
+ **kwargs,
733
+ ):
734
+ residual = hidden_states
735
+ hidden_states = self.input_layernorm(hidden_states)
736
+
737
+ hidden_states, attn_weights = self.self_attn(
738
+ hidden_states=hidden_states,
739
+ attention_mask=attention_mask,
740
+ position_ids=position_ids,
741
+ past_key_values=past_key_values,
742
+ output_attentions=output_attentions,
743
+ use_cache=use_cache,
744
+ cache_position=cache_position,
745
+ position_embeddings=position_embeddings,
746
+ store_kv=store_kv,
747
+ **kwargs,
748
+ )
749
+ hidden_states = residual + hidden_states
750
+
751
+ residual = hidden_states
752
+ hidden_states = self.post_attention_layernorm(hidden_states)
753
+ hidden_states = self.mlp(hidden_states)
754
+ hidden_states = residual + hidden_states
755
+
756
+ return hidden_states, attn_weights
757
+
758
+ class DiffusionVL_Qwen2_5_VL_PreTrainedModel(PreTrainedModel):
759
+
760
+ config_class = DiffusionVL_Qwen2_5_VL_Config
761
+ base_model_prefix = "model"
762
+ supports_gradient_checkpointing = True
763
+ _no_split_modules = ["DiffusionVL_Qwen2_5_VL_DecoderLayer", "DiffusionVL_Qwen2_5_VL_VisionBlock"]
764
+
765
+ def _init_weights(self, module: nn.Module) -> None:
766
+ """Initialize the weights."""
767
+ std = self.config.initializer_range
768
+ if isinstance(module, nn.Linear):
769
+ module.weight.data.normal_(mean=0.0, std=std)
770
+ if module.bias is not None:
771
+ module.bias.data.zero_()
772
+ elif isinstance(module, nn.Embedding):
773
+ module.weight.data.normal_(mean=0.0, std=std)
774
+
775
+
776
+ class DiffusionVL_Qwen2_5_VL_Model(DiffusionVL_Qwen2_5_VL_PreTrainedModel):
777
+
778
+ def __init__(self, config: DiffusionVL_Qwen2_5_VL_Config):
779
+ super().__init__(config)
780
+ self.config = config
781
+
782
+ # Vision components (matching weight keys)
783
+ self.vision_tower = DiffusionVL_Qwen2_5_VL_VisionTower(config.vision_config)
784
+ self.mm_projector = DiffusionVL_Qwen2_5_VL_MMProjector(config.vision_config)
785
+
786
+ # Text components
787
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
788
+ self.layers = nn.ModuleList([
789
+ DiffusionVL_Qwen2_5_VL_DecoderLayer(config, layer_idx)
790
+ for layer_idx in range(config.num_hidden_layers)
791
+ ])
792
+ self.norm = DiffusionVL_Qwen2_5_VL_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
793
+ self.rotary_emb = DiffusionVL_Qwen2_5_VL_RotaryEmbedding(config)
794
+
795
+ # BD3LM block size
796
+ self.bd3lm_block_size = config.bd3lm_block_size
797
+
798
+ self.post_init()
799
+
800
+ def get_input_embeddings(self):
801
+ return self.embed_tokens
802
+
803
+ def set_input_embeddings(self, value):
804
+ self.embed_tokens = value
805
+
806
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
807
+ """
808
+ Encodes images into continuous embeddings through vision tower and mm_projector.
809
+
810
+ Args:
811
+ pixel_values: Image tensor
812
+ image_grid_thw: Grid dimensions (temporal, height, width) for each image
813
+
814
+ Returns:
815
+ Image embeddings ready to be merged with text embeddings
816
+ """
817
+ pixel_values = pixel_values.to(dtype=self.vision_tower.vision_tower.patch_embed.proj.weight.dtype)
818
+ hidden_states = self.vision_tower(pixel_values, image_grid_thw)
819
+ image_embeds = self.mm_projector(hidden_states)
820
+ return image_embeds
821
+
822
+ def forward(
823
+ self,
824
+ input_ids=None,
825
+ attention_mask=None,
826
+ position_ids=None,
827
+ past_key_values=None,
828
+ inputs_embeds=None,
829
+ use_cache=None,
830
+ output_attentions=None,
831
+ output_hidden_states=None,
832
+ return_dict=None,
833
+ cache_position=None,
834
+ store_kv=False,
835
+ pixel_values=None,
836
+ image_grid_thw=None,
837
+ **kwargs,
838
+ ):
839
+ """Forward pass with optional vision input processing."""
840
+ output_attentions = output_attentions or False
841
+ output_hidden_states = output_hidden_states or False
842
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
843
+ return_dict = return_dict if return_dict is not None else True
844
+
845
+ IMAGE_TOKEN_INDEX = -200
846
+
847
+ if inputs_embeds is None:
848
+ inputs_embeds = self.embed_tokens(input_ids)
849
+
850
+ if pixel_values is not None and image_grid_thw is not None:
851
+ # Get image features
852
+ image_features = self.get_image_features(pixel_values, image_grid_thw)
853
+
854
+ # Split features per image
855
+ spatial_merge_size = self.vision_tower.spatial_merge_size
856
+ split_sizes = (image_grid_thw.prod(dim=1) // (spatial_merge_size ** 2)).tolist()
857
+ image_features_list = list(torch.split(image_features, split_sizes))
858
+
859
+ # Replace IMAGE_TOKEN positions with image features
860
+ batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
861
+ new_inputs_embeds_list = []
862
+
863
+ for batch_idx in range(batch_size):
864
+ cur_input_ids = input_ids[batch_idx] if input_ids is not None else None
865
+ cur_embeds = inputs_embeds[batch_idx]
866
+
867
+ if cur_input_ids is None or (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
868
+ new_inputs_embeds_list.append(cur_embeds)
869
+ continue
870
+
871
+ # Find IMAGE_TOKEN positions
872
+ image_positions = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
873
+ image_token_indices = [-1] + image_positions + [len(cur_input_ids)]
874
+
875
+ # Split embeddings and interleave with image features
876
+ cur_new_embeds = []
877
+ cur_image_idx = 0
878
+
879
+ for i in range(len(image_token_indices) - 1):
880
+ start = image_token_indices[i] + 1
881
+ end = image_token_indices[i + 1]
882
+
883
+ # Add text segment
884
+ if start < end:
885
+ cur_new_embeds.append(cur_embeds[start:end])
886
+
887
+ # Add image features (before the next segment, except after last)
888
+ if i < len(image_positions) and cur_image_idx < len(image_features_list):
889
+ cur_new_embeds.append(image_features_list[cur_image_idx].to(cur_embeds.dtype))
890
+ cur_image_idx += 1
891
+
892
+ if cur_new_embeds:
893
+ new_inputs_embeds_list.append(torch.cat(cur_new_embeds, dim=0))
894
+ else:
895
+ new_inputs_embeds_list.append(cur_embeds)
896
+
897
+ # Pad and stack
898
+ max_len = max(x.shape[0] for x in new_inputs_embeds_list)
899
+ hidden_size = new_inputs_embeds_list[0].shape[-1]
900
+ inputs_embeds = torch.zeros(
901
+ batch_size, max_len, hidden_size,
902
+ dtype=new_inputs_embeds_list[0].dtype,
903
+ device=new_inputs_embeds_list[0].device
904
+ )
905
+ for i, embed in enumerate(new_inputs_embeds_list):
906
+ inputs_embeds[i, :embed.shape[0]] = embed
907
+
908
+ batch_size, seq_length = inputs_embeds.shape[:2]
909
+
910
+ if cache_position is None:
911
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
912
+ cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
913
+
914
+ if position_ids is None:
915
+ # position_ids will be converted to 3D for M-RoPE in rotary_emb
916
+ position_ids = cache_position.unsqueeze(0)
917
+
918
+ # Position embeddings
919
+ position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
920
+
921
+ hidden_states = inputs_embeds
922
+ all_hidden_states = () if output_hidden_states else None
923
+ all_attentions = () if output_attentions else None
924
+
925
+ for layer in self.layers:
926
+ if output_hidden_states:
927
+ all_hidden_states += (hidden_states,)
928
+
929
+ hidden_states, attn_weights = layer(
930
+ hidden_states,
931
+ attention_mask=attention_mask,
932
+ position_ids=position_ids,
933
+ past_key_values=past_key_values,
934
+ output_attentions=output_attentions,
935
+ use_cache=use_cache,
936
+ cache_position=cache_position,
937
+ position_embeddings=position_embeddings,
938
+ store_kv=store_kv,
939
+ )
940
+
941
+ if output_attentions:
942
+ all_attentions += (attn_weights,)
943
+
944
+ hidden_states = self.norm(hidden_states)
945
+
946
+ if output_hidden_states:
947
+ all_hidden_states += (hidden_states,)
948
+
949
+ return BaseModelOutputWithPast(
950
+ last_hidden_state=hidden_states,
951
+ past_key_values=past_key_values,
952
+ hidden_states=all_hidden_states,
953
+ attentions=all_attentions,
954
+ )
955
+
956
+
957
+ class DiffusionVL_Qwen2_5_VL_ForConditionalGeneration(DiffusionVL_Qwen2_5_VL_PreTrainedModel):
958
+ r"""
959
+ DiffusionVL Model with a language modeling head for diffusion-based generation.
960
+
961
+ This model uses block diffusion instead of autoregressive
962
+ generation. The `generate()` method implements the diffusion denoising process.
963
+
964
+ """
965
+
966
+ # Weight tying keys - used when tie_word_embeddings=True
967
+ _tied_weights_keys = ["lm_head.weight"]
968
+
969
+ def __init__(self, config: DiffusionVL_Qwen2_5_VL_Config):
970
+ super().__init__(config)
971
+ self.model = DiffusionVL_Qwen2_5_VL_Model(config)
972
+ self.vocab_size = config.vocab_size
973
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
974
+
975
+ # Diffusion parameters
976
+ self.mask_token_id = config.mask_token_id
977
+ self.block_size = config.bd3lm_block_size
978
+
979
+ self.post_init()
980
+
981
+ def get_model(self):
982
+ return self.model
983
+
984
+ def get_input_embeddings(self):
985
+ return self.model.embed_tokens
986
+
987
+ def set_input_embeddings(self, value):
988
+ self.model.embed_tokens = value
989
+
990
+ def tie_weights(self):
991
+ """Tie weights if config.tie_word_embeddings is True (3B model)."""
992
+ if getattr(self.config, "tie_word_embeddings", False):
993
+ # Call parent's tie_weights to tie lm_head with embed_tokens
994
+ super().tie_weights()
995
+ # else: do nothing, keep separate lm_head weights (7B model)
996
+
997
+ def get_output_embeddings(self):
998
+ return self.lm_head
999
+
1000
+ def set_output_embeddings(self, new_embeddings):
1001
+ self.lm_head = new_embeddings
1002
+
1003
+ def forward(
1004
+ self,
1005
+ input_ids=None,
1006
+ attention_mask=None,
1007
+ position_ids=None,
1008
+ past_key_values=None,
1009
+ inputs_embeds=None,
1010
+ labels=None,
1011
+ use_cache=None,
1012
+ output_attentions=None,
1013
+ output_hidden_states=None,
1014
+ return_dict=None,
1015
+ pixel_values=None,
1016
+ image_grid_thw=None,
1017
+ **kwargs,
1018
+ ):
1019
+ return_dict = return_dict if return_dict is not None else True
1020
+
1021
+ # Handle vision inputs if provided
1022
+ if pixel_values is not None and inputs_embeds is None:
1023
+ # Get vision features and merge with text
1024
+ vision_features = self.model.vision_tower(pixel_values, image_grid_thw)
1025
+ inputs_embeds = self._merge_vision_text(input_ids, vision_features)
1026
+ input_ids = None
1027
+
1028
+ outputs = self.model(
1029
+ input_ids=input_ids,
1030
+ attention_mask=attention_mask,
1031
+ position_ids=position_ids,
1032
+ past_key_values=past_key_values,
1033
+ inputs_embeds=inputs_embeds,
1034
+ use_cache=use_cache,
1035
+ output_attentions=output_attentions,
1036
+ output_hidden_states=output_hidden_states,
1037
+ return_dict=True,
1038
+ )
1039
+
1040
+ hidden_states = outputs.last_hidden_state
1041
+ logits = self.lm_head(hidden_states)
1042
+
1043
+ loss = None
1044
+ if labels is not None:
1045
+ shift_logits = logits[..., :-1, :].contiguous()
1046
+ shift_labels = labels[..., 1:].contiguous()
1047
+ loss = F.cross_entropy(
1048
+ shift_logits.view(-1, self.vocab_size),
1049
+ shift_labels.view(-1),
1050
+ ignore_index=-100,
1051
+ )
1052
+
1053
+ return CausalLMOutputWithPast(
1054
+ loss=loss,
1055
+ logits=logits,
1056
+ past_key_values=outputs.past_key_values,
1057
+ hidden_states=outputs.hidden_states,
1058
+ attentions=outputs.attentions,
1059
+ )
1060
+
1061
+ def _merge_vision_text(self, input_ids, vision_features):
1062
+ """Merge vision features with text embeddings."""
1063
+ text_embeds = self.model.embed_tokens(input_ids)
1064
+ # Simple placeholder - full implementation would properly insert vision tokens
1065
+ return text_embeds
1066
+
1067
+ @torch.no_grad()
1068
+ def generate(
1069
+ self,
1070
+ inputs: Optional[torch.Tensor] = None,
1071
+ images: Optional[torch.Tensor] = None,
1072
+ image_sizes: Optional[torch.Tensor] = None,
1073
+ image_grid_thws: Optional[torch.Tensor] = None,
1074
+ modalities: Optional[List] = None,
1075
+ gen_length: int = 256,
1076
+ steps: int = 8,
1077
+ temperature: float = 0.0,
1078
+ **kwargs,
1079
+ ):
1080
+ """
1081
+ Diffusion-based generation using BD3LM algorithm.
1082
+
1083
+ Follows the same logic as DiffusionVLQwenVLForCausalLM.generate():
1084
+ 1. If images provided, call prepare_inputs_labels_for_multimodal
1085
+ 2. Otherwise, just embed the input tokens
1086
+ 3. Call generate_with_bd3lm
1087
+
1088
+ Args:
1089
+ inputs: Input token IDs (prompt) [batch_size, seq_len]
1090
+ images: Image tensor (pixel_values) for vision inputs
1091
+ image_sizes: Image sizes
1092
+ image_grid_thws: Grid dimensions for vision inputs (num_images, 3)
1093
+ modalities: List of modalities (e.g., ["image"])
1094
+ gen_length: Number of tokens to generate
1095
+ steps: Number of diffusion steps per block
1096
+ temperature: Sampling temperature (0 for greedy)
1097
+
1098
+ Returns:
1099
+ Generated token IDs
1100
+ """
1101
+ if modalities is None:
1102
+ modalities = ["image"]
1103
+
1104
+ if images is not None:
1105
+ inputs_embeds = self.prepare_inputs_labels_for_multimodal(
1106
+ input_ids=inputs,
1107
+ images=images,
1108
+ image_grid_thws=image_grid_thws,
1109
+ )
1110
+ else:
1111
+ inputs_embeds = self.get_input_embeddings()(inputs)
1112
+
1113
+ # Call the BD3LM generation
1114
+ return self.generate_with_bd3lm(
1115
+ inputs_embeds=inputs_embeds,
1116
+ gen_length=gen_length,
1117
+ steps=steps,
1118
+ temperature=temperature,
1119
+ **kwargs,
1120
+ )
1121
+
1122
+ def prepare_inputs_labels_for_multimodal(
1123
+ self,
1124
+ input_ids: torch.Tensor,
1125
+ images: torch.Tensor,
1126
+ image_grid_thws: Optional[torch.Tensor] = None,
1127
+ ) -> torch.Tensor:
1128
+ """
1129
+ Prepare inputs_embeds by merging text embeddings with image features.
1130
+
1131
+ Uses LLaVA format: IMAGE_TOKEN_INDEX (-200) as placeholder.
1132
+
1133
+ Args:
1134
+ input_ids: Input token IDs with IMAGE_TOKEN_INDEX (-200) as image placeholders
1135
+ images: Pixel values tensor
1136
+ image_grid_thws: Grid dimensions for each image
1137
+
1138
+ Returns:
1139
+ inputs_embeds: Merged text + image embeddings
1140
+ """
1141
+ IMAGE_TOKEN_INDEX = -200
1142
+
1143
+ device = input_ids.device
1144
+ batch_size = input_ids.shape[0]
1145
+
1146
+ # Convert image_grid_thws to tensor if needed
1147
+ if image_grid_thws is not None:
1148
+ if not isinstance(image_grid_thws, torch.Tensor):
1149
+ image_grid_thw = torch.tensor(image_grid_thws, device=device)
1150
+ else:
1151
+ image_grid_thw = image_grid_thws.to(device)
1152
+ else:
1153
+ raise ValueError("image_grid_thws is required for vision processing")
1154
+
1155
+ # 1. Get image features through vision tower + mm_projector
1156
+ image_features = self.model.get_image_features(images, image_grid_thw)
1157
+
1158
+ # 2. Split features per image based on grid_thw
1159
+ spatial_merge_size = self.model.vision_tower.spatial_merge_size
1160
+ split_sizes = (image_grid_thw.prod(dim=1) // (spatial_merge_size ** 2)).tolist()
1161
+ image_features_list = list(torch.split(image_features, split_sizes))
1162
+
1163
+ # 3. Build new input embeddings (LLaVA format)
1164
+ new_input_embeds_list = []
1165
+
1166
+ for batch_idx in range(batch_size):
1167
+ cur_input_ids = input_ids[batch_idx]
1168
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum().item()
1169
+
1170
+ if num_images == 0:
1171
+ # No image tokens, just embed text
1172
+ cur_input_embeds = self.get_input_embeddings()(cur_input_ids)
1173
+ new_input_embeds_list.append(cur_input_embeds)
1174
+ continue
1175
+
1176
+ # LLaVA format: IMAGE_TOKEN_INDEX (-200) as placeholder
1177
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [len(cur_input_ids)]
1178
+
1179
+ cur_input_ids_noim = []
1180
+ for idx in range(len(image_token_indices) - 1):
1181
+ start = image_token_indices[idx] + 1
1182
+ end = image_token_indices[idx + 1]
1183
+ if start < end:
1184
+ cur_input_ids_noim.append(cur_input_ids[start:end])
1185
+
1186
+ if cur_input_ids_noim:
1187
+ cur_input_embeds_noim = self.get_input_embeddings()(torch.cat(cur_input_ids_noim))
1188
+ split_sizes_text = [x.shape[0] for x in cur_input_ids_noim]
1189
+ cur_input_embeds_noim_split = list(torch.split(cur_input_embeds_noim, split_sizes_text))
1190
+ else:
1191
+ cur_input_embeds_noim_split = []
1192
+
1193
+ cur_new_input_embeds = []
1194
+ cur_image_idx = 0
1195
+
1196
+ for idx in range(num_images + 1):
1197
+ if idx < len(cur_input_embeds_noim_split):
1198
+ cur_new_input_embeds.append(cur_input_embeds_noim_split[idx])
1199
+ if idx < num_images and cur_image_idx < len(image_features_list):
1200
+ cur_image_features = image_features_list[cur_image_idx]
1201
+ target_dtype = cur_input_embeds_noim_split[0].dtype if cur_input_embeds_noim_split else images.dtype
1202
+ cur_new_input_embeds.append(cur_image_features.to(target_dtype))
1203
+ cur_image_idx += 1
1204
+
1205
+ if cur_new_input_embeds:
1206
+ # Ensure all tensors are on the same device before cat (multi-GPU support)
1207
+ target_device = cur_new_input_embeds[0].device
1208
+ cur_new_input_embeds = [t.to(target_device) for t in cur_new_input_embeds]
1209
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
1210
+ else:
1211
+ cur_new_input_embeds = self.get_input_embeddings()(cur_input_ids)
1212
+
1213
+ new_input_embeds_list.append(cur_new_input_embeds)
1214
+
1215
+ # 4. Pad to same length and stack
1216
+ max_len = max(x.shape[0] for x in new_input_embeds_list)
1217
+ hidden_size = new_input_embeds_list[0].shape[-1]
1218
+ dtype = new_input_embeds_list[0].dtype
1219
+
1220
+ inputs_embeds = torch.zeros(batch_size, max_len, hidden_size, dtype=dtype, device=device)
1221
+ for i, embed in enumerate(new_input_embeds_list):
1222
+ inputs_embeds[i, :embed.shape[0]] = embed.to(device)
1223
+
1224
+ return inputs_embeds
1225
+
1226
+ @torch.no_grad()
1227
+ def generate_with_bd3lm(
1228
+ self,
1229
+ inputs_embeds: torch.FloatTensor,
1230
+ gen_length: int = 256,
1231
+ steps: int = 8,
1232
+ temperature: float = 0.0,
1233
+ top_k: int = 0,
1234
+ top_p: float = 1.0,
1235
+ remasking_strategy: str = 'low_confidence_static',
1236
+ use_kv_cache: bool = True,
1237
+ confidence_threshold: float = 0.85,
1238
+ **kwargs,
1239
+ ):
1240
+ """
1241
+ BD3LM generation algorithm with KV-cache support.
1242
+
1243
+ Args:
1244
+ inputs_embeds: Input embeddings (prompt)
1245
+ gen_length: Number of tokens to generate
1246
+ steps: Number of diffusion steps per block
1247
+ temperature: Sampling temperature (0 for greedy)
1248
+ top_k: Top-k sampling parameter
1249
+ top_p: Top-p (nucleus) sampling parameter
1250
+ remasking_strategy: 'low_confidence_static', 'low_confidence_dynamic', or 'sequential'
1251
+ use_kv_cache: Whether to use KV cache (default True)
1252
+ confidence_threshold: Threshold for low_confidence_dynamic strategy
1253
+
1254
+ Returns:
1255
+ Generated token IDs
1256
+ """
1257
+ device = inputs_embeds.device
1258
+ batch_size = inputs_embeds.shape[0]
1259
+ prompt_len = inputs_embeds.shape[1]
1260
+ block_size = self.block_size
1261
+ mask_id = self.mask_token_id
1262
+
1263
+ # Compute total length aligned to block size
1264
+ num_blocks = (prompt_len + gen_length + block_size - 1) // block_size
1265
+ total_length = num_blocks * block_size
1266
+
1267
+ # Initialize with mask tokens
1268
+ x_ids = torch.full((batch_size, total_length), mask_id, dtype=torch.long, device=device)
1269
+ # Get mask embedding and ensure it's on the same device as inputs_embeds
1270
+ embed_layer = self.get_input_embeddings()
1271
+ mask_embed = embed_layer(torch.tensor([mask_id], device=embed_layer.weight.device))
1272
+ mask_embed = mask_embed.to(device) # Move to same device as inputs_embeds
1273
+ x_embeds = mask_embed.repeat(batch_size, total_length, 1)
1274
+ x_embeds[:, :prompt_len] = inputs_embeds.clone()
1275
+
1276
+ # Reconstruct prompt IDs from embeddings
1277
+ prompt_logits = self.lm_head(inputs_embeds)
1278
+ prompt_ids = torch.argmax(prompt_logits, dim=-1)
1279
+ x_ids[:, :prompt_len] = prompt_ids
1280
+
1281
+ # Block causal mask
1282
+ block_mask = torch.tril(torch.ones(num_blocks, num_blocks, device=device)).to(inputs_embeds.dtype)
1283
+ block_diffusion_mask_bool = block_mask.repeat_interleave(block_size, dim=0) \
1284
+ .repeat_interleave(block_size, dim=1).unsqueeze(0)
1285
+ block_diffusion_mask = block_diffusion_mask_bool.unsqueeze(1)
1286
+ block_diffusion_mask = torch.where(block_diffusion_mask == 0., torch.full_like(block_diffusion_mask, float('-inf')), 0.)
1287
+
1288
+ position_ids = torch.arange(total_length, device=device).unsqueeze(0).expand(batch_size, -1)
1289
+
1290
+ # KV-cache prefill
1291
+ prefill_blocks = prompt_len // block_size
1292
+ prefill_length = prefill_blocks * block_size
1293
+
1294
+ past_key_values = DynamicCache() if use_kv_cache else None
1295
+ if use_kv_cache and prefill_length > 0:
1296
+ prefill_embeds = x_embeds[:, :prefill_length]
1297
+ prefill_mask = block_diffusion_mask[:, :, :prefill_length, :prefill_length]
1298
+ prefill_pos_ids = position_ids[:, :prefill_length]
1299
+
1300
+ # Dict-format mask for BD3LM compatibility
1301
+ model_mask = {"full_attention": prefill_mask, "sliding_attention": prefill_mask}
1302
+
1303
+ prefill_outputs = self.model(
1304
+ inputs_embeds=prefill_embeds,
1305
+ attention_mask=model_mask,
1306
+ position_ids=prefill_pos_ids,
1307
+ past_key_values=past_key_values,
1308
+ use_cache=True,
1309
+ store_kv=True
1310
+ )
1311
+ prefill_logits = self.lm_head(prefill_outputs.last_hidden_state).float()
1312
+ self.last_prefill_logits = prefill_logits[:, -1:, :].clone()
1313
+ past_key_values = prefill_outputs.past_key_values
1314
+
1315
+ # Calculate how many tokens to unmask per step
1316
+ num_transfer_tokens = self._get_num_transfer_tokens(block_size, steps)
1317
+ eos_token_id = kwargs.get('eos_token_id', 151645)
1318
+
1319
+ # Generate block by block
1320
+ for block_idx in range(prefill_blocks, num_blocks):
1321
+ block_start = block_idx * block_size
1322
+ block_end = block_start + block_size
1323
+
1324
+ cur_block_embeds = x_embeds[:, block_start:block_end].clone()
1325
+ cur_block_ids = x_ids[:, block_start:block_end]
1326
+
1327
+ cur_mask = block_diffusion_mask[:, :, block_start:block_end, :block_end]
1328
+ cur_pos_ids = position_ids[:, block_start:block_end]
1329
+
1330
+ # Dict-format mask for BD3LM compatibility
1331
+ model_mask = {"full_attention": cur_mask, "sliding_attention": cur_mask}
1332
+
1333
+ # Run diffusion steps within this block
1334
+ for step in range(steps + 1):
1335
+ # Check mask using embedding comparison (ensure same device for multi-GPU)
1336
+ is_mask = torch.all(torch.abs(cur_block_embeds - mask_embed.to(cur_block_embeds.device)) < 1e-5, dim=-1)
1337
+ if not is_mask.any():
1338
+ # Store KV for fully unmasked block
1339
+ if use_kv_cache:
1340
+ _ = self.model(
1341
+ inputs_embeds=cur_block_embeds,
1342
+ attention_mask=model_mask,
1343
+ position_ids=cur_pos_ids,
1344
+ past_key_values=past_key_values,
1345
+ use_cache=True,
1346
+ store_kv=True
1347
+ )
1348
+ break
1349
+
1350
+ # Forward pass
1351
+ if use_kv_cache:
1352
+ outputs = self.model(
1353
+ inputs_embeds=cur_block_embeds,
1354
+ attention_mask=model_mask,
1355
+ position_ids=cur_pos_ids,
1356
+ past_key_values=past_key_values,
1357
+ use_cache=True,
1358
+ store_kv=False
1359
+ )
1360
+ logits = self.lm_head(outputs.last_hidden_state).float()
1361
+ else:
1362
+ # No KV-cache: recompute full context
1363
+ context_embeds = x_embeds[:, :block_end].clone()
1364
+ context_embeds[:, block_start:block_end] = cur_block_embeds
1365
+ context_mask = block_diffusion_mask[:, :, :block_end, :block_end]
1366
+ context_pos_ids = position_ids[:, :block_end]
1367
+ context_model_mask = {"full_attention": context_mask, "sliding_attention": context_mask}
1368
+
1369
+ outputs = self.model(
1370
+ inputs_embeds=context_embeds,
1371
+ attention_mask=context_model_mask,
1372
+ position_ids=context_pos_ids,
1373
+ past_key_values=None,
1374
+ use_cache=False,
1375
+ store_kv=False
1376
+ )
1377
+ logits = self.lm_head(outputs.last_hidden_state[:, block_start:block_end]).float()
1378
+
1379
+ # Sample tokens
1380
+ x0, x0_p = self._sample_tokens(logits, temperature, top_k, top_p)
1381
+
1382
+ # Select tokens to unmask based on strategy
1383
+ num_to_transfer = num_transfer_tokens[step].item()
1384
+
1385
+ # Ensure all tensors are on the same device for multi-GPU support
1386
+ target_device = x0.device
1387
+ is_mask = is_mask.to(target_device)
1388
+ x0_p = x0_p.to(target_device)
1389
+
1390
+ transfer_mask = torch.zeros_like(x0, dtype=torch.bool)
1391
+
1392
+ if remasking_strategy == 'sequential':
1393
+ for j in range(batch_size):
1394
+ if is_mask[j].any():
1395
+ mask_positions = is_mask[j].nonzero(as_tuple=True)[0]
1396
+ num_to_select = min(num_to_transfer, len(mask_positions))
1397
+ selected_positions = mask_positions[:num_to_select]
1398
+ transfer_mask[j, selected_positions] = True
1399
+
1400
+ elif remasking_strategy == 'low_confidence_static':
1401
+ confidence = torch.where(is_mask, x0_p, torch.tensor(-torch.inf, device=target_device))
1402
+ for j in range(batch_size):
1403
+ num_masks = is_mask[j].sum().item()
1404
+ k = min(num_to_transfer, num_masks)
1405
+ if k > 0 and not torch.all(torch.isinf(confidence[j])):
1406
+ _, idx = torch.topk(confidence[j], k)
1407
+ transfer_mask[j, idx] = True
1408
+
1409
+ elif remasking_strategy == 'low_confidence_dynamic':
1410
+ confidence = torch.where(is_mask, x0_p, torch.tensor(-torch.inf, device=target_device))
1411
+ for j in range(batch_size):
1412
+ high_conf_mask = confidence[j] > confidence_threshold
1413
+ num_high_confidence = high_conf_mask.sum().item()
1414
+ if num_high_confidence >= num_to_transfer:
1415
+ transfer_mask[j] = high_conf_mask
1416
+ else:
1417
+ num_masks = is_mask[j].sum().item()
1418
+ k = min(num_to_transfer, num_masks)
1419
+ if k > 0:
1420
+ _, idx = torch.topk(confidence[j], k)
1421
+ transfer_mask[j, idx] = True
1422
+
1423
+ else:
1424
+ raise ValueError(f"Unknown remasking strategy: {remasking_strategy}")
1425
+
1426
+ # Update tokens - ensure all tensors are on same device
1427
+ cur_block_ids = cur_block_ids.to(x0.device)
1428
+ cur_block_ids = torch.where(transfer_mask, x0, cur_block_ids)
1429
+ # Get embeddings - move x0 to embed layer's device first
1430
+ embed_layer = self.get_input_embeddings()
1431
+ x0_embeds = embed_layer(x0.to(embed_layer.weight.device))
1432
+ cur_block_embeds = cur_block_embeds.to(x0_embeds.device)
1433
+ cur_block_embeds = torch.where(transfer_mask.unsqueeze(-1).to(x0_embeds.device), x0_embeds, cur_block_embeds)
1434
+
1435
+ # Update global state - handle multi-GPU
1436
+ x_embeds[:, block_start:block_end] = cur_block_embeds.to(x_embeds.device)
1437
+ x_ids[:, block_start:block_end] = cur_block_ids.to(x_ids.device)
1438
+
1439
+ # Check for EOS
1440
+ if block_end > prompt_len:
1441
+ gen_start_in_block = max(prompt_len, block_start)
1442
+ gen_ids_check = x_ids[:, gen_start_in_block:block_end]
1443
+ if eos_token_id in gen_ids_check:
1444
+ break
1445
+
1446
+ # Return only generated tokens
1447
+ return x_ids[:, prompt_len:prompt_len + gen_length]
1448
+
1449
+ def _sample_tokens(self, logits, temperature=0.0, top_k=0, top_p=1.0):
1450
+ """Sample tokens with temperature, top-k, and top-p."""
1451
+ batch_size = logits.shape[0]
1452
+ seq_len = logits.shape[1]
1453
+ vocab_size = logits.shape[-1]
1454
+
1455
+ logits_2d = logits.reshape(-1, vocab_size)
1456
+
1457
+ if temperature == 0:
1458
+ # Greedy sampling
1459
+ tokens = torch.argmax(logits_2d, dim=-1, keepdim=True)
1460
+ probs = F.softmax(logits_2d, dim=-1)
1461
+ token_probs = torch.gather(probs, -1, tokens)
1462
+ else:
1463
+ # Apply temperature
1464
+ logits_scaled = logits_2d / temperature
1465
+
1466
+ # Apply top-k
1467
+ if top_k > 0:
1468
+ values, _ = torch.topk(logits_scaled, top_k)
1469
+ min_values = values[:, -1:]
1470
+ logits_scaled = torch.where(logits_scaled < min_values, float('-inf'), logits_scaled)
1471
+
1472
+ # Apply top-p
1473
+ if top_p < 1.0:
1474
+ sorted_logits, sorted_indices = torch.sort(logits_scaled, descending=True)
1475
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
1476
+ sorted_mask = cumulative_probs > top_p
1477
+ sorted_mask[:, 1:] = sorted_mask[:, :-1].clone()
1478
+ sorted_mask[:, 0] = False
1479
+ mask_indices = torch.scatter(
1480
+ torch.zeros_like(logits_scaled, dtype=torch.bool),
1481
+ -1, sorted_indices, sorted_mask
1482
+ )
1483
+ logits_scaled = logits_scaled.masked_fill(mask_indices, float('-inf'))
1484
+
1485
+ probs = F.softmax(logits_scaled, dim=-1)
1486
+ tokens = torch.multinomial(probs, num_samples=1)
1487
+ token_probs = torch.gather(probs, -1, tokens)
1488
+
1489
+ return tokens.view(batch_size, seq_len), token_probs.view(batch_size, seq_len)
1490
+
1491
+ def _get_num_transfer_tokens(self, block_length, steps):
1492
+ """Calculate how many tokens to unmask at each step."""
1493
+ if steps == 0:
1494
+ return torch.zeros(1, dtype=torch.int64)
1495
+ base = block_length // steps
1496
+ remainder = block_length % steps
1497
+ num_transfer = torch.zeros(steps + 1, dtype=torch.int64) + base
1498
+ num_transfer[:remainder] += 1
1499
+ return num_transfer
1500
+
1501
+ from transformers import AutoConfig, AutoModelForCausalLM
1502
+
1503
+ AutoConfig.register("diffusionvl_qwen2_5_vl", DiffusionVL_Qwen2_5_VL_Config)
1504
+ AutoModelForCausalLM.register(DiffusionVL_Qwen2_5_VL_Config, DiffusionVL_Qwen2_5_VL_ForConditionalGeneration)
1505
+
1506
+
1507
+ __all__ = [
1508
+ "DiffusionVL_Qwen2_5_VL_Config",
1509
+ "DiffusionVL_Qwen2_5_VL_VisionConfig",
1510
+ "DiffusionVL_Qwen2_5_VL_PreTrainedModel",
1511
+ "DiffusionVL_Qwen2_5_VL_Model",
1512
+ "DiffusionVL_Qwen2_5_VL_ForConditionalGeneration",
1513
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor",
18
+ "use_fast": false,
19
+ "processor_class": "DiffusionVL_Qwen2_5_VL_Processor",
20
+ "auto_map": {
21
+ "AutoProcessor": "processing_diffusionvl_qwen2_5_vl.DiffusionVL_Qwen2_5_VL_Processor"
22
+ }
23
+ }
processing_diffusionvl_qwen2_5_vl.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The HustVL Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on Qwen2.5-VL, which is derived from EleutherAI's GPT-NeoX library
5
+ # and the GPT-NeoX and OPT implementations. It has been modified to create DiffusionVL.
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ """
19
+ DiffusionVL Processor - Combines image processor and tokenizer.
20
+ """
21
+
22
+ import re
23
+ from typing import List, Optional, Union
24
+
25
+ import torch
26
+
27
+ from transformers.feature_extraction_utils import BatchFeature
28
+ from transformers.image_utils import ImageInput
29
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
30
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
31
+ from transformers.video_utils import VideoInput
32
+
33
+
34
+ IMAGE_TOKEN_INDEX = -200
35
+ DEFAULT_IMAGE_TOKEN = "<image>"
36
+
37
+
38
+ class DiffusionVL_Qwen2_5_VL_ProcessorKwargs(ProcessingKwargs, total=False):
39
+ """Keyword arguments for DiffusionVL_Qwen2_5_VL_Processor."""
40
+
41
+ _defaults = {
42
+ "text_kwargs": {
43
+ "padding": False,
44
+ },
45
+ }
46
+
47
+
48
+ def tokenizer_image_token(
49
+ prompt: str,
50
+ tokenizer,
51
+ image_token_index: int = IMAGE_TOKEN_INDEX,
52
+ return_tensors: Optional[str] = None,
53
+ ) -> Union[List[int], torch.Tensor]:
54
+ """
55
+ Tokenize text with image placeholders, replacing <image> with IMAGE_TOKEN_INDEX.
56
+
57
+ Args:
58
+ prompt: Input text containing <image> placeholders.
59
+ tokenizer: The tokenizer to use for encoding text.
60
+ image_token_index: The token index to use for image placeholders.
61
+ return_tensors: If "pt", return a PyTorch tensor.
62
+
63
+ Returns:
64
+ List of token IDs or a PyTorch tensor.
65
+ """
66
+ prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN)
67
+
68
+ input_ids = []
69
+ offset = 0
70
+
71
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0:
72
+ # First chunk has text
73
+ input_ids = tokenizer(prompt_chunks[0], add_special_tokens=False).input_ids
74
+ offset = 1
75
+
76
+ for chunk_idx in range(offset, len(prompt_chunks)):
77
+ chunk = prompt_chunks[chunk_idx]
78
+ # Add image token
79
+ input_ids.append(image_token_index)
80
+ # Add text after image
81
+ if len(chunk) > 0:
82
+ input_ids.extend(tokenizer(chunk, add_special_tokens=False).input_ids)
83
+
84
+ if return_tensors == "pt":
85
+ return torch.tensor(input_ids, dtype=torch.long)
86
+ return input_ids
87
+
88
+
89
+ class DiffusionVL_Qwen2_5_VL_Processor(ProcessorMixin):
90
+ r"""
91
+ Constructs a DiffusionVL processor which wraps an image processor and a tokenizer into a single processor.
92
+
93
+ [`DiffusionVL_Qwen2_5_VL_Processor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`].
94
+ See the [`~DiffusionVL_Qwen2_5_VL_Processor.__call__`] and [`~DiffusionVL_Qwen2_5_VL_Processor.decode`] for more information.
95
+
96
+ This processor uses LLaVA-style image token handling:
97
+ - `<image>` in text is replaced with `IMAGE_TOKEN_INDEX` (-200) in input_ids
98
+ - The model's `prepare_inputs_labels_for_multimodal` replaces -200 with actual image features
99
+
100
+ Args:
101
+ image_processor ([`Qwen2VLImageProcessor`], *optional*):
102
+ The image processor is a required input.
103
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
104
+ The tokenizer is a required input.
105
+ chat_template (`str`, *optional*):
106
+ A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
107
+
108
+ Example:
109
+
110
+ ```python
111
+ >>> from transformers import AutoProcessor
112
+ >>> from PIL import Image
113
+
114
+ >>> processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True)
115
+
116
+ >>> # Prepare text with image placeholder
117
+ >>> messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Describe this image."}]}]
118
+ >>> text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
119
+
120
+ >>> # Process image and text
121
+ >>> image = Image.open("image.jpg")
122
+ >>> inputs = processor(text=[text], images=[image], return_tensors="pt")
123
+ ```
124
+ """
125
+
126
+ attributes = ["image_processor", "tokenizer"]
127
+ image_processor_class = "Qwen2VLImageProcessor"
128
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
129
+
130
+ def __init__(
131
+ self,
132
+ image_processor=None,
133
+ tokenizer=None,
134
+ chat_template: Optional[str] = None,
135
+ **kwargs,
136
+ ):
137
+ self.image_token = DEFAULT_IMAGE_TOKEN
138
+ self.image_token_index = IMAGE_TOKEN_INDEX
139
+
140
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
141
+
142
+ def __call__(
143
+ self,
144
+ images: Optional[ImageInput] = None,
145
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
146
+ videos: Optional[VideoInput] = None,
147
+ **kwargs: Unpack[DiffusionVL_Qwen2_5_VL_ProcessorKwargs],
148
+ ) -> BatchFeature:
149
+ """
150
+ Main method to prepare for the model one or several sequences and image(s).
151
+
152
+ This method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`]
153
+ if `text` is not `None` to encode the text. To prepare the vision inputs, this method forwards the `images`
154
+ and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `images` is not `None`.
155
+
156
+ The text should contain `<image>` placeholders where images should be inserted.
157
+ These will be replaced with `IMAGE_TOKEN_INDEX` (-200) in the output input_ids.
158
+
159
+ Args:
160
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, *optional*):
161
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array, or PyTorch
162
+ tensor. Both channels-first and channels-last formats are supported.
163
+ text (`str`, `List[str]`, *optional*):
164
+ The sequence or batch of sequences to be encoded. Each sequence should be a string containing
165
+ `<image>` placeholders where images will be inserted.
166
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, *optional*):
167
+ The video or batch of videos to be prepared. Currently not fully supported.
168
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
169
+ If set, will return tensors of a particular framework. Acceptable values are:
170
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
171
+ - `'np'`: Return NumPy `np.ndarray` objects.
172
+
173
+ Returns:
174
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
175
+
176
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
177
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
178
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
179
+ - **image_grid_thw** -- List of image 3D grid dimensions. Returned when `images` is not `None`.
180
+ """
181
+ output_kwargs = self._merge_kwargs(
182
+ DiffusionVL_Qwen2_5_VL_ProcessorKwargs,
183
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
184
+ **kwargs,
185
+ )
186
+
187
+ # Process images
188
+ image_inputs = {}
189
+ if images is not None:
190
+ image_inputs = self.image_processor(
191
+ images=images, **output_kwargs.get("images_kwargs", {})
192
+ )
193
+
194
+ # Handle text input
195
+ if text is None:
196
+ return BatchFeature(data=image_inputs)
197
+
198
+ if not isinstance(text, list):
199
+ text = [text]
200
+
201
+ # Tokenize with LLaVA-style image token handling
202
+ return_tensors = output_kwargs.get("text_kwargs", {}).pop("return_tensors", None)
203
+
204
+ all_input_ids = []
205
+ for t in text:
206
+ input_ids = tokenizer_image_token(
207
+ t, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors=None
208
+ )
209
+ all_input_ids.append(input_ids)
210
+
211
+ # Pad sequences
212
+ max_len = max(len(ids) for ids in all_input_ids)
213
+ padded_input_ids = []
214
+ attention_masks = []
215
+
216
+ pad_token_id = (
217
+ self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0
218
+ )
219
+
220
+ for ids in all_input_ids:
221
+ padding_length = max_len - len(ids)
222
+ padded_ids = ids + [pad_token_id] * padding_length
223
+ mask = [1] * len(ids) + [0] * padding_length
224
+ padded_input_ids.append(padded_ids)
225
+ attention_masks.append(mask)
226
+
227
+ text_inputs = {
228
+ "input_ids": padded_input_ids,
229
+ "attention_mask": attention_masks,
230
+ }
231
+
232
+ return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
233
+
234
+ def build_conversation_input_ids(
235
+ self,
236
+ messages: List[dict],
237
+ images: Optional[List] = None,
238
+ add_generation_prompt: bool = True,
239
+ ) -> dict:
240
+ """
241
+ Build input_ids from conversation messages in LLaVA format.
242
+
243
+ This method converts a list of messages into a prompt string with `<image>` placeholders.
244
+ Uses LLaVA-style chat template format (trained format).
245
+
246
+ Args:
247
+ messages: List of message dicts with 'role' and 'content' keys.
248
+ Content can be a string or a list of dicts with 'type' key ('text' or 'image').
249
+ images: Optional list of images (used for validation).
250
+ add_generation_prompt: Whether to add generation prompt at the end.
251
+
252
+ Returns:
253
+ dict with 'text' key containing the prompt string with `<image>` placeholders.
254
+ """
255
+ # Build LLaVA-style prompt directly
256
+ # Format: <|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nPrompt<|im_end|>\n<|im_start|>assistant\n
257
+
258
+ text_parts = []
259
+
260
+ for message in messages:
261
+ role = message.get("role", "user")
262
+ content = message.get("content", "")
263
+
264
+ text_parts.append(f"<|im_start|>{role}\n")
265
+
266
+ # Handle content - can be string or list of content items
267
+ if isinstance(content, str):
268
+ text_parts.append(content)
269
+ elif isinstance(content, list):
270
+ for item in content:
271
+ if isinstance(item, dict):
272
+ if item.get("type") == "image":
273
+ text_parts.append(DEFAULT_IMAGE_TOKEN)
274
+ elif item.get("type") == "text":
275
+ text_parts.append(item.get("text", ""))
276
+ else:
277
+ text_parts.append(str(item))
278
+
279
+ text_parts.append("<|im_end|>\n")
280
+
281
+ if add_generation_prompt:
282
+ text_parts.append("<|im_start|>assistant\n")
283
+
284
+ text = "".join(text_parts)
285
+ return {"text": text}
286
+
287
+ def batch_decode(self, *args, **kwargs):
288
+ """
289
+ Decode a batch of token IDs to text.
290
+
291
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
292
+ Please refer to the docstring of this method for more information.
293
+ """
294
+ return self.tokenizer.batch_decode(*args, **kwargs)
295
+
296
+ def decode(self, *args, **kwargs):
297
+ """
298
+ Decode token IDs to text.
299
+
300
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`].
301
+ Please refer to the docstring of this method for more information.
302
+ """
303
+ return self.tokenizer.decode(*args, **kwargs)
304
+
305
+ @property
306
+ def model_input_names(self) -> List[str]:
307
+ """Return the list of model input names."""
308
+ tokenizer_names = self.tokenizer.model_input_names
309
+ image_processor_names = self.image_processor.model_input_names
310
+ return list(dict.fromkeys(tokenizer_names + image_processor_names))
311
+
312
+
313
+ __all__ = ["DiffusionVL_Qwen2_5_VL_Processor", "tokenizer_image_token"]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 8192,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff