cpatonn commited on
Commit
2609cba
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - zh
4
+ - en
5
+ library_name: transformers
6
+ license: mit
7
+ pipeline_tag: image-text-to-text
8
+ base_model: zai-org/GLM-4.6V-Flash
9
+ ---
10
+
11
+ # GLM-4.6V-Flash AWQ - INT8
12
+
13
+ ## Model Details
14
+
15
+ ### Quantization Details
16
+
17
+ - **Quantization Method:** AWQ
18
+ - **Bits:** 8
19
+ - **Group Size:** 32
20
+ - **Calibration Dataset:** [5CD-AI/LLaVA-CoT-o1-Instruct](https://huggingface.co/datasets/5CD-AI/LLaVA-CoT-o1-Instruct)
21
+ - **Quantization Tool:** [llm-compressor](https://github.com/vllm-project/llm-compressor)
22
+
23
+ ### Memory Usage
24
+
25
+ | **Type** | **GLM-4.6V-Flash** | **GLM-4.6V-Flash-AWQ-8bit** |
26
+ |:---------------:|:----------------:|:----------------:|
27
+ | **Memory Size** | 19.2 GB | 12.0 GB |
28
+ | **KV Cache per Token** | 40.0 kB | 20.0 kB |
29
+ | **KV Cache per Context** | 5.0 GB | 2.5 GB |
30
+
31
+ ## Inference
32
+
33
+ ### Prerequisite
34
+
35
+ ```bash
36
+ pip install vllm>=0.12.0
37
+ pip install --upgrade git+https://github.com/huggingface/transformers.git
38
+ ```
39
+
40
+ ### Basic Usage
41
+
42
+ ```bash
43
+ vllm serve cyankiwi/GLM-4.6V-Flash-AWQ-8bit
44
+ ```
45
+
46
+ ## Additional Information
47
+
48
+ ### Changelog
49
+
50
+ - **v1.0.0** - Initial quantized release
51
+
52
+ ### Authors
53
+
54
+ - **Name:** Ton Cao
55
+ - **Contacts:** [email protected]
56
+
57
+ # GLM-4.6V
58
+
59
+ <div align="center">
60
+ <img src=https://raw.githubusercontent.com/zai-org/GLM-V/refs/heads/main/resources/logo.svg width="40%"/>
61
+ </div>
62
+
63
+ This model is part of the GLM-V family of models, introduced in the paper [GLM-4.1V-Thinking and GLM-4.5V: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning](https://huggingface.co/papers/2507.01006).
64
+
65
+ - **GLM-4.6V Blog**: [https://z.ai/blog/glm-4.6v](https://z.ai/blog/glm-4.6v)
66
+ - **Paper**: [https://huggingface.co/papers/2507.01006](https://huggingface.co/papers/2507.01006)
67
+ - **GitHub Repository**: [https://github.com/zai-org/GLM-V](https://github.com/zai-org/GLM-V)
68
+ - **Online Demo**: [https://chat.z.ai/](https://chat.z.ai/)
69
+ - **API Access**: [Z.ai Open Platform](https://docs.z.ai/guides/vlm/glm-4.6v)
70
+ - **Desktop Assistant App**: [https://huggingface.co/spaces/zai-org/GLM-4.5V-Demo-App](https://huggingface.co/spaces/zai-org/GLM-4.5V-Demo-App)
71
+
72
+ ## Introduction
73
+
74
+ GLM-4.6V series model includes two versions: GLM-4.6V (106B), a foundation model designed for cloud and high-performance
75
+ cluster scenarios,
76
+ and GLM-4.6V-Flash (9B), a lightweight model optimized for local deployment and low-latency applications.
77
+ GLM-4.6V scales its context window to 128k tokens in training,
78
+ and achieves SoTA performance in visual understanding among models of similar parameter scales.
79
+ Crucially, we integrate native Function Calling capabilities for the first time.
80
+ This effectively bridges the gap between "visual perception" and "executable action"
81
+ providing a unified technical foundation for multimodal agents in real-world business scenarios.
82
+
83
+ ![GLM-4.6V Benchmarks](https://raw.githubusercontent.com/zai-org/GLM-V/refs/heads/main/resources/bench_46v.jpeg)
84
+
85
+ Beyond achieves SoTA performance across major multimodal benchmarks at comparable model scales. GLM-4.6V introduces
86
+ several key features:
87
+
88
+ - **Native Multimodal Function Calling**
89
+ Enables native vision-driven tool use. Images, screenshots, and document pages can be passed directly as tool inputs without text conversion, while visual outputs (charts, search images, rendered pages) are interpreted and integrated into the reasoning chain. This closes the loop from perception to understanding to execution.
90
+
91
+ - **Interleaved Image-Text Content Generation**
92
+ Supports high-quality mixed media creation from complex multimodal inputs. GLM-4.6V takes a multimodal context—spanning documents, user inputs, and tool-retrieved images—and synthesizes coherent, interleaved image-text content tailored to the task. During generation it can actively call search and retrieval tools to gather and curate additional text and visuals, producing rich, visually grounded content.
93
+
94
+
95
+ - **Multimodal Document Understanding**
96
+ GLM-4.6V can process up to 128K tokens of multi-document or long-document input, directly interpreting richly formatted pages as images. It understands text, layout, charts, tables, and figures jointly, enabling accurate comprehension of complex, image-heavy documents without requiring prior conversion to plain text.
97
+
98
+ - **Frontend Replication & Visual Editing**
99
+ Reconstructs pixel-accurate HTML/CSS from UI screenshots and supports natural-language-driven edits. It detects layout, components, and styles visually, generates clean code, and applies iterative visual modifications through simple user instructions.
100
+
101
+
102
+ **This Hugging Face repository hosts the `GLM-4.6V-Flash` model, part of the `GLM-V` series.**
103
+
104
+ ## Usage
105
+
106
+ ### Environment Installation
107
+
108
+ For `SGLang`:
109
+
110
+ ```bash
111
+ pip install sglang>=0.5.6post1
112
+ pip install transformers>=5.0.0rc0
113
+ ```
114
+
115
+ For `vLLM`:
116
+
117
+ ```bash
118
+ pip install vllm>=0.12.0
119
+ pip install transformers>=5.0.0rc0
120
+ ```
121
+
122
+ ### Quick Start with Transformers
123
+
124
+ ```python
125
+ from transformers import AutoProcessor, Glm4vMoeForConditionalGeneration
126
+ import torch
127
+
128
+ MODEL_PATH = "zai-org/GLM-4.6V-Flash"
129
+ messages = [
130
+ {
131
+ "role": "user",
132
+ "content": [
133
+ {
134
+ "type": "image",
135
+ "url": "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png"
136
+ },
137
+ {
138
+ "type": "text",
139
+ "text": "describe this image"
140
+ }
141
+ ],
142
+ }
143
+ ]
144
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
145
+ model = Glm4vMoeForConditionalGeneration.from_pretrained(
146
+ pretrained_model_name_or_path=MODEL_PATH,
147
+ torch_dtype="auto",
148
+ device_map="auto",
149
+ )
150
+ inputs = processor.apply_chat_template(
151
+ messages,
152
+ tokenize=True,
153
+ add_generation_prompt=True,
154
+ return_dict=True,
155
+ return_tensors="pt"
156
+ ).to(model.device)
157
+ inputs.pop("token_type_ids", None)
158
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
159
+ output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
160
+ print(output_text)
161
+ ```
162
+
163
+ ## Evaluation Settings
164
+
165
+ We primarily use vLLM as the backend for model inference. For faster and more reliable performance on video tasks, we employ SGLang. To reproduce our leaderboard results, we recommend the following decoding parameters:
166
+
167
+ + top_p: 0.6
168
+ + top_k: 2
169
+ + temperature: 0.8
170
+ + repetition_penalty: 1.1
171
+ + max_generate_tokens: 16K
172
+
173
+ For more usage details, please refer to Our [Github](https://github.com/zai-org/GLM-V).
174
+
175
+
176
+
177
+ ## Fixed and Remaining Issues
178
+
179
+ Since the open-sourcing of GLM-4.1V, we have received extensive feedback from the community and are well aware that the model still has many shortcomings. In subsequent iterations, we attempted to address several common issues — such as repetitive thinking outputs and formatting errors — which have been mitigated to some extent in this new version.
180
+
181
+ However, the model still has several limitations and issues that we will fix as soon as possible:
182
+
183
+ 1. Pure text QA capabilities still have significant room for improvement. In this development cycle, our primary focus was on visual multimodal scenarios, and we will enhance pure text abilities in upcoming updates.
184
+ 2. The model may still overthink or even repeat itself in certain cases, especially when dealing with complex prompts.
185
+ 3. In some situations, the model may restate the answer again at the end.
186
+ 4. There remain certain perception limitations, such as counting accuracy and identifying specific individuals, which still require improvement.
187
+
188
+ Thank you for your patience and understanding. We also welcome feedback and suggestions in the issue section — we will respond and improve as much as we can!
189
+
190
+ ## Citation
191
+
192
+ If you use this model, please cite the following paper:
193
+
194
+ ```bibtex
195
+ @misc{vteam2025glm45vglm41vthinkingversatilemultimodal,
196
+ title={GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning},
197
+ author={V Team and Wenyi Hong and Wenmeng Yu and Xiaotao Gu and Guo Wang and Guobing Gan and Haomiao Tang and Jiale Cheng and Ji Qi and Junhui Ji and Lihang Pan and Shuaiqi Duan and Weihan Wang and Yan Wang and Yean Cheng and Zehai He and Zhe Su and Zhen Yang and Ziyang Pan and Aohan Zeng and Baoxu Wang and Bin Chen and Boyan Shi and Changyu Pang and Chenhui Zhang and Da Yin and Fan Yang and Guoqing Chen and Jiazheng Xu and Jiale Zhu and Jiali Chen and Jing Chen and Jinhao Chen and Jinghao Lin and Jinjiang Wang and Junjie Chen and Leqi Lei and Letian Gong and Leyi Pan and Mingdao Liu and Mingde Xu and Mingzhi Zhang and Qinkai Zheng and Sheng Yang and Shi Zhong and Shiyu Huang and Shuyuan Zhao and Siyan Xue and Shangqin Tu and Shengbiao Meng and Tianshu Zhang and Tianwei Luo and Tianxiang Hao and Tianyu Tong and Wenkai Li and Wei Jia and Xiao Liu and Xiaohan Zhang and Xin Lyu and Xinyue Fan and Xuancheng Huang and Yanling Wang and Yadong Xue and Yanfeng Wang and Yanzi Wang and Yifan An and Yifan Du and Yiming Shi and Yiheng Huang and Yilin Niu and Yuan Wang and Yuanchang Yue and Yuchen Li and Yutao Zhang and Yuting Wang and Yu Wang and Yuxuan Zhang and Zhao Xue and Zhenyu Hou and Zhengxiao Du and Zihan Wang and Peng Zhang and Debing Liu and Bin Xu and Juanzi Li and Minlie Huang and Yuxiao Dong and Jie Tang},
198
+ year={2025},
199
+ eprint={2507.01006},
200
+ archivePrefix={arXiv},
201
+ primaryClass={cs.CV},
202
+ url={https://arxiv.org/abs/2507.01006},
203
+ }
204
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}
17
+ <arg_key>{arg-key-1}</arg_key>
18
+ <arg_value>{arg-value-1}</arg_value>
19
+ <arg_key>{arg-key-2}</arg_key>
20
+ <arg_value>{arg-value-2}</arg_value>
21
+ ...
22
+ </tool_call>{%- endif -%}
23
+ {%- macro visible_text(content) -%}
24
+ {%- if content is string -%}
25
+ {{- content }}
26
+ {%- elif content is iterable and content is not mapping -%}
27
+ {%- for item in content -%}
28
+ {%- if item is mapping and item.type == 'text' -%}
29
+ {{- item.text }}
30
+ {%- elif item is mapping and (item.type == 'image' or 'image' in item) -%}
31
+ <|begin_of_image|><|image|><|end_of_image|>
32
+ {%- elif item is mapping and (item.type == 'video' or 'video' in item) -%}
33
+ <|begin_of_video|><|video|><|end_of_video|>
34
+ {%- elif item is string -%}
35
+ {{- item }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{- content }}
40
+ {%- endif -%}
41
+ {%- endmacro -%}
42
+ {%- set ns = namespace(last_user_index=-1) %}
43
+ {%- for m in messages %}
44
+ {%- if m.role == 'user' %}
45
+ {% set ns.last_user_index = loop.index0 -%}
46
+ {%- endif %}
47
+ {%- endfor %}
48
+ {% for m in messages %}
49
+ {%- if m.role == 'user' -%}<|user|>
50
+ {% if m.content is string %}
51
+ {{ m.content }}
52
+ {%- else %}
53
+ {%- for item in m.content %}
54
+ {% if item.type == 'video' or 'video' in item %}
55
+ <|begin_of_video|><|video|><|end_of_video|>{% elif item.type == 'image' or 'image' in item %}
56
+ <|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %}
57
+ {{ item.text }}
58
+ {%- endif %}
59
+ {%- endfor %}
60
+ {%- endif %}
61
+ {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
62
+ {%- elif m.role == 'assistant' -%}
63
+ <|assistant|>
64
+ {%- set reasoning_content = '' %}
65
+ {%- set content = visible_text(m.content) %}
66
+ {%- if m.reasoning_content is string %}
67
+ {%- set reasoning_content = m.reasoning_content %}
68
+ {%- else %}
69
+ {%- if '</think>' in content %}
70
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
71
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
72
+ {%- endif %}
73
+ {%- endif %}
74
+ {%- if loop.index0 > ns.last_user_index and reasoning_content -%}
75
+ {{ '\n<think>' + reasoning_content.strip() + '</think>'}}
76
+ {%- else -%}
77
+ {{ '\n<think></think>' }}
78
+ {%- endif -%}
79
+ {%- if content.strip() -%}
80
+ {{ '\n' + content.strip() }}
81
+ {%- endif -%}
82
+ {% if m.tool_calls %}
83
+ {% for tc in m.tool_calls %}
84
+ {%- if tc.function %}
85
+ {%- set tc = tc.function %}
86
+ {%- endif %}
87
+ {{ '\n<tool_call>' + tc.name }}
88
+ {% set _args = tc.arguments %}
89
+ {% for k, v in _args.items() %}
90
+ <arg_key>{{ k }}</arg_key>
91
+ <arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
92
+ {% endfor %}
93
+ </tool_call>{% endfor %}
94
+ {% endif %}
95
+ {%- elif m.role == 'tool' -%}
96
+ {%- if m.content is string -%}
97
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
98
+ {{- '<|observation|>' }}
99
+ {%- endif %}
100
+ {{- '\n<tool_response>\n' }}
101
+ {{- m.content }}
102
+ {{- '\n</tool_response>' }}
103
+ {% elif m.content is iterable and m.content is not mapping %}
104
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
105
+ {{- '<|observation|>' }}
106
+ {%- endif %}
107
+ {{- '\n<tool_response>\n' }}
108
+ {%- for tr in m.content -%}
109
+ {%- if tr is mapping and tr.type is defined -%}
110
+ {%- set t = tr.type | lower -%}
111
+ {%- if t == 'text' and tr.text is defined -%}
112
+ {{ tr.text }}
113
+ {%- elif t in ['image', 'image_url'] -%}
114
+ <|begin_of_image|><|image|><|end_of_image|>
115
+ {%- elif t in ['video', 'video_url'] -%}
116
+ <|begin_of_video|><|video|><|end_of_video|>
117
+ {%- else -%}
118
+ {{ tr | tojson(ensure_ascii=False) }}
119
+ {%- endif -%}
120
+ {%- else -%}
121
+ {{ tr.output if tr.output is defined else tr }}
122
+ {%- endif -%}
123
+ {%- endfor -%}
124
+ {{- '\n</tool_response>' }}
125
+ {%- else -%}
126
+ <|observation|>{% for tr in m.content %}
127
+
128
+ <tool_response>
129
+ {{ tr.output if tr.output is defined else tr }}
130
+ </tool_response>{% endfor -%}
131
+ {% endif -%}
132
+ {%- elif m.role == 'system' -%}
133
+ <|system|>
134
+ {{ visible_text(m.content) }}
135
+ {%- endif -%}
136
+ {%- endfor -%}
137
+ {%- if add_generation_prompt -%}
138
+ <|assistant|>
139
+ {{'<think></think>\n' if (enable_thinking is defined and not enable_thinking) else ''}}
140
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4vForConditionalGeneration"
4
+ ],
5
+ "dtype": "float16",
6
+ "image_end_token_id": 151340,
7
+ "image_start_token_id": 151339,
8
+ "image_token_id": 151363,
9
+ "model_type": "glm4v",
10
+ "quantization_config": {
11
+ "config_groups": {
12
+ "group_0": {
13
+ "format": "pack-quantized",
14
+ "input_activations": null,
15
+ "output_activations": null,
16
+ "targets": [
17
+ "Linear"
18
+ ],
19
+ "weights": {
20
+ "actorder": null,
21
+ "block_structure": null,
22
+ "dynamic": false,
23
+ "group_size": 32,
24
+ "num_bits": 8,
25
+ "observer": "mse",
26
+ "observer_kwargs": {},
27
+ "strategy": "group",
28
+ "symmetric": true,
29
+ "type": "int"
30
+ }
31
+ }
32
+ },
33
+ "format": "pack-quantized",
34
+ "global_compression_ratio": null,
35
+ "ignore": [
36
+ "model.visual.blocks.0.attn.qkv_proj",
37
+ "model.visual.blocks.0.attn.qkv",
38
+ "model.visual.blocks.0.attn.proj",
39
+ "model.visual.blocks.0.mlp.gate_up_proj",
40
+ "model.visual.blocks.0.mlp.gate_proj",
41
+ "model.visual.blocks.0.mlp.up_proj",
42
+ "model.visual.blocks.0.mlp.down_proj",
43
+ "model.visual.blocks.1.attn.qkv_proj",
44
+ "model.visual.blocks.1.attn.qkv",
45
+ "model.visual.blocks.1.attn.proj",
46
+ "model.visual.blocks.1.mlp.gate_up_proj",
47
+ "model.visual.blocks.1.mlp.gate_proj",
48
+ "model.visual.blocks.1.mlp.up_proj",
49
+ "model.visual.blocks.1.mlp.down_proj",
50
+ "model.visual.blocks.2.attn.qkv_proj",
51
+ "model.visual.blocks.2.attn.qkv",
52
+ "model.visual.blocks.2.attn.proj",
53
+ "model.visual.blocks.2.mlp.gate_up_proj",
54
+ "model.visual.blocks.2.mlp.gate_proj",
55
+ "model.visual.blocks.2.mlp.up_proj",
56
+ "model.visual.blocks.2.mlp.down_proj",
57
+ "model.visual.blocks.3.attn.qkv_proj",
58
+ "model.visual.blocks.3.attn.qkv",
59
+ "model.visual.blocks.3.attn.proj",
60
+ "model.visual.blocks.3.mlp.gate_up_proj",
61
+ "model.visual.blocks.3.mlp.gate_proj",
62
+ "model.visual.blocks.3.mlp.up_proj",
63
+ "model.visual.blocks.3.mlp.down_proj",
64
+ "model.visual.blocks.4.attn.qkv_proj",
65
+ "model.visual.blocks.4.attn.qkv",
66
+ "model.visual.blocks.4.attn.proj",
67
+ "model.visual.blocks.4.mlp.gate_up_proj",
68
+ "model.visual.blocks.4.mlp.gate_proj",
69
+ "model.visual.blocks.4.mlp.up_proj",
70
+ "model.visual.blocks.4.mlp.down_proj",
71
+ "model.visual.blocks.5.attn.qkv_proj",
72
+ "model.visual.blocks.5.attn.qkv",
73
+ "model.visual.blocks.5.attn.proj",
74
+ "model.visual.blocks.5.mlp.gate_up_proj",
75
+ "model.visual.blocks.5.mlp.gate_proj",
76
+ "model.visual.blocks.5.mlp.up_proj",
77
+ "model.visual.blocks.5.mlp.down_proj",
78
+ "model.visual.blocks.6.attn.qkv_proj",
79
+ "model.visual.blocks.6.attn.qkv",
80
+ "model.visual.blocks.6.attn.proj",
81
+ "model.visual.blocks.6.mlp.gate_up_proj",
82
+ "model.visual.blocks.6.mlp.gate_proj",
83
+ "model.visual.blocks.6.mlp.up_proj",
84
+ "model.visual.blocks.6.mlp.down_proj",
85
+ "model.visual.blocks.7.attn.qkv_proj",
86
+ "model.visual.blocks.7.attn.qkv",
87
+ "model.visual.blocks.7.attn.proj",
88
+ "model.visual.blocks.7.mlp.gate_up_proj",
89
+ "model.visual.blocks.7.mlp.gate_proj",
90
+ "model.visual.blocks.7.mlp.up_proj",
91
+ "model.visual.blocks.7.mlp.down_proj",
92
+ "model.visual.blocks.8.attn.qkv_proj",
93
+ "model.visual.blocks.8.attn.qkv",
94
+ "model.visual.blocks.8.attn.proj",
95
+ "model.visual.blocks.8.mlp.gate_up_proj",
96
+ "model.visual.blocks.8.mlp.gate_proj",
97
+ "model.visual.blocks.8.mlp.up_proj",
98
+ "model.visual.blocks.8.mlp.down_proj",
99
+ "model.visual.blocks.9.attn.qkv_proj",
100
+ "model.visual.blocks.9.attn.qkv",
101
+ "model.visual.blocks.9.attn.proj",
102
+ "model.visual.blocks.9.mlp.gate_up_proj",
103
+ "model.visual.blocks.9.mlp.gate_proj",
104
+ "model.visual.blocks.9.mlp.up_proj",
105
+ "model.visual.blocks.9.mlp.down_proj",
106
+ "model.visual.blocks.10.attn.qkv_proj",
107
+ "model.visual.blocks.10.attn.qkv",
108
+ "model.visual.blocks.10.attn.proj",
109
+ "model.visual.blocks.10.mlp.gate_up_proj",
110
+ "model.visual.blocks.10.mlp.gate_proj",
111
+ "model.visual.blocks.10.mlp.up_proj",
112
+ "model.visual.blocks.10.mlp.down_proj",
113
+ "model.visual.blocks.11.attn.qkv_proj",
114
+ "model.visual.blocks.11.attn.qkv",
115
+ "model.visual.blocks.11.attn.proj",
116
+ "model.visual.blocks.11.mlp.gate_up_proj",
117
+ "model.visual.blocks.11.mlp.gate_proj",
118
+ "model.visual.blocks.11.mlp.up_proj",
119
+ "model.visual.blocks.11.mlp.down_proj",
120
+ "model.visual.blocks.12.attn.qkv_proj",
121
+ "model.visual.blocks.12.attn.qkv",
122
+ "model.visual.blocks.12.attn.proj",
123
+ "model.visual.blocks.12.mlp.gate_up_proj",
124
+ "model.visual.blocks.12.mlp.gate_proj",
125
+ "model.visual.blocks.12.mlp.up_proj",
126
+ "model.visual.blocks.12.mlp.down_proj",
127
+ "model.visual.blocks.13.attn.qkv_proj",
128
+ "model.visual.blocks.13.attn.qkv",
129
+ "model.visual.blocks.13.attn.proj",
130
+ "model.visual.blocks.13.mlp.gate_up_proj",
131
+ "model.visual.blocks.13.mlp.gate_proj",
132
+ "model.visual.blocks.13.mlp.up_proj",
133
+ "model.visual.blocks.13.mlp.down_proj",
134
+ "model.visual.blocks.14.attn.qkv_proj",
135
+ "model.visual.blocks.14.attn.qkv",
136
+ "model.visual.blocks.14.attn.proj",
137
+ "model.visual.blocks.14.mlp.gate_up_proj",
138
+ "model.visual.blocks.14.mlp.gate_proj",
139
+ "model.visual.blocks.14.mlp.up_proj",
140
+ "model.visual.blocks.14.mlp.down_proj",
141
+ "model.visual.blocks.15.attn.qkv_proj",
142
+ "model.visual.blocks.15.attn.qkv",
143
+ "model.visual.blocks.15.attn.proj",
144
+ "model.visual.blocks.15.mlp.gate_up_proj",
145
+ "model.visual.blocks.15.mlp.gate_proj",
146
+ "model.visual.blocks.15.mlp.up_proj",
147
+ "model.visual.blocks.15.mlp.down_proj",
148
+ "model.visual.blocks.16.attn.qkv_proj",
149
+ "model.visual.blocks.16.attn.qkv",
150
+ "model.visual.blocks.16.attn.proj",
151
+ "model.visual.blocks.16.mlp.gate_up_proj",
152
+ "model.visual.blocks.16.mlp.gate_proj",
153
+ "model.visual.blocks.16.mlp.up_proj",
154
+ "model.visual.blocks.16.mlp.down_proj",
155
+ "model.visual.blocks.17.attn.qkv_proj",
156
+ "model.visual.blocks.17.attn.qkv",
157
+ "model.visual.blocks.17.attn.proj",
158
+ "model.visual.blocks.17.mlp.gate_up_proj",
159
+ "model.visual.blocks.17.mlp.gate_proj",
160
+ "model.visual.blocks.17.mlp.up_proj",
161
+ "model.visual.blocks.17.mlp.down_proj",
162
+ "model.visual.blocks.18.attn.qkv_proj",
163
+ "model.visual.blocks.18.attn.qkv",
164
+ "model.visual.blocks.18.attn.proj",
165
+ "model.visual.blocks.18.mlp.gate_up_proj",
166
+ "model.visual.blocks.18.mlp.gate_proj",
167
+ "model.visual.blocks.18.mlp.up_proj",
168
+ "model.visual.blocks.18.mlp.down_proj",
169
+ "model.visual.blocks.19.attn.qkv_proj",
170
+ "model.visual.blocks.19.attn.qkv",
171
+ "model.visual.blocks.19.attn.proj",
172
+ "model.visual.blocks.19.mlp.gate_up_proj",
173
+ "model.visual.blocks.19.mlp.gate_proj",
174
+ "model.visual.blocks.19.mlp.up_proj",
175
+ "model.visual.blocks.19.mlp.down_proj",
176
+ "model.visual.blocks.20.attn.qkv_proj",
177
+ "model.visual.blocks.20.attn.qkv",
178
+ "model.visual.blocks.20.attn.proj",
179
+ "model.visual.blocks.20.mlp.gate_up_proj",
180
+ "model.visual.blocks.20.mlp.gate_proj",
181
+ "model.visual.blocks.20.mlp.up_proj",
182
+ "model.visual.blocks.20.mlp.down_proj",
183
+ "model.visual.blocks.21.attn.qkv_proj",
184
+ "model.visual.blocks.21.attn.qkv",
185
+ "model.visual.blocks.21.attn.proj",
186
+ "model.visual.blocks.21.mlp.gate_up_proj",
187
+ "model.visual.blocks.21.mlp.gate_proj",
188
+ "model.visual.blocks.21.mlp.up_proj",
189
+ "model.visual.blocks.21.mlp.down_proj",
190
+ "model.visual.blocks.22.attn.qkv_proj",
191
+ "model.visual.blocks.22.attn.qkv",
192
+ "model.visual.blocks.22.attn.proj",
193
+ "model.visual.blocks.22.mlp.gate_up_proj",
194
+ "model.visual.blocks.22.mlp.gate_proj",
195
+ "model.visual.blocks.22.mlp.up_proj",
196
+ "model.visual.blocks.22.mlp.down_proj",
197
+ "model.visual.blocks.23.attn.qkv_proj",
198
+ "model.visual.blocks.23.attn.qkv",
199
+ "model.visual.blocks.23.attn.proj",
200
+ "model.visual.blocks.23.mlp.gate_up_proj",
201
+ "model.visual.blocks.23.mlp.gate_proj",
202
+ "model.visual.blocks.23.mlp.up_proj",
203
+ "model.visual.blocks.23.mlp.down_proj",
204
+ "model.visual.merger.proj",
205
+ "model.visual.merger.gate_up_proj",
206
+ "model.visual.merger.gate_proj",
207
+ "model.visual.merger.up_proj",
208
+ "model.visual.merger.down_proj",
209
+ "lm_head"
210
+ ],
211
+ "kv_cache_scheme": null,
212
+ "quant_method": "compressed-tensors",
213
+ "quantization_status": "compressed",
214
+ "sparsity_config": {},
215
+ "transform_config": {},
216
+ "version": "0.12.3.a20251203"
217
+ },
218
+ "text_config": {
219
+ "attention_bias": true,
220
+ "attention_dropout": 0.0,
221
+ "dtype": "bfloat16",
222
+ "eos_token_id": [
223
+ 151329,
224
+ 151336,
225
+ 151338
226
+ ],
227
+ "hidden_act": "silu",
228
+ "hidden_size": 4096,
229
+ "image_token_id": null,
230
+ "initializer_range": 0.02,
231
+ "intermediate_size": 13696,
232
+ "max_position_embeddings": 131072,
233
+ "model_type": "glm4v_text",
234
+ "num_attention_heads": 32,
235
+ "num_hidden_layers": 40,
236
+ "num_key_value_heads": 2,
237
+ "pad_token_id": 151329,
238
+ "rms_norm_eps": 1e-05,
239
+ "rope_parameters": {
240
+ "mrope_section": [
241
+ 8,
242
+ 12,
243
+ 12
244
+ ],
245
+ "partial_rotary_factor": 0.5,
246
+ "rope_theta": 500000,
247
+ "rope_type": "default"
248
+ },
249
+ "use_cache": true,
250
+ "vocab_size": 151552
251
+ },
252
+ "tie_word_embeddings": false,
253
+ "transformers_version": "4.57.3",
254
+ "video_end_token_id": 151342,
255
+ "video_start_token_id": 151341,
256
+ "video_token_id": 151364,
257
+ "vision_config": {
258
+ "attention_bias": false,
259
+ "attention_dropout": 0.0,
260
+ "depth": 24,
261
+ "hidden_act": "silu",
262
+ "hidden_dropout_prob": 0.0,
263
+ "hidden_size": 1536,
264
+ "image_size": 336,
265
+ "in_channels": 3,
266
+ "initializer_range": 0.02,
267
+ "intermediate_size": 13696,
268
+ "model_type": "glm4v",
269
+ "num_heads": 12,
270
+ "out_hidden_size": 4096,
271
+ "patch_size": 14,
272
+ "rms_norm_eps": 1e-05,
273
+ "spatial_merge_size": 2,
274
+ "temporal_patch_size": 2
275
+ }
276
+ }
generation_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151329,
6
+ 151336,
7
+ 151338,
8
+ 151348
9
+ ],
10
+ "pad_token_id": 151329,
11
+ "temperature": 0.8,
12
+ "top_k": 2,
13
+ "top_p": 0.6,
14
+ "transformers_version": "4.57.3"
15
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ae8d7797802c40d5bcf90a92e0de138034a1f648f0a653d6cf7242a28ad243a
3
+ size 4997257728
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c410e1dbcc49ef10d2677620a41fe11ffb2e25bf34aa1cd3575b36ede05ac68f
3
+ size 4985022776
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b7696d696514be0224b306e1d42f97b7318259644fb92b19e6d787cec2fabe2
3
+ size 2955378312
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {"shortest_edge": 12544, "longest_edge": 9633792},
3
+ "do_rescale": true,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [0.48145466, 0.4578275, 0.40821073],
8
+ "image_std": [0.26862954, 0.26130258, 0.27577711],
9
+ "image_processor_type": "Glm46VImageProcessor",
10
+ "processor_class": "Glm46VProcessor"
11
+ }
recipe.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ AWQModifier:
4
+ config_groups:
5
+ group_0:
6
+ targets: [Linear]
7
+ weights:
8
+ num_bits: 8
9
+ type: int
10
+ symmetric: true
11
+ group_size: 32
12
+ strategy: group
13
+ block_structure: null
14
+ dynamic: false
15
+ actorder: null
16
+ scale_dtype: null
17
+ zp_dtype: null
18
+ observer: mse
19
+ observer_kwargs: {}
20
+ input_activations: null
21
+ output_activations: null
22
+ format: null
23
+ targets: [Linear]
24
+ ignore: [lm_head, 're:.*embed_tokens', 're:.*input_layernorm', 're:.*post_attention_layernorm',
25
+ model.language_model.norm, 're:.*mlp[.]gate$', 're:model[.]visual.*']
26
+ mappings:
27
+ - smooth_layer: re:.*input_layernorm$
28
+ balance_layers: ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$']
29
+ - smooth_layer: re:.*v_proj$
30
+ balance_layers: ['re:.*o_proj$']
31
+ - smooth_layer: re:.*post_attention_layernorm$
32
+ balance_layers: ['re:.*gate_proj$', 're:.*up_proj$']
33
+ - smooth_layer: re:.*up_proj$
34
+ balance_layers: ['re:.*down_proj$']
35
+ offload_device: !!python/object/apply:torch.device [cpu]
36
+ duo_scaling: true
37
+ n_grid: 20
special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>",
17
+ "<|begin_of_audio|>",
18
+ "<|end_of_audio|>",
19
+ "<|image|>",
20
+ "<|video|>",
21
+ "<|begin_of_transcription|>",
22
+ "<|end_of_transcription|>",
23
+ "<|code_prefix|>",
24
+ "<|code_middle|>",
25
+ "<|code_suffix|>",
26
+ "/nothink"
27
+ ],
28
+ "eos_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ff52959093921034528ecd6a59926e5fd543f56f94f2a0034ed4ba458c0a86
3
+ size 19970698
tokenizer_config.json ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151329": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151330": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151331": {
20
+ "content": "[gMASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151332": {
28
+ "content": "[sMASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151333": {
36
+ "content": "<sop>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151334": {
44
+ "content": "<eop>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151335": {
52
+ "content": "<|system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151336": {
60
+ "content": "<|user|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151337": {
68
+ "content": "<|assistant|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151338": {
76
+ "content": "<|observation|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151339": {
84
+ "content": "<|begin_of_image|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151340": {
92
+ "content": "<|end_of_image|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151341": {
100
+ "content": "<|begin_of_video|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151342": {
108
+ "content": "<|end_of_video|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "151343": {
116
+ "content": "<|begin_of_audio|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "151344": {
124
+ "content": "<|end_of_audio|>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "151345": {
132
+ "content": "<|begin_of_transcription|>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "151346": {
140
+ "content": "<|end_of_transcription|>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "151347": {
148
+ "content": "<|code_prefix|>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "151348": {
156
+ "content": "<|code_middle|>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "151349": {
164
+ "content": "<|code_suffix|>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "151350": {
172
+ "content": "<think>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "151351": {
180
+ "content": "</think>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "151352": {
188
+ "content": "<tool_call>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "151353": {
196
+ "content": "</tool_call>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "151354": {
204
+ "content": "<tool_response>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "151355": {
212
+ "content": "</tool_response>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "151356": {
220
+ "content": "<arg_key>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "151357": {
228
+ "content": "</arg_key>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "151358": {
236
+ "content": "<arg_value>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "151359": {
244
+ "content": "</arg_value>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "151360": {
252
+ "content": "/nothink",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "151361": {
260
+ "content": "<|begin_of_box|>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "151362": {
268
+ "content": "<|end_of_box|>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "151363": {
276
+ "content": "<|image|>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "151364": {
284
+ "content": "<|video|>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ }
291
+ },
292
+ "additional_special_tokens": [
293
+ "<|endoftext|>",
294
+ "[MASK]",
295
+ "[gMASK]",
296
+ "[sMASK]",
297
+ "<sop>",
298
+ "<eop>",
299
+ "<|system|>",
300
+ "<|user|>",
301
+ "<|assistant|>",
302
+ "<|observation|>",
303
+ "<|begin_of_image|>",
304
+ "<|end_of_image|>",
305
+ "<|begin_of_video|>",
306
+ "<|end_of_video|>",
307
+ "<|begin_of_audio|>",
308
+ "<|end_of_audio|>",
309
+ "<|image|>",
310
+ "<|video|>",
311
+ "<|begin_of_transcription|>",
312
+ "<|end_of_transcription|>",
313
+ "<|code_prefix|>",
314
+ "<|code_middle|>",
315
+ "<|code_suffix|>",
316
+ "/nothink"
317
+ ],
318
+ "clean_up_tokenization_spaces": false,
319
+ "do_lower_case": false,
320
+ "eos_token": "<|endoftext|>",
321
+ "extra_special_tokens": {},
322
+ "model_max_length": 128000,
323
+ "pad_token": "<|endoftext|>",
324
+ "padding_side": "left",
325
+ "remove_space": false,
326
+ "tokenizer_class": "PreTrainedTokenizerFast"
327
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {"shortest_edge": 12544, "longest_edge": 100352000},
3
+ "do_rescale": true,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [0.48145466, 0.4578275, 0.40821073],
8
+ "image_std": [0.26862954, 0.26130258, 0.27577711],
9
+ "video_processor_type": "Glm46VVideoProcessor",
10
+ "processor_class": "Glm46VProcessor"
11
+ }