jingwwu commited on 13 days ago

Commit

295118d

1 Parent(s): 16c6dad

Upload folder using huggingface_hub

Browse files

Files changed (38) hide show

.gitattributes +2 -0
README.md +92 -3
added_tokens.json +34 -0
assets/comparision.png +3 -0
config.json +66 -0
model.safetensors.index.json +698 -0
models/__pycache__/config.cpython-310.pyc +0 -0
models/__pycache__/gen_pipeline.cpython-310.pyc +0 -0
models/__pycache__/heads.cpython-310.pyc +0 -0
models/__pycache__/llama_model.cpython-310.pyc +0 -0
models/__pycache__/nextstep_model.cpython-310.pyc +0 -0
models/config.py +45 -0
models/gen_pipeline.py +398 -0
models/heads.py +283 -0
models/llama_model.py +568 -0
models/nextstep_model.py +553 -0
pytorch-model-00001-of-00004.safetensors +3 -0
pytorch-model-00002-of-00004.safetensors +3 -0
pytorch-model-00003-of-00004.safetensors +3 -0
pytorch-model-00004-of-00004.safetensors +3 -0
requirements.txt +14 -0
special_tokens_map.json +27 -0
tokenizer.json +3 -0
tokenizer_config.json +285 -0
utils/__pycache__/compile_utils.cpython-310.pyc +0 -0
utils/__pycache__/image_utils.cpython-310.pyc +0 -0
utils/__pycache__/misc.cpython-310.pyc +0 -0
utils/__pycache__/model_utils.cpython-310.pyc +0 -0
utils/aspect_ratio.py +107 -0
utils/compile_utils.py +122 -0
utils/image_utils.py +314 -0
utils/misc.py +51 -0
utils/model_utils.py +128 -0
vae/__pycache__/nextstep_ae.cpython-310.pyc +0 -0
vae/checkpoint.pt +3 -0
vae/config.json +14 -0
vae/nextstep_ae.py +494 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/comparision.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,92 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+pipeline_tag: text-to-image
+library_name: transformers
+---
+## NextStep-1.1
+[Homepage](https://stepfun.ai/research/en/nextstep-1)&nbsp;
+| [GitHub](https://github.com/stepfun-ai/NextStep-1)&nbsp;
+| [Paper](https://arxiv.org/abs/2508.10711)&nbsp;
+We introduce **NextStep-1.1**, a new model represents a significant leap forward in the NextStep series. This version effectively resolves the visualization failures seen in **NextStep-1** and substantially elevates image quality through extended training and a Flow-based Reinforcement Learning (RL) post-training paradigm.
+<div align='center'>
+<img src="assets/comparision.png" class="interpolation-image" alt="arch." width="100%" />
+</div>
+## What's New in 1.1?
+NextStep-1.1 is not just a fine-tune; it is a re-engineered version focused on stability and high-fidelity output. Key improvements include:
+- RL Enhanced Visual Fidelity: Significant improvement in image texture and a substantial reduction in visual artifacts via RL, ensuring much cleaner and more professional outputs.
+- Technical Stability: Solves numerical instability inherent in the RL of autoregressive flow-based models.
+## Environment Setup
+To avoid potential errors when loading and running your models, we recommend using the following settings:
+```shell
+conda create -n nextstep python=3.11 -y
+conda activate nextstep
+pip install uv # optional
+GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/stepfun-ai/NextStep-1.1-Pretrain && cd NextStep-1.1-Pretrain
+uv pip install -r requirements.txt
+hf download stepfun-ai/NextStep-1.1-Pretrain "vae/checkpoint.pt" --local-dir ./
+```
+## Usage
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+from models.gen_pipeline import NextStepPipeline
+HF_HUB = "stepfun-ai/NextStep-1.1-Pretrain"
+# load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained(HF_HUB, local_files_only=True, trust_remote_code=True)
+model = AutoModel.from_pretrained(HF_HUB, local_files_only=True, trust_remote_code=True)
+pipeline = NextStepPipeline(tokenizer=tokenizer, model=model).to(device="cuda", dtype=torch.bfloat16)
+# set prompts
+positive_prompt = ""
+negative_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
+example_prompt = "A REALISTIC PHOTOGRAPH OF A WALL WITH \"TOWARD AUTOREGRESSIVE IMAGE GENERATION WITH CONTINUOUS TOKENS AT SCALE\" PROMINENTLY DISPLAYED"
+# generate image from text
+IMG_SIZE = 512
+image = pipeline.generate_image(
+    example_prompt,
+    hw=(IMG_SIZE, IMG_SIZE),
+    num_images_per_caption=1,
+    positive_prompt=positive_prompt,
+    negative_prompt=negative_prompt,
+    cfg=7.5,
+    cfg_img=1.0,
+    cfg_schedule="constant",
+    use_norm=False,
+    num_sampling_steps=28,
+    timesteps_shift=1.0,
+    seed=3407,
+)[0]
+image.save("./assets/output.jpg")
+```
+## Citation
+If you find NextStep useful for your research and applications, please consider starring this repository and citing:
+```bibtex
+@article{nextstepteam2025nextstep1,
+  title={NextStep-1: Toward Autoregressive Image Generation with Continuous Tokens at Scale},
+  author={NextStep Team and Chunrui Han and Guopeng Li and Jingwei Wu and Quan Sun and Yan Cai and Yuang Peng and Zheng Ge and Deyu Zhou and Haomiao Tang and Hongyu Zhou and Kenkun Liu and Ailin Huang and Bin Wang and Changxin Miao and Deshan Sun and En Yu and Fukun Yin and Gang Yu and Hao Nie and Haoran Lv and Hanpeng Hu and Jia Wang and Jian Zhou and Jianjian Sun and Kaijun Tan and Kang An and Kangheng Lin and Liang Zhao and Mei Chen and Peng Xing and Rui Wang and Shiyu Liu and Shutao Xia and Tianhao You and Wei Ji and Xianfang Zeng and Xin Han and Xuelin Zhang and Yana Wei and Yanming Xu and Yimin Jiang and Yingming Wang and Yu Zhou and Yucheng Han and Ziyang Meng and Binxing Jiao and Daxin Jiang and Xiangyu Zhang and Yibo Zhu},
+  journal={arXiv preprint arXiv:2508.10711},
+  year={2025}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|begin_of_image|>": 151667,
+  "<|begin_of_prompt_refinement|>": 151670,
+  "<|begin_of_thinking|>": 151672,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|end_of_image|>": 151668,
+  "<|end_of_prompt_refinement|>": 151671,
+  "<|end_of_thinking|>": 151673,
+  "<|beginoftext|>": 151674,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_area|>": 151666,
+  "<|image_pad|>": 151655,
+  "<|image_placeholder|>": 151669,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652,
+  "[PAD]": 151665
+}

assets/comparision.png ADDED Viewed

Git LFS Details

SHA256: c03496181fccd0cb84da7554c305cdeaf7f7f4e4af41b73fe97f36b7626504dd
Pointer size: 133 Bytes
Size of remote file: 16.5 MB

config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "_attn_implementation_autoset": true,
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "auto_map":{
+    "AutoConfig": "models/config.NextStepConfig",
+    "AutoModel": "models/nextstep_model.NextStep"
+  },
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "base_image_grid_size": 64,
+  "boi": 151667,
+  "bos_token_id": 151643,
+  "create_kwargs": {
+    "snr_type": "lognorm"
+  },
+  "eoi": 151668,
+  "eos_token_id": 151643,
+  "genloss_batch_mul": 4,
+  "genloss_depth": 12,
+  "genloss_net_arch": "mlp",
+  "genloss_num_sampling_steps": "100",
+  "genloss_type": "transport",
+  "genloss_width": 1536,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "image_decoder_arch": "Trans_E",
+  "image_encoder_name": null,
+  "image_feature_layer": -2,
+  "image_loss_weight": 1.0,
+  "image_placeholder_id": 151669,
+  "image_size": 64,
+  "initializer_range": 0.02,
+  "intermediate_size": 13824,
+  "lm_loss_weight": 0.01,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 48,
+  "mlp_bias": false,
+  "model_type": "nextstep",
+  "noise_strength": 0.0,
+  "num_attention_heads": 40,
+  "num_channels": 16,
+  "num_hidden_layers": 48,
+  "num_key_value_heads": 8,
+  "o_attention_bias": false,
+  "pad_token_id_added": 151665,
+  "patch_size": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.0",
+  "use_2d_rope": false,
+  "use_cache": true,
+  "use_gen_pos_embed": false,
+  "use_mlp_before_lm_head": false,
+  "use_sliding_window": false,
+  "use_token_length_weight": false,
+  "vae_name_or_path": "vae/",
+  "vocab_size": 152064
+}

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,698 @@

+{
+  "metadata": {
+    "total_size": 29907628160
+  },
+  "weight_map": {
+    "embed_tokens.weight": "pytorch-model-00004-of-00004.safetensors",
+    "image_head.net.cond_embed.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.cond_embed.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.final_layer.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.final_layer.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.final_layer.linear.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.final_layer.linear.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.input_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.input_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.0.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.1.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.10.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.11.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.2.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.3.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.4.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.5.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.6.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.7.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.8.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.adaLN_modulation.1.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.adaLN_modulation.1.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.in_ln.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.in_ln.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.res_blocks.9.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.time_embed.mlp.0.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.time_embed.mlp.0.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.time_embed.mlp.2.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_head.net.time_embed.mlp.2.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_in_projector.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_in_projector.weight": "pytorch-model-00003-of-00004.safetensors",
+    "image_out_projector.bias": "pytorch-model-00003-of-00004.safetensors",
+    "image_out_projector.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.0.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.0.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.1.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.10.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.11.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.12.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.13.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.14.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.15.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.16.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.17.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.18.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.19.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.2.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.20.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.21.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.22.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.23.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.mlp.down_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.mlp.gate_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.mlp.up_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.post_attention_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.self_attn.k_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.self_attn.k_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.self_attn.q_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.self_attn.q_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.self_attn.v_proj.bias": "pytorch-model-00001-of-00004.safetensors",
+    "layers.24.self_attn.v_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.25.input_layernorm.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.25.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.self_attn.o_proj.weight": "pytorch-model-00001-of-00004.safetensors",
+    "layers.25.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.25.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.26.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.27.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.28.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.29.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.3.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.30.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.31.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.32.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.33.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.34.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.35.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.36.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.37.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.38.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.39.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.4.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.mlp.down_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.mlp.gate_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.mlp.up_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.post_attention_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.self_attn.k_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.self_attn.k_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.self_attn.q_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.self_attn.q_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.self_attn.v_proj.bias": "pytorch-model-00002-of-00004.safetensors",
+    "layers.40.self_attn.v_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.41.input_layernorm.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.41.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.self_attn.o_proj.weight": "pytorch-model-00002-of-00004.safetensors",
+    "layers.41.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.41.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.42.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.43.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.44.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.45.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.46.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.47.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.5.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.6.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.7.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.8.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.input_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.mlp.down_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.mlp.gate_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.mlp.up_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.post_attention_layernorm.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.self_attn.k_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.self_attn.k_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.self_attn.o_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.self_attn.q_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.self_attn.q_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.self_attn.v_proj.bias": "pytorch-model-00003-of-00004.safetensors",
+    "layers.9.self_attn.v_proj.weight": "pytorch-model-00003-of-00004.safetensors",
+    "lm_head.weight": "pytorch-model-00003-of-00004.safetensors",
+    "norm.weight": "pytorch-model-00003-of-00004.safetensors"
+  }
+}

models/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.47 kB). View file

models/__pycache__/gen_pipeline.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

models/__pycache__/heads.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

models/__pycache__/llama_model.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

models/__pycache__/nextstep_model.cpython-310.pyc ADDED Viewed

Binary file (15.8 kB). View file

models/config.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from transformers.models.llama.configuration_llama import LlamaConfig
+class NextStepConfig(LlamaConfig):
+    model_type = "nextstep"
+    def __init__(
+        self,
+        vae_name_or_path: str | None = None,
+        latent_size: int = 32,
+        latent_patch_size: int = 2,
+        latent_channels: int = 16,
+        boi: int | None = None,
+        eoi: int | None = None,
+        image_placeholder_id: int | None = None,
+        pad_token_id_added: int | None = None,
+        lm_loss_weight: float = 0.01,
+        im_loss_weight: float = 1.0,
+        fm_head_dim: int = 1536,
+        fm_head_layers: int = 12,
+        fm_head_batch_mul: int = 4,
+        o_attention_bias: bool | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vae_name_or_path = vae_name_or_path
+        self.latent_size = latent_size
+        self.latent_patch_size = latent_patch_size
+        self.latent_channels = latent_channels
+        self.boi = boi
+        self.eoi = eoi
+        self.image_placeholder_id = image_placeholder_id
+        self.pad_token_id_added = pad_token_id_added
+        self.lm_loss_weight = lm_loss_weight
+        self.im_loss_weight = im_loss_weight
+        self.fm_head_dim = fm_head_dim
+        self.fm_head_layers = fm_head_layers
+        self.fm_head_batch_mul = fm_head_batch_mul
+        self.o_attention_bias = self.attention_bias if o_attention_bias is None else o_attention_bias

models/gen_pipeline.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import re
+import copy
+from typing import Literal
+from PIL import Image
+from tqdm.auto import tqdm
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+from transformers import AutoTokenizer
+from transformers.cache_utils import Cache, StaticCache
+from models.nextstep_model import NextStep
+from vae.nextstep_ae import AutoencoderKL
+from utils.image_utils import to_pil
+from utils.model_utils import layer_norm
+from utils.compile_utils import compile_manager
+from utils.misc import set_seed
+DEFAULT_IMAGE_AREA_TOKEN = "<|image_area|>"
+def hw2str(h: int, w: int) -> str:
+    return f"{h}*{w}"
+class NextStepPipeline:
+    def __init__(
+        self,
+        model_name_or_path: str | None = None,
+        vae_name_or_path: str | None = None,
+        tokenizer: AutoTokenizer | None = None,
+        model: nn.Module | None = None,
+        vae: AutoencoderKL | None = None,
+    ):
+        if model is not None:
+            self.tokenizer = copy.deepcopy(tokenizer)
+            self.tokenizer.padding_side = "left"
+            self.model = model
+        elif model_name_or_path is not None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_name_or_path,
+                local_files_only=True,
+                padding_side="left",
+                use_fast=True,
+            )
+            self.model: NextStep = NextStep.from_pretrained(model_name_or_path, local_files_only=True)
+        else:
+            raise ValueError("model or model_name_or_path is required")
+        self.tokenizer.add_eos_token = False
+        if vae_name_or_path is None:
+            vae_name_or_path = getattr(self.model.config, "vae_name_or_path", None)
+        if vae is not None:
+            self.vae = vae
+        elif vae_name_or_path is not None:
+            self.vae = AutoencoderKL.from_pretrained(vae_name_or_path)
+        else:
+            raise ValueError("vae or vae_name_or_path is required")
+        self.model.eval()
+        self.vae.eval()
+        vae_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.down_factor = vae_factor * self.model.config.latent_patch_size
+        self.shift_factor = getattr(self.vae.config, "shift_factor", 0.0)
+        self.scaling_factor = getattr(self.vae.config, "scaling_factor", 1.0)
+        self.boi = self.model.config.boi
+        self.eoi = self.model.config.eoi
+        self.image_placeholder_id = self.model.config.image_placeholder_id
+        self.pil2tensor = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+            ]
+        )
+        self.__device = self.model.device
+        self.__dtype = self.model.dtype
+        self.to(self.device, self.dtype)
+    @property
+    def device(self):
+        return self.__device
+    @property
+    def device_type(self):
+        if isinstance(self.__device, str):
+            return self.__device
+        return self.__device.type
+    @property
+    def dtype(self):
+        return self.__dtype
+    def to(self, device: str | None = None, dtype: torch.dtype | None = None):
+        if device is not None:
+            self.__device = device
+        if dtype is not None:
+            self.__dtype = dtype
+        self.model.to(self.__device, dtype=self.__dtype)
+        self.vae.to(self.__device, dtype=self.__dtype)
+        return self
+    def _image_str(self, hw: tuple[int, int] = (256, 256)):
+        latent_hw = (hw[0] // self.down_factor, hw[1] // self.down_factor)
+        image_ids = [self.boi] + [self.image_placeholder_id] * (latent_hw[0] * latent_hw[1]) + [self.eoi]
+        image_str = DEFAULT_IMAGE_AREA_TOKEN + hw2str(*latent_hw) + self.tokenizer.decode(image_ids)
+        return image_str
+    def _check_input(
+        self, captions: str | list[str], images: Image.Image | list[Image.Image] | None
+    ) -> tuple[list[str], list[Image.Image] | None]:
+        if not isinstance(captions, list):
+            captions = [captions]
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+            # Validate image count matches <image> tokens in captions
+            image_token_count = 0
+            for caption in captions:
+                num_image_token = len(re.findall(r"<image>", caption))
+                assert num_image_token == 1, f"Caption `{caption}` has {num_image_token} image tokens, but only 1 is allowed."
+                image_token_count += num_image_token
+            if image_token_count != len(images):
+                raise ValueError(
+                    f"Number of images ({len(images)}) does not match number of image tokens ({image_token_count}).\n"
+                    f"Captions: {captions}"
+                )
+            hws = [(image.size[1], image.size[0]) for image in images]
+            # Replace <image> tokens sequentially with corresponding image_str based on hw
+            processed_captions = []
+            image_idx = 0
+            for caption in captions:
+                # Process each caption
+                processed_caption = caption
+                num_image_tokens = processed_caption.count("<image>")
+                # Replace each <image> token in order
+                for _ in range(num_image_tokens):
+                    processed_caption = processed_caption.replace("<image>", self._image_str(hws[image_idx]), 1)
+                    image_idx += 1
+                processed_captions.append(processed_caption)
+            captions = processed_captions
+        return captions, images
+    def _build_captions(
+        self,
+        captions: str | list[str],
+        images: list[Image.Image] | None = None,
+        num_images_per_caption: int = 1,
+        positive_prompt: str | None = None,
+        negative_prompt: str | None = None,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+    ):
+        # 1. repeat captions and images
+        if not isinstance(captions, list):
+            captions = [captions]
+        captions = [caption for caption in captions for _ in range(num_images_per_caption)]
+        if images is not None:
+            images = [image for image in images for _ in range(num_images_per_caption)]
+        # 2. add positive prompt
+        if positive_prompt is not None and positive_prompt != "":
+            captions = [f"{caption} {positive_prompt}" for caption in captions]
+        # 3. add negative prompt
+        if negative_prompt is None:
+            negative_prompt = ""
+        num_samples = len(captions)
+        if cfg != 1.0 and cfg_img != 1.0:  # use both image and text CFG
+            w, h = images[0].size
+            captions = (
+                captions + [self._image_str((h, w)) + negative_prompt] * num_samples
+            )
+            images = images + images
+            captions = captions + [negative_prompt] * num_samples
+        elif cfg != 1.0 and cfg_img == 1.0:  # use text CFG
+            captions = captions + [negative_prompt] * num_samples
+        elif cfg == 1.0 and cfg_img == 1.0:
+            pass
+        return captions, images
+    def _add_prefix_ids(self, hw: tuple[int, int], input_ids: torch.Tensor, attention_mask: torch.Tensor):
+        prefix_str = DEFAULT_IMAGE_AREA_TOKEN + hw2str(hw[0] // self.down_factor, hw[1] // self.down_factor)
+        prefix_output = self.tokenizer(
+            prefix_str,
+            truncation=False,
+            add_special_tokens=True,
+            return_tensors="pt"
+        )
+        prefix_input_ids = prefix_output.input_ids.to(input_ids.device, dtype=input_ids.dtype)
+        prefix_attention_mask = prefix_output.attention_mask.to(attention_mask.device, dtype=attention_mask.dtype)
+        # remove bos token
+        if self.tokenizer.bos_token is not None:
+            prefix_input_ids = prefix_input_ids[:, 1:]
+            prefix_attention_mask = prefix_attention_mask[:, 1:]
+        # add boi token
+        prefix_input_ids = torch.cat(
+            [
+                prefix_input_ids,
+                prefix_input_ids.new_tensor([self.model.config.boi]).unsqueeze(0),
+            ],
+            dim=1,
+        )
+        prefix_attention_mask = torch.cat(
+            [
+                prefix_attention_mask,
+                prefix_attention_mask.new_ones((prefix_attention_mask.shape[0], 1)),
+            ],
+            dim=1,
+        )
+        bsz = input_ids.shape[0]
+        input_ids = torch.cat([input_ids, prefix_input_ids.expand(bsz, -1)], dim=1)
+        attention_mask = torch.cat([attention_mask, prefix_attention_mask.expand(bsz, -1)], dim=1)
+        return input_ids, attention_mask
+    @torch.no_grad()
+    def decoding(
+        self,
+        c: torch.Tensor,
+        attention_mask: torch.Tensor,
+        past_key_values: Cache,
+        max_new_len: int,
+        num_images_per_caption: int,
+        use_norm: bool = False,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+        cfg_schedule: Literal["linear", "constant"] = "constant",
+        timesteps_shift: float = 1.0,
+        num_sampling_steps: int = 20,
+        progress: bool = True,
+        hw: tuple[int, int] = (256, 256),
+        step: int = 0,
+    ):
+        indices = list(range(max_new_len))
+        indices = tqdm(indices, unit="tokens") if progress else indices
+        tokens = None
+        for step in indices:
+            # cfg schedule follow Muse
+            if cfg_schedule == "linear":
+                tokens_len = 0 if tokens is None else tokens.shape[1]
+                cfg_iter = max(cfg / 2, 1 + (cfg - 1) * tokens_len / max_new_len)
+                cfg_img_iter = max(cfg_img / 2, 1 + (cfg_img - 1) * tokens_len / max_new_len)
+            elif cfg_schedule == "constant":
+                cfg_iter = cfg
+                cfg_img_iter = cfg_img
+            else:
+                raise NotImplementedError
+            c = self.model.image_out_projector(c)
+            token_sampled = self.model.image_head.sample(
+                c=c.squeeze(1),
+                cfg=cfg_iter,
+                cfg_img=cfg_img_iter,
+                timesteps_shift=timesteps_shift,
+                num_sampling_steps=num_sampling_steps,
+                noise_repeat=num_images_per_caption,
+            )
+            if use_norm:
+                token_sampled = layer_norm(token_sampled, normalized_shape=token_sampled.size()[1:])
+            if tokens is not None:
+                tokens = torch.cat([tokens, token_sampled.unsqueeze(1)], dim=1)
+            else:
+                tokens = token_sampled.unsqueeze(1)
+            cur_inputs_embeds = self.model.image_in_projector(tokens[:, -1:])
+            if cfg != 1.0 and cfg_img == 1.0:
+                cur_inputs_embeds = torch.cat([cur_inputs_embeds, cur_inputs_embeds], dim=0)
+            elif cfg != 1.0 and cfg_img != 1.0:
+                cur_inputs_embeds = torch.cat([cur_inputs_embeds, cur_inputs_embeds, cur_inputs_embeds], dim=0)
+            attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+            outputs = self.model.forward_model(
+                inputs_embeds=cur_inputs_embeds,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+            past_key_values = outputs.past_key_values
+            c = outputs.last_hidden_state[:, -1:]
+            if self.model.config.use_gen_pos_embed:
+                c = c + self.model.gen_pos_embed_with_ar(hw[0], hw[1])[:, step + 1 : step + 2, :]
+        return tokens
+    @torch.no_grad()
+    def generate_image(
+        self,
+        captions: str | list[str],
+        images: list[Image.Image] | None = None,
+        num_images_per_caption: int = 1,
+        positive_prompt: str | None = None,
+        negative_prompt: str | None = None,
+        hw: tuple[int, int] = (256, 256),
+        use_norm: bool = False,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+        cfg_schedule: Literal["linear", "constant"] = "constant",
+        num_sampling_steps: int = 20,
+        timesteps_shift: float = 1.0,
+        seed: int = 42,
+        progress: bool = True,
+    ) -> list[Image.Image]:
+        # 0. set seed
+        if seed is not None:
+            set_seed(seed)
+        # 1. check input
+        captions, images = self._check_input(captions, images)
+        # 2. build captions
+        captions, images = self._build_captions(
+            captions, images, num_images_per_caption, positive_prompt, negative_prompt, cfg, cfg_img
+        )
+        # 3. encode images
+        # `images` must be processed by `process_images` before calling this function
+        latents = None
+        if images is not None:
+            pixel_values = [self.pil2tensor(image) for image in images]
+            pixel_values = torch.stack(pixel_values).to(self.device)
+            with compile_manager.compile_disabled():
+                posterior = self.vae.encode(pixel_values.to(self.vae.dtype)).latent_dist
+            latents = (posterior.sample() - self.shift_factor) * self.scaling_factor
+        captions = [self.tokenizer.bos_token + caption if self.tokenizer.bos_token is not None else caption for caption in captions]
+        # 4. tokenize caption & add prefix ids
+        output = self.tokenizer(
+            captions,
+            padding="longest",
+            truncation=False,
+            add_special_tokens=True,
+            return_tensors="pt",
+            padding_side="left"
+        )
+        input_ids = output.input_ids.to(self.device)
+        attention_mask = output.attention_mask.to(self.device)
+        input_ids, attention_mask = self._add_prefix_ids(hw, input_ids, attention_mask)
+        # 5. LLM prefill
+        max_new_len = (hw[0] // self.down_factor) * (hw[1] // self.down_factor)
+        max_cache_len = input_ids.shape[1] + max_new_len
+        past_key_values = StaticCache(
+            config=self.model.config,
+            max_batch_size=input_ids.shape[0],
+            max_cache_len=max_cache_len,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        inputs_embeds = self.model.prepare_inputs_embeds(input_ids, latents)
+        with compile_manager.compile_disabled():
+            outputs = self.model.forward_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                use_cache=True,
+            )
+        past_key_values = outputs.past_key_values
+        c = outputs.last_hidden_state[:, -1:]
+        if self.model.config.use_gen_pos_embed:
+            c = c + self.model.gen_pos_embed_with_ar(hw[0], hw[1])[:, 0:1, :]
+        # 6. decoding
+        tokens = self.decoding(
+            c=c,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            max_new_len=max_new_len,
+            num_images_per_caption=num_images_per_caption,
+            use_norm=use_norm,
+            cfg=cfg,
+            cfg_img=cfg_img,
+            cfg_schedule=cfg_schedule,
+            timesteps_shift=timesteps_shift,
+            num_sampling_steps=num_sampling_steps,
+            progress=progress,
+            hw=hw,
+        )
+        # 7. unpatchify
+        latents = self.model.unpatchify(tokens)
+        latents = (latents / self.scaling_factor) + self.shift_factor
+        # 8. decode latents
+        with compile_manager.compile_disabled():
+            sampled_images = self.vae.decode(latents.to(self.vae.dtype)).sample
+        sampled_images = sampled_images.detach().cpu().to(torch.float32)
+        pil_images = [to_pil(img) for img in sampled_images]
+        return pil_images

models/heads.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from transformers.activations import ACT2FN
+from models.config import LlamaConfig
+from utils.misc import LargeInt
+from utils.model_utils import expand_t, randn_tensor
+from utils.compile_utils import smart_compile
+class LlamaMLP(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def modulate(x, shift, scale=None):
+    if shift is None:
+        return x * (1 + scale)
+    return x * (1 + scale) + shift
+class ResBlock(nn.Module):
+    def __init__(self, channels, mlp_ratio=1.0):
+        super().__init__()
+        self.channels = channels
+        self.intermediate_size = int(channels * mlp_ratio)
+        self.in_ln = nn.LayerNorm(self.channels, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.channels, self.intermediate_size),
+            nn.SiLU(),
+            nn.Linear(self.intermediate_size, self.channels),
+        )
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(channels, 3 * channels, bias=True))
+    def forward(self, x, y):
+        shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(y).chunk(3, dim=-1)
+        h = modulate(self.in_ln(x), shift_mlp, scale_mlp)
+        h = self.mlp(h)
+        return x + gate_mlp * h
+class FinalLayer(nn.Module):
+    def __init__(self, model_channels, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(model_channels, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(model_channels, out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(model_channels, 2 * model_channels, bias=True))
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t: torch.Tensor, dim: int, max_period: float = 10000.0):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+            device=t.device
+        )
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq.to(self.mlp[0].weight.dtype))
+        return t_emb
+class SimpleMLPAdaLN(nn.Module):
+    def __init__(self, input_dim, cond_dim, dim=1536, layers=12, mlp_ratio=1.0):
+        super().__init__()
+        self.input_dim = input_dim
+        self.cond_dim = cond_dim
+        self.dim = dim
+        self.layers = layers
+        self.mlp_ratio = mlp_ratio
+        self.time_embed = TimestepEmbedder(dim)
+        self.cond_embed = nn.Linear(cond_dim, dim)
+        self.input_proj = nn.Linear(input_dim, dim)
+        res_blocks = []
+        for _ in range(layers):
+            res_blocks.append(ResBlock(dim, mlp_ratio))
+        self.res_blocks = nn.ModuleList(res_blocks)
+        self.final_layer = FinalLayer(dim, input_dim)
+        self.grad_checkpointing = False
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP
+        nn.init.normal_(self.time_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        for block in self.res_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    @smart_compile()
+    def forward(self, x, t, c):
+        """
+        x.shape = (bsz, input_dim)
+        t.shape = (bsz,)
+        c.shape = (bsz, cond_dim)
+        """
+        x = self.input_proj(x)
+        t = self.time_embed(t)
+        c = self.cond_embed(c)
+        y = t + c
+        for block in self.res_blocks:
+            if self.grad_checkpointing and self.training:
+                x = checkpoint(block, x, y, use_reentrant=True)
+            else:
+                x = block(x, y)
+        return self.final_layer(x, y)
+class FlowMatchingHead(nn.Module):
+    def __init__(self, input_dim, cond_dim, dim=1536, layers=12, mlp_ratio=1.0):
+        super(FlowMatchingHead, self).__init__()
+        self.input_dim = input_dim
+        self.net = SimpleMLPAdaLN(input_dim=input_dim, cond_dim=cond_dim, dim=dim, layers=layers, mlp_ratio=mlp_ratio)
+    @property
+    def dtype(self):
+        return self.net.input_proj.weight.dtype
+    @property
+    def device(self):
+        return self.net.input_proj.weight.device
+    @property
+    def trainable_params(self) -> float:
+        n_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return LargeInt(n_params)
+    def get_score_from_velocity(self, velocity, x, t):
+        """Wrapper function: transfrom velocity prediction model to score
+        Args:
+            velocity: [bsz, ...] shaped tensor; velocity model output
+            x:        [bsz, ...] shaped tensor; x_t data point
+            t:        [bsz,] time tensor
+        """
+        t = expand_t(t, x)
+        alpha_t, d_alpha_t = t, 1
+        sigma_t, d_sigma_t = 1 - t, -1
+        mean = x
+        reverse_alpha_ratio = alpha_t / d_alpha_t
+        var = sigma_t**2 - reverse_alpha_ratio * d_sigma_t * sigma_t
+        score = (reverse_alpha_ratio * velocity - mean) / var
+        return score
+    def get_velocity_from_cfg(self, velocity, cfg, cfg_img, cfg_mult):
+        if cfg_mult == 2:
+            cond_v, uncond_v = torch.chunk(velocity, 2, dim=0)
+            velocity = uncond_v + cfg * (cond_v - uncond_v)
+        elif cfg_mult == 3:
+            cond_v, uncond_v1, uncond_v2 = torch.chunk(velocity, 3, dim=0)
+            velocity = uncond_v2 + cfg_img * (uncond_v1 - uncond_v2) + cfg * (cond_v - uncond_v1)
+        return velocity
+    @smart_compile(options={"triton.cudagraphs": True}, fullgraph=True)
+    @torch.no_grad()
+    def sample(
+        self,
+        c: torch.Tensor,
+        cfg: float = 1.0,
+        cfg_img: float = 1.0,
+        timesteps_shift: float = 1.0,
+        num_sampling_steps: int = 20,
+        last_step_size: float = 0.0,
+        noise_repeat: int = 1,
+    ):
+        # """c.shape = (bsz, cond_dim)"""
+        cfg_mult = 1
+        if cfg > 1.0:
+            cfg_mult += 1
+        if cfg_img > 1.0:
+            cfg_mult += 1
+        noise = randn_tensor((c.shape[0] // cfg_mult, self.input_dim), noise_repeat, self.device)
+        mean_x = noise
+        x = noise
+        xs = []
+        t0, t1 = 0, 1
+        timesteps = torch.linspace(t0, t1, num_sampling_steps + 1, device=c.device)[:-1]
+        timesteps = timesteps / (timesteps_shift - (timesteps_shift - 1) * timesteps)
+        timesteps = torch.cat([timesteps, torch.ones(1, device=c.device)])
+        for ti, tj in zip(timesteps[:-1], timesteps[1:]):
+            dt = tj - ti
+            combined = torch.cat([x] * cfg_mult, dim=0)
+            velocity = self.net(combined.to(c.dtype), ti.expand(c.shape[0]).to(c), c)
+            velocity = velocity.to(torch.float32)
+            velocity = self.get_velocity_from_cfg(velocity, cfg, cfg_img, cfg_mult)
+            score = self.get_score_from_velocity(velocity, x, ti.expand(x.shape[0]).to(x))
+            drift = velocity + (1 - expand_t(ti.expand(x.shape[0]).to(x), x)) * score
+            w_cur = randn_tensor((c.shape[0] // cfg_mult, self.input_dim), noise_repeat, self.device)
+            dw = w_cur * torch.sqrt(dt)
+            mean_x = x + drift * dt
+            x = mean_x + torch.sqrt(2 * (1 - expand_t(ti.expand(x.shape[0]).to(x), x))) * dw
+            xs.append(x)
+        if len(xs) != num_sampling_steps:
+            raise ValueError(f"Samples ({len(xs)}) does not match the number of steps ({num_sampling_steps})")
+        return xs[-1].to(c.dtype)

models/llama_model.py ADDED Viewed

	@@ -0,0 +1,568 @@

+from typing import Optional, Tuple
+from loguru import logger
+import math
+import torch
+import torch.nn as nn
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+from transformers.utils import is_flash_attn_greater_or_equal_2_10
+from transformers import ROPE_INIT_FUNCTIONS
+from transformers.models.llama.configuration_llama import LlamaConfig
+from models.heads import LlamaMLP
+from utils.model_utils import apply_rotary_pos_emb, repeat_kv
+from models.config import NextStepConfig
+class LlamaRMSNorm(nn.Module):
+    """LlamaRMSNorm is equivalent to T5LayerNorm"""
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, device=None, config: Optional[LlamaConfig] = None):
+        super().__init__()
+        self.rope_type = "default"
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class LlamaAttention(nn.Module):
+    def __init__(self, config: NextStepConfig, layer_idx: Optional[int]):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(
+            self.num_heads * self.head_dim, self.hidden_size, bias=getattr(config, "o_attention_bias", config.attention_bias)
+        )
+        self._flash_attn_uses_top_left_mask = False
+    def forward_sdpa(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+    def forward_flash(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at GitHub - huggingface/transformers: 🤗 Transformers: the model-definition framework for state-of-the-a"
+            )
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaFlashAttention2(LlamaAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=None,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaSdpaAttention(LlamaAttention):
+    """
+    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+    # Adapted from LlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "flash_attention_2": LlamaFlashAttention2,
+    "sdpa": LlamaSdpaAttention,
+}
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs

models/nextstep_model.py ADDED Viewed

	@@ -0,0 +1,553 @@

+import os
+import json
+import inspect
+from loguru import logger
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from safetensors.torch import safe_open
+from transformers.modeling_utils import PreTrainedModel
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from models.config import NextStepConfig
+from models.llama_model import LlamaDecoderLayer, LlamaRMSNorm, LlamaRotaryEmbedding
+from models.heads import FlowMatchingHead
+from utils.misc import LargeInt
+from utils.compile_utils import smart_compile
+from utils.model_utils import get_2d_sincos_pos_embed
+@dataclass
+class NextStepOutputWithPast(CausalLMOutputWithPast):
+    lm_loss: torch.FloatTensor | None = None
+    im_loss: torch.FloatTensor | None = None
+class NextStepPreTrainedModel(PreTrainedModel):
+    config_class = NextStepConfig
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def trainable_params(self) -> float:
+        n_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return LargeInt(n_params)
+class NextStep(NextStepPreTrainedModel):
+    def __init__(self, config: NextStepConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        token_dim = self.config.latent_channels * self.config.latent_patch_size**2
+        self.image_in_projector = nn.Linear(token_dim, config.hidden_size)
+        self.image_in_projector.weight.data.normal_(mean=0.0, std=config.initializer_range)
+        self.image_in_projector.bias.data.zero_()
+        self.image_out_projector = nn.Linear(config.hidden_size, config.hidden_size)
+        self.image_out_projector.weight.data.normal_(mean=0.0, std=config.initializer_range)
+        self.image_out_projector.bias.data.zero_()
+        self.image_head = FlowMatchingHead(
+            input_dim=token_dim,
+            cond_dim=config.hidden_size,
+            dim=config.fm_head_dim,
+            layers=config.fm_head_layers,
+        )
+        if config.use_gen_pos_embed:
+            self.init_gen_pos_embed()
+    def init_gen_pos_embed(self):
+        self.register_buffer(
+            "gen_pos_embed",
+            torch.from_numpy(
+                get_2d_sincos_pos_embed(
+                    self.config.hidden_size, self.config.base_image_grid_size
+                )
+            ).float().unsqueeze(0),
+        )
+    def gen_pos_embed_with_ar(self, h, w):
+        bsz, hw, dim = self.gen_pos_embed.shape
+        gen_pos_embed = self.gen_pos_embed.reshape(bsz, int(hw**0.5), int(hw**0.5), dim)
+        gen_pos_embed = gen_pos_embed[:, :h, :w, :]
+        gen_pos_embed = gen_pos_embed.reshape(bsz, -1, dim)
+        return gen_pos_embed
+    @property
+    def image_size(self):
+        return self.config.image_size
+    @property
+    def image_patch_size(self):
+        return self.config.patch_size
+    @property
+    def image_grid_size(self):
+        return round(self.image_size / self.image_patch_size)
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def load_lm_head(self, lm_head_dir: str | None = None):
+        index_json_file = os.path.join(lm_head_dir, "model.safetensors.index.json")
+        head_weight_name = "lm_head.weight" if not self.config.tie_word_embeddings else "model.embed_tokens.weight"
+        if os.path.exists(index_json_file):
+            with open(index_json_file, "r") as f:
+                index = json.load(f)
+            model_name = index["weight_map"][head_weight_name]
+        else:
+            model_name = "model.safetensors"
+        with safe_open(os.path.join(lm_head_dir, model_name), framework="pt") as f:
+            loaded_weight = f.get_tensor(head_weight_name)
+            loaded_weight = loaded_weight.to(dtype=self.lm_head.weight.dtype, device=self.lm_head.weight.device)
+            self.lm_head.weight.data.copy_(loaded_weight)
+    def patchify(self, img: torch.Tensor):
+        """
+        img: (bsz, C, H, W)
+        x: (bsz, H * W / patch_size**2, patch_size**2 * C)
+        """
+        bsz, c, h, w = img.shape
+        p = self.config.latent_patch_size
+        h_, w_ = h // p, w // p
+        img = img.reshape(bsz, c, h_, p, w_, p)
+        img = torch.einsum("nchpwq->nhwcpq", img)
+        x = img.reshape(bsz, h_ * w_, c * p**2)
+        return x
+    def unpatchify(self, x: torch.Tensor, h: int = None, w: int = None):
+        """
+        x: (bsz, H * W / patch_size**2, patch_size**2 * C)
+        img: (bsz, C, H, W)
+        """
+        bsz = x.shape[0]
+        p = self.config.latent_patch_size
+        c = self.config.latent_channels
+        if h is None and w is None:
+            h_ = w_ = int(x.shape[1] ** 0.5)
+        else:
+            h_, w_ = h, w
+        assert h_ * w_ == x.shape[1], f"Invalid sequence length {x.shape[1]}."
+        x = x.reshape(bsz, h_, w_, c, p, p)
+        x = torch.einsum("nhwcpq->nchpwq", x)
+        img = x.reshape(bsz, c, h_ * p, w_ * p)
+        return img
+    def prepare_inputs_embeds(self, input_ids: torch.LongTensor | None = None, latents: torch.FloatTensor | None = None):
+        if latents is None:
+            if not self.training:
+                return self.embed_tokens(input_ids)
+            else:  # dummy forward for image pass, for the consistent shape of gradient.
+                raise NotImplementedError("Dummy forward for image pass is not implemented.")
+        else:
+            bs, seq_length = input_ids.shape
+            inputs_embeds = torch.zeros(
+                (bs, seq_length, self.config.hidden_size),
+                device=self.embed_tokens.weight.device,
+                dtype=self.embed_tokens.weight.dtype,
+            )
+            im_indices = input_ids == self.config.image_placeholder_id
+            lm_indices = ~im_indices
+            if isinstance(latents, list):
+                tokens = torch.cat([self.patchify(latent) for latent in latents], dim=1)
+            else:
+                tokens = self.patchify(latents)
+                # tokens = tokens.reshape(1, -1, tokens.shape[-1])
+            image_embeds = self.image_in_projector(tokens)
+            image_embeds = image_embeds.view(-1, self.config.hidden_size)
+            token_embeds = self.embed_tokens(input_ids[lm_indices])
+            inputs_embeds[im_indices] = image_embeds.to(inputs_embeds.dtype)
+            inputs_embeds[lm_indices] = token_embeds
+            return inputs_embeds
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
+        return causal_mask
+    @smart_compile()
+    def forward_model(
+        self,
+        inputs_embeds: torch.FloatTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: Cache | list[torch.FloatTensor] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        cache_position: torch.LongTensor | None = None,
+    ) -> tuple | BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Cache | None = None,
+        attention_mask: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs,
+    ):
+        """
+        Prepare the model inputs for generation. In includes operations like computing the 4D attention mask or
+        slicing inputs given the existing cache.
+        See the forward pass in the model documentation for expected arguments (different models might have different
+        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
+        """
+        # 1. Handle BC:
+        model_inputs = {}
+        # - some models don't have `Cache` support (which implies they don't expect `cache_position` in `forward`)
+        if self._supports_cache_class:
+            model_inputs["cache_position"] = cache_position
+        # - `cache_position` was not a mandatory input in `prepare_inputs_for_generation` for those models, and this
+        #   function may be called outside of `generate`. Handle most use cases by creating `cache_position` on the fly
+        #   (this alternative is not as robust as calling `generate` and letting it create `cache_position`)
+        elif cache_position is None:
+            past_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+            cache_position = torch.arange(past_length, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        # 2. Generic cache-dependent input preparation
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case
+        if past_key_values is not None:
+            model_inputs["past_key_values"] = past_key_values
+            if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]:  # Exception 1 or Exception 3
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        # 3. Prepare base model inputs
+        input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if not self.config.is_encoder_decoder:
+            if inputs_embeds is not None and cache_position[0] == 0:
+                model_inputs[input_ids_key] = None
+                model_inputs["inputs_embeds"] = inputs_embeds
+            else:
+                # `clone` calls in this function ensure a consistent stride. See #32227
+                model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
+                model_inputs["inputs_embeds"] = None
+        else:
+            model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)
+        # 4. Create missing `position_ids` on the fly
+        if (
+            attention_mask is not None
+            and kwargs.get("position_ids") is None
+            and "position_ids" in set(inspect.signature(self.forward).parameters.keys())
+        ):
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            kwargs["position_ids"] = position_ids  # placed in kwargs for further processing (see below)
+        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
+        for model_input_name in ["position_ids", "token_type_ids"]:
+            model_input = kwargs.get(model_input_name)
+            if model_input is not None:
+                if past_key_values:
+                    model_input = model_input[:, -input_ids.shape[1] :]
+                    model_input = model_input.clone(memory_format=torch.contiguous_format)
+                model_inputs[model_input_name] = model_input
+        # 6. Create 4D attention mask is we are using a `StaticCache` (important for performant compiled forward pass)
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs[input_ids_key].shape
+                device = model_inputs[input_ids_key].device
+            # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create
+            # the 4D causal mask exists, it should be present in the base model (XXXModel class).
+            base_model = getattr(self, self.base_model_prefix, None)
+            if base_model is None:
+                causal_mask_creation_function = getattr(self, "_prepare_4d_causal_attention_mask_with_cache_position", None)
+            else:
+                causal_mask_creation_function = getattr(
+                    base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None
+                )
+            if causal_mask_creation_function is None:
+                logger.warning_once(
+                    f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method "
+                    "defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're "
+                    "writing code, see Llama for an example implementation. If you're a user, please report this "
+                    "issue on GitHub."
+                )
+            else:
+                attention_mask = causal_mask_creation_function(
+                    attention_mask,
+                    sequence_length=sequence_length,
+                    target_length=past_key_values.get_max_cache_shape(),
+                    dtype=self.dtype,
+                    device=device,
+                    cache_position=cache_position,
+                    batch_size=batch_size,
+                    config=self.config,
+                    past_key_values=past_key_values,
+                )
+        if attention_mask is not None:
+            model_inputs["attention_mask"] = attention_mask
+        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+        # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples)
+        model_inputs.pop("labels", None)
+        return model_inputs
+    @torch.no_grad()
+    def generate(self, inputs: torch.LongTensor = None, **kwargs):
+        input_ids = kwargs.pop("input_ids")
+        latents = kwargs.pop("latents", None)
+        inputs_embeds = self.prepare_inputs_embeds(input_ids, latents)
+        return super().generate(inputs=inputs, input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs)
+    def gradient_checkpointing_enable(self, **kwargs):
+        super().gradient_checkpointing_enable(**kwargs)
+        self.image_head.net.grad_checkpointing = True

pytorch-model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:909083ea5deb26b37d66d4966869b0c3310c06aa72a88974c293d0fe62a489b9
+size 9962132680

pytorch-model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:630af8946d2326406253ede6e3cb143d935ba19595059ce56af86fd50442e2d3
+size 9909693448

pytorch-model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47a71e20a18992b9008bae6fa523b69824c1e67b5656bbd8fa6b442bf7405c72
+size 8478742432

pytorch-model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5f5fe250f5cbce219aadb61c9a44903739e510a66184cc960bfce87175bc34
+size 1557135464

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+diffusers==0.34.0
+einops==0.8.1
+gradio==5.42.0
+loguru==0.7.3
+numpy==1.26.4
+omegaconf==2.3.0
+Pillow==11.0.0
+Requests==2.32.4
+safetensors==0.5.3
+tabulate==0.9.0
+torch==2.5.1
+torchvision==0.20.1
+tqdm==4.67.1
+transformers==4.55.0

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<|image_area|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|image_placeholder|>",
+    "<|begin_of_prompt_refinement|>",
+    "<|end_of_prompt_refinement|>",
+    "<|begin_of_thinking|>",
+    "<|end_of_thinking|>",
+    "<|beginoftext|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:310b48c809fba04c32e7f7cdac4d0fb1c00140d8914e0b0163307f64e5330a92
+size 11423853

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,285 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|image_area|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|image_placeholder|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|begin_of_prompt_refinement|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|end_of_prompt_refinement|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|begin_of_thinking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|end_of_thinking|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|beginoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|image_area|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|image_placeholder|>",
+    "<|begin_of_prompt_refinement|>",
+    "<|end_of_prompt_refinement|>",
+    "<|begin_of_thinking|>",
+    "<|end_of_thinking|>",
+    "<|beginoftext|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

utils/__pycache__/compile_utils.cpython-310.pyc ADDED Viewed

Binary file (2.72 kB). View file

utils/__pycache__/image_utils.cpython-310.pyc ADDED Viewed

Binary file (8.46 kB). View file

utils/__pycache__/misc.cpython-310.pyc ADDED Viewed

Binary file (2.03 kB). View file

utils/__pycache__/model_utils.cpython-310.pyc ADDED Viewed

Binary file (4.38 kB). View file

utils/aspect_ratio.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+import PIL.Image
+ANY_ASPECT_RATIO = (0, 0)
+HW_ASPECT_RATIOS = [
+    (8, 32),  # 256
+    (9, 28),  # 252
+    (10, 25),  # 250
+    (11, 23),  # 253
+    (12, 21),  # 252
+    (13, 19),  # 247
+    (14, 18),  # 252
+    (15, 17),  # 255
+    (16, 16),  # 256
+    (17, 15),  # 255
+    (18, 14),  # 252
+    (19, 13),  # 247
+    (21, 12),  # 252
+    (23, 11),  # 253
+    (25, 10),  # 250
+    (28, 9),  # 252
+    (32, 8),  # 256
+]
+def get_ar_base(ars: list[tuple[int, int]] = HW_ASPECT_RATIOS):
+    sqrt_products = [round(np.sqrt(h * w)) for h, w in ars]
+    return round(np.mean(sqrt_products))
+def ar2str(h: int, w: int) -> str:
+    return f"{h}*{w}"
+def str2ar(s: str) -> tuple[int, int]:
+    return tuple(map(int, s.split("*")))
+def center_crop_arr_with_buckets(pil_image, ars: list[tuple[int, int]] = HW_ASPECT_RATIOS, crop=True, buckets: list[int] = [256, 512, 768, 1024]):
+    """
+    Center crop the image to match the closest aspect ratio from the provided list.
+    Args:
+        pil_image: PIL Image to be cropped
+        image_size: Target size for the smaller dimension
+        ars: List of aspect ratios as (height, width) tuples
+    Returns:
+        PIL Image cropped to the closest aspect ratio
+    """
+    # ar_base = get_ar_base(ars)
+    # Get current image dimensions
+    width, height = pil_image.size
+    buckets = sorted(buckets, reverse=True)
+    image_size = buckets[-1]
+    for bucket in buckets:
+        if width * height >= bucket * bucket:
+            image_size = bucket
+            break
+    return center_crop_arr_with_ar(pil_image, image_size, ars, crop)
+def center_crop_arr_with_ar(pil_image, image_size: int, ars: list[tuple[int, int]] = HW_ASPECT_RATIOS, crop=True):
+    """
+    Center crop the image to match the closest aspect ratio from the provided list.
+    Args:
+        pil_image: PIL Image to be cropped
+        image_sizes: Target size for the smaller dimension
+        ars: List of aspect ratios as (height, width) tuples
+    Returns:
+        PIL Image cropped to the closest aspect ratio
+    """
+    ar_base = get_ar_base(ars)
+    assert image_size % ar_base == 0, f"image_size must be divisible by {ar_base}"
+    # Get current image dimensions
+    width, height = pil_image.size
+    current_ar = height / width
+    # Find the closest aspect ratio
+    closest_ar_idx = np.argmin([abs(current_ar - (h / w)) for h, w in ars])
+    target_h, target_w = ars[closest_ar_idx]
+    if crop:
+        target_h, target_w = round(image_size / ar_base * target_h), round(image_size / ar_base * target_w)
+        # First, resize the image while maintaining aspect ratio to ensure the smaller dimension is at least the target size
+        scale = max(target_h / height, target_w / width)
+        new_height = round(height * scale)
+        new_width = round(width * scale)
+        pil_image = pil_image.resize((new_width, new_height), resample=PIL.Image.LANCZOS)
+        arr = np.array(pil_image)
+        # Then perform center crop to the target dimensions
+        crop_y = (new_height - target_h) // 2
+        crop_x = (new_width - target_w) // 2
+        return PIL.Image.fromarray(arr[crop_y : crop_y + target_h, crop_x : crop_x + target_w])
+    else:
+        scale = image_size // ar_base
+        return pil_image.resize((round(target_w * scale), round(target_h * scale)), resample=PIL.Image.LANCZOS)

utils/compile_utils.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import contextlib
+import functools
+import os
+from typing import Callable, Dict, Optional
+import torch
+from loguru import logger
+"""
+Usage:
+1. Control through environment variable (at startup):
+    export TORCH_COMPILE_ENABLE=true
+    python your_script.py
+2. Control through environment variable (disable):
+    export TORCH_COMPILE_ENABLE=false  # or not set
+    python your_script.py
+3. Dynamically control in code:
+    compile_manager.set_compile_enabled(True)   # enable
+    compile_manager.set_compile_enabled(False)  # disable
+4. Select version at runtime:
+    # use the version configured
+    result = my_function(args)
+    # force use the original version
+    result = my_function.original(args)
+    # force use the compiled version
+    result = my_function.compiled(args)
+"""
+# Global configuration: control whether to enable compile through environment variables
+# Default set this env to true
+ENABLE_TORCH_COMPILE = os.getenv("ENABLE_TORCH_COMPILE", "false").lower() == "true"
+class CompileManager:
+    """Global controller for torch.compile"""
+    def __init__(self):
+        self.compile_enabled = ENABLE_TORCH_COMPILE
+        self.compiled_functions: Dict[str, Callable] = {}
+        self.original_functions: Dict[str, Callable] = {}
+    def set_compile_enabled(self, enabled: bool):
+        """Dynamic setting of whether to enable compile"""
+        self.compile_enabled = enabled
+    def get_compile_status(self):
+        """Get the current compile status"""
+        return self.compile_enabled
+    @contextlib.contextmanager
+    def compile_disabled(self):
+        """Temporarily disable compile within the context"""
+        original_status = self.compile_enabled
+        try:
+            self.compile_enabled = False
+            yield
+        finally:
+            self.compile_enabled = original_status
+# global instance
+compile_manager = CompileManager()
+def smart_compile(func: Optional[Callable] = None, **compile_kwargs):
+    """
+    Smart compile decorator
+    Args:
+        func: The function to decorate
+        **compile_kwargs: Other compile parameters, see https://pytorch.org/docs/stable/generated/torch.compile.html
+    """
+    def decorator(fn: Callable) -> Callable:
+        # save the original function
+        original_func = fn
+        # Use qualified name to handle functions with same name in different classes
+        # Include module name to handle functions with same name in different files
+        func_name = f"{fn.__module__}.{fn.__qualname__}"
+        compile_manager.original_functions[func_name] = original_func
+        # if compile is disabled, return the original function
+        if not compile_manager.compile_enabled:
+            # add attributes to the original function for later access
+            original_func.original = original_func
+            original_func.compiled = original_func  # point to itself
+            return original_func
+        # create the compiled function
+        try:
+            compiled_func = torch.compile(original_func, **compile_kwargs)
+            compile_manager.compiled_functions[func_name] = compiled_func
+        except Exception as e:
+            logger.warning(f"[WARNING] Failed to compile function {func_name}: {e}")
+            # if compile fails, revert to the original function
+            compiled_func = original_func
+        @functools.wraps(original_func)
+        def wrapper(*args, **kwargs):
+            # check whether to use the compiled version at runtime
+            if compile_manager.compile_enabled:
+                return compiled_func(*args, **kwargs)
+            else:
+                return original_func(*args, **kwargs)
+        # add attributes to the wrapper for later access
+        wrapper.original = original_func
+        wrapper.compiled = compiled_func
+        return wrapper
+    # support direct use of @smart_compile or @smart_compile(...)
+    if func is not None:
+        return decorator(func)
+    return decorator

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import io
+import os
+from typing import Literal, TypeAlias
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+import requests
+import torch
+"""
+- pil: `PIL.Image.Image`, size (w, h), seamless conversion between `uint8`
+- np: `np.ndarray`, shape (h, w, c), default `np.uint8`
+- pt: `torch.Tensor`, shape (c, h, w), default `torch.uint8`
+"""
+ImageType: TypeAlias = PIL.Image.Image | np.ndarray | torch.Tensor
+ImageTypeStr: TypeAlias = Literal["pil", "np", "pt"]
+ImageFormat: TypeAlias = Literal["JPEG", "PNG"]
+DataFormat: TypeAlias = Literal["255", "01", "11"]
+IMG_SUPPORT_MODE = ["L", "LA", "RGB", "RGBA", "CMYK", "P", "1"]
+IMAGE_EXT_LOWER = ["png", "jpeg", "jpg", "webp"]
+IMAGE_EXT = IMAGE_EXT_LOWER + [_ext.upper() for _ext in IMAGE_EXT_LOWER]
+def check_image_type(image: ImageType):
+    if not (isinstance(image, PIL.Image.Image) or isinstance(image, np.ndarray) or isinstance(image, torch.Tensor)):
+        raise TypeError(f"`image` should be PIL Image, ndarray or Tensor. Got `{type(image)}`.")
+def to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+    # Automatically adjust the orientation of the image to match the direction it was taken.
+    image = PIL.ImageOps.exif_transpose(image)
+    if image.mode not in IMG_SUPPORT_MODE:
+        raise ValueError(f"Only support mode in `{IMG_SUPPORT_MODE}`, got `{image.mode}`")
+    if image.mode == "LA":
+        image = image.convert("RGBA")
+    # add white background for RGBA images, and convert to RGB
+    if image.mode == "RGBA":
+        background = PIL.Image.new("RGBA", image.size, "white")
+        image = PIL.Image.alpha_composite(background, image).convert("RGB")
+    # then convert to RGB
+    image = image.convert("RGB")
+    return image
+def load_image(
+    image: str | os.PathLike | PIL.Image.Image | bytes,
+    *,
+    output_type: ImageTypeStr = "pil",
+) -> ImageType:
+    """
+    Loads `image` to a PIL Image, NumPy array or PyTorch tensor.
+    Args:
+        image (str | PIL.Image.Image): The path to image or PIL Image.
+        mode (ImageMode, optional): The mode to convert to. Defaults to None (no conversion).
+            The current version supports all possible conversions between "L", "RGB", "RGBA".
+        output_type (ImageTypeStr, optional): The type of the output image. Defaults to "pil".
+            The current version supports "pil", "np", "pt".
+    Returns:
+        ImageType: The loaded image in the given type.
+    """
+    timeout = 10
+    # Load the `image` into a PIL Image.
+    if isinstance(image, str) or isinstance(image, os.PathLike):
+        if image.startswith("http://") or image.startswith("https://"):
+            try:
+                image = PIL.Image.open(requests.get(image, stream=True, timeout=timeout).raw)
+            except requests.exceptions.Timeout:
+                raise ValueError(f"HTTP request timed out after {timeout} seconds")
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://`, `https://` or `s3+[profile]://`, and `{image}` is not a valid path."
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    elif isinstance(image, bytes):
+        image = PIL.Image.open(io.BytesIO(image))
+    else:
+        raise ValueError(f"`image` must be a path or PIL Image, got `{type(image)}`")
+    image = to_rgb(image)
+    if output_type == "pil":
+        image = image
+    elif output_type == "np":
+        image = to_np(image)
+    elif output_type == "pt":
+        image = to_pt(image)
+    else:
+        raise ValueError(f"`output_type` must be one of `{ImageTypeStr}`, got `{output_type}`")
+    return image
+def to_pil(image: ImageType, image_mode: DataFormat | None = None) -> PIL.Image.Image:
+    """
+    Convert a NumPy array or a PyTorch tensor to a PIL image.
+    """
+    check_image_type(image)
+    if isinstance(image, PIL.Image.Image):
+        return image
+    elif isinstance(image, np.ndarray):
+        image = normalize_np(image, image_mode)
+    elif isinstance(image, torch.Tensor):
+        image = normalize_pt(image, image_mode)
+        image = image.cpu().permute(1, 2, 0).numpy()
+        assert image.dtype == np.uint8, f"Supposed to convert `torch.uint8` to `np.uint8`, but got `{image.dtype}`"
+    mode_map = {1: "L", 3: "RGB"}
+    mode = mode_map[image.shape[-1]]
+    if image.shape[-1] == 1:
+        image = image[:, :, 0]
+    return PIL.Image.fromarray(image, mode=mode)
+def to_np(image: ImageType, image_mode: DataFormat | None = None) -> np.ndarray:
+    """
+    Convert a PIL image or a PyTorch tensor to a NumPy array.
+    """
+    check_image_type(image)
+    if isinstance(image, PIL.Image.Image):
+        image = np.array(image, np.uint8, copy=True)
+    if isinstance(image, np.ndarray):
+        image = normalize_np(image, image_mode)
+    elif isinstance(image, torch.Tensor):
+        image = normalize_pt(image, image_mode)
+        image = image.cpu().permute(1, 2, 0).numpy()
+        assert image.dtype == np.uint8, f"Supposed to convert `torch.uint8` to `np.uint8`, but got `{image.dtype}`"
+    return image
+def to_pt(image: ImageType, image_mode: DataFormat | None = None) -> torch.Tensor:
+    """
+    Convert a PIL image or a NumPy array to a PyTorch tensor.
+    """
+    check_image_type(image)
+    if isinstance(image, torch.Tensor):
+        image = normalize_pt(image, image_mode)
+        return image
+    # convert PIL Image to NumPy array
+    if isinstance(image, PIL.Image.Image):
+        image = np.array(image, np.uint8, copy=True)
+    image = normalize_np(image, image_mode)
+    image = torch.from_numpy(image.transpose((2, 0, 1))).contiguous()
+    assert image.dtype == torch.uint8, f"Supposed to convert `np.uint8` to `torch.uint8`, but got `{image.dtype}`"
+    return image
+def normalize_np(image: np.ndarray, image_mode: DataFormat | None = None) -> np.ndarray:
+    """
+    Normalize a NumPy array to the standard format of shape (h, w, c) and uint8.
+    """
+    if image.ndim not in {2, 3}:
+        raise ValueError(f"`image` should be 2 or 3 dimensions. Got {image.ndim} dimensions.")
+    elif image.ndim == 2:
+        # if 2D image, add channel dimension (HWC)
+        image = np.expand_dims(image, 2)
+    if image.shape[-1] not in {1, 3}:
+        raise ValueError(f"`image` should have 1 (`L`) or 3 (`RGB`) channels. Got {image.shape[-1]} channels.")
+    image = to_dataformat(image, image_mode=image_mode, mode="255")
+    return image
+def normalize_pt(image: torch.Tensor, image_mode: DataFormat | None = None) -> torch.Tensor:
+    """
+    Normalize a PyTorch tensor to the standard format of shape (c, h, w) and uint8.
+    """
+    if image.ndimension() not in {2, 3}:
+        raise ValueError(f"`image` should be 2 or 3 dimensions. Got {image.ndimension()} dimensions.")
+    elif image.ndimension() == 2:
+        # if 2D image, add channel dimension (CHW)
+        image = image.unsqueeze(0)
+    # check number of channels
+    if image.shape[-3] not in {1, 3}:
+        raise ValueError(f"`image` should have 1 (`L`) or 3 (`RGB`) channels. Got {image.shape[-3]} channels.")
+    image = to_dataformat(image, image_mode=image_mode, mode="255")
+    return image
+def to_dataformat(
+    image: ImageType,
+    *,
+    image_mode: DataFormat | None = None,
+    mode: DataFormat = "255",
+) -> np.ndarray | torch.Tensor:
+    check_image_type(image)
+    # convert PIL Image to NumPy array
+    if isinstance(image, PIL.Image.Image):
+        image = np.array(image, np.uint8, copy=True)
+        image_mode = "255"
+    # guess image mode
+    if image.dtype == np.uint8 or image.dtype == torch.uint8:
+        guess_image_mode = "255"
+    elif image.dtype == np.float32 or image.dtype == np.float16 or image.dtype == torch.float32 or image.dtype == torch.float16:
+        if image.min() < 0.0:
+            guess_image_mode = "11"
+        else:
+            guess_image_mode = "01"
+    else:
+        raise ValueError(f"Unsupported dtype `{image.dtype}`")
+    if image_mode is None:
+        image_mode = guess_image_mode
+    else:
+        if guess_image_mode != image_mode:
+            print(f"Guess image mode is `{guess_image_mode}`, but image mode is `{image_mode}`")
+    if isinstance(image, np.ndarray):
+        if image_mode == "255" and mode != "255":
+            np.clip((image.astype(np.float32) / 255), 0, 1, out=image)
+            if mode == "11":
+                np.clip((image * 2 - 1), -1, 1, out=image)
+        elif image_mode == "01" and mode != "01":
+            if mode == "255":
+                np.clip(image, 0, 1, out=image)
+                image = (image * 255).round().astype(np.uint8)
+            elif mode == "11":
+                np.clip((image * 2 - 1), -1, 1, out=image)
+        elif image_mode == "11" and mode != "11":
+            np.clip((image / 2 + 0.5), 0, 1, out=image)
+            if mode == "255":
+                image = (image * 255).round().astype(np.uint8)
+    elif isinstance(image, torch.Tensor):
+        if image_mode == "255" and mode != "255":
+            image = image.to(dtype=torch.float32).div(255).clamp(0, 1)
+            if mode == "11":
+                image = (image * 2 - 1).clamp(-1, 1)
+        elif image_mode == "01" and mode != "01":
+            if mode == "255":
+                image = image.clamp(0, 1)
+                image = (image * 255).round().to(dtype=torch.uint8)
+            elif mode == "11":
+                image = (image * 2 - 1).clamp(-1, 1)
+        elif image_mode == "11" and mode != "11":
+            image = (image / 2 + 0.5).clamp(0, 1)
+            if mode == "255":
+                image = image.mul(255).round().to(dtype=torch.uint8)
+    return image
+def resize_image(pil_image, image_size):
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(tuple(x // 2 for x in pil_image.size), resample=PIL.Image.BOX)
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(tuple(round(x * scale) for x in pil_image.size), resample=PIL.Image.BICUBIC)
+    return pil_image
+def center_crop_arr(pil_image, image_size, crop=True):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    if crop:
+        pil_image = resize_image(pil_image, image_size)
+        arr = np.array(pil_image)
+        crop_y = (arr.shape[0] - image_size) // 2
+        crop_x = (arr.shape[1] - image_size) // 2
+        return PIL.Image.fromarray(arr[crop_y : crop_y + image_size, crop_x : crop_x + image_size])
+    else:
+        # 将图像填充为正方形
+        width, height = pil_image.size
+        if width != height:
+            # 创建一个正方形画布，尺寸为较大的边长
+            max_dim = max(width, height)
+            padded_img = PIL.Image.new(pil_image.mode, (max_dim, max_dim), (0, 0, 0))
+            # 将原图居中粘贴到正方形画布上
+            padded_img.paste(pil_image, ((max_dim - width) // 2, (max_dim - height) // 2))
+            pil_image = padded_img
+        pil_image = resize_image(pil_image, image_size)
+        return pil_image

utils/misc.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import numpy as np
+import random
+import torch
+def set_seed(seed: int, rank: int = 0):
+    random.seed(seed + rank)
+    np.random.seed(seed + rank)
+    torch.manual_seed(seed + rank)
+    torch.cuda.manual_seed_all(seed + rank)
+    torch.backends.cudnn.deterministic = True
+    os.environ["PYTHONHASHSEED"] = str(seed + rank)
+class LargeInt(int):
+    def __new__(cls, value):
+        if isinstance(value, str):
+            units = {"K": 1e3, "M": 1e6, "B": 1e9, "T": 1e12}
+            last_char = value[-1].upper()
+            if last_char in units:
+                num = float(value[:-1]) * units[last_char]
+                return super(LargeInt, cls).__new__(cls, int(num))
+            else:
+                return super(LargeInt, cls).__new__(cls, int(value))
+        else:
+            return super(LargeInt, cls).__new__(cls, value)
+    def __str__(self):
+        value = int(self)
+        if abs(value) < 1000:
+            return f"{value}"
+        for unit in ["", "K", "M", "B", "T"]:
+            if abs(value) < 1000:
+                return f"{value:.1f}{unit}"
+            value /= 1000
+        return f"{value:.1f}P"  # P stands for Peta, or 10^15
+    def __repr__(self):
+        return f'"{self.__str__()}"'  # Ensure repr also returns the string with quotes
+    def __json__(self):
+        return f'"{self.__str__()}"'
+    def __add__(self, other):
+        if isinstance(other, int):
+            return LargeInt(super().__add__(other))
+        return NotImplemented
+    def __radd__(self, other):
+        return self.__add__(other)  # This ensures commutativity

utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+import numpy as np
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0, pe_interpolation=1.0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32) / pe_interpolation
+    grid_w = np.arange(grid_size, dtype=np.float32) / pe_interpolation
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def expand_t(t, x):
+    """Function to reshape time t to broadcastable dimension of x
+    Args:
+        t: [bsz,], time vector
+        x: [bsz,...], data point
+    """
+    dims = [1] * (len(x.size()) - 1)
+    t = t.view(t.size(0), *dims)
+    return t
+def randn_tensor(shape, noise_repeat, device, dtype=torch.float32):
+    bsz = shape[0]
+    if bsz % noise_repeat != 0:
+        raise ValueError(f"Batch size ({bsz}) must be divisible by noise repeat ({noise_repeat})")
+    _shape = (noise_repeat,) + shape[1:]
+    _tensor = torch.randn(_shape, device=device, dtype=dtype).repeat(bsz // noise_repeat, 1)
+    return _tensor
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def identity(input: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    return input
+def rms_norm(
+    input: torch.Tensor,
+    normalized_shape: torch.Size,
+    eps: float = 1e-6,
+    ) -> torch.Tensor:
+    dtype = input.dtype
+    input = input.to(torch.float32)
+    variance = input.flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)]
+    input = input * torch.rsqrt(variance + eps)
+    return input.to(dtype)
+def layer_norm(
+    input: torch.Tensor,
+    normalized_shape: torch.Size,
+    eps: float = 1e-6,
+    ) -> torch.Tensor:
+    dtype = input.dtype
+    input = input.to(torch.float32)
+    mean = input.flatten(-len(normalized_shape)).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)]
+    variance = (input - mean).flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)]
+    input = (input - mean) * torch.rsqrt(variance + eps)
+    return input.to(dtype)

vae/__pycache__/nextstep_ae.cpython-310.pyc ADDED Viewed

Binary file (15.1 kB). View file

vae/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99293255229a29297e2851858db3794497d1b0b09b20c308c1062636ea4bcdd9
+size 335365010

vae/config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [1, 2, 4, 4],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "shift_factor": 0,
+    "scaling_factor": 1,
+    "deterministic": true,
+    "encoder_norm": true,
+    "psz": 1
+}

vae/nextstep_ae.py ADDED Viewed

	@@ -0,0 +1,494 @@

+import os
+import json
+import inspect
+from dataclasses import dataclass, field, asdict
+from loguru import logger
+from omegaconf import OmegaConf
+from tabulate import tabulate
+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.utils.checkpoint import checkpoint
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from utils.misc import LargeInt
+from utils.model_utils import randn_tensor
+from utils.compile_utils import smart_compile
+@dataclass
+class AutoEncoderParams:
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    ch_mult: list[int] = field(default_factory=lambda: [1, 2, 4, 4])
+    num_res_blocks: int = 2
+    z_channels: int = 16
+    scaling_factor: float = 0.3611
+    shift_factor: float = 0.1159
+    deterministic: bool = False
+    encoder_norm: bool = False
+    psz: int | None = None
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+        self.grad_checkpointing = False
+    @smart_compile()
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                block_fn = self.down[i_level].block[i_block]
+                if self.grad_checkpointing:
+                    h = checkpoint(block_fn, hs[-1])
+                else:
+                    h = block_fn(hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    attn_fn = self.down[i_level].attn[i_block]
+                    if self.grad_checkpointing:
+                        h = checkpoint(attn_fn, h)
+                    else:
+                        h = attn_fn(h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+        self.grad_checkpointing = False
+    @smart_compile()
+    def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                block_fn = self.up[i_level].block[i_block]
+                if self.grad_checkpointing:
+                    h = checkpoint(block_fn, h)
+                else:
+                    h = block_fn(h)
+                if len(self.up[i_level].attn) > 0:
+                    attn_fn = self.up[i_level].attn[i_block]
+                    if self.grad_checkpointing:
+                        h = checkpoint(attn_fn, h)
+                    else:
+                        h = attn_fn(h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+def layer_norm_2d(input: torch.Tensor, normalized_shape: torch.Size, eps: float = 1e-6) -> torch.Tensor:
+    # input.shape = (bsz, c, h, w)
+    _input = input.permute(0, 2, 3, 1)
+    _input = F.layer_norm(_input, normalized_shape, None, None, eps)
+    _input = _input.permute(0, 3, 1, 2)
+    return _input
+class AutoencoderKL(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.config = params
+        self.config = OmegaConf.create(asdict(self.config))
+        self.config.latent_channels = params.z_channels
+        self.config.block_out_channels = params.ch_mult
+        self.params = params
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.encoder_norm = params.encoder_norm
+        self.psz = params.psz
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        std = 0.02
+        if isinstance(module, (nn.Conv2d, nn.Linear)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.GroupNorm):
+            if module.weight is not None:
+                module.weight.data.fill_(1.0)
+            if module.bias is not None:
+                module.bias.data.zero_()
+    def gradient_checkpointing_enable(self):
+        self.encoder.grad_checkpointing = True
+        self.decoder.grad_checkpointing = True
+    @property
+    def dtype(self):
+        return self.encoder.conv_in.weight.dtype
+    @property
+    def device(self):
+        return self.encoder.conv_in.weight.device
+    @property
+    def trainable_params(self) -> float:
+        n_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        return LargeInt(n_params)
+    @property
+    def params_info(self) -> str:
+        encoder_params = str(LargeInt(sum(p.numel() for p in self.encoder.parameters())))
+        decoder_params = str(LargeInt(sum(p.numel() for p in self.decoder.parameters())))
+        table = [["encoder", encoder_params], ["decoder", decoder_params]]
+        return tabulate(table, headers=["Module", "Params"], tablefmt="grid")
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def patchify(self, img: torch.Tensor):
+        """
+        img: (bsz, C, H, W)
+        x: (bsz, patch_size**2 * C, H / patch_size, W / patch_size)
+        """
+        bsz, c, h, w = img.shape
+        p = self.psz
+        h_, w_ = h // p, w // p
+        img = img.reshape(bsz, c, h_, p, w_, p)
+        img = torch.einsum("nchpwq->ncpqhw", img)
+        x = img.reshape(bsz, c * p**2, h_, w_)
+        return x
+    def unpatchify(self, x: torch.Tensor):
+        """
+        x: (bsz, patch_size**2 * C, H / patch_size, W / patch_size)
+        img: (bsz, C, H, W)
+        """
+        bsz = x.shape[0]
+        p = self.psz
+        c = self.config.latent_channels
+        h_, w_ = x.shape[2], x.shape[3]
+        x = x.reshape(bsz, c, p, p, h_, w_)
+        x = torch.einsum("ncpqhw->nchpwq", x)
+        img = x.reshape(bsz, c, h_ * p, w_ * p)
+        return img
+    def encode(self, x: torch.Tensor, return_dict: bool = True):
+        moments = self.encoder(x)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        if self.psz is not None:
+            mean = self.patchify(mean)
+        if self.encoder_norm:
+            mean = layer_norm_2d(mean, mean.size()[-1:])
+        if self.psz is not None:
+            mean = self.unpatchify(mean)
+        moments = torch.cat([mean, logvar], dim=1).contiguous()
+        posterior = DiagonalGaussianDistribution(moments, deterministic=self.params.deterministic)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def decode(self, z: torch.Tensor, return_dict: bool = True):
+        dec = self.decoder(z)
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(self, input, sample_posterior=True, noise_strength=0.0):
+        posterior = self.encode(input).latent_dist
+        z = posterior.sample() if sample_posterior else posterior.mode()
+        if noise_strength > 0.0:
+            p = torch.distributions.Uniform(0, noise_strength)
+            z = z + p.sample((z.shape[0],)).reshape(-1, 1, 1, 1).to(z.device) * randn_tensor(
+                z.shape, device=z.device, dtype=z.dtype
+            )
+        dec = self.decode(z).sample
+        return dec, posterior
+    @classmethod
+    def from_pretrained(cls, model_path, **kwargs):
+        config_path = os.path.join(model_path, "config.json")
+        ckpt_path = os.path.join(model_path, "checkpoint.pt")
+        if not os.path.isdir(model_path) or not os.path.isfile(config_path) or not os.path.isfile(ckpt_path):
+            raise ValueError(
+                f"Invalid model path: {model_path}. The path should contain both config.json and checkpoint.pt files."
+            )
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(config_path, "r") as f:
+            config: dict = json.load(f)
+        config.update(kwargs)
+        kwargs = config
+        # Filter out kwargs that are not in AutoEncoderParams
+        # This ensures we only pass parameters that the model can accept
+        valid_kwargs = {}
+        param_signature = inspect.signature(AutoEncoderParams.__init__).parameters
+        for key, value in kwargs.items():
+            if key in param_signature:
+                valid_kwargs[key] = value
+            else:
+                logger.info(f"Ignoring parameter '{key}' as it's not defined in AutoEncoderParams")
+        params = AutoEncoderParams(**valid_kwargs)
+        model = cls(params)
+        try:
+            msg = model.load_state_dict(state_dict, strict=False)
+            logger.info(f"Loaded state_dict from {ckpt_path}")
+            logger.info(f"Missing keys:\n{msg.missing_keys}")
+            logger.info(f"Unexpected keys:\n{msg.unexpected_keys}")
+        except Exception as e:
+            logger.error(e)
+            logger.warning(f"Failed to load state_dict from {ckpt_path}, using random initialization")
+        return model

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff