huangrh9 commited on Jun 2

Commit

16ac099

verified ·

1 Parent(s): 52c0b51

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

.gitattributes +2 -0
added_tokens.json +0 -0
aspect_ratio_utils.py +196 -0
config.json +191 -0
configuration_dualvitok.py +154 -0
configuration_illume.py +140 -0
configuration_movqgan.py +92 -0
configuration_qwen2vit.py +249 -0
generation_config.json +6 -0
image_processing_dualvitok.py +24 -0
image_processing_movqgan.py +429 -0
image_utils.py +812 -0
inference_utils.py +412 -0
merges.txt +0 -0
modeling_dualvitok.py +653 -0
modeling_illume.py +883 -0
modeling_movqgan.py +828 -0
modeling_qwen2vit.py +841 -0
modeling_rope_utils.py +561 -0
processing_illume.py +329 -0
pytorch_model-00001-of-00004.bin +3 -0
pytorch_model-00002-of-00004.bin +3 -0
pytorch_model-00003-of-00004.bin +3 -0
pytorch_model-00004-of-00004.bin +3 -0
pytorch_model.bin.index.json +889 -0
sdxl_decoder_pipe.py +901 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+tokenizer_config.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

aspect_ratio_utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from torchvision import transforms
+import numpy as np
+import math
+import torch
+from PIL import Image, ImageOps
+RATIOS = [
+    (512, 512),
+    (384, 512),
+    (512, 384),
+    (384, 768),
+    (768, 384),
+    (384, 576),
+    (576, 384),
+    (320, 960),
+    (960, 320),
+    (256, 1024),
+    (1024, 256),
+]
+RATIO_TYPES = [
+    'ratio_h512_w512',
+    'ratio_h384_w512',
+    'ratio_h512_w384',
+    'ratio_h384_w768',
+    'ratio_h768_w384',
+    'ratio_h384_w576',
+    'ratio_h576_w384',
+    'ratio_h320_w960',
+    'ratio_h960_w320',
+    'ratio_h256_w1024',
+    'ratio_h1024_w256',
+]
+def center_crop_and_resize(img, output_size=(256, 256)):
+    target_h, target_w = output_size
+    img_w, img_h = img.size
+    scale_w, scale_h = img_w / target_w, img_h / target_h
+    if scale_h > scale_w:
+        new_w, new_h = target_w, int(target_w / img_w * img_h)
+    else:
+        new_w, new_h = int(target_h / img_h * img_w), target_h
+    # Resize the image, keeping the aspect ratio
+    img = img.resize((new_w, new_h), Image.LANCZOS)
+    # Calculate the center cropping area
+    left = (new_w - target_w) // 2
+    top = (new_h - target_h) // 2
+    right = left + target_w
+    bottom = top + target_h
+    # Crop the extra part
+    img = img.crop((left, top, right, bottom))
+    return img
+def resize_with_padding(img, output_size=(256, 256), fill_color=(0, 0, 0)):
+    target_height, target_width = output_size
+    # Step 1: Resize with aspect ratio preserved
+    original_width, original_height = img.size
+    ratio = min(target_width / original_width, target_height / original_height)
+    new_size = (int(original_width * ratio), int(original_height * ratio))
+    resized_image = img.resize(new_size, Image.LANCZOS)
+    # Step 2: Add padding to reach target size
+    delta_w = target_width - new_size[0]
+    delta_h = target_height - new_size[1]
+    padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
+    padded_image = ImageOps.expand(resized_image, padding, fill=fill_color)
+    return padded_image
+def unpad_and_resize_back(padded_image, original_width, original_height):
+    """
+    Revert the padded+resized image back to original size.
+    Args:
+        padded_image (PIL.Image): Image after padding.
+        original_width (int): Original image width before resize & pad.
+        original_height (int): Original image height before resize & pad.
+    Returns:
+        PIL.Image: Image resized back to original resolution.
+    """
+    # Compute the scale factor used during the first resize
+    target_width, target_height = padded_image.size
+    ratio = min(target_width / original_width, target_height / original_height)
+    resized_w = int(original_width * ratio)
+    resized_h = int(original_height * ratio)
+    # Compute cropping box on padded image
+    left = (target_width - resized_w) // 2
+    upper = (target_height - resized_h) // 2
+    right = left + resized_w
+    lower = upper + resized_h
+    # Crop out the resized region (before padding)
+    cropped_image = padded_image.crop((left, upper, right, lower))
+    # Resize back to original resolution
+    recovered_image = cropped_image.resize((original_width, original_height), Image.LANCZOS)
+    return recovered_image
+def calculate_ratio():
+    max_area = 512 * 512
+    ratios = [(2, 2), (3, 4), (4, 3), (2, 4), (4, 2), (1, 4), (4, 1), (2, 3), (3, 2), (1, 3), (3, 1)]
+    ratio_candicates = []
+    for ratio in ratios:
+        x = math.sqrt(max_area / ratio[0] / ratio[1])
+        x = round(x / 64) * 64
+        tmp = (x*ratio[0], x*ratio[1])
+        # print(ratio, x, tmp)
+        ratio_candicates.append(tmp)
+    print("ratio_candicates", ratio_candicates)
+    return ratio_candicates
+class AspectRatioCrop(object):
+    """
+    Aspect Ratio Crop transform.
+    For a given image, find the corresponding aspect ratio and
+    resize / resize + crop to the corresponding base sizes
+    Args:
+        base_sizes: list[tuple], the base sizes of final output.
+            For example, [(512, 512), (512, 768), (768, 512)]
+        resize_and_crop: bool .If False, find the matched aspect ratio and resize to base size.
+    """
+    def __init__(self, base_sizes, crop_percent_thresh=0.2):
+        self.base_sizes = [(math.floor(h), math.floor(w)) for (h, w) in base_sizes]
+        self.aspect_ratios = [x[1] / x[0] for x in self.base_sizes]  # w / h
+        self.crop_percent_thresh = crop_percent_thresh
+    def _find_size(self, w, h):
+        base_size_indexes = list(range(len(self.base_sizes)))
+        aspect_ratios = [self.aspect_ratios[i] for i in base_size_indexes]
+        aspect_ratio = w / h
+        ratio_diff = [abs(ratio - aspect_ratio) for ratio in aspect_ratios]
+        min_diff = np.min(ratio_diff)
+        match_diff_indexes = [j for j in range(len(ratio_diff)) if ratio_diff[j] == min_diff]
+        match_diff_indexes = sorted(match_diff_indexes, key=lambda x: (h-self.base_sizes[base_size_indexes[x]][0])**2
+                                                                    + (w-self.base_sizes[base_size_indexes[x]][1])**2) # pick the area most match one
+        corr_index = base_size_indexes[match_diff_indexes[0]]
+        return corr_index
+    def get_pred_target_w_h(self, w, h):
+        aspect_ratio = w / h
+        aspect_index = self._find_size(w, h)
+        pred_h, pred_w = self.base_sizes[aspect_index]
+        solutions = [
+            (pred_w, int(pred_w / aspect_ratio)),
+            (int(pred_h * aspect_ratio), pred_h),
+        ]
+        w_tar = None
+        h_tar = None
+        for solution in solutions:
+            w_s, h_s = solution
+            if w_s >= pred_w and h_s >= pred_h:
+                w_tar = w_s
+                h_tar = h_s
+        return pred_w, pred_h, w_tar, h_tar, aspect_index
+    def __call__(self, image, is_inference=False):
+        ## step 1: find the cloest aspect ratios
+        flag_matched = True
+        w, h = image.size
+        pred_w, pred_h, w_tar, h_tar, aspect_index = self.get_pred_target_w_h(w, h)
+        crop_percent = 1 - pred_w * pred_h / (w_tar * h_tar)
+        if self.crop_percent_thresh > 0 and crop_percent > self.crop_percent_thresh:
+            flag_matched = False  # filter data
+        if not is_inference:
+            ## step 2: train: crop and resize
+            image = center_crop_and_resize(image, output_size=(pred_h, pred_w))
+        else:
+            ## step 2: inference: resize and padding
+            image = resize_with_padding(image, output_size=(pred_h, pred_w))
+        original_size = [h, w]
+        target_size = [pred_h, pred_w]
+        return image, original_size, target_size, flag_matched

config.json ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+  "architectures": [
+    "ILLUMEForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_illume.ILLUMEConfig",
+    "AutoModel": "modeling_illume.ILLUMEForConditionalGeneration",
+    "AutoModelForCausalLM": "modeling_illume.ILLUMEForConditionalGeneration"
+  },
+  "ignore_index": -100,
+  "image_out_token_index": 282777,
+  "image_token_index": 282776,
+  "mm_projector_config": {
+    "hidden_size": 3584,
+    "mm_hidden_size": [
+      3584,
+      32
+    ],
+    "projector_cfg1": {
+      "mlp_depth": 2,
+      "type": "MLPProjector"
+    },
+    "projector_cfg2": {
+      "mlp_depth": 2,
+      "type": "MLPProjector"
+    },
+    "trainable": true,
+    "type": "MixedProjector"
+  },
+  "model_type": "illume",
+  "special_tokens_ids": {
+    "<end_of_image>": 151666,
+    "<end_of_level0>": 151669,
+    "<end_of_level1>": 151671,
+    "<end_of_line>": 151667,
+    "<start_of_image>": 151665,
+    "<start_of_level0>": 151668,
+    "<start_of_level1>": 151670
+  },
+  "text_config": {
+    "_name_or_path": "./Qwen2.5-7B-Instruct-with-vision-tokenizer-32k-96k-level2",
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_size": 3584,
+    "intermediate_size": 18944,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rope_theta": 1000000.0,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 283175
+  },
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "vision_config": {
+    "_name_or_path": "./dualvitok/",
+    "architectures": [
+      "DualViTok"
+    ],
+    "auto_map": {
+      "AutoConfig": "configuration_dualvitok.DualViTokConfig",
+      "AutoModel": "modeling_dualvitok.DualViTok"
+    },
+    "model_type": "DualViTok",
+    "pixel_decoder": {
+      "attn_resolutions": [
+        4
+      ],
+      "ch": 384,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "codebook_size": 98304,
+      "embed_dim": 64,
+      "use_dc_up_down_blocks": true,
+      "z_channels": 64
+    },
+    "pixel_encoder": {
+      "attn_resolutions": [
+        4
+      ],
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "codebook_size": 98304,
+      "embed_dim": 32,
+      "use_dc_up_down_blocks": true,
+      "z_channels": 32
+    },
+    "semantic_decoder": {
+      "pretrained_semantic_encoder": "Emova-ollm/qwen2vit600m",
+      "target_mlp": "norm"
+    },
+    "semantic_encoder": {
+      "pretrained_semantic_encoder": {
+        "_name_or_path": "Emova-ollm/qwen2vit600m",
+        "add_cross_attention": false,
+        "architectures": [
+          "Qwen2VisionTransformerPretrainedModel"
+        ],
+        "bad_words_ids": null,
+        "begin_suppress_tokens": null,
+        "bos_token_id": null,
+        "chunk_size_feed_forward": 0,
+        "cross_attention_hidden_size": null,
+        "decoder_start_token_id": null,
+        "depth": 32,
+        "diversity_penalty": 0.0,
+        "do_sample": false,
+        "early_stopping": false,
+        "embed_dim": 1280,
+        "encoder_no_repeat_ngram_size": 0,
+        "eos_token_id": null,
+        "exponential_decay_length_penalty": null,
+        "finetuning_task": null,
+        "forced_bos_token_id": null,
+        "forced_eos_token_id": null,
+        "hidden_act": "quick_gelu",
+        "hidden_size": 3584,
+        "id2label": {
+          "0": "LABEL_0",
+          "1": "LABEL_1"
+        },
+        "in_channels": 3,
+        "in_chans": 3,
+        "initializer_range": 0.02,
+        "is_decoder": false,
+        "is_encoder_decoder": false,
+        "label2id": {
+          "LABEL_0": 0,
+          "LABEL_1": 1
+        },
+        "length_penalty": 1.0,
+        "max_length": 20,
+        "min_length": 0,
+        "mlp_ratio": 4,
+        "model_type": "qwen2_vl",
+        "no_repeat_ngram_size": 0,
+        "num_beam_groups": 1,
+        "num_beams": 1,
+        "num_heads": 16,
+        "num_return_sequences": 1,
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "output_scores": false,
+        "pad_token_id": null,
+        "patch_size": 14,
+        "prefix": null,
+        "problem_type": null,
+        "pruned_heads": {},
+        "remove_invalid_values": false,
+        "repetition_penalty": 1.0,
+        "return_dict": true,
+        "return_dict_in_generate": false,
+        "sep_token_id": null,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "suppress_tokens": null,
+        "task_specific_params": null,
+        "temperature": 1.0,
+        "temporal_patch_size": 2,
+        "tf_legacy_loss": false,
+        "tie_encoder_decoder": false,
+        "tie_word_embeddings": true,
+        "tokenizer_class": null,
+        "top_k": 50,
+        "top_p": 1.0,
+        "torch_dtype": "float32",
+        "torchscript": false,
+        "transformers_version": "4.44.2",
+        "typical_p": 1.0,
+        "use_bfloat16": false
+      }
+    },
+    "torch_dtype": "float16"
+  }
+}

configuration_dualvitok.py ADDED Viewed

	@@ -0,0 +1,154 @@

+""" DualViTok model configuration """
+import os
+from typing import List, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .configuration_movqgan import MoVQConfig
+logger = logging.get_logger(__name__)
+class SemanticEncoderConfig(PretrainedConfig):
+    model_type = "DualViTokSemanticEncoder"
+    def __init__(
+            self,
+            pretrained_semantic_encoder='Emova-ollm/qwen2vit600m',
+            z_channels=32,
+            num_blocks=4,
+            embed_dim=1280,
+            out_layer='linear',
+            target_mlp='norm',
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.pretrained_semantic_encoder = pretrained_semantic_encoder
+        self.z_channels = z_channels
+        self.num_blocks = num_blocks
+        self.out_layer = out_layer
+        self.embed_dim = embed_dim
+        self.target_mlp = target_mlp
+class SemanticDecoderConfig(PretrainedConfig):
+    model_type = "DualViTokSemanticDecoder"
+    def __init__(
+            self,
+            z_channels=32,
+            num_blocks=4,
+            embed_dim=1280,
+            out_layer='linear_norm',
+            out_channels=3584,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.z_channels = z_channels
+        self.num_blocks = num_blocks
+        self.embed_dim = embed_dim
+        self.out_layer = out_layer
+        self.out_channels = out_channels
+class DualViTokConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DualViTok`]. It is used to instantiate an video movq
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the VQ model presented in  paper.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        semantic_encoder (`dict`, *optional*):
+            Configuration dictionary for the semantic encoder. If `None`, defaults to `SemanticEncoderConfig()`.
+            The provided dictionary will be unpacked to initialize a `SemanticEncoderConfig` instance.
+        semantic_decoder (`dict`, *optional*):
+            Configuration dictionary for the semantic decoder. If `None`, defaults to `SemanticEncoderConfig()`.
+            The provided dictionary will be unpacked to initialize a `SemanticEncoderConfig` instance (note: uses `SemanticEncoderConfig` as per current implementation).
+        pixel_encoder (`dict`, *optional*):
+            Configuration dictionary for the pixel pathway's VQ-VAE model (e.g., `MoVQConfig`). If `None`, defaults to `MoVQConfig()`.
+            The provided dictionary will be unpacked to initialize a `MoVQConfig` instance, which defines both encoder and decoder for pixel-level features.
+        semantic_quantizer_type (`str`, *optional*, defaults to `'simvq'`):
+            Type of the quantizer for semantic tokens (e.g., `'simvq'`, `'ema_simvq'`).
+        pixel_quantizer_type (`str`, *optional*, defaults to `'simvq'`):
+            Type of the quantizer for pixel tokens (e.g., `'simvq'`, `'ema_simvq'`).
+        semantic_quantizer_codebook_size (`int`, *optional*, defaults to 32768):
+            Number of entries in the codebook for the semantic quantizer.
+        pixel_quantizer_codebook_size (`int`, *optional*, defaults to 98304):
+            Number of entries in the codebook for the pixel quantizer.
+        attn_implementation (`str`, *optional*, defaults to `'sdpa'`):
+            The attention implementation to use (e.g., `'sdpa'`, `'flash_attention_2'`, `'eager'`).
+            Can be `'sdpa'` (scaled dot product attention), `'flash_attention_2'` (if available and installed),
+            or `'eager'` (the default PyTorch attention implementation).
+    ```python
+    >>> from transformers import DualViTok, DualViTokConfig
+    >>> # Initializing a video VQ model of  configuration
+    >>> configuration = DualViTokConfig()
+    >>> # Initializing a model from the  VQ model style configuration
+    >>> model = DualViTok(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "DualViTok"
+    def __init__(
+            self,
+            semantic_encoder=None,
+            semantic_decoder=None,
+            pixel_encoder=None,
+            pixel_decoder=None,
+            semantic_quantizer_type='simvq',
+            pixel_quantizer_type='simvq',
+            semantic_quantizer_codebook_size=32768,
+            pixel_quantizer_codebook_size=98304,
+            attn_implementation='sdpa',
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if semantic_encoder is None:
+            self.semantic_encoder = SemanticEncoderConfig()
+        else:
+            self.semantic_encoder = SemanticEncoderConfig(**semantic_encoder)
+        if semantic_decoder is None:
+            self.semantic_decoder = SemanticDecoderConfig()
+        else:
+            self.semantic_decoder = SemanticDecoderConfig(**semantic_decoder)
+        self.semantic_quantizer_type = semantic_quantizer_type
+        self.pixel_quantizer_type = pixel_quantizer_type
+        self.semantic_quantizer_codebook_size = semantic_quantizer_codebook_size
+        self.pixel_quantizer_codebook_size = pixel_quantizer_codebook_size
+        if pixel_encoder is None:
+            self.pixel_encoder = MoVQConfig()
+        else:
+            self.pixel_encoder = MoVQConfig(**pixel_encoder)
+        self.pixel_decoder = self.pixel_encoder if pixel_decoder is None else MoVQConfig(**pixel_decoder)
+        self.attn_implementation = attn_implementation
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], attn_implementation='sdpa', **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, attn_implementation=attn_implementation, **kwargs)

configuration_illume.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""ILLUME model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+# import the first three to make sure the last one recognize them.
+from .modeling_rope_utils import rope_config_validation
+from .configuration_movqgan import MoVQConfig
+from .configuration_qwen2vit import Qwen2VLVisionConfig
+from .configuration_dualvitok import DualViTokConfig
+logger = logging.get_logger(__name__)
+class ILLUMEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ILLUMEForConditionalGeneration`]. It is used to instantiate an
+    ILLUME model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `DualViTokConfig`):
+            The config object or dictionary of the vision backbone.
+        mm_projector_config (`dict`, *optional*, defaults to `None`):
+            Configuration for the multimodal projector.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
+            The config object or dictionary of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+    Example:
+    ```python
+    >>> from transformers import ILLUMEForConditionalGeneration, ILLUMEConfig, CLIPVisionConfig, LlamaConfig
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+    >>> # Initializing a ILLUME style configuration
+    >>> configuration = ILLUMEConfig(vision_config, text_config)
+    >>> # Initializing a model from the style configuration
+    >>> model = ILLUMEForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "illume"
+    is_composition = False
+    def __init__(
+            self,
+            vision_config=None,
+            mm_projector_config=None,
+            text_config=None,
+            ignore_index=-100,
+            image_token_index=32000,
+            tie_word_embeddings=False,
+            **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        if isinstance(vision_config, dict):
+            vision_config = DualViTokConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = DualViTokConfig({
+                "semantic_encoder": {
+                  "pretrained_semantic_encoder":
+                      "Emova-ollm/qwen2vit600m",
+                  "z_channels": 32,
+                  "num_blocks": 4,
+                  "out_layer": "linear",
+                  "embed_dim": 1280,
+                  "target_mlp": "norm"
+                },
+                "semantic_decoder": {
+                  "z_channels": 32,
+                  "num_blocks": 4,
+                  "embed_dim": 1280,
+                  "out_layer": "linear_norm",
+                  "out_channels": 3584
+                },
+                "semantic_quantizer_type": "simvq",
+                "pixel_quantizer_type": "simvq",
+                "semantic_quantizer_codebook_size": 32768,
+                "pixel_quantizer_codebook_size": 98304,
+                "attn_implementation": "sdpa",
+                "pixel_encoder": {
+                  "codebook_size": 98304,
+                  "embed_dim": 32,
+                  "z_channels": 32,
+                  "double_z": False,
+                  "in_channels": 3,
+                  "out_channels": 3,
+                  "ch": 128,
+                  "ch_mult": [ 1, 1, 2, 2, 4 ],
+                  "num_res_blocks": 2,
+                  "attn_resolutions": [ 4 ],
+                  "dropout": 0.0,
+                  "use_dc_up_down_blocks": True
+                },
+                "pixel_decoder": {
+                  "codebook_size": 98304,
+                  "embed_dim": 64,
+                  "z_channels": 64,
+                  "double_z": False,
+                  "in_channels": 3,
+                  "out_channels": 3,
+                  "ch": 384,
+                  "ch_mult": [ 1, 1, 2, 2, 4 ],
+                  "num_res_blocks": 2,
+                  "attn_resolutions": [4],
+                  "dropout": 0.0,
+                  "use_dc_up_down_blocks": True
+                },
+              }
+            )
+        self.vision_config = vision_config
+        self.mm_projector_config = mm_projector_config
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2"
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"]()
+        self.text_config = text_config
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

configuration_movqgan.py ADDED Viewed

	@@ -0,0 +1,92 @@

+""" MoVQ model configuration """
+from typing import List
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MoVQConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MoVQ`]. It is used to instantiate an video movq
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the VQ model presented in  paper.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        codebook_size (`int`, *optional*, defaults to 32768):
+            Codebook size of the VQ model.
+        embed_dim (`int`, *optional*, defaults to 4):
+            Dimension of the quantized vector in codebook.
+        z_channels (`int`, *optional*, defaults to 4):
+            Dimension of the output channel of encoder and the input channel of decoder
+        double_z (`bool`, *optional*, defaults to False):
+            Whether double the output dim of the encoder.
+        in_channels (`int`, *optional*, defaults to 3):
+            Input channel of encoder.
+        out_channels (`int`, *optional*, defaults to 3):
+            Output channel of decoder.
+        ch (`int`, *optional*, defaults to 256):
+            Basic channel number of the intermediate blocks.
+        ch_mult (`List[int]`, *optional*, defaults to `[1, 2, 2, 4]`):
+            Channel scaling factor of the intermediate blocks.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Residual block number in each stage.
+        attn_resolutions (`List[int]`, *optional*, defaults to 3):
+            Stage indices to apply attention.
+        dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability.
+        use_dc_up_down_blocks (`bool`, *optional*, defaults to `False`):
+            Whether to use the DC up-down blocks.
+    ```python
+    >>> from transformers import MoVQ, MoVQConfig
+    >>> # Initializing a video VQ model of  configuration
+    >>> configuration = MoVQConfig()
+    >>> # Initializing a model from the  VQ model style configuration
+    >>> model = MoVQModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "MoVQ"
+    def __init__(
+        self,
+        codebook_size: int = 32768,
+        embed_dim: int = 4,
+        z_channels: int = 4,
+        double_z: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        ch: int = 256,
+        ch_mult: List[int] = [1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        attn_resolutions: List[int] = [3],
+        dropout: float = 0.0,
+        use_dc_up_down_blocks=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.codebook_size = codebook_size
+        self.embed_dim = embed_dim
+        self.z_channels = z_channels
+        self.double_z = double_z
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.ch = ch
+        self.ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        self.dropout = dropout
+        self.use_dc_up_down_blocks = use_dc_up_down_blocks

configuration_qwen2vit.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .modeling_rope_utils import rope_config_validation
+logger = logging.get_logger(__name__)
+class Qwen2VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+            self,
+            depth=32,
+            embed_dim=1280,
+            hidden_size=3584,
+            hidden_act="quick_gelu",
+            mlp_ratio=4,
+            num_heads=16,
+            in_channels=3,
+            patch_size=14,
+            spatial_merge_size=2,
+            temporal_patch_size=2,
+            attn_implementation='eager',
+            init_weights=False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.attn_implementation = attn_implementation if attn_implementation else 'eager'
+        self.init_weights = init_weights
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # if config_dict.get("model_type") == "qwen2_vl":
+        #     config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class Qwen2VLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2VLModel`]. It is used to instantiate a
+    Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Qwen2VL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2VLModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`Dict`, *optional*):
+            The config for the visual encoder initialization.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+    ```python
+    >>> from transformers import Qwen2VLForConditionalGeneration, Qwen2VLConfig
+    >>> # Initializing a Qwen2VL style configuration
+    >>> configuration = Qwen2VLConfig()
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = Qwen2VLForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "qwen2_vl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+            self,
+            vocab_size=152064,
+            hidden_size=8192,
+            intermediate_size=29568,
+            num_hidden_layers=80,
+            num_attention_heads=64,
+            num_key_value_heads=8,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-05,
+            use_cache=True,
+            tie_word_embeddings=False,
+            rope_theta=1000000.0,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=80,
+            attention_dropout=0.0,
+            vision_config=None,
+            rope_scaling=None,
+            **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = Qwen2VLVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = Qwen2VLVisionConfig()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default'
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "4.44.2"
+}

image_processing_dualvitok.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Image processor class for DualViTok."""
+from transformers.utils import TensorType, is_vision_available, logging
+from .image_processing_movqgan import MoVQImageProcessor
+logger = logging.get_logger(__name__)
+class DualViTokImageProcessor(MoVQImageProcessor):
+    r"""
+    Constructs a DualViTok image processor that dynamically resizes images based on the original images.
+    This image processor is based on MoVQImageProcessor with spatial_factor of 16.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        *args,
+        spatial_factor: int = 16,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, spatial_factor=spatial_factor, **kwargs)

image_processing_movqgan.py ADDED Viewed

	@@ -0,0 +1,429 @@

+"""Image processor class for MoVQ."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def smart_resize(
+    height: int, width: int, factor: int = 8, min_pixels: int = 512 * 512, max_pixels: int = 1024 * 1024
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    # if height < factor or width < factor:
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # elif max(height, width) / min(height, width) > 5:
+    #     raise ValueError(
+    #         f"absolute aspect ratio must be smaller than 5, got {max(height, width) / min(height, width)}"
+    #     )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return max(h_bar, factor), max(w_bar, factor)
+class MoVQImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MoVQ image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `512 * 512`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
+            The max pixels of the image to resize the image.
+        spatial_factor (`int`, *optional*, defautls to 8):
+            The spatial downsample factor the image will be downsampled in feature extracting phase
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 32 * 32,
+        max_pixels: int = 1024 * 1024,
+        spatial_factor: int = 8,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        self.spatial_factor = spatial_factor
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        spatial_factor: Optional[int] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        output_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            spatial_factor (`int`, *optional*, defaults to `self.spatial_factor`):
+                The spatial downsample factor the image will be downsampled in feature extracting phase
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            output_data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        spatial_factor = spatial_factor if spatial_factor is not None else self.spatial_factor
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                "pixel_values.append()images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=spatial_factor,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, output_data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        image = np.array(processed_images)
+        return image
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        spatial_factor: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        output_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            spatial_factor (`int`, *optional*, defaults to `self.spatial_factor`):
+                The spatial downsample factor the image will be downsampled in feature extracting phase
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            output_data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        spatial_factor = spatial_factor if spatial_factor is not None else self.spatial_factor
+        images = make_list_of_images(images)
+        if images is None or not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=self.size,
+            resample=resample,
+        )
+        pixel_values = []
+        for image in images:
+            norm_image = self._preprocess(
+                image,
+                do_resize=do_resize,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_convert_rgb=do_convert_rgb,
+                spatial_factor=spatial_factor,
+                input_data_format=input_data_format,
+                output_data_format=output_data_format,
+            )
+            pixel_values.extend(norm_image)
+        pixel_values = np.array(pixel_values)
+        data = {"pixel_values": pixel_values}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = "PIL.Image.Image",
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Postprocess an image or batch of images tensor. Postprocess is the reverse process of preprocess.
+        The parameters should be same as in preprocess.
+        Args:
+            images (`ImageInput`):
+                Image to postprocess. Expects a single or batch of images with pixel values ranging from -1 to 1.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        rescale_factor = 1 / rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        image_mean, image_std = self.inverse_meanstd(image_mean, image_std)
+        images = make_list_of_images(images)
+        if isinstance(images[0], Image.Image):
+            return images if len(images) > 1 else images[0]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        pixel_values = []
+        for image in images:
+            image = to_numpy_array(image)
+            if do_normalize:
+                image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+                image = image.clip(0, 255).astype(np.uint8)
+            if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+                image = to_channel_dimension_format(image, ChannelDimension.LAST, input_channel_dim=input_data_format)
+                pixel_values.append(Image.fromarray(image))
+            else:
+                pixel_values.extend(image)
+        data = {"pixel_values": pixel_values}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def inverse_meanstd(self, image_mean, image_std):
+        image_mean = self.to_tuple(image_mean)
+        image_std = self.to_tuple(image_std)
+        rev_image_mean = tuple(-m / s for m, s in zip(image_mean, image_std))
+        rev_image_std = tuple(1 / s for s in image_std)
+        return rev_image_mean, rev_image_std
+    def to_tuple(self, value, dim=3):
+        if isinstance(value, (int, float)):
+            return (value,) * dim
+        return tuple(value)

image_utils.py ADDED Viewed

	@@ -0,0 +1,812 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import os
+from io import BytesIO
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+import numpy as np
+import requests
+from packaging import version
+from transformers.utils import (
+    ExplicitEnum,
+    is_jax_tensor,
+    is_numpy_array,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+    to_numpy,
+)
+from transformers.utils.constants import (  # noqa: F401
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+)
+if is_vision_available():
+    import PIL.Image
+    import PIL.ImageOps
+    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+        PILImageResampling = PIL.Image.Resampling
+    else:
+        PILImageResampling = PIL.Image
+    if is_torchvision_available():
+        from torchvision.transforms import InterpolationMode
+        pil_torch_interpolation_mapping = {
+            PILImageResampling.NEAREST: InterpolationMode.NEAREST,
+            PILImageResampling.BOX: InterpolationMode.BOX,
+            PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
+            PILImageResampling.HAMMING: InterpolationMode.HAMMING,
+            PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
+            PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
+        }
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+logger = logging.get_logger(__name__)
+ImageInput = Union[
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
+]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  # noqa
+class ChannelDimension(ExplicitEnum):
+    FIRST = "channels_first"
+    LAST = "channels_last"
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+class AnnotionFormat(ExplicitEnum):
+    COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
+    COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+def is_pil_image(img):
+    return is_vision_available() and isinstance(img, PIL.Image.Image)
+class ImageType(ExplicitEnum):
+    PIL = "pillow"
+    TORCH = "torch"
+    NUMPY = "numpy"
+    TENSORFLOW = "tensorflow"
+    JAX = "jax"
+def get_image_type(image):
+    if is_pil_image(image):
+        return ImageType.PIL
+    if is_torch_tensor(image):
+        return ImageType.TORCH
+    if is_numpy_array(image):
+        return ImageType.NUMPY
+    if is_tf_tensor(image):
+        return ImageType.TENSORFLOW
+    if is_jax_tensor(image):
+        return ImageType.JAX
+    raise ValueError(f"Unrecognised image type {type(image)}")
+def is_valid_image(img):
+    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
+def valid_images(imgs):
+    # If we have an list of images, make sure every image is valid
+    if isinstance(imgs, (list, tuple)):
+        for img in imgs:
+            if not valid_images(img):
+                return False
+    # If not a list of tuple, we have been given a single image or batched tensor of images
+    elif not is_valid_image(imgs):
+        return False
+    return True
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+def is_scaled_image(image: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
+    """
+    Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a batch of images, it is converted to a list of images.
+    Args:
+        images (`ImageInput`):
+            Image of images to turn into a list of images.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            Expected number of dimensions for a single input image. If the input image has a different number of
+            dimensions, an error is raised.
+    """
+    if is_batched(images):
+        return images
+    # Either the input is a single image, in which case we create a list of length 1
+    if isinstance(images, PIL.Image.Image):
+        # PIL images are never batched
+        return [images]
+    if is_valid_image(images):
+        if images.ndim == expected_ndims + 1:
+            # Batch of images
+            images = list(images)
+        elif images.ndim == expected_ndims:
+            # Single image
+            images = [images]
+        else:
+            raise ValueError(
+                f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
+                f" {images.ndim} dimensions."
+            )
+        return images
+    raise ValueError(
+        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or "
+        f"jax.ndarray, but got {type(images)}."
+    )
+def to_numpy_array(img) -> np.ndarray:
+    if not is_valid_image(img):
+        raise ValueError(f"Invalid image type: {type(img)}")
+    if is_vision_available() and isinstance(img, PIL.Image.Image):
+        return np.array(img)
+    return to_numpy(img)
+def infer_channel_dimension_format(
+    image: np.ndarray, num_channels: Optional[Union[int, Tuple[int, ...]]] = None
+) -> ChannelDimension:
+    """
+    Infers the channel dimension format of `image`.
+    Args:
+        image (`np.ndarray`):
+            The image to infer the channel dimension of.
+        num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
+            The number of channels of the image.
+    Returns:
+        The channel dimension of the image.
+    """
+    num_channels = num_channels if num_channels is not None else (1, 3)
+    num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
+    if image.ndim == 3:
+        first_dim, last_dim = 0, 2
+    elif image.ndim == 4:
+        first_dim, last_dim = 1, 3
+    else:
+        raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
+    if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
+        logger.warning(
+            f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension."
+        )
+        return ChannelDimension.FIRST
+    elif image.shape[first_dim] in num_channels:
+        return ChannelDimension.FIRST
+    elif image.shape[last_dim] in num_channels:
+        return ChannelDimension.LAST
+    raise ValueError("Unable to infer channel dimension format")
+def get_channel_dimension_axis(
+    image: np.ndarray, input_data_format: Optional[Union[ChannelDimension, str]] = None
+) -> int:
+    """
+    Returns the channel dimension axis of the image.
+    Args:
+        image (`np.ndarray`):
+            The image to get the channel dimension axis of.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the image. If `None`, will infer the channel dimension from the image.
+    Returns:
+        The channel dimension axis of the image.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    if input_data_format == ChannelDimension.FIRST:
+        return image.ndim - 3
+    elif input_data_format == ChannelDimension.LAST:
+        return image.ndim - 1
+    raise ValueError(f"Unsupported data format: {input_data_format}")
+def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
+    """
+    Returns the (height, width) dimensions of the image.
+    Args:
+        image (`np.ndarray`):
+            The image to get the dimensions of.
+        channel_dim (`ChannelDimension`, *optional*):
+            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
+    Returns:
+        A tuple of the image's height and width.
+    """
+    if channel_dim is None:
+        channel_dim = infer_channel_dimension_format(image)
+    if channel_dim == ChannelDimension.FIRST:
+        return image.shape[-2], image.shape[-1]
+    elif channel_dim == ChannelDimension.LAST:
+        return image.shape[-3], image.shape[-2]
+    else:
+        raise ValueError(f"Unsupported data format: {channel_dim}")
+def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "annotations" in annotation
+        and isinstance(annotation["annotations"], (list, tuple))
+        and (
+            # an image can have no annotations
+            len(annotation["annotations"]) == 0 or isinstance(annotation["annotations"][0], dict)
+        )
+    ):
+        return True
+    return False
+def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "segments_info" in annotation
+        and "file_name" in annotation
+        and isinstance(annotation["segments_info"], (list, tuple))
+        and (
+            # an image can have no segments
+            len(annotation["segments_info"]) == 0 or isinstance(annotation["segments_info"][0], dict)
+        )
+    ):
+        return True
+    return False
+def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+    return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
+def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+    return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
+def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
+    """
+    Loads `image` to a PIL Image.
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `PIL.Image.Image`: A PIL Image.
+    """
+    requires_backends(load_image, ["vision"])
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            if image.startswith("data:image/"):
+                image = image.split(",")[1]
+            # Try to load as base64
+            try:
+                b64 = base64.decodebytes(image.encode())
+                image = PIL.Image.open(BytesIO(b64))
+            except Exception as e:
+                raise ValueError(
+                    f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+                )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise TypeError(
+            "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+def validate_preprocess_arguments(
+    do_rescale: Optional[bool] = None,
+    rescale_factor: Optional[float] = None,
+    do_normalize: Optional[bool] = None,
+    image_mean: Optional[Union[float, List[float]]] = None,
+    image_std: Optional[Union[float, List[float]]] = None,
+    do_pad: Optional[bool] = None,
+    size_divisibility: Optional[int] = None,
+    do_center_crop: Optional[bool] = None,
+    crop_size: Optional[Dict[str, int]] = None,
+    do_resize: Optional[bool] = None,
+    size: Optional[Dict[str, int]] = None,
+    resample: Optional["PILImageResampling"] = None,
+):
+    """
+    Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
+    Raises `ValueError` if arguments incompatibility is caught.
+    Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
+    sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
+    existing arguments when possible.
+    """
+    if do_rescale and rescale_factor is None:
+        raise ValueError("`rescale_factor` must be specified if `do_rescale` is `True`.")
+    if do_pad and size_divisibility is None:
+        # Here, size_divisor might be passed as the value of size
+        raise ValueError(
+            "Depending on the model, `size_divisibility`, `size_divisor`, `pad_size` or `size` must be specified if `do_pad` is `True`."
+        )
+    if do_normalize and (image_mean is None or image_std is None):
+        raise ValueError("`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.")
+    if do_center_crop and crop_size is None:
+        raise ValueError("`crop_size` must be specified if `do_center_crop` is `True`.")
+    if do_resize and (size is None or resample is None):
+        raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
+# In the future we can add a TF implementation here when we have TF models.
+class ImageFeatureExtractionMixin:
+    """
+    Mixin that contain utilities for preparing image features.
+    """
+    def _ensure_format_supported(self, image):
+        if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image):
+            raise ValueError(
+                f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.array` and "
+                "`torch.Tensor` are."
+            )
+    def to_pil_image(self, image, rescale=None):
+        """
+        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+        needed.
+        Args:
+            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+                The image to convert to the PIL Image format.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
+                default to `True` if the image type is a floating type, `False` otherwise.
+        """
+        self._ensure_format_supported(image)
+        if is_torch_tensor(image):
+            image = image.numpy()
+        if isinstance(image, np.ndarray):
+            if rescale is None:
+                # rescale default to the array being of floating type.
+                rescale = isinstance(image.flat[0], np.floating)
+            # If the channel as been moved to first dim, we put it back at the end.
+            if image.ndim == 3 and image.shape[0] in [1, 3]:
+                image = image.transpose(1, 2, 0)
+            if rescale:
+                image = image * 255
+            image = image.astype(np.uint8)
+            return PIL.Image.fromarray(image)
+        return image
+    def convert_rgb(self, image):
+        """
+        Converts `PIL.Image.Image` to RGB format.
+        Args:
+            image (`PIL.Image.Image`):
+                The image to convert.
+        """
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            return image
+        return image.convert("RGB")
+    def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.ndarray:
+        """
+        Rescale a numpy image by scale amount
+        """
+        self._ensure_format_supported(image)
+        return image * scale
+    def to_numpy_array(self, image, rescale=None, channel_first=True):
+        """
+        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
+        dimension.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to convert to a NumPy array.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
+                default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
+            channel_first (`bool`, *optional*, defaults to `True`):
+                Whether or not to permute the dimensions of the image to put the channel dimension first.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = np.array(image)
+        if is_torch_tensor(image):
+            image = image.numpy()
+        rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale
+        if rescale:
+            image = self.rescale(image.astype(np.float32), 1 / 255.0)
+        if channel_first and image.ndim == 3:
+            image = image.transpose(2, 0, 1)
+        return image
+    def expand_dims(self, image):
+        """
+        Expands 2-dimensional `image` to 3 dimensions.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to expand.
+        """
+        self._ensure_format_supported(image)
+        # Do nothing if PIL image
+        if isinstance(image, PIL.Image.Image):
+            return image
+        if is_torch_tensor(image):
+            image = image.unsqueeze(0)
+        else:
+            image = np.expand_dims(image, axis=0)
+        return image
+    def normalize(self, image, mean, std, rescale=False):
+        """
+        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
+        if it's a PIL Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to normalize.
+            mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
+                The mean (per channel) to use for normalization.
+            std (`List[float]` or `np.ndarray` or `torch.Tensor`):
+                The standard deviation (per channel) to use for normalization.
+            rescale (`bool`, *optional*, defaults to `False`):
+                Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
+                happen automatically.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image, rescale=True)
+        # If the input image is a PIL image, it automatically gets rescaled. If it's another
+        # type it may need rescaling.
+        elif rescale:
+            if isinstance(image, np.ndarray):
+                image = self.rescale(image.astype(np.float32), 1 / 255.0)
+            elif is_torch_tensor(image):
+                image = self.rescale(image.float(), 1 / 255.0)
+        if isinstance(image, np.ndarray):
+            if not isinstance(mean, np.ndarray):
+                mean = np.array(mean).astype(image.dtype)
+            if not isinstance(std, np.ndarray):
+                std = np.array(std).astype(image.dtype)
+        elif is_torch_tensor(image):
+            import torch
+            if not isinstance(mean, torch.Tensor):
+                if isinstance(mean, np.ndarray):
+                    mean = torch.from_numpy(mean)
+                else:
+                    mean = torch.tensor(mean)
+            if not isinstance(std, torch.Tensor):
+                if isinstance(std, np.ndarray):
+                    std = torch.from_numpy(std)
+                else:
+                    std = torch.tensor(std)
+        if image.ndim == 3 and image.shape[0] in [1, 3]:
+            return (image - mean[:, None, None]) / std[:, None, None]
+        else:
+            return (image - mean) / std
+    def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
+        """
+        Resizes `image`. Enforces conversion of input to PIL.Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to resize.
+            size (`int` or `Tuple[int, int]`):
+                The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
+                matched to this.
+                If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+                `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
+                this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+            resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                The filter to user for resampling.
+            default_to_square (`bool`, *optional*, defaults to `True`):
+                How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
+                square (`size`,`size`). If set to `False`, will replicate
+                [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+                with support for resizing only the smallest edge and providing an optional `max_size`.
+            max_size (`int`, *optional*, defaults to `None`):
+                The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
+                greater than `max_size` after being resized according to `size`, then the image is resized again so
+                that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
+                edge may be shorter than `size`. Only used if `default_to_square` is `False`.
+        Returns:
+            image: A resized `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PILImageResampling.BILINEAR
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+        if isinstance(size, list):
+            size = tuple(size)
+        if isinstance(size, int) or len(size) == 1:
+            if default_to_square:
+                size = (size, size) if isinstance(size, int) else (size[0], size[0])
+            else:
+                width, height = image.size
+                # specified size only for the smallest edge
+                short, long = (width, height) if width <= height else (height, width)
+                requested_new_short = size if isinstance(size, int) else size[0]
+                if short == requested_new_short:
+                    return image
+                new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+                if max_size is not None:
+                    if max_size <= requested_new_short:
+                        raise ValueError(
+                            f"max_size = {max_size} must be strictly greater than the requested "
+                            f"size for the smaller edge size = {size}"
+                        )
+                    if new_long > max_size:
+                        new_short, new_long = int(max_size * new_short / new_long), max_size
+                size = (new_short, new_long) if width <= height else (new_long, new_short)
+        return image.resize(size, resample=resample)
+    def center_crop(self, image, size):
+        """
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
+        size given, it will be padded (so the returned result has the size asked).
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
+                The image to resize.
+            size (`int` or `Tuple[int, int]`):
+                The size to which crop the image.
+        Returns:
+            new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
+            height, width).
+        """
+        self._ensure_format_supported(image)
+        if not isinstance(size, tuple):
+            size = (size, size)
+        # PIL Image.size is (width, height) but NumPy array and torch Tensors have (height, width)
+        if is_torch_tensor(image) or isinstance(image, np.ndarray):
+            if image.ndim == 2:
+                image = self.expand_dims(image)
+            image_shape = image.shape[1:] if image.shape[0] in [1, 3] else image.shape[:2]
+        else:
+            image_shape = (image.size[1], image.size[0])
+        top = (image_shape[0] - size[0]) // 2
+        bottom = top + size[0]  # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+        left = (image_shape[1] - size[1]) // 2
+        right = left + size[1]  # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+        # For PIL Images we have a method to crop directly.
+        if isinstance(image, PIL.Image.Image):
+            return image.crop((left, top, right, bottom))
+        # Check if image is in (n_channels, height, width) or (height, width, n_channels) format
+        channel_first = True if image.shape[0] in [1, 3] else False
+        # Transpose (height, width, n_channels) format images
+        if not channel_first:
+            if isinstance(image, np.ndarray):
+                image = image.transpose(2, 0, 1)
+            if is_torch_tensor(image):
+                image = image.permute(2, 0, 1)
+        # Check if cropped area is within image boundaries
+        if top >= 0 and bottom <= image_shape[0] and left >= 0 and right <= image_shape[1]:
+            return image[..., top:bottom, left:right]
+        # Otherwise, we may need to pad if the image is too small. Oh joy...
+        new_shape = image.shape[:-2] + (max(size[0], image_shape[0]), max(size[1], image_shape[1]))
+        if isinstance(image, np.ndarray):
+            new_image = np.zeros_like(image, shape=new_shape)
+        elif is_torch_tensor(image):
+            new_image = image.new_zeros(new_shape)
+        top_pad = (new_shape[-2] - image_shape[0]) // 2
+        bottom_pad = top_pad + image_shape[0]
+        left_pad = (new_shape[-1] - image_shape[1]) // 2
+        right_pad = left_pad + image_shape[1]
+        new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+        top += top_pad
+        bottom += top_pad
+        left += left_pad
+        right += left_pad
+        new_image = new_image[
+            ..., max(0, top) : min(new_image.shape[-2], bottom), max(0, left) : min(new_image.shape[-1], right)
+        ]
+        return new_image
+    def flip_channel_order(self, image):
+        """
+        Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
+        `image` to a NumPy array if it's a PIL Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
+                be first.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image)
+        return image[::-1, :, :]
+    def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
+        """
+        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
+        counter clockwise around its centre.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
+                rotating.
+        Returns:
+            image: A rotated `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PIL.Image.NEAREST
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+        return image.rotate(
+            angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
+        )
+def validate_annotations(
+    annotation_format: AnnotationFormat,
+    supported_annotation_formats: Tuple[AnnotationFormat, ...],
+    annotations: List[Dict],
+) -> None:
+    if annotation_format not in supported_annotation_formats:
+        raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
+    if annotation_format is AnnotationFormat.COCO_DETECTION:
+        if not valid_coco_detection_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                "being a list of annotations in the COCO format."
+            )
+    if annotation_format is AnnotationFormat.COCO_PANOPTIC:
+        if not valid_coco_panoptic_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
+                "the latter being a list of annotations in the COCO format."
+            )
+def validate_kwargs(valid_processor_keys: List[str], captured_kwargs: List[str]):
+    unused_keys = set(captured_kwargs).difference(set(valid_processor_keys))
+    if unused_keys:
+        unused_key_str = ", ".join(unused_keys)
+        # TODO raise a warning here instead of simply logging?
+        logger.warning(f"Unused or unrecognized kwargs: {unused_key_str}.")

inference_utils.py ADDED Viewed

	@@ -0,0 +1,412 @@

+import re
+import torch
+import torch.nn.functional as F
+import random
+import numpy as np
+from transformers import LogitsProcessor, LogitsProcessorList, logging
+from typing import Optional, List, Dict, Tuple, Union, Set
+logger = logging.get_logger(__name__)
+def parse_interleaved_text_image(
+        full_output_text: str,
+        num_levels: int = 2,
+        image_placeholder: str = "<image>",
+        start_tag: str = "<start_of_image>",
+        end_tag: str = "<end_of_image>"
+) -> Tuple[str, List[List[List[int]]]]:
+    """
+    Parses text containing interleaved image token blocks.
+    Identifies blocks enclosed by start_tag and end_tag, extracts image tokens
+    (<|image_levelX_Y|>) within them, and replaces the blocks with a placeholder
+    in the output text.
+    Args:
+        full_output_text: The raw input string containing text and image blocks.
+        num_levels: The expected number of levels for image tokens (e.g., 2).
+        image_placeholder: The string to replace image blocks with in the output text.
+        start_tag: The exact string marking the beginning of an image block.
+        end_tag: The exact string marking the end of an image block.
+        eos_token: If provided, this token will be removed from the final text.
+    Returns:
+        A tuple containing:
+        - generated_text (str): The text with image blocks replaced by placeholders.
+        - all_image_indices (List[List[List[int]]]): A list where each element
+          represents one image. Each image element is a list containing lists
+          of token indices for each level.
+          Example for 2 images, 2 levels:
+          [
+            [[level0_indices_img1], [level1_indices_img1]], # Image 1
+            [[level0_indices_img2], [level1_indices_img2]]  # Image 2
+          ]
+    """
+    all_image_indices: List[List[List[int]]] = []
+    processed_text_parts: List[str] = []
+    list_image_token_parts: List[str] = []
+    last_end: int = 0
+    # Escape start/end tags for regex safety if they contain special characters
+    escaped_start_tag = re.escape(start_tag)
+    escaped_end_tag = re.escape(end_tag)
+    # Pattern to find image blocks: start_tag ... end_tag (non-greedy)
+    image_block_pattern = rf'{escaped_start_tag}(.*?){escaped_end_tag}'
+    # Pattern to find individual image tokens within a block
+    token_pattern = r'<\|image_level(\d+)_(\d+)\|>'
+    # Find all image blocks (re.DOTALL allows '.' to match newlines)
+    for match in re.finditer(image_block_pattern, full_output_text, re.DOTALL):
+        # 1. Add text preceding this image block
+        processed_text_parts.append(full_output_text[last_end:match.start()])
+        # collect the image token ids.
+        list_image_token_parts.append(full_output_text[match.start(): match.end()])
+        # 2. Add the placeholder for the image
+        processed_text_parts.append(image_placeholder)
+        # 3. Process the content *within* the current image block
+        image_token_content = match.group(1)  # Content between tags
+        parsed_level_indices = {}  # {level: [indices]} for *this* image
+        # Find all image tokens within this block
+        for token_match in re.finditer(token_pattern, image_token_content):
+            try:
+                level = int(token_match.group(1))
+                index = int(token_match.group(2))
+                if level >= num_levels:
+                    logger.warning(f"Parsed token level {level} >= num_levels {num_levels}. Ignoring token.")
+                    continue
+                if level not in parsed_level_indices:
+                    parsed_level_indices[level] = []
+                parsed_level_indices[level].append(index)
+            except (ValueError, IndexError):
+                logger.warning(f"Could not parse token: {token_match.group(0)}")
+                continue  # Skip malformed tokens
+        # Structure the indices for the current image based on expected levels
+        current_image_indices = []
+        logger.debug(f"Processing Image Block. Found levels: {parsed_level_indices.keys()}")
+        for level in range(num_levels):
+            # Get indices for the level, default to empty list if level not found
+            indices = parsed_level_indices.get(level, [])
+            # Optional: Sort indices if order isn't guaranteed (usually is by finditer)
+            # indices.sort()
+            current_image_indices.append(indices)
+            logger.debug(f"  Level {level} indices count: {len(indices)}")
+        all_image_indices.append(current_image_indices)
+        logger.info(f"Parsed Image {len(all_image_indices)}: Found indices for {len(current_image_indices)} levels.")
+        # 4. Update position for the next iteration
+        last_end = match.end()
+    # Add any remaining text after the last image block
+    processed_text_parts.append(full_output_text[last_end:])
+    # Join the text parts to form the final generated text
+    generated_text = "".join(processed_text_parts)
+    return generated_text, all_image_indices, list_image_token_parts
+def calculate_image_token_num(h, w, downsample_rate_per_level=[28, 16]):
+    # Assuming RESOLUTION_MAPPING is accessible or hardcoded if needed
+    # For simplicity, let's assume direct calculation based on downsampling
+    # Replace with actual RESOLUTION_MAPPING logic if necessary
+    # Example: w1, h1 = RESOLUTION_MAPPING.get((w, h), (w, h)) # Get from mapping
+    w1, h1 = w, h  # Placeholder if mapping not available/needed here
+    w1, h1 = w1 // downsample_rate_per_level[0], h1 // downsample_rate_per_level[0]
+    semantic_token_num = w1 * h1
+    w2, h2 = w // downsample_rate_per_level[1], h // downsample_rate_per_level[1]
+    pixel_token_num = w2 * h2
+    logger.info(f"Calculated token nums: semantic={semantic_token_num}, pixel={pixel_token_num} for target ({h},{w})")
+    # Estimate max_token_length (adjust based on special tokens in your format)
+    max_token_length = (h1 * (w1 + 1) + 2) + (h2 * (w2 + 1) + 2) + 2 + 2 + 1 + 1 + 50  # Add buffer
+    return [semantic_token_num, pixel_token_num], max_token_length, h1, w1, h2, w2
+class InterleavedLogitsProcessor(LogitsProcessor):
+    """
+    Combines CFG, Dual VQ Image Token Structure Enforcement, and Dynamic Sampling
+    for interleaved text and image generation.
+    Includes refined masking during text generation to only allow text,
+    a specific resolution tag, and the start_of_image token.
+    """
+    def __init__(self,
+                 # CFG parameters
+                 guidance_scale=1.0,
+                 uncond=None,
+                 attention_mask=None,
+                 model=None,
+                 # DualVQ parameters
+                 level0_range=None, level1_range=None,
+                 num_level0_rows=None, num_level0_tokens=None,
+                 num_level1_rows=None, num_level1_tokens=None,
+                 special_tokens=None,
+                 *,
+                 # Dynamic Sampling parameters
+                 default_temp=1.0, level0_temp=1.0, level1_temp=2.0,
+                 default_top_k=2048, level0_top_k=2048, level1_top_k=2048 * 3,
+                 default_top_p=0.8, level0_top_p=0.8, level1_top_p=1.0,
+                 # General
+                 images=None,
+                 ):
+        # --- CFG ---
+        self.guidance_scale = guidance_scale
+        self.uncond = uncond
+        self.attention_mask = attention_mask
+        self.images = images
+        self.model = model
+        self.out = None
+        # --- DualVQ ---
+        self.level0_range = level0_range
+        self.level1_range = level1_range
+        self.num_level0_rows = num_level0_rows
+        self.num_level0_tokens = num_level0_tokens
+        self.num_level1_rows = num_level1_rows
+        self.num_level1_tokens = num_level1_tokens
+        self.special_tokens = special_tokens
+        # DualVQ State
+        self.generating_image = False
+        self.current_level = None
+        self.tokens_in_row = 0
+        self.rows_in_level = 0
+        # --- Dynamic Sampling ---
+        self.start_of_level0_token_id = special_tokens["start_of_level0"]
+        self.end_of_level0_token_id = special_tokens["end_of_level0"]
+        self.start_of_level1_token_id = special_tokens["start_of_level1"]
+        self.end_of_level1_token_id = special_tokens["end_of_level1"]
+        self.start_of_image_token_id = special_tokens["start_of_image"]
+        self.end_of_image_token_id = special_tokens["end_of_image"]
+        self.default_temp = default_temp
+        self.default_top_k = default_top_k
+        self.default_top_p = default_top_p
+        self.level0_temp = level0_temp
+        self.level0_top_k = level0_top_k
+        self.level0_top_p = level0_top_p
+        self.level1_temp = level1_temp
+        self.level1_top_k = level1_top_k
+        self.level1_top_p = level1_top_p
+        # Dynamic Sampling State
+        self.in_level0_mode = False
+        self.in_level1_mode = False
+        # --- Validation ---
+        if not self.special_tokens:
+            raise ValueError("special_tokens dictionary cannot be empty.")
+        # *** Updated required keys ***
+        required_keys = ["start_of_image", "end_of_image", "start_of_level0",
+                         "end_of_level0", "start_of_level1", "end_of_level1",
+                         "end_of_line", "end_of_text"]
+        for key in required_keys:
+            if key not in self.special_tokens:
+                raise ValueError(f"Missing required key in special_tokens: {key}")
+    def _apply_cfg(self, input_ids, scores):
+        """Applies Classifier-Free Guidance."""
+        scores = F.log_softmax(scores, dim=-1)
+        if self.guidance_scale == 1:
+            return scores
+        if self.out is None:
+            self.out = self.model(self.uncond,
+                                  attention_mask=self.attention_mask,
+                                  pixel_values=self.images)
+        else:
+            self.out = self.model(
+                input_ids[:, -1:],
+                use_cache=True,
+                past_key_values=self.out.past_key_values,
+            )
+        unconditional_logits = F.log_softmax(self.out.logits[:, -1, :], dim=-1)
+        out = self.guidance_scale * (scores - unconditional_logits) + unconditional_logits
+        return out
+    def _apply_sampling(self, scores, temp, top_k, top_p):
+        """ Apply top-k, top-p, and temperature """
+        if temp > 0.0:
+            scores = scores / temp  # Adjust temperature
+        # Top-K filtering
+        if top_k > 0:
+            top_k_values, _ = torch.topk(scores, min(top_k, scores.size(-1)))
+            scores[scores < top_k_values[:, -1].unsqueeze(-1)] = -float("Inf")
+        # Top-P filtering
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            # Only keep tokens with cumulative probabilities within top_p
+            sorted_indices_to_remove = cumulative_probs > top_p
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = False
+            indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+            scores[indices_to_remove] = -float("Inf")
+        return scores
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # --- Step 0: Get last token and Vocab Size ---
+        last_token = None
+        if input_ids.shape[1] > 0:
+            last_token = input_ids[0, -1].item()  # Assuming batch size 1
+        # --- Step 1: Update State & Apply Constraints ---
+        # State updates based on the *last generated* token
+        if last_token == self.start_of_image_token_id:
+            self.generating_image = True
+            self.current_level = None
+            self.tokens_in_row = 0
+            self.rows_in_level = 0
+            self.in_level0_mode = False
+            self.in_level1_mode = False
+        elif last_token == self.start_of_level0_token_id:
+            self.current_level = "level0"
+            self.tokens_in_row = 0
+            self.rows_in_level = 0
+            self.in_level0_mode = True
+            self.in_level1_mode = False
+        elif last_token == self.end_of_level0_token_id:
+            self.current_level = None
+            self.in_level0_mode = False
+        elif last_token == self.start_of_level1_token_id:
+            self.current_level = "level1"
+            self.tokens_in_row = 0
+            self.rows_in_level = 0
+            self.in_level0_mode = False
+            self.in_level1_mode = True
+        elif last_token == self.end_of_level1_token_id:
+            self.current_level = None
+            self.in_level1_mode = False
+        elif last_token == self.end_of_image_token_id:
+            self.generating_image = False
+            self.current_level = None
+            self.tokens_in_row = 0
+            self.rows_in_level = 0
+            self.in_level0_mode = False
+            self.in_level1_mode = False
+        elif last_token == self.special_tokens["end_of_line"] and self.generating_image:
+            self.tokens_in_row = 0
+            self.rows_in_level += 1
+        elif self.generating_image and self.current_level is not None:
+            if (self.current_level == "level0" and self.level0_range[0] <= last_token < self.level0_range[1]) or \
+                    (self.current_level == "level1" and self.level1_range[0] <= last_token < self.level1_range[1]):
+                self.tokens_in_row += 1
+        # --- Step 2: Apply CFG ---
+        if self.generating_image:
+            scores = self._apply_cfg(input_ids, scores)
+        else:
+            if self.out:
+                self.out = None
+        # Apply constraints based on the *current* state (determining the *next* token)
+        mask = torch.zeros_like(scores, dtype=torch.bool)  # True means ALLOWED
+        if self.generating_image:
+            # --- Image Generation Masking ---
+            if self.current_level == "level0":
+                if self.rows_in_level == self.num_level0_rows:
+                    mask[:, self.special_tokens["end_of_level0"]] = True
+                elif self.tokens_in_row == self.num_level0_tokens:
+                    mask[:, self.special_tokens["end_of_line"]] = True
+                else:
+                    mask[:, self.level0_range[0]:self.level0_range[1]] = True
+            elif self.current_level == "level1":
+                if self.rows_in_level == self.num_level1_rows:
+                    mask[:, self.special_tokens["end_of_level1"]] = True
+                elif self.tokens_in_row == self.num_level1_tokens:
+                    mask[:, self.special_tokens["end_of_line"]] = True
+                else:
+                    mask[:, self.level1_range[0]:self.level1_range[1]] = True
+            else:  # Between structure tokens
+                if last_token == self.start_of_image_token_id:
+                    mask[:, self.special_tokens["start_of_level0"]] = True
+                elif last_token == self.end_of_level0_token_id:
+                    mask[:, self.special_tokens["start_of_level1"]] = True
+                elif last_token == self.end_of_level1_token_id:
+                    mask[:, self.special_tokens["end_of_image"]] = True
+                elif last_token is None and input_ids.shape[1] == 0:  # Very first token is image?
+                    mask[:, self.start_of_image_token_id] = True
+                else:  # Allow relevant structural tokens if needed
+                    mask[:, self.special_tokens["start_of_level0"]] = True
+                    mask[:, self.special_tokens["start_of_level1"]] = True
+                    mask[:, self.special_tokens["end_of_image"]] = True
+        else:
+            # Allow *all* tokens by default...
+            mask[:, :] = True
+            # ...then specifically *disallow* image content and intermediate structure tokens
+            mask[:, self.level0_range[0]:self.level0_range[1]] = False
+            mask[:, self.level1_range[0]:self.level1_range[1]] = False
+            mask[:, self.special_tokens["start_of_level0"]] = False
+            mask[:, self.special_tokens["end_of_level0"]] = False
+            mask[:, self.special_tokens["start_of_level1"]] = False
+            mask[:, self.special_tokens["end_of_level1"]] = False
+            mask[:, self.special_tokens["end_of_line"]] = False  # EOL only allowed within image context
+            # Ensure the specific allowed tokens for text phase are indeed allowed
+            # (This overrides any potential disallowing above if IDs overlap, e.g., if EOS was in image range)
+            mask[:, self.special_tokens["end_of_text"]] = True
+            mask[:, self.special_tokens["start_of_image"]] = True
+        # Apply the mask
+        scores[~mask] = -float("Inf")
+        # Handle edge case: If all tokens are masked
+        if not torch.any(scores > -float("Inf"), dim=-1).all():
+            print("WARN: All tokens masked, allowing EOS.")
+            # Allow EOS and potentially other safe tokens if needed
+            scores[:] = -float("Inf")  # Reset all to -inf first
+            scores[:, self.special_tokens["end_of_text"]] = 0
+        # --- Step 3: Apply Dynamic Sampling ---
+        current_temp, current_top_k, current_top_p = self.default_temp, self.default_top_k, self.default_top_p
+        if self.in_level0_mode:
+            current_temp, current_top_k, current_top_p = self.level0_temp, self.level0_top_k, self.level0_top_p
+        elif self.in_level1_mode:
+            current_temp, current_top_k, current_top_p = self.level1_temp, self.level1_top_k, self.level1_top_p
+        scores = self._apply_sampling(scores, current_temp, current_top_k, current_top_p)
+        return scores
+def replace_placeholder_with_list(
+        tensor_a: torch.Tensor,
+        tensor_b: torch.Tensor,
+        placeholder_value: Union[int, float]
+) -> torch.Tensor:
+    if tensor_a.dim() != 1:
+        raise ValueError("Input tensor_a must be 1-dimensional.")
+    indices = torch.where(tensor_a == placeholder_value)[0]
+    if len(indices) == 0:
+        # Placeholder not found, return the original tensor
+        print(
+            f"Warning: Placeholder value {placeholder_value} not found in the tensor. Returning original tensor.")
+        return tensor_a
+    # Get the index of the *first* occurrence
+    idx = indices[0].item()
+    result_tensor = torch.cat((tensor_a[:idx], tensor_b.to(tensor_a), tensor_a[idx + 1:]), dim=0)
+    return result_tensor

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_dualvitok.py ADDED Viewed

	@@ -0,0 +1,653 @@

+from __future__ import annotations
+import os
+import sys
+import math
+from typing import Optional, Tuple, Union, List, Callable
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Module
+from einops import rearrange, repeat, pack, unpack
+from einx import get_at
+from torch.utils.checkpoint import checkpoint
+from transformers import AutoImageProcessor
+from transformers.modeling_utils import PreTrainedModel, get_parameter_device, get_parameter_dtype
+from .configuration_dualvitok import DualViTokConfig
+from .modeling_movqgan import MoVQModel, MoVQEncoder, MoVQDecoder, Decoder
+from .configuration_qwen2vit import Qwen2VLVisionConfig
+from .modeling_qwen2vit import Qwen2VisionTransformerPretrainedModel, \
+    VisionRotaryEmbedding, Qwen2VLBatchVisionBlock
+try:
+    import xformers.ops as xops
+    is_xformers_available = True
+except Exception as e:
+    is_xformers_available = False
+if torch.__version__ > "2.1.2":
+    IS_SDPA_AVAILABLE = True
+else:
+    IS_SDPA_AVAILABLE = False
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(cur_dir)
+# helper functions
+def exists(v):
+    return v is not None
+def identity(t):
+    return t
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    packed, packed_shape = pack([t], pattern)
+    def inverse(out, inv_pattern=None):
+        inv_pattern = default(inv_pattern, pattern)
+        out, = unpack(out, packed_shape, inv_pattern)
+        return out
+    return packed, inverse
+# class
+class SimVQ(Module):
+    def __init__(
+            self,
+            dim,
+            codebook_size,
+            codebook_transform: Module | None = None,
+            init_fn: Callable = identity,
+            channel_first=True,
+            input_to_quantize_commit_loss_weight=0.25,
+            commitment_weight=1.,
+            frozen_codebook_dim=None  # frozen codebook dim could have different dimensions than projection
+    ):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.channel_first = channel_first
+        frozen_codebook_dim = default(frozen_codebook_dim, dim)
+        codebook = torch.randn(codebook_size, frozen_codebook_dim) * (frozen_codebook_dim ** -0.5)
+        codebook = init_fn(codebook)
+        # the codebook is actually implicit from a linear layer from frozen gaussian or uniform
+        if not exists(codebook_transform):
+            codebook_transform = nn.Linear(frozen_codebook_dim, dim, bias=False)
+        self.code_transform = codebook_transform
+        self.register_buffer('frozen_codebook', codebook)
+        # commit loss weighting - weighing input to quantize a bit less is crucial for it to work
+        self.input_to_quantize_commit_loss_weight = input_to_quantize_commit_loss_weight
+        # total commitment loss weight
+        self.commitment_weight = commitment_weight
+    @property
+    def codebook(self):
+        return self.code_transform(self.frozen_codebook)
+    def indices_to_codes(
+            self,
+            indices
+    ):
+        implicit_codebook = self.codebook
+        frozen_codes = get_at('[c] d, b ... -> b ... d', self.frozen_codebook, indices)
+        quantized = self.code_transform(frozen_codes)
+        if self.channel_first:
+            quantized = rearrange(quantized, 'b ... d -> b d ...')
+        return quantized
+    def forward(
+            self,
+            x
+    ):
+        if self.channel_first:
+            x = rearrange(x, 'b d ... -> b ... d')
+        x, inverse_pack = pack_one(x, 'b * d')
+        implicit_codebook = self.codebook
+        with torch.no_grad():
+            dist = torch.cdist(x, implicit_codebook)
+            indices = dist.argmin(dim=-1)
+        # select codes
+        quantized = get_at('[c] d, b n -> b n d', implicit_codebook, indices)
+        # commit loss and straight through, as was done in the paper
+        commit_loss = (
+                F.mse_loss(x.detach(), quantized) +
+                F.mse_loss(x, quantized.detach()) * self.input_to_quantize_commit_loss_weight
+        )
+        quantized = (quantized - x).detach() + x
+        quantized = inverse_pack(quantized)
+        indices = inverse_pack(indices, 'b *')
+        if self.channel_first:
+            quantized = rearrange(quantized, 'b ... d-> b d ...')
+        return quantized, commit_loss * self.commitment_weight, indices
+def init_weights(m):
+    if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.LayerNorm):
+        if m.weight is not None:
+            nn.init.constant_(m.weight, 1)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Linear):
+        nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d) \
+            or isinstance(m, nn.Conv3d) or isinstance(m, nn.ConvTranspose3d):
+        w = m.weight.data
+        nn.init.xavier_uniform_(w)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        nn.init.normal_(m.weight, mean=0, std=1)
+class ScalingLayerForQwen2ViT:
+    def __init__(
+            self,
+            min_pixels: int = 56 * 56,
+            max_pixels: int = 28 * 28 * 1280,
+            patch_size: int = 14,
+            temporal_patch_size: int = 2,
+            merge_size: int = 2,
+            **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        OPENAI_CLIP_MEAN = torch.as_tensor([0.48145466, 0.4578275, 0.40821073])[None, :, None, None]
+        OPENAI_CLIP_STD = torch.as_tensor([0.26862954, 0.26130258, 0.27577711])[None, :, None, None]
+        self.image_mean = OPENAI_CLIP_MEAN
+        self.image_std = OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+    def __call__(self, images):
+        if images.ndim == 4:
+            images = images.unsqueeze(1)
+        batch_size, temporal, channel, height, width = images.shape
+        factor = self.patch_size * self.merge_size
+        resized_height, resized_width = height // factor * factor, width // factor * factor
+        images = (images + 1) / 2  # rescale to [0, 1.]
+        images = torch.nn.functional.interpolate(
+            images.flatten(0, 1).float(),
+            size=(resized_height, resized_width),
+            mode='bicubic',
+            align_corners=False,
+            antialias=True
+        ).to(images.dtype)
+        images = images.clamp(0, 1)  # rescale to [0, 1.]
+        images = ((images - self.image_mean.to(images)) / self.image_std.to(images))
+        images = rearrange(images, '(b t) c h w -> b t c h w', b=batch_size, t=temporal)
+        if temporal == 1:
+            images = images.repeat_interleave(self.temporal_patch_size, dim=1)
+            temporal = self.temporal_patch_size
+        grid_t = temporal // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        images = images.reshape(
+            batch_size * grid_t,
+            self.temporal_patch_size,
+            channel,
+            -1
+        )
+        images = rearrange(images, 'b p c n -> b n (c p)')
+        images = images.reshape(
+            batch_size * grid_t,
+            grid_h // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            grid_w // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            -1
+        )
+        images = rearrange(images, 'b h k s1 w l s2 n -> (b h w k l) (n s1 s2)')
+        return dict(image=images, image_grid_thw=torch.as_tensor([[grid_t, grid_h, grid_w] for _ in range(batch_size)]))
+class SemanticEncoder(nn.Module):
+    def __init__(self,
+                 semantic_encoder,
+                 z_channels=4,
+                 num_blocks=2,
+                 embed_dim=1280,
+                 proj_layer='linear',
+                 attn_implementation='xformers',
+                 target_mlp='identity',
+                 ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        if isinstance(semantic_encoder, str):
+            self.model = Qwen2VisionTransformerPretrainedModel.from_pretrained(
+                semantic_encoder,
+                attn_implementation=attn_implementation
+            )
+        elif isinstance(semantic_encoder, dict):
+            config = Qwen2VLVisionConfig(**semantic_encoder, attn_implementation=attn_implementation)
+            self.model = Qwen2VisionTransformerPretrainedModel(config)
+        else:
+            raise ValueError(f"Invalid semantic_encoder: {semantic_encoder}")
+        input_channels = self.model.config.hidden_size
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.proj_in = nn.Conv2d(input_channels, embed_dim, 1, 1) if input_channels != embed_dim else nn.Identity()
+        config = Qwen2VLVisionConfig(depth=num_blocks,
+                                     embed_dim=embed_dim, )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLBatchVisionBlock(config, attn_implementation) for _ in range(num_blocks)]
+        )
+        if proj_layer == 'norm_linear':
+            self.proj_out = nn.Sequential(
+                nn.LayerNorm(embed_dim),
+                nn.Linear(
+                    embed_dim,
+                    z_channels,
+                )
+            )
+        elif proj_layer == 'linear':
+            self.proj_out = nn.Sequential(
+                nn.Linear(
+                    embed_dim,
+                    z_channels,
+                )
+            )
+        elif proj_layer == 'mlp':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, embed_dim),
+                nn.Tanh(),
+                nn.Linear(embed_dim, z_channels),
+            )
+        else:
+            raise RuntimeError(f"Wrong proj layer. Got {proj_layer}")
+        if target_mlp == 'identity':
+            self.target_mlp = nn.Sequential(
+                nn.Identity(),
+            )
+        elif target_mlp == 'norm':
+            self.target_mlp = nn.Sequential(
+                nn.LayerNorm(input_channels, eps=1e-6, elementwise_affine=False),
+            )
+        self.init_weight()
+    def init_weight(self):
+        self.proj_in.apply(init_weights)
+        self.blocks.apply(init_weights)
+        self.proj_out.apply(init_weights)
+        self.target_mlp.apply(init_weights)
+    def rot_pos_emb(self, grid_thw, max_seq_len):
+        pos_ids = torch.zeros((len(grid_thw), max_seq_len, 2), dtype=torch.long)
+        for idx, (t, h, w) in enumerate(grid_thw):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.flatten()
+            current_pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+            pos_ids[idx, :current_pos_ids.shape[0]] = current_pos_ids
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(2)
+        return rotary_pos_emb
+    def forward(self, x, grid_thw):
+        x = self.model(x, grid_thw=grid_thw)
+        x = x_target = self.target_mlp(x)
+        x = F.linear(x,
+                     self.proj_in.weight.view(self.proj_in.weight.shape[0], -1),
+                     self.proj_in.bias)
+        new_grid_thw = torch.as_tensor([[t, h // 2, w // 2] for t, h, w in grid_thw])
+        seq_lens = [t_i * h_i * w_i for t_i, h_i, w_i in new_grid_thw]
+        max_seq_len = max(seq_lens)
+        x = rearrange(x, '(b h w) c -> b (h w) c', h=new_grid_thw[0, 1], w=new_grid_thw[0, 2])
+        rotary_pos_emb = self.rot_pos_emb(new_grid_thw, max_seq_len)
+        for blk in self.blocks:
+            x = blk(x, rotary_pos_emb=rotary_pos_emb)
+        x = self.proj_out(x)  # [b, max_length, d]
+        t, h, w = new_grid_thw[0]
+        b = len(grid_thw)
+        x = rearrange(x, 'b (h w) c ->b c h w', b=b, h=h, w=w)
+        x_target = rearrange(x_target, '(b h w) c ->b c h w', b=b, h=h, w=w)
+        return x, x_target
+class SemanticDecoder(nn.Module):
+    def __init__(self,
+                 z_channels=4,
+                 embed_dim=1280,
+                 num_blocks=2,
+                 output_channels=1280,
+                 attn_implementation='xformers',
+                 proj_layer='linear_norm'):
+        super().__init__()
+        self.proj_in = nn.Linear(z_channels, embed_dim)
+        self.output_channels = output_channels
+        config = Qwen2VLVisionConfig(depth=num_blocks, embed_dim=embed_dim)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLBatchVisionBlock(config, attn_implementation) for _ in range(num_blocks)]
+        )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        if proj_layer == 'norm_linear':
+            self.proj_out = nn.Sequential(
+                nn.LayerNorm(embed_dim),
+                nn.Linear(embed_dim, output_channels),
+            )
+        elif proj_layer == 'linear':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, output_channels)
+            )
+        elif proj_layer == 'mlp':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, embed_dim),
+                nn.Tanh(),
+                nn.Linear(embed_dim, output_channels),
+            )
+        elif proj_layer == 'linear_norm':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, output_channels),
+                nn.LayerNorm(output_channels),
+            )
+        self.apply(init_weights)
+    @property
+    def last_layer(self):
+        return self.proj_out[-1].weight
+    def rot_pos_emb(self, grid_thw, max_seq_len):
+        pos_ids = torch.zeros((len(grid_thw), max_seq_len, 2), dtype=torch.long)
+        for idx, (t, h, w) in enumerate(grid_thw):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.flatten()
+            current_pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+            pos_ids[idx, :current_pos_ids.shape[0]] = current_pos_ids
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(2)
+        return rotary_pos_emb
+    def forward(self, z: torch.Tensor):
+        x = z
+        b, c, h, w = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        grid_thw = torch.as_tensor([[1, h, w] for _ in range(b)])
+        seq_lens = [t * h * w for t, h, w in grid_thw]
+        max_seq_len = max(seq_lens)
+        x = self.proj_in(x)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw, max_seq_len)
+        for blk in self.blocks:
+            x = blk(x, rotary_pos_emb=rotary_pos_emb)
+        x = self.proj_out(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+        return x
+class DualViTokPretrainModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = DualViTokConfig
+    base_model_prefix = "dualvitok"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["BatchQwen2VLVisionBlock", "MoVQResnetBlock", "MoVQAttnBlock", "MoVQResnetTemporalBlock"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+class DualViTok(DualViTokPretrainModel):
+    def __init__(self, config: DualViTokConfig):
+        super().__init__(config)
+        self.config = config
+        self._semantic_channel = config.semantic_encoder.z_channels
+        self._pixel_channel = config.pixel_encoder.z_channels
+        self.semantic_encoder = SemanticEncoder(
+            semantic_encoder=config.semantic_encoder.pretrained_semantic_encoder,
+            z_channels=config.semantic_encoder.z_channels,
+            num_blocks=config.semantic_encoder.num_blocks,
+            embed_dim=config.semantic_encoder.embed_dim,
+            proj_layer=config.semantic_encoder.out_layer,
+            attn_implementation=config.attn_implementation,
+            target_mlp=config.semantic_encoder.target_mlp, )
+        self.semantic_decoder = SemanticDecoder(
+            z_channels=config.semantic_decoder.z_channels,
+            embed_dim=config.semantic_decoder.embed_dim,
+            num_blocks=config.semantic_decoder.num_blocks,
+            output_channels=config.semantic_decoder.out_channels,
+            attn_implementation=config.attn_implementation,
+            proj_layer=config.semantic_decoder.out_layer,
+        )
+        if config.semantic_quantizer_type.lower() == 'simvq':
+            self.semantic_quantizer = SimVQ(
+                dim=config.semantic_encoder.z_channels,
+                codebook_size=config.semantic_quantizer_codebook_size,
+            )
+        elif config.semantic_quantizer_type.lower() == 'vq':
+            raise NotImplementedError
+            self.semantic_quantizer = VQ(
+                dim=config.semantic_encoder.z_channels,
+                codebook_size=config.semantic_quantizer_codebook_size,
+            )
+        self.pixel_encoder = MoVQEncoder(config.pixel_encoder)
+        self.pixel_quant_conv = nn.Conv2d(config.pixel_encoder.z_channels, config.pixel_encoder.embed_dim, 1)
+        if config.pixel_quantizer_type.lower() == 'simvq':
+            self.pixel_quantizer = SimVQ(
+                dim=config.pixel_encoder.z_channels,
+                codebook_size=config.pixel_quantizer_codebook_size,
+            )
+        elif config.pixel_quantizer_type.lower() == 'vq':
+            raise NotImplementedError
+            self.pixel_quantizer = VQ(
+                dim=config.pixel_encoder.z_channels,
+                codebook_size=config.pixel_quantizer_codebook_size,
+            )
+        self.pixel_post_quant_conv = nn.Conv2d(config.pixel_decoder.embed_dim,
+                                               config.pixel_decoder.z_channels, 1)
+        self.pixel_decoder = MoVQDecoder(config.pixel_decoder)
+        self.scaling_layer = ScalingLayerForQwen2ViT()
+    @property
+    def device(self):
+        return get_parameter_device(self)
+    @property
+    def dtype(self):
+        return get_parameter_dtype(self)
+    @property
+    def pixel_channel(self):
+        return self._pixel_channel
+    @property
+    def semantic_channel(self):
+        return self._semantic_channel
+    def encode(self, image: torch.FloatTensor):
+        scale_output = self.scaling_layer(image)
+        image, image_grid_thw, image_gen = scale_output['image'], scale_output['image_grid_thw'], image
+        h_semantic, target_semantic = self.semantic_encoder(image, image_grid_thw)
+        quant_semantic, emb_loss_semantic, info_semantic = self.semantic_quantizer(h_semantic.float())
+        h_pixel = self.pixel_encoder(image_gen)
+        h_pixel = self.pixel_quant_conv(h_pixel)
+        quant_pixel, emb_loss_pixel, info_pixel = self.pixel_quantizer(h_pixel.float())
+        return (quant_semantic, emb_loss_semantic, info_semantic, target_semantic), \
+               (quant_pixel, emb_loss_pixel, info_pixel)
+    def encode_code(self, *args, **kwargs):
+        (_, _, semantic_indices, _), \
+        (_, _, pixel_indices) = self.encode(*args, **kwargs)
+        return semantic_indices, pixel_indices
+    def indices_to_codes(self, semantic_indices, pixel_indices):
+        quant_semantic = self.semantic_quantizer.indices_to_codes(semantic_indices)
+        quant_pixel = self.pixel_quantizer.indices_to_codes(pixel_indices)
+        return quant_semantic, quant_pixel
+    def encode_semantic(self, image: torch.FloatTensor):
+        scale_output = self.scaling_layer(image)
+        image, image_grid_thw, image_gen = scale_output['image'], scale_output['image_grid_thw'], image
+        h_semantic, target_semantic = self.semantic_encoder(image, image_grid_thw)
+        quant_semantic, emb_loss_semantic, info_semantic = self.semantic_quantizer(h_semantic.float())
+        return quant_semantic, emb_loss_semantic, info_semantic, target_semantic
+    def merge_quants(self, quant_semantic: torch.Tensor, quant_pixel: torch.Tensor):
+        quant_semantic_resized = F.interpolate(
+            quant_semantic, quant_pixel.shape[-2:], mode='bicubic'
+        ).to(quant_semantic.dtype)
+        quant_semantic = quant_semantic_resized
+        quant = torch.cat([quant_semantic, quant_pixel], dim=1)
+        return quant
+    def decode(self, quant_semantic: torch.Tensor, quant_pixel: torch.Tensor, ):
+        quant = self.merge_quants(quant_semantic, quant_pixel)
+        quant2 = self.pixel_post_quant_conv(quant)
+        x = self.pixel_decoder(quant2, quant)
+        return x
+    def decode_code(self, semantic_indices, pixel_indices):
+        quant_semantic = self.semantic_quantizer.indices_to_codes(semantic_indices)
+        quant_pixel = self.pixel_quantizer.indices_to_codes(pixel_indices)
+        return self.decode(quant_semantic, quant_pixel)
+    def decode_semantic(self, x: List[torch.Tensor]):
+        return self.semantic_decoder(x)
+    def forward(self, pixel_values: torch.FloatTensor):
+        (quant_semantic, diff_semantic, _, target_semantic), \
+        (quant_pixel, diff_pixel, _) = self.encode(pixel_values)
+        dec = self.decode(quant_semantic, quant_pixel)
+        dec_semantic = self.decode_semantic(quant_semantic)
+        return (dec_semantic, diff_semantic, target_semantic), (dec, diff_pixel)
+    def build_sdxl_decoder(self, path='ILLUME-MLLM/dualvitok-sdxl-decoder',
+                           image_processor=None,
+                           torch_dtype=torch.float16,
+                           add_watermarker=False,
+                           device='cuda',
+                           ):
+        from .sdxl_decoder_pipe import StableDiffusionXLDecoderPipeline
+        if image_processor is None:
+            image_processor = AutoImageProcessor.from_pretrained('ILLUME-MLLM/dualvitok', trust_remote_code=True)
+        return StableDiffusionXLDecoderPipeline.from_pretrained(path,
+                                                                torch_dtype=torch_dtype,
+                                                                add_watermarker=add_watermarker,
+                                                                vq_model=self,
+                                                                vq_image_processor=image_processor).to(device)

modeling_illume.py ADDED Viewed

	@@ -0,0 +1,883 @@

+"""PyTorch ILLUME model."""
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import ModelOutput
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from transformers import LogitsProcessorList
+from .configuration_illume import ILLUMEConfig
+from .modeling_qwen2vit import Qwen2VisionTransformerPretrainedModel
+from .modeling_dualvitok import ScalingLayerForQwen2ViT, SemanticEncoder
+from .modeling_movqgan import MoVQEncoder
+from .inference_utils import InterleavedLogitsProcessor, \
+    parse_interleaved_text_image, calculate_image_token_num
+from einops import rearrange
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "ILLUMEConfig"
+# Define common resolutions
+DEFAULT_RESOLUTIONS = [
+    (256, 256), (512, 512), (384, 640), (640, 384), (512, 384),
+    (384, 512), (256, 384), (384, 256), (256, 512), (512, 256)
+]
+# qwen2.5
+special_tokens_ids = [151665, 151666, 151667, 151668, 151669, 151670, 151671]
+start_token = 151672 + 32
+level0_range = (start_token, start_token + 32768)  # Level 0 token ID 范围
+level1_range = (start_token + 32768, start_token + 32768 * 4)  # Level 1 token ID 范围
+special_tokens_dict = {
+    "start_of_image": 151665,
+    "end_of_image": 151666,
+    "start_of_level0": 151668,
+    "end_of_level0": 151669,
+    "start_of_level1": 151670,
+    "end_of_level1": 151671,
+    "end_of_line": 151667,
+    "end_of_text": 151645,
+    #
+    "level0_range": level0_range,
+    "level1_range": level1_range,
+}
+@dataclass
+class ILLUMECausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for ILLUME causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+class MLPProjector(nn.Sequential):
+    # CAbstractor
+    def __init__(self, mlp_depth, hidden_size, mm_hidden_size):
+        super(MLPProjector, self).__init__()
+        modules = [nn.Linear(mm_hidden_size, hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(hidden_size, hidden_size))
+        super(MLPProjector, self).__init__(*modules)
+class ILLUMEMultiModalProjector(nn.Sequential):
+    # CAbstractor
+    def __init__(self, config):
+        super(ILLUMEMultiModalProjector, self).__init__()
+        hidden_size = config.text_config.hidden_size
+        mm_hidden_size1, mm_hidden_size2 = config.mm_projector_config['mm_hidden_size']
+        self.projector_1 = MLPProjector(mlp_depth=config.mm_projector_config['projector_cfg1']['mlp_depth'],
+                                        mm_hidden_size=mm_hidden_size1, hidden_size=hidden_size)
+        self.projector_2 = MLPProjector(mlp_depth=config.mm_projector_config['projector_cfg2']['mlp_depth'],
+                                        mm_hidden_size=mm_hidden_size2, hidden_size=hidden_size)
+    def forward(self, image_features):
+        image_feature_1, image_feature_2 = image_features
+        image_feature_1 = self.projector_1(image_feature_1)
+        image_feature_2 = self.projector_2(image_feature_2)
+        image_features = torch.concat([image_feature_1, image_feature_2], dim=1)
+        return image_features
+class ILLUMEDualVisionTower(nn.Module):
+    def __init__(self,
+                 vision_config,
+                 attn_implementation='sdpa',
+                 ):
+        super().__init__()
+        self._config = vision_config
+        self.semantic_encoder = SemanticEncoder(
+            semantic_encoder=vision_config.semantic_encoder.pretrained_semantic_encoder,
+            z_channels=vision_config.semantic_encoder.z_channels,
+            num_blocks=vision_config.semantic_encoder.num_blocks,
+            embed_dim=vision_config.semantic_encoder.embed_dim,
+            proj_layer=vision_config.semantic_encoder.out_layer,
+            attn_implementation=attn_implementation,
+            target_mlp=vision_config.semantic_encoder.target_mlp, ).model
+        self.pixel_encoder = MoVQEncoder(vision_config.pixel_encoder)
+        self.scaling_layer = ScalingLayerForQwen2ViT()
+    def forward(self, images):
+        if isinstance(images, list) and all(x is not None and x.shape == images[0].shape for x in images):
+            images = torch.concat(images, dim=0)
+            images = images.to(device=self.device, dtype=self.dtype)
+        else:
+            images = [image.to(device=self.device, dtype=self.dtype) for image in images]
+        image_feature_shape_pixels, image_feature_shape_semantics = [], []
+        if isinstance(images, list):  # anyres setting
+            h_pixels = []
+            for image in images:
+                if image.ndim == 3:
+                    image = image.unsqueeze(0)
+                h_pixel = self.pixel_encoder(image)
+                b, c, h, w = h_pixel.shape
+                image_feature_shape_pixels.append((h, w))
+                h_pixel = rearrange(h_pixel, 'b c h w -> b (h w) c')
+                h_pixels.append(h_pixel)
+            h_pixels = torch.cat(h_pixels, dim=1)
+            h_semantics = []
+            for image in images:
+                if image.ndim == 3:
+                    image = image.unsqueeze(0)
+                image = image.unsqueeze(dim=1)
+                scale_output = self.scaling_layer(image.clone())
+                image_2, image_grid_thw = scale_output['image'], scale_output['image_grid_thw']
+                image_feature_shape_semantics.append((int(image_grid_thw[0][1]) // 2, int(image_grid_thw[0][2] // 2)))
+                h_semantic = self.semantic_encoder(image_2, image_grid_thw)
+                h_semantics.append(h_semantic)
+            h_semantics = torch.cat(h_semantics, dim=0)
+            h_semantics = h_semantics.unsqueeze(dim=0)
+            image_feature_shapes = [[shape_semantic, shape_pixel] for shape_semantic, shape_pixel in
+                                    zip(image_feature_shape_semantics, image_feature_shape_pixels)]
+        else:  # fixed res setting
+            assert images.ndim == 4
+            h_pixels = self.pixel_encoder(images)
+            b, c, h, w = h_pixels.shape
+            h_pixels = rearrange(h_pixels, 'b c h w -> (b h w) c')
+            h_pixels = h_pixels.unsqueeze(dim=0)
+            images = images.unsqueeze(dim=1)
+            scale_output = self.scaling_layer(images.clone())
+            images_2, images_grid_thw = scale_output['image'], scale_output['image_grid_thw']
+            h_semantics = self.semantic_encoder(images_2, images_grid_thw)
+            h_semantics = h_semantics.unsqueeze(dim=0)
+            shape_semantic = (int(images_grid_thw[0][1]) // 2, int(images_grid_thw[0][2] // 2))
+            shape_pixel = (h, w)
+            image_feature_shapes = [[shape_semantic, shape_pixel] for i in range(b)]
+        return [h_semantics, h_pixels], image_feature_shapes
+    @property
+    def dtype(self):
+        return self.semantic_encoder.dtype
+    @property
+    def device(self):
+        return self.semantic_encoder.device
+    @property
+    def config(self):
+        return self._config
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+class ILLUMEPreTrainedModel(PreTrainedModel):
+    config_class = ILLUMEConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ILLUMEVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+    def _init_weights(self, module):
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+class ILLUMEForConditionalGeneration(ILLUMEPreTrainedModel):
+    def __init__(self, config: ILLUMEConfig, **kwargs):
+        super().__init__(config)
+        self.vision_tower = ILLUMEDualVisionTower(config.vision_config)
+        self.mm_projector = ILLUMEMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        )
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self._padding_side = "left"  # set it to left by default, user can use setter to change padding_sides
+        self.post_init()
+    @property
+    def padding_side(self):
+        return self._padding_side
+    @padding_side.setter
+    def padding_side(self, padding_side: str):
+        if padding_side not in ["left", "right"]:
+            raise ValueError(f"{padding_side} is not `left` or `right`.")
+        self._padding_side = padding_side
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    def _add_eol(self, x_feat, eol_feature):
+        h, w, C = x_feat.shape
+        eol_feature = eol_feature.unsqueeze(0).unsqueeze(0).expand(h, 1, C)
+        x_feat = torch.cat([x_feat, eol_feature], dim=1)
+        x_feat = x_feat.view(-1, C)
+        return x_feat
+    def _reformat_image_sequence(self, x, special_tokens_features, level):
+        # add end_of_line
+        x = self._add_eol(x, special_tokens_features[2])
+        # add soi, eoi, sol, eol
+        x = torch.cat([
+            special_tokens_features[3 + level * 2].unsqueeze(0),
+            x,
+            special_tokens_features[3 + level * 2 + 1].unsqueeze(0),
+        ], dim=0)
+        return x
+    def _merge_input_ids_with_image_features(
+            self,
+            image_features,
+            feature_lens,
+            inputs_embeds,
+            input_ids,
+            attention_mask,
+            position_ids=None,
+            labels=None,
+            image_token_index=None,
+            ignore_index=-100,
+    ):
+        image_token_index = image_token_index if image_token_index is not None else self.config.image_token_index
+        ignore_index = ignore_index if ignore_index is not None else self.config.ignore_index
+        with torch.no_grad():
+            num_images = feature_lens.size(0)
+            num_image_features, embed_dim = image_features.shape
+            if feature_lens.sum() != num_image_features:
+                raise ValueError(f"{feature_lens=} / {feature_lens.sum()} != {image_features.shape=}")
+            batch_size = input_ids.shape[0]
+            _left_padding = torch.any(attention_mask[:, 0] == 0)
+            _right_padding = torch.any(attention_mask[:, -1] == 0)
+            left_padding = True if not self.training else False
+            if batch_size > 1 and not self.training:
+                if _left_padding and not _right_padding:
+                    left_padding = True
+                elif not _left_padding and _right_padding:
+                    left_padding = False
+                elif not _left_padding and not _right_padding:
+                    # both side is 1, so cannot tell
+                    left_padding = self.padding_side == "left"
+                else:
+                    # invalid attention_mask
+                    raise ValueError(f"both side of attention_mask has zero, invalid. {attention_mask}")
+            # Whether to turn off right padding
+            # 1. Create a mask to know where special image tokens are
+            special_image_token_mask = input_ids == image_token_index
+            # special_image_token_mask: [bsz, seqlen]
+            num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+            # num_special_image_tokens: [bsz]
+            # Reserve for padding of num_images
+            total_num_special_image_tokens = torch.sum(special_image_token_mask)
+            if total_num_special_image_tokens != num_images:
+                raise ValueError(
+                    f"Number of image tokens in input_ids ({total_num_special_image_tokens}) different from num_images ({num_images})."
+                )
+            # Compute the maximum embed dimension
+            # max_image_feature_lens is max_feature_lens per batch
+            feature_lens = feature_lens.to(input_ids.device)
+            feature_lens_batch = feature_lens.split(num_special_image_tokens.tolist(), dim=0)
+            feature_lens_batch_sum = torch.tensor([x.sum() for x in feature_lens_batch], device=input_ids.device)
+            embed_sequence_lengths = (
+                    (attention_mask == 1).long().sum(-1) - num_special_image_tokens + feature_lens_batch_sum
+            )
+            max_embed_dim = embed_sequence_lengths.max()
+            batch_indices, non_image_indices = torch.where((input_ids != image_token_index) & (attention_mask == 1))
+            # 2. Compute the positions where text should be written
+            # Calculate new positions for text tokens in merged image-text sequence.
+            # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images` text tokens.
+            # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+            # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+            # ! instead of special_image_token_mask * (num_image_patches - 1)
+            #   special_image_token_mask * (num_feature_len - 1)
+            special_image_token_mask = special_image_token_mask.long()
+            special_image_token_mask[special_image_token_mask == 1] = feature_lens - 1
+            new_token_positions = torch.cumsum((special_image_token_mask + 1), -1) - 1
+            if left_padding:
+                # shift right token positions so that they are ending at the same number
+                # the below here was incorrect? new_token_positions += new_token_positions[:, -1].max() - new_token_positions[:, -1:]
+                new_token_positions += max_embed_dim - 1 - new_token_positions[:, -1:]
+            text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        final_input_ids = torch.full(
+            (batch_size, max_embed_dim), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device
+        )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+        input_ids = input_ids.to(target_device)
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_image_indices]
+        final_labels = None
+        if labels is not None:
+            labels = labels.to(target_device)
+            final_labels = torch.full_like(final_attention_mask, ignore_index).to(torch.long)
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+        with torch.no_grad():
+            image_to_overwrite = torch.full(
+                (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
+            )
+            image_to_overwrite[batch_indices, text_to_overwrite] = False
+            embed_indices = torch.arange(max_embed_dim).unsqueeze(0).to(target_device)
+            embed_indices = embed_indices.expand(batch_size, max_embed_dim)
+            embed_seq_lens = embed_sequence_lengths[:, None].to(target_device)
+            if left_padding:
+                # exclude padding on the left
+                max_embed_dim = max_embed_dim.to(target_device)
+                val = (max_embed_dim - embed_indices) <= embed_seq_lens
+            else:
+                # exclude padding on the right
+                val = embed_indices < embed_seq_lens
+            image_to_overwrite &= val
+            if image_to_overwrite.sum() != num_image_features:
+                raise ValueError(
+                    f"{image_to_overwrite.sum()=} != {num_image_features=} The input provided to the model are wrong. "
+                    f"The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                    f" the number of image given to the model is {num_images}. "
+                    f"This prevents correct indexing and breaks batch generation."
+                )
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        return final_embedding, final_attention_mask, position_ids, final_labels, final_input_ids
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            pixel_values: torch.FloatTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ILLUMECausalLMOutputWithPast]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values of the image to be generated.
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Mask to avoid performing attention on padding token indices.
+                Mask values selected in `[0, 1]`:
+                - 1 for tokens that are not masked,
+                - 0 for tokens that are masked.
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of positions of each input sequence tokens in the position embeddings.
+                Selected in the range `[0, config.max_position_embeddings - 1]`.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+                `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            use_cache (`bool`, *optional*):
+                If `True`, past_key_values are returned and can be used to speed up decoding.
+            output_attentions (`bool`, *optional*):
+                Whether to return the attentions tensors of all attention layers.
+            output_hidden_states (`bool`, *optional*):
+                Whether to return the hidden states of all layers.
+            return_dict (`bool`, *optional*):
+                Whether to return a dictionary of outputs instead of a plain tuple.
+        Returns:
+            An instance of [`ILLUMECausalLMOutputWithPast`] if `return_dict=True`. Otherwise, it returns a tuple of tensors.
+        Example:
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, ILLUMEForConditionalGeneration
+        >>> # Load model and processor
+        >>> # Specify `torch_dtype` for mixed-precision inference, e.g., torch.bfloat16 or torch.float16
+        >>> # Specify `attn_implementation="flash_attention_2"` if Flash Attention 2 is installed and supported, or "sdpa" for PyTorch SDPA.
+        >>> # `low_cpu_mem_usage=True` can help reduce CPU memory for large models.
+        >>> model = ILLUMEForConditionalGeneration.from_pretrained(
+        ...     "illume-unified-mllm/illume_plus-qwen2_5-3b-hf",
+        ...     torch_dtype=torch.bfloat16,    # Optional: Or torch.float16. Adjust based on your hardware.
+        ...     low_cpu_mem_usage=True,      # Optional: Reduces CPU RAM during model loading.
+        ...     attn_implementation="sdpa",  # Optional: Use "flash_attention_2" if available for better performance.
+        ...     trust_remote_code=True
+        ... ).eval()
+        >>> # To use GPU: model = model.to("cuda") # Ensure the model is on the correct device
+        >>> processor = AutoProcessor.from_pretrained(
+        ...     "illume-unified-mllm/illume_plus-qwen2_5-3b-hf",
+        ...     trust_remote_code=True
+        ... )
+        >>> # Prepare inputs: a text prompt and an image
+        >>> # The processor formats the input for the model, including applying the chat template.
+        >>> messages = [
+        ...     {"role": "user", "content": [
+        ...         {"type": "image"},
+        ...         {"type": "text", "text": "What is shown in this image?"}
+        ...     ]}
+        ... ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" # An example image URL
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=messages, images=[image], return_tensors="pt")
+        >>> # To use GPU: inputs = {k: v.to("cuda") for k, v in inputs.items()} # Move inputs to the same device as the model
+        >>> # Generate text based on the input
+        >>> gen_kwargs = {"max_new_tokens": 100, "do_sample": False} # Generation parameters
+        >>> with torch.no_grad(): # Disable gradient calculations for inference
+        ...     outputs = model.generate(**inputs, **gen_kwargs)
+        >>> # Decode the generated tokens, removing the prompt
+        >>> input_token_len = inputs["input_ids"].shape[1]
+        >>> generated_ids = outputs[:, input_token_len:]
+        >>> response = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        >>> print(response)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is None:
+            # 1. Extract the input embeddings
+            # In case image_token_index is not in the embeddings (extra token but embedding don't have it)
+            for_inputs_embeds_ids = input_ids.clone()
+            for_inputs_embeds_ids[(input_ids == self.config.image_token_index)] = 0
+            inputs_embeds = self.get_input_embeddings()(for_inputs_embeds_ids)
+            # 2. Merge text and images
+            if pixel_values is not None and input_ids.shape[1] != 1 and len(pixel_values) > 0:
+                image_features, image_feature_shapes = self.vision_tower(pixel_values)
+                image_features = self.mm_projector(image_features)
+                # reformat image sequence
+                # special_tokens_ids: <start_of_image>, <end_of_image>, <end_of_line>, <start_of_level0>, <end_of_level0>, <start_of_level1>, <end_of_level1>
+                special_tokens_ids = torch.Tensor([self.config.special_tokens_ids[key] for key in
+                                                   ["<start_of_image>", "<end_of_image>", "<end_of_line>",
+                                                    "<start_of_level0>",
+                                                    "<end_of_level0>", "<start_of_level1>",
+                                                    "<end_of_level1>"]]).long().to(image_features.device)
+                semantic_sizes = [h * w for (h, w), _ in image_feature_shapes]
+                pixel_sizes = [h * w for _, (h, w) in image_feature_shapes]
+                # English: Split the image features into semantic features and reshape them to (h, w, -1)
+                semantic_features = torch.split(image_features[:, :sum(semantic_sizes), :], semantic_sizes, dim=1)
+                h_semantics = [feat.view(h, w, -1) for feat, ((h, w), _) in
+                               zip(semantic_features, image_feature_shapes)]
+                # English: Split the image features into pixel features and reshape them to (h, w, -1)
+                det_features = torch.split(
+                    image_features[:, sum(semantic_sizes): sum(semantic_sizes) + sum(pixel_sizes), :], pixel_sizes,
+                    dim=1)
+                h_pixels = [feat.view(h, w, -1) for feat, (_, (h, w)) in zip(det_features, image_feature_shapes)]
+                special_tokens_features = self.language_model.model.embed_tokens(special_tokens_ids)
+                image_features = []
+                feature_lens = []
+                for h_semantic, h_pixel in zip(h_semantics, h_pixels):
+                    h_semantic = self._reformat_image_sequence(h_semantic, special_tokens_features.clone(), level=0)
+                    h_pixel = self._reformat_image_sequence(h_pixel, special_tokens_features.clone(), level=1)
+                    image_feature = torch.cat([special_tokens_features[0].unsqueeze(0), h_semantic, h_pixel,
+                                               special_tokens_features[1].unsqueeze(0)], dim=0)
+                    image_features.append(image_feature)
+                    feature_lens.append(image_feature.shape[0])
+                feature_lens = torch.as_tensor(feature_lens)
+                image_features = torch.cat(image_features, dim=0)
+                inputs_embeds = inputs_embeds.to(self.dtype)
+                inputs_embeds, attention_mask, position_ids, labels, _ = self._merge_input_ids_with_image_features(
+                    image_features,
+                    feature_lens,
+                    inputs_embeds,
+                    input_ids,
+                    attention_mask,
+                    position_ids,
+                    labels=labels,
+                )
+            # pixel_values is not None but is empty ---> text only cases
+            elif pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) == 0:
+                # there are no images
+                pass
+            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+            # generation with cache
+            elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                # Get the target length
+                target_length = input_ids.shape[1]
+                past_length = first_layer_past_key_value.shape[-1]
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_length),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses ILLUME + Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        outputs = self.language_model(
+            attention_mask=attention_mask.to(inputs_embeds.device) if attention_mask is not None else attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs[0]
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return ILLUMECausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+            self,
+            input_ids,
+            past_key_values=None,
+            inputs_embeds=None,
+            pixel_values=None,
+            attention_mask=None,
+            **kwargs,
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.image_token_index in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1:]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]):]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+            }
+        )
+        return model_inputs
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)
+    def prepare_logit_processor(self,
+                                guidance_scale=2.0,
+                                negative_prompt_ids=None,
+                                negative_prompt_attention_mask=None,
+                                resolution=None,
+                                temperature=1.0, image_semantic_temperature=1.0, image_pixel_temperature=None,
+                                top_k=128, image_semantic_top_k=2048, image_pixel_top_k=None,
+                                top_p=1.0, image_semantic_top_p=1.0, image_pixel_top_p=None,
+                                images=None,
+                                ):
+        if resolution is not None:
+            token_nums, _, h1, w1, h2, w2 = calculate_image_token_num(*resolution)
+        else:
+            h1, w1, h2, w2 = 0, 0, 0, 0
+        if image_pixel_temperature is None:
+            image_pixel_temperature = image_semantic_temperature
+        if image_pixel_top_k is None:
+            image_pixel_top_k = image_semantic_top_k * 3
+        if image_pixel_top_p is None:
+            image_pixel_top_p = image_semantic_top_p
+        return InterleavedLogitsProcessor(
+            guidance_scale=guidance_scale,
+            uncond=negative_prompt_ids,
+            attention_mask=negative_prompt_attention_mask,
+            model=self,
+            # DualVQ parameters
+            level0_range=level0_range,
+            level1_range=level1_range,
+            num_level0_rows=h1, num_level0_tokens=w1,
+            num_level1_rows=h2, num_level1_tokens=w2,
+            special_tokens=special_tokens_dict,
+            # Dynamic Sampling parameters
+            default_temp=temperature, level0_temp=image_semantic_temperature, level1_temp=image_pixel_temperature,
+            default_top_k=top_k, level0_top_k=image_semantic_top_k, level1_top_k=image_pixel_top_k,
+            default_top_p=top_p, level0_top_p=image_semantic_top_p, level1_top_p=image_pixel_top_p,
+            images=images
+        )
+    def generate(
+            self,
+            *args,
+            temperature: float = 1.0, top_k: int = 128, top_p: float = 1.0,
+            pixel_values: Optional[torch.Tensor] = None,
+            # image generation or image editing hyperparameters.
+            guidance_scale=1.0, target_image_resolution=None,
+            image_semantic_temperature: float = 1.0,
+            image_semantic_top_k: int = 2048,
+            image_semantic_top_p: float = 1.0,
+            image_pixel_temperature: float = 1.0,
+            image_pixel_top_k: int = 2048 * 3,
+            image_pixel_top_p: float = 1.0,
+            negative_image_prompt_ids: Optional[torch.Tensor] = None,
+            negative_image_prompt_attention_mask: Optional[torch.Tensor] = None,
+            disable_logit_processor=False,
+            logits_processor=None,
+            **kwargs,
+    ):
+        if target_image_resolution is not None:
+            # check if target_image_resolution valied.
+            if not isinstance(target_image_resolution, tuple) or len(target_image_resolution) != 2:
+                raise ValueError("target_image_resolution must be a tuple of two integers.")
+            if not all(isinstance(dim, int) and dim > 0 for dim in target_image_resolution):
+                raise ValueError("target_image_resolution must contain positive integers.")
+            if target_image_resolution not in DEFAULT_RESOLUTIONS:
+                raise ValueError(
+                    "target_image_resolution must be in one of the following ratios: " + str(DEFAULT_RESOLUTIONS))
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList([])
+        if not disable_logit_processor:
+            illume_logit_processor = self.prepare_logit_processor(
+                negative_prompt_ids=negative_image_prompt_ids,
+                negative_prompt_attention_mask=negative_image_prompt_attention_mask,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                guidance_scale=guidance_scale,
+                resolution=target_image_resolution,
+                image_semantic_temperature=image_semantic_temperature,
+                image_pixel_temperature=image_pixel_temperature,
+                image_semantic_top_k=image_semantic_top_k,
+                image_pixel_top_k=image_pixel_top_k,
+                image_semantic_top_p=image_semantic_top_p,
+                image_pixel_top_p=image_pixel_top_p,
+                images=pixel_values,
+            )
+            logits_processor.append(illume_logit_processor)
+        return super(ILLUMEForConditionalGeneration, self).generate(
+            *args,
+            pixel_values=pixel_values,
+            logits_processor=logits_processor,
+            **kwargs)

modeling_movqgan.py ADDED Viewed

	@@ -0,0 +1,828 @@

+""" MoVQ model """
+import math
+from typing import Optional, Tuple, Union
+import torch
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_movqgan import MoVQConfig
+try:
+    import xformers.ops as xops
+    is_xformers_available = True
+except Exception as e:
+    is_xformers_available = False
+if torch.__version__ > "2.1.2":
+    IS_SDPA_AVAILABLE = True
+else:
+    IS_SDPA_AVAILABLE = False
+class MoVQActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, x: torch.Tensor):
+        return x * torch.sigmoid(x)
+class MoVQUpsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x: torch.Tensor):
+        x = F.interpolate(x.float(), scale_factor=2.0, mode="nearest").to(x.dtype)
+        x = self.conv(x)
+        return x
+class DCDownBlock2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int = None, downsample: bool = True,
+                 shortcut: bool = True) -> None:
+        super().__init__()
+        out_channels = out_channels if out_channels else in_channels
+        self.downsample = downsample
+        self.factor = 2
+        self.stride = 1 if downsample else 2
+        self.group_size = in_channels * self.factor ** 2 // out_channels
+        self.shortcut = shortcut
+        out_ratio = self.factor ** 2
+        if downsample:
+            assert out_channels % out_ratio == 0
+            out_channels = out_channels // out_ratio
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=self.stride,
+            padding=1,
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self.conv(hidden_states)
+        if self.downsample:
+            x = F.pixel_unshuffle(x, self.factor)
+        if self.shortcut:
+            y = F.pixel_unshuffle(hidden_states, self.factor)
+            y = y.unflatten(1, (-1, self.group_size))
+            y = y.mean(dim=2)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+        return hidden_states  # x + y
+class DCUpBlock2d(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int = None,
+            interpolate: bool = False,
+            shortcut: bool = True,
+            interpolation_mode: str = "nearest",
+    ) -> None:
+        super().__init__()
+        out_channels = out_channels if out_channels else in_channels
+        self.interpolate = interpolate
+        self.interpolation_mode = interpolation_mode
+        self.shortcut = shortcut
+        self.factor = 2
+        self.repeats = out_channels * self.factor ** 2 // in_channels
+        out_ratio = self.factor ** 2
+        if not interpolate:
+            out_channels = out_channels * out_ratio
+        self.conv = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.interpolate:
+            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
+            x = self.conv(x)
+        else:
+            x = self.conv(hidden_states)
+            x = F.pixel_shuffle(x, self.factor)
+        if self.shortcut:
+            y = hidden_states.repeat_interleave(self.repeats, dim=1)
+            y = F.pixel_shuffle(y, self.factor)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+        return hidden_states
+class MoVQDownsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor):
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class MoVQSpatialNorm(nn.Module):
+    def __init__(
+            self,
+            f_channels: int,
+            zq_channels: int,
+            norm_layer: nn.Module = nn.GroupNorm,
+            add_conv: bool = False,
+            num_groups: int = 32,
+            eps: float = 1e-6,
+            affine: bool = True,
+    ):
+        super().__init__()
+        self.norm_layer = norm_layer(
+            num_channels=f_channels,
+            num_groups=num_groups,
+            eps=eps,
+            affine=affine,
+        )
+        self.add_conv = add_conv
+        if self.add_conv:
+            self.conv = nn.Conv2d(
+                zq_channels,
+                zq_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        self.conv_y = nn.Conv2d(
+            zq_channels,
+            f_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv_b = nn.Conv2d(
+            zq_channels,
+            f_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor, zq: torch.Tensor):
+        zq = F.interpolate(zq.float(), size=x.shape[-2:], mode="nearest").to(zq.dtype)
+        if self.add_conv:
+            zq = self.conv(zq)
+        x = self.norm_layer(x)
+        x = x * self.conv_y(zq) + self.conv_b(zq)
+        return x
+class MoVQResnetBlock(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: Optional[int] = None,
+            conv_shortcut: bool = False,
+            dropout: float = 0.0,
+            zq_ch: Optional[int] = None,
+            add_conv: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.zq_ch = zq_ch
+        if zq_ch is None:
+            norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, **norm_kwargs)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, **norm_kwargs)
+        else:
+            self.norm1 = MoVQSpatialNorm(in_channels, zq_ch, add_conv=add_conv)
+            self.norm2 = MoVQSpatialNorm(out_channels, zq_ch, add_conv=add_conv)
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.act = MoVQActivation()
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                )
+            else:
+                self.nin_shortcut = nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+    def forward(self, x: torch.Tensor, zq: Optional[torch.Tensor] = None):
+        norm_args = tuple() if self.zq_ch is None else (zq,)
+        h = self.norm1(x, *norm_args)
+        h = self.act(h)
+        h = self.conv1(h)
+        h = self.norm2(h, *norm_args)
+        h = self.act(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class MoVQAttnBlock(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            zq_ch: Optional[int] = None,
+            add_conv: bool = False,
+            num_heads=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.zq_ch = zq_ch
+        self.num_heads = num_heads
+        if zq_ch is None:
+            norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+            self.norm = nn.GroupNorm(num_channels=in_channels, **norm_kwargs)
+        else:
+            self.norm = MoVQSpatialNorm(in_channels, zq_ch, add_conv=add_conv)
+        self.q = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.k = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.v = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.proj_out = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor, zq: Optional[torch.Tensor] = None):
+        # x: [b, c1, h1, w1]
+        # zq: [b, c2, h2, w2]
+        # attention_mask: [b, 1, h3, w3]
+        norm_args = tuple() if self.zq_ch is None else (zq,)
+        # if context is not None:
+        #     context = F.interpolate(context.float(), size=x.shape[-2:], mode="nearest").to(context.dtype)
+        #     x = x + self.conv_context(context)
+        nx = self.norm(x, *norm_args)
+        q = self.q(nx)
+        k = self.k(nx)
+        v = self.v(nx)
+        b, c, h, w = q.shape
+        if is_xformers_available:
+            # If xformers is available, create attn_bias for xops.memory_efficient_attention.
+            attn_bias = None
+            v = xops.memory_efficient_attention(
+                rearrange(q, 'b (n c) h w -> b (h w) n c', n=self.num_heads).contiguous(),
+                rearrange(k, 'b (n c) h w -> b (h w) n c', n=self.num_heads).contiguous(),
+                rearrange(v, 'b (n c) h w -> b (h w) n c', n=self.num_heads).contiguous(),
+                scale=1.0 / math.sqrt(c // self.num_heads),
+                attn_bias=attn_bias,
+            )
+            v = rearrange(v, 'b (h w) n c -> b (n c) h w', h=h, w=w).contiguous()
+        elif IS_SDPA_AVAILABLE:
+            # compute attention
+            q = rearrange(q, 'b (n c) h w -> b n (h w) c', n=self.num_heads).contiguous()
+            k = rearrange(k, 'b (n c) h w -> b n (h w) c', n=self.num_heads).contiguous()
+            v = rearrange(v, 'b (n c) h w -> b n (h w) c', n=self.num_heads).contiguous()
+            attn_bias = None
+            v = F.scaled_dot_product_attention(q, k, v, attn_bias, dropout_p=0.0)
+            v = v.transpose(1, 2)
+            v = rearrange(v, 'b (h w) n c -> b (n c) h w', h=h, w=w)
+        else:
+            # compute attention
+            q = rearrange(q, 'b (n c) h w -> b n c (h w)', n=self.num_heads).contiguous()
+            k = rearrange(k, 'b (n c) h w -> b n c (h w)', n=self.num_heads).contiguous()
+            v = rearrange(v, 'b (n c) h w -> b n c (h w)', n=self.num_heads).contiguous()
+            # score = torch.bmm(q.permute(0, 2, 1), k)
+            score = torch.einsum('b n c k, b n c l -> b n k l', q, k)
+            score = score / math.sqrt(c // self.num_heads)
+            score = F.softmax(score, dim=2)
+            # attend to values
+            # v = v.reshape(b, c, h * w)
+            # v = torch.bmm(v, score.permute(0, 2, 1))
+            v = torch.einsum('b n c l, b n k l -> b n c k', v, score)
+            v = v.reshape(b, c, h, w)
+        v = self.proj_out(v)
+        return x + v
+class MoVQVectorQuantizer(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.embedding = nn.Embedding(config.codebook_size, config.embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / config.codebook_size, 1.0 / config.codebook_size)
+    def forward(self, x: torch.Tensor):
+        # b t c h w -> b t h w c
+        b, t, c, h, w = x.shape
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x_flattened = x.view(-1, c)
+        codebook = self.embedding.weight
+        d = torch.sum(x_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(codebook ** 2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', x_flattened, codebook.permute(1, 0))
+        indices = torch.argmin(d, dim=1)
+        indices = indices.view(b, t, h, w)
+        return indices
+class MoVQPretrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = MoVQConfig
+    base_model_prefix = "movq"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["MoVQResnetBlock", "MoVQAttnBlock"]
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+class MoVQEncoder(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.config = config
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        self.in_channels = config.in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            self.in_channels,
+            self.ch,
+            kernel_size=3,
+            stride=1,
+            padding=1
+        )
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = config.ch * in_ch_mult[i_level]
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    MoVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(MoVQAttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                if config.use_dc_up_down_blocks:
+                    down.downsample = DCDownBlock2d(block_in)
+                else:
+                    down.downsample = MoVQDownsample(block_in)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        self.mid.attn_1 = MoVQAttnBlock(block_in)
+        self.mid.block_2 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        # end
+        self.norm_out = nn.GroupNorm(num_channels=block_in, num_groups=32, eps=1e-6, affine=True)
+        self.act = MoVQActivation()
+        out_z_channels = 2 * config.z_channels if config.double_z else config.z_channels
+        self.conv_out = nn.Conv2d(
+            block_in,
+            out_z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.out_shortcut_average_group_size = block_in // out_z_channels
+    def forward(self, x: torch.Tensor):
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = self.act(h)
+        if self.config.use_dc_up_down_blocks:
+            x = h.unflatten(1, (-1, self.out_shortcut_average_group_size))
+            x = x.mean(dim=2)
+            h = self.conv_out(h) + x
+        else:
+            h = self.conv_out(h)
+        return h
+class MoVQDecoder(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.config = config
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        zq_ch = config.embed_dim
+        block_in = config.ch * config.ch_mult[-1]
+        self.in_shortcut_repeats = block_in // config.embed_dim
+        self.conv_in = nn.Conv2d(
+            config.z_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+            zq_ch=zq_ch,
+        )
+        self.mid.attn_1 = MoVQAttnBlock(block_in, zq_ch)
+        self.mid.block_2 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+            zq_ch=zq_ch,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    MoVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                        zq_ch=zq_ch,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(MoVQAttnBlock(block_in, zq_ch))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                if config.use_dc_up_down_blocks:
+                    up.upsample = DCUpBlock2d(block_in)
+                else:
+                    up.upsample = MoVQUpsample(block_in)
+            self.up.insert(0, up)
+        self.act = MoVQActivation()
+        self.norm_out = MoVQSpatialNorm(block_in, zq_ch)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    def forward(self, z: torch.Tensor, zq: torch.Tensor):
+        h = z
+        if self.config.use_dc_up_down_blocks:
+            h = h.repeat_interleave(self.in_shortcut_repeats, dim=1)
+            h = self.conv_in(z) + h
+        else:
+            h = self.conv_in(h)
+        # middle
+        h = self.mid.block_1(h, zq)
+        h = self.mid.attn_1(h, zq)
+        h = self.mid.block_2(h, zq)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, zq)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, zq)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h, zq)
+        h = self.act(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.config = config
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        block_in = config.ch * config.ch_mult[-1]
+        self.conv_in = nn.Conv2d(
+            config.z_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        self.mid.attn_1 = MoVQAttnBlock(block_in)
+        self.mid.block_2 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    MoVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(MoVQAttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = MoVQUpsample(block_in)
+            self.up.insert(0, up)
+        self.act = MoVQActivation()
+        norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+        self.norm_out = nn.GroupNorm(num_channels=block_in, **norm_kwargs)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    def forward(self, z: torch.Tensor, zq: torch.Tensor):
+        h = z
+        h = self.conv_in(h)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = self.act(h)
+        h = self.conv_out(h)
+        return h
+class MoVQModel(MoVQPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = MoVQEncoder(config)
+        self.decoder = MoVQDecoder(config)
+        self.quantize = MoVQVectorQuantizer(config)
+        self.quant_conv = nn.Conv2d(config.z_channels, config.embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(config.embed_dim, config.z_channels, 1)
+        self.spatial_scale_factor = 2 ** (len(config.ch_mult) - 1)
+        self.post_init()
+    def encode(self, x: torch.Tensor):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        codes = self.quantize(h)
+        return codes
+    def decode(self, x: torch.Tensor):
+        quant = self.quantize.embedding(x.flatten())
+        b, h, w, c = quant.shape
+        quant = quant.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        quant2 = self.post_quant_conv(quant)
+        image = self.decoder(quant2, quant)
+        image = image.reshape(
+            b,
+            self.config.out_channels,
+            h * self.spatial_scale_factor,
+            w * self.spatial_scale_factor,
+        )
+        return image
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

modeling_qwen2vit.py ADDED Viewed

	@@ -0,0 +1,841 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2-VL model."""
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+from torch import Tensor
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    ModelOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_npu_available,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2vit import Qwen2VLConfig, Qwen2VLVisionConfig
+from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from einops import rearrange
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "Qwen2VLConfig"
+try:
+    import xformers.ops as xops
+    is_xformers_available = True
+except Exception as e:
+    is_xformers_available = False
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        # we use xavier_uniform following official JAX ViT:
+        torch.nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.nn.LayerNorm):
+        nn.init.constant_(m.bias, 0)
+        nn.init.constant_(m.weight, 1.0)
+    elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+        w = m.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+@dataclass
+class Qwen2VLCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Qwen2VL causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+class Qwen2VLRotaryEmbedding(nn.Module):
+    def __init__(
+            self,
+            dim=None,
+            max_position_embeddings=2048,
+            base=10000,
+            device=None,
+            scaling_factor=1.0,
+            rope_type="default",
+            config: Optional[Qwen2VLConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block. In contrast to other models, Qwen2_VL has different position ids for thw grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+def apply_rotary_pos_emb_vision_batch(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.repeat(1, 1, 1, 2).float()
+    sin = sin.repeat(1, 1, 1, 2).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int, scale_factor: float = 1.0) -> torch.Tensor:
+        # 使用 scale_factor 动态调整 inv_freq
+        scaled_inv_freq = self.inv_freq * scale_factor
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, scaled_inv_freq)
+        return freqs
+class PatchEmbed(nn.Module):
+    def __init__(
+            self,
+            patch_size: int = 14,
+            temporal_patch_size: int = 2,
+            in_channels: int = 3,
+            embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        if is_torch_npu_available():
+            # if True:
+            hidden_states = F.linear(hidden_states, self.proj.weight.view(self.embed_dim, -1))
+        else:
+            hidden_states = hidden_states.view(
+                -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+            )
+            hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class PatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size ** 2)
+        self.ln_q = nn.LayerNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+    def forward(self, x: torch.Tensor, grid_thw) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+class VisionMlp(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.act = ACT2FN[hidden_act]
+        self.fc2 = nn.Linear(hidden_dim, dim)
+    def forward(self, x) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class VisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16, ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = 0
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,  # [batch_size, seq_len, dim]
+            attention_mask: torch.Tensor,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None  # [batch_size, seq_len, head_dim//2]
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim).permute(2, 0,
+                                                                                                                 3, 1,
+                                                                                                                 4).unbind(
+            0)
+        # [batch_size, num_heads, seq_len, head_dim]
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        # Softmax
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)  # [batch_size, num_heads, seq_len, head_dim]
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, -1)
+        return self.proj(attn_output)
+class VisionXformerAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb)
+        seqlens = [cu_seqlens[0]] + [cu_seqlens[i] - cu_seqlens[i - 1] for i in range(1, len(cu_seqlens))]
+        attn_bias = xops.fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_output = xops.memory_efficient_attention(
+            q, k, v.unsqueeze(0),
+            attn_bias=attn_bias,
+            scale=1.0 / math.sqrt(self.head_dim)
+        )
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionXformerAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        batch_size, seq_len = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim).permute(2, 0,
+                                                                                                                 3, 1,
+                                                                                                                 4).unbind(
+            0)
+        # [batch_size, num_heads, seq_len, head_dim]
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        attn_output = xops.memory_efficient_attention(
+            q, k, v,
+            attn_bias=attention_mask,
+            scale=1.0 / math.sqrt(self.head_dim)
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, -1)
+        return self.proj(attn_output)
+class VisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, -1).permute(2, 0, 3, 1,
+                                                                                                      4).unbind(0)
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        q = rearrange(q, 'b h l d -> b l h d')
+        k = rearrange(k, 'b h l d -> b l h d')
+        v = rearrange(v, 'b h l d -> b l h d')
+        attn_output = _flash_attention_forward(q, k, v).reshape(batch_size, seq_len, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class VisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,  # [batch_size, seq_len, dim]
+            attention_mask: torch.Tensor = None,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None  # [batch_size, seq_len, head_dim//2]
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, -1).permute(2, 0, 3, 1,
+                                                                                                      4).unbind(0)
+        # [batch_size, num_heads, seq_len, head_dim]
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, seq_len, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+QWEN2_VL_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+    "xformers": VisionXformerAttention,
+}
+QWEN2_VL_VISION_BATCH_ATTENTION_CLASSES = {
+    "eager": BatchVisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": BatchVisionSdpaAttention,
+}
+class Qwen2VLVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.embed_dim, num_heads=config.num_heads,
+        )
+        self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb, grid_thw) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen2VLBatchVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        self.attn = QWEN2_VL_VISION_BATCH_ATTENTION_CLASSES[attn_implementation](
+            config.embed_dim, num_heads=config.num_heads,
+        )
+        self.mlp = VisionMlp(config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,  # [batch_size, seq_len, dim]
+            attention_mask: torch.Tensor = None,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None  # [batch_size, seq_len, head_dim//2]
+    ) -> torch.Tensor:
+        # Attention
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb
+        )
+        # MLP
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        min_dtype: float,
+        cache_position: torch.Tensor,
+        batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+    return causal_mask
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5nn.LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2MLP
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Qwen2VLPreTrainedModel(PreTrainedModel):
+    config_class = Qwen2VLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+    config_class = Qwen2VLVisionConfig
+    _no_split_modules = ["Qwen2VLVisionBlock"]
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+        )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLVisionBlock(config, config.attn_implementation) for _ in range(config.depth)]
+        )
+        self.merger = PatchMerger(
+            dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
+        )
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0, dtype=torch.int32
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.blocks:
+            hidden_states = blk(hidden_states,
+                                cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb,
+                                grid_thw=grid_thw)
+        hidden_states = self.merger(hidden_states, grid_thw)
+        return hidden_states

modeling_rope_utils.py ADDED Viewed

	@@ -0,0 +1,561 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import is_torch_available, logging
+logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
+def _compute_default_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies according to the original RoPE implementation
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Compute the inverse frequencies
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    return inv_freq, attention_factor
+def _compute_linear_scaling_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        factor = config.rope_scaling["factor"]
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+    # Then applies linear scaling to the frequencies.
+    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+    # applying scaling to the inverse frequencies is equivalent.
+    inv_freq /= factor
+    return inv_freq, attention_factor
+def _compute_dynamic_ntk_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length, used to update the dynamic RoPE at inference time.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+        max_position_embeddings = rope_kwargs["max_position_embeddings"]
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        dim = int(head_dim * partial_rotary_factor)
+        max_position_embeddings = config.max_position_embeddings
+        factor = config.rope_scaling["factor"]
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # seq_len: default to max_position_embeddings, e.g. at init time
+    seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
+    # Compute the inverse frequencies
+    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    return inv_freq, attention_factor
+def _compute_yarn_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Please refer to the
+    [original paper](https://arxiv.org/abs/2309.00071)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # No need to keep BC with yarn, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    max_position_embeddings = config.max_position_embeddings
+    factor = config.rope_scaling["factor"]
+    # Sets the attention factor as suggested in the paper
+    attention_factor = config.rope_scaling.get("attention_factor")
+    if attention_factor is None:
+        attention_factor = 0.1 * math.log(factor) + 1.0
+    # Optional config options
+    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+    beta_fast = config.rope_scaling.get("beta_fast") or 32
+    beta_slow = config.rope_scaling.get("beta_slow") or 1
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+        """Find dimension range bounds based on rotations"""
+        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        return max(low, 0), min(high, dim - 1)
+    def linear_ramp_factor(min, max, dim):
+        if min == max:
+            max += 0.001  # Prevent singularity
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+    inv_freq_extrapolation = 1.0 / pos_freqs
+    inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+    low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+    # Get n-dimensional rotational scaling corrected for extrapolation
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+    inv_freq = (
+        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+        + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    )
+    return inv_freq, attention_factor
+def _compute_longrope_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+    [original implementation](https://github.com/microsoft/LongRoPE)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    # No need to keep BC with longrope, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
+            f"{rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    long_factor = config.rope_scaling["long_factor"]
+    short_factor = config.rope_scaling["short_factor"]
+    factor = config.rope_scaling.get("factor")
+    attention_factor = config.rope_scaling.get("attention_factor")
+    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if hasattr(config, "original_max_position_embeddings"):
+        max_position_embeddings = config.original_max_position_embeddings
+        expanded_max_position_embeddings = config.max_position_embeddings
+        factor = expanded_max_position_embeddings / max_position_embeddings
+    else:
+        max_position_embeddings = config.max_position_embeddings
+        expanded_max_position_embeddings = max_position_embeddings * factor
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if factor <= 1.0:
+            attention_factor = 1.0
+        else:
+            attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
+    # Compute the inverse frequencies -- scaled based on the target sequence length
+    if expanded_max_position_embeddings > max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
+    else:
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
+    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+    return inv_freq, attention_factor
+def _compute_llama3_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for llama 3.1.
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+    factor = config.rope_scaling["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
+    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama, attention_factor
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    "linear": _compute_linear_scaling_rope_parameters,
+    "dynamic": _compute_dynamic_ntk_parameters,
+    "yarn": _compute_yarn_parameters,
+    "longrope": _compute_longrope_parameters,
+    "llama3": _compute_llama3_parameters,
+}
+def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
+    """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
+    # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
+    if "type" in received_keys:
+        received_keys -= {"type"}
+        required_keys.add("rope_type")
+    missing_keys = required_keys - received_keys
+    if missing_keys:
+        raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
+    if optional_keys is not None:
+        unused_keys = received_keys - required_keys - optional_keys
+    else:
+        unused_keys = received_keys - required_keys
+    if unused_keys:
+        logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")
+def _validate_default_rope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys)
+def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+def _validate_yarn_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+    attention_factor = rope_scaling.get("attention_factor")
+    if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
+        logger.warning(
+            f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+        )
+    beta_fast = rope_scaling.get("beta_fast")
+    if beta_fast is not None and not isinstance(beta_fast, float):
+        logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
+    beta_slow = rope_scaling.get("beta_slow")
+    if beta_slow is not None and not isinstance(beta_slow, float):
+        logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")
+    if (beta_fast or 32) < (beta_slow or 1):
+        logger.warning(
+            f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
+            f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
+        )
+def _validate_longrope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "short_factor", "long_factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    short_factor = rope_scaling.get("short_factor")
+    if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
+        logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
+    if not len(short_factor) == dim // 2:
+        logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")
+    long_factor = rope_scaling.get("long_factor")
+    if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
+        logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
+    if not len(long_factor) == dim // 2:
+        logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")
+    # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
+    # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
+    # unique to longrope (= undesirable)
+    if hasattr(config, "original_max_position_embeddings"):
+        logger.warning_once(
+            "This model has set a `original_max_position_embeddings` field, to be used together with "
+            "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
+            "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
+            "as it is compatible with most model architectures."
+        )
+    else:
+        factor = rope_scaling.get("factor")
+        if factor is None:
+            logger.warning("Missing required keys in `rope_scaling`: 'factor'")
+        elif not isinstance(factor, float) or factor < 1.0:
+            logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+        attention_factor = rope_scaling.get("attention_factor")
+        if attention_factor is not None:
+            if not isinstance(attention_factor, float) or attention_factor < 0.0:
+                logger.warning(
+                    f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+                )
+def _validate_llama3_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+    low_freq_factor = rope_scaling["low_freq_factor"]
+    high_freq_factor = rope_scaling["high_freq_factor"]
+    if low_freq_factor is None or not isinstance(low_freq_factor, float):
+        logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
+    if high_freq_factor is None or not isinstance(high_freq_factor, float):
+        logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
+    if high_freq_factor <= low_freq_factor:
+        logger.warning(
+            "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
+            f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
+        )
+    original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
+    if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
+            f"{original_max_position_embeddings}"
+        )
+    if original_max_position_embeddings >= config.max_position_embeddings:
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
+            f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
+        )
+# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
+ROPE_VALIDATION_FUNCTIONS = {
+    "default": _validate_default_rope_parameters,
+    "linear": _validate_linear_scaling_rope_parameters,
+    "dynamic": _validate_dynamic_scaling_rope_parameters,
+    "yarn": _validate_yarn_parameters,
+    "longrope": _validate_longrope_parameters,
+    "llama3": _validate_llama3_parameters,
+}
+def rope_config_validation(config: PretrainedConfig):
+    """
+    Validate the RoPE config arguments, given a `PretrainedConfig` object
+    """
+    rope_scaling = getattr(config, "rope_scaling", None)  # not a default parameter in `PretrainedConfig`
+    if rope_scaling is None:
+        return
+    # BC: "rope_type" was originally "type"
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
+    validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
+    if validation_fn is not None:
+        validation_fn(config)
+    else:
+        logger.warning(
+            f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
+        )

processing_illume.py ADDED Viewed

	@@ -0,0 +1,329 @@

+"""
+Processor class for ILLUME_plus with dualvitok and dualvitok-sdxl-decoder.
+"""
+import json
+from typing import List, Union
+from transformers import AutoProcessor, AutoImageProcessor
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+from transformers.feature_extraction_utils import BatchFeature
+from .image_utils import ImageInput
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from PIL import Image
+import re  # Added for parsing image tokens
+from typing import List, Tuple
+import torch
+from .configuration_illume import ILLUMEConfig
+from .image_processing_movqgan import MoVQImageProcessor
+from .image_processing_dualvitok import DualViTokImageProcessor
+from .aspect_ratio_utils import AspectRatioCrop, RATIOS, unpad_and_resize_back
+from .inference_utils import parse_interleaved_text_image, calculate_image_token_num
+from .sdxl_decoder_pipe import StableDiffusionXLDecoderPipeline
+logger = logging.get_logger(__name__)
+class ILLUMEProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+class ILLUMEProcessor(ProcessorMixin):
+    r"""
+    Constructs a Qwen2-VL processor which wraps a Qwen2-VL image processor and a Qwen2 tokenizer into a single processor.
+    [`ILLUMEProcessor`] offers all the functionalities of [`ILLUMEImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~ILLUMEProcessor.__call__`] and [`~ILLUMEProcessor.decode`] for more information.
+    Args:
+        image_processor ([`IllumeImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    _re_placeholder = re.compile(r"<image_out>|<image>")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None,
+                 crop_percent_thresh=0.2, anyres_indicator_base=64, **kwargs):
+        super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)
+        self.vision_tokenizer = None
+        self.diffusion_vision_detokenizer = None
+        self.crop_percent_thresh = crop_percent_thresh
+        self.anyres_indicator_base = anyres_indicator_base
+    def set_vision_tokenizer(self, tokenizer):
+        if self.vision_tokenizer and tokenizer:
+            logger.info('You are resetting vision tokenizer!')
+            return
+        self.vision_tokenizer = tokenizer
+        logger.info('Setting vision tokenizer!')
+    def load_diffusion_vision_detokenizer(self, diffusion_decoder,
+                                          torch_dtype=torch.float16,
+                                          add_watermarker=False,
+                                          device='cuda',
+                                          ):
+        if self.diffusion_vision_detokenizer:
+            logger.info('You are resetting diffusion vision detokenizer!')
+            return
+        if self.vision_tokenizer is None:
+            raise ValueError("Vision tokenizer is not set. Please set the vision tokenizer by using `processor.set_vision_tokenizer`")
+        self.diffusion_vision_detokenizer = StableDiffusionXLDecoderPipeline.from_pretrained(diffusion_decoder,
+                                                         torch_dtype=torch_dtype,
+                                                         add_watermarker=add_watermarker,
+                                                         vq_model=self.vision_tokenizer,
+                                                         vq_image_processor=self.image_processor).to(device)
+        logger.info('Setting diffusion vision detokenizer!')
+    def get_ratio_tag_from_ratio(self, ratio):
+        h, w = ratio
+        h_indicator, w_indicator = h // self.anyres_indicator_base, w // self.anyres_indicator_base
+        ratio_tag = f"<height_{h_indicator}><width_{w_indicator}>"
+        return ratio_tag
+    @torch.no_grad()
+    def _encode_with_dualvitok(self, img):
+        # img is a PIL image or np.ndarray
+        px = self.image_processor(img, return_tensors="pt")["pixel_values"].to(self.vision_tokenizer.device)
+        (_, _, idx_sem, _), (_, _, idx_pix) = self.vision_tokenizer.encode(px)
+        return idx_sem[0].cpu().tolist(), idx_pix[0].cpu().tolist()
+    def transform_image_nearest_resolution_ratio(self, image, ratios=RATIOS):
+        arc = AspectRatioCrop(ratios, crop_percent_thresh=self.crop_percent_thresh)
+        image, original_size, target_size, flag_matched = arc(image, is_inference=True)
+        return image
+    def convert_image_to_token_string(self, image, ratios=RATIOS):
+        arc = AspectRatioCrop(ratios, crop_percent_thresh=self.crop_percent_thresh)
+        image, original_size, target_size, flag_matched = arc(image, is_inference=True)
+        ratio_tag = self.get_ratio_tag_from_ratio(target_size)
+        image_embed_inds = self._encode_with_dualvitok(image)
+        return ratio_tag + self.encode_image_token_into_code(image_embed_inds)
+    def unpad_and_resize_back(self, padded_image, original_width, original_height):
+        return unpad_and_resize_back(padded_image, original_width, original_height)
+    def encode_image_token_into_code(self, image_embed_inds,
+                                     add_token_name="<|image_level{}_{}|>",
+                                     selected_vision_tokenizer_levels=None):
+        '''
+        Args:
+            image_embed_inds: 3D list, vision token ids for each tokenizer level
+            add_token_name: tag name for vision tokens
+        Returns:
+            image_token_return: str
+        '''
+        if selected_vision_tokenizer_levels is not None:
+            image_embed_inds_new = []
+            for level in selected_vision_tokenizer_levels:
+                image_embed_inds_new.append(image_embed_inds[level])
+            image_embed_inds = image_embed_inds_new
+        image_token_name_list = []
+        for level, image_embed_ind in enumerate(image_embed_inds):
+            image_token_name = []
+            for row in image_embed_ind:
+                image_token_name.append([add_token_name.format(level, ind) for ind in row])
+            image_token_name_list.append("<start_of_level{}>".format(level))
+            for row in image_token_name:
+                row.append("<end_of_line>")
+            for row in image_token_name:
+                image_token_name_list.extend(row)
+            image_token_name_list.append("<end_of_level{}>".format(level))
+        image_token_return = "".join(image_token_name_list)
+        image_token_return = "<start_of_image>" + image_token_return + "<end_of_image>"
+        return image_token_return
+    @torch.no_grad()
+    def decode_images(self, image_inds_list, target_resolution=(512, 512), return_type='pil',
+                      use_diffusion=False, diffusion_cfg_scale=2.0, diffusion_num_inference_steps=20, **kwargs):
+        token_nums, _, h1, w1, h2, w2 = calculate_image_token_num(*target_resolution)
+        decoded_images = []
+        for image_inds in image_inds_list:
+            semantic_code = torch.as_tensor([image_inds[0]])
+            texture_code = torch.as_tensor([image_inds[1]])
+            if use_diffusion:
+                if self.diffusion_vision_detokenizer is None:
+                    raise RuntimeError(
+                        "diffusion_vision_detokenizer is not set. Please set the diffusion decoder by using `pipe.load_diffusion_vision_detokenizer`")
+                semantic_code = semantic_code.view(semantic_code.shape[0], h1, w1)
+                texture_code = texture_code.view(texture_code.shape[0], h2, w2)
+                diffusion_outputs = self.diffusion_vision_detokenizer(
+                    vq_indices=(semantic_code, texture_code),
+                    height=target_resolution[0] * 2,
+                    width=target_resolution[1] * 2,
+                    guidance_scale=diffusion_cfg_scale,
+                    num_inference_steps=diffusion_num_inference_steps,
+                    output_type=return_type,
+                    **kwargs
+                )
+                samples = diffusion_outputs.images
+                image = samples[0]
+            else:
+                if self.vision_tokenizer is None:
+                    raise RuntimeError(
+                        "vision_detokenizer is not set. Please set the vision decoder by using `pipe.set_vision_detokenizer`")
+                semantic_code = semantic_code.view(semantic_code.shape[0], h1, w1)
+                texture_code = texture_code.view(texture_code.shape[0], h2, w2)
+                samples = self.vision_tokenizer.decode_code(semantic_code, texture_code)
+                if return_type == 'pil':
+                    sample = \
+                    torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).cpu().to(torch.uint8).numpy()[0]
+                    image = Image.fromarray(sample)
+                else:  # return numpy range -1 to 1.
+                    image = samples.permute(0, 2, 3, 1).cpu().numpy()[0]
+            decoded_images.append(image)
+        return decoded_images
+    def parse_text_image(self, text, image_placeholder='<image_out>'):
+        generated_text, image_embed_inds_list, list_image_token_parts = parse_interleaved_text_image(text, num_levels=2,
+                                                                                                     image_placeholder=image_placeholder)
+        return generated_text, image_embed_inds_list, list_image_token_parts
+    def _encode_out_placeholder(self, img):
+        """
+        Encode one image with DualViTok and return a string
+        that can replace the <image_out> marker in the text.
+        """
+        return self.convert_image_to_token_string(img)
+    def __call__(
+            self,
+            text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+            images: ImageInput = None,
+            **kwargs: Unpack[ILLUMEProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        DualViTokImageProcessor's [`~DualViTokImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            ILLUMEProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if not isinstance(text, list):
+            text = [text]
+        if isinstance(images, str):
+            images = [images]
+        elif images and isinstance(images[0], list):
+            # flatten List[List[PIL.Image.Image]]
+            images = [item for sublist in images for item in sublist]
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
+        try:
+            text = self.apply_chat_template(text, add_generation_prompt=True, padding=True)
+        except Exception as e:
+            logger.info('Warning: input texts have been applied chat templates!')
+        if images is None:
+            text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+            return BatchFeature(data={**text_inputs})
+        else:
+            imgs_in, new_text, used = [], [], 0
+            if not isinstance(text, list):
+                text = [text]
+            for s in text:  # walk each prompt
+                out, i = [], 0
+                for m in self._re_placeholder.finditer(s):  # every placeholder
+                    out.append(s[i:m.start()])
+                    if used >= len(images):
+                        raise ValueError("not enough images for placeholders")
+                    img = images[used]
+                    used += 1
+                    if m.group() == "<image_out>":
+                        out.append(self.convert_image_to_token_string(img))  # replace
+                    else:  # <image>
+                        out.append("<image>")
+                        imgs_in.append(img)  # keep for pixel feats
+                    i = m.end()
+                out.append(s[i:])
+                new_text.append("".join(out))
+            if used != len(images):
+                raise ValueError(f"too many images for placeholders. used {used} vs len(images) {len(images)}. {text}")
+            text_inputs = self.tokenizer(new_text, **output_kwargs["text_kwargs"])
+            image_inputs = self.image_processor.preprocess(imgs_in, **output_kwargs["images_kwargs"]) if imgs_in else {}
+            return BatchFeature(data={**text_inputs, **image_inputs})
+    def batch_decode(self, sequences, *args, **kwargs):
+        return [self.decode(seq, *args, **kwargs)
+                for i, seq in enumerate(sequences)]
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

pytorch_model-00001-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7c85c4110b8939d12d79ac65e4d0985cec132c92ef481e36b3923e5e090ba21
+size 4970246339

pytorch_model-00002-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:813720e5102d637c585854db25e80984d03fb73490e72795247d917933d8f821
+size 4932780328

pytorch_model-00003-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab7b0e070fc80137d7d9e43951d89a4ea82c961fb0a3dffeba4e73be033d07e1
+size 4991527403

pytorch_model-00004-of-00004.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89e7232078f06cb9df7470b148d01e6de8826c936d49ed56401c7e91cddea297
+size 3699762846

pytorch_model.bin.index.json ADDED Viewed

	@@ -0,0 +1,889 @@

+{
+  "metadata": {
+    "total_size": 18594005440
+  },
+  "weight_map": {
+    "language_model.lm_head.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.embed_tokens.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.13.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.13.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.14.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.14.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.15.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.16.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.17.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.18.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.24.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.24.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.24.self_attn.k_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.self_attn.q_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.self_attn.v_proj.bias": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
+    "language_model.model.layers.25.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.self_attn.k_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.self_attn.q_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.self_attn.v_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.25.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.self_attn.k_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.self_attn.q_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.self_attn.v_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.26.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.self_attn.k_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.self_attn.q_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.self_attn.v_proj.bias": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.27.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
+    "language_model.model.layers.3.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.3.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.3.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.3.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.3.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.3.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.3.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.3.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
+    "language_model.model.layers.4.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.4.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.5.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.6.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.7.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.self_attn.k_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.self_attn.q_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.self_attn.v_proj.bias": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
+    "language_model.model.norm.weight": "pytorch_model-00004-of-00004.bin",
+    "mm_projector.projector_1.0.bias": "pytorch_model-00001-of-00004.bin",
+    "mm_projector.projector_1.0.weight": "pytorch_model-00001-of-00004.bin",
+    "mm_projector.projector_1.2.bias": "pytorch_model-00001-of-00004.bin",
+    "mm_projector.projector_1.2.weight": "pytorch_model-00001-of-00004.bin",
+    "mm_projector.projector_2.0.bias": "pytorch_model-00001-of-00004.bin",
+    "mm_projector.projector_2.0.weight": "pytorch_model-00001-of-00004.bin",
+    "mm_projector.projector_2.2.bias": "pytorch_model-00001-of-00004.bin",
+    "mm_projector.projector_2.2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.conv_in.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.conv_in.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.conv_out.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.conv_out.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.0.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.block.1.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.downsample.conv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.0.downsample.conv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.0.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.block.1.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.downsample.conv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.1.downsample.conv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.nin_shortcut.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.nin_shortcut.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.0.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.block.1.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.downsample.conv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.2.downsample.conv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.0.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.block.1.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.downsample.conv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.3.downsample.conv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.k.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.k.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.norm.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.norm.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.proj_out.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.proj_out.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.q.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.q.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.v.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.0.v.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.k.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.k.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.norm.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.norm.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.proj_out.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.proj_out.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.q.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.q.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.v.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.attn.1.v.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.nin_shortcut.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.nin_shortcut.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.0.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.down.4.block.1.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.k.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.k.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.norm.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.norm.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.proj_out.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.proj_out.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.q.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.q.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.v.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.attn_1.v.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_1.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.conv1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.conv1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.conv2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.conv2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.mid.block_2.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.norm_out.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.pixel_encoder.norm_out.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.0.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.1.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.10.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.11.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.12.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.13.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.14.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.15.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.16.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.17.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.18.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.19.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.2.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.20.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.21.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.22.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.23.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.24.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.25.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.26.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.27.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.28.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.29.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.3.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.30.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.31.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.4.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.5.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.6.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.7.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.8.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.attn.proj.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.attn.proj.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.attn.qkv.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.attn.qkv.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.norm1.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.norm1.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.norm2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.blocks.9.norm2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.merger.ln_q.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.merger.ln_q.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.merger.mlp.0.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.merger.mlp.0.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.merger.mlp.2.bias": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.merger.mlp.2.weight": "pytorch_model-00001-of-00004.bin",
+    "vision_tower.semantic_encoder.patch_embed.proj.weight": "pytorch_model-00001-of-00004.bin"
+  }
+}

sdxl_decoder_pipe.py ADDED Viewed

	@@ -0,0 +1,901 @@

+# Modify from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import repeat, rearrange
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils.torch_utils import randn_tensor
+import PIL.Image
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import StableDiffusionXLPipeline, \
+    retrieve_timesteps, rescale_noise_cfg
+from torchvision.transforms import Compose, Resize, CenterCrop, Normalize, InterpolationMode
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class StableDiffusionXLDecoderPipelineOutput(StableDiffusionXLPipelineOutput):
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    indices_semantic: Optional[torch.Tensor] = None
+    indices_pixel: Optional[torch.Tensor] = None
+def expand_dims_like(x, y):
+    while x.dim() != y.dim():
+        x = x.unsqueeze(-1)
+    return x
+class AbstractEmbModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+class DualViTok2ImageEmbedder(AbstractEmbModel):
+    def __init__(
+            self,
+            image_processor=None,
+            vq_model=None,
+            device="cuda",
+            dtype=torch.float32,
+            freeze=True,
+            image_size=0,
+            resize_factor=1,
+            not_bicubic=True,
+            return_sequence=False,
+            grid_feature_scale=1,
+            texture_drop_prob=0,
+            semantic_drop_prob=0,
+            pixel_channel=32,
+            semantic_channel=32,
+    ):
+        super().__init__()
+        vq_model.to(device=device, dtype=dtype)
+        vq_model.eval()
+        self.processor = image_processor
+        self.model = vq_model
+        self.device = device
+        if freeze:
+            self.freeze()
+        if image_size > 0:
+            preprocessor = [
+                Resize(image_size) if not_bicubic else Resize(image_size, interpolation=InterpolationMode.BICUBIC)]
+            preprocessor += [
+                CenterCrop(image_size),
+            ]
+            self.preprocessor = Compose(preprocessor)
+        self.image_size = image_size
+        self.resize_factor = resize_factor
+        self.not_bicubic = not_bicubic
+        self.return_sequence = return_sequence
+        self.grid_feature_scale = grid_feature_scale
+        self.texture_drop_prob = texture_drop_prob
+        self.semantic_drop_prob = semantic_drop_prob
+        self.pixel_channel = pixel_channel
+        self.semantic_channel = semantic_channel
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def vq_encode(self, image):
+        if image.ndim == 5:
+            assert image.size(1) == 1
+            image = image.squeeze(1)
+        bs, _, h, w = image.shape
+        if self.image_size > 0:
+            image = self.preprocessor(image)
+        else:
+            assert self.resize_factor > 0
+            preprocessor = Resize((int(h * self.resize_factor), int(w * self.resize_factor))) if self.not_bicubic else \
+                Resize((int(h * self.resize_factor), int(w * self.resize_factor)),
+                       interpolation=InterpolationMode.BICUBIC)
+            image = preprocessor(image)
+        inputs = dict(image=image)
+        inputs = self.model.get_input(inputs)
+        (quant_semantic, diff_semantic, indices_semantic, target_semantic), \
+        (quant_pixel, diff_pixel, indices_pixel) = self.model.encode(**inputs)
+        return indices_semantic, indices_pixel
+    def vq_encode_code(self, image):
+        (quant_semantic, diff_semantic, indices_semantic, target_semantic), \
+        (quant_pixel, diff_pixel, indices_pixel) = self.vq_encode(image)
+        return indices_semantic, indices_pixel
+    def vq_decode_code(self, indices_semantic, indices_pixel):
+        return self.model.decode_code(indices_semantic, indices_pixel)
+    def forward(self, image, return_indices=False):
+        if image.ndim == 5:
+            assert image.size(1) == 1
+            image = image.squeeze(1)
+        bs, _, h, w = image.shape
+        if self.image_size > 0:
+            image = self.preprocessor(image)
+        else:
+            assert self.resize_factor > 0
+            preprocessor = Resize((int(h * self.resize_factor), int(w * self.resize_factor))) if self.not_bicubic else \
+                Resize((int(h * self.resize_factor), int(w * self.resize_factor)),
+                       interpolation=InterpolationMode.BICUBIC)
+            image = preprocessor(image)
+        inputs = dict(image=image)
+        inputs = self.model.get_input(inputs)
+        (quant_semantic, diff_semantic, indices_semantic, target_semantic), \
+        (quant_pixel, diff_pixel, indices_pixel) = self.model.encode(**inputs)
+        feature = self.model.merge_quants(quant_semantic, quant_pixel)
+        if self.return_sequence:
+            feature = rearrange(feature, 'b c h w -> b h w c')
+            _, this_h, this_w, _ = feature.shape
+            feature = feature.view(bs, this_w * this_w, -1)
+        else:
+            feature = feature * self.grid_feature_scale
+        if return_indices:
+            return feature, indices_semantic, indices_pixel
+        return feature
+    def encode(self, img):
+        return self(img)
+    def indices_to_codes(self, semantic_indices, texture_indices):
+        quant_semantic, quant_texture = self.model.indices_to_codes(semantic_indices, texture_indices)
+        return self.model.merge_quants(quant_semantic, quant_texture)
+class StableDiffusionXLDecoderPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
+    model_cpu_offload_seq = "vq_model_embedder->unet->vae"
+    _optional_components = [
+        "vq_model_embedder",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+    def __init__(
+            self,
+            vae: AutoencoderKL,
+            unet: UNet2DConditionModel,
+            scheduler: KarrasDiffusionSchedulers,
+            force_zeros_for_empty_prompt: bool = True,
+            add_watermarker: Optional[bool] = None,
+            vq_image_processor=None,
+            vq_model=None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+        self.empty_prompt_embeds = torch.zeros([1, 77, 2048]).to(device=unet.device, dtype=unet.dtype)
+        self.empty_pooled_prompt_embeds = torch.zeros([1, 1280]).to(device=unet.device, dtype=unet.dtype)
+        self.dualvitok_channels = vq_model.pixel_channel + vq_model.semantic_channel
+        self.resolution_group = ['(1024, 1024)', '(768, 1024)', '(1024, 768)', '(512, 2048)', '(2048, 512)',
+                                 '(640, 1920)', '(1920, 640)', '(768, 1536)', '(1536, 768)', '(768, 1152)',
+                                 '(1152, 768)', '(512, 512)']
+        embedder_kwargs = dict(image_size=0,
+                               resize_factor=1,
+                               return_sequence=False,
+                               grid_feature_scale=1)
+        if isinstance(vq_model, DualViTok2ImageEmbedder):
+            self.vq_model_embedder = vq_model
+        else:
+            self.vq_model_embedder = DualViTok2ImageEmbedder(vq_image_processor, vq_model, **embedder_kwargs)
+    def vq_encode(self, image):
+        return self.vq_model_embedder.encode(image)
+    def vq_encode_code(self, image):
+        return self.vq_model_embedder.vq_encode_code(image)
+    def vq_decode_code(self, *args, **kwargs):
+        return self.vq_model_embedder.vq_decode_code(*args, **kwargs)
+    def indices_to_codes(self, *args, **kwargs):
+        return self.vq_model_embedder.indices_to_codes(*args, **kwargs)
+    def _get_add_time_ids(
+            self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None,
+            resolution_index=None,
+    ):
+        add_time_ids = [resolution_index] * 6
+        passed_add_embed_dim = (
+                self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+    def check_inputs(
+            self,
+            height,
+            width,
+            callback_steps,
+            callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+                k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+            self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    @torch.no_grad()
+    def __call__(
+            self,
+            vq_indices: Optional[List] = None,
+            vq_embeds: Optional[torch.Tensor] = None,
+            images: Optional[PipelineImageInput] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            timesteps: List[int] = None,
+            sigmas: List[float] = None,
+            denoising_end: Optional[float] = None,
+            guidance_scale: float = 2.0,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            original_size: Optional[Tuple[int, int]] = None,
+            crops_coords_top_left: Tuple[int, int] = (0, 0),
+            target_size: Optional[Tuple[int, int]] = None,
+            negative_original_size: Optional[Tuple[int, int]] = None,
+            negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+            negative_target_size: Optional[Tuple[int, int]] = None,
+            clip_skip: Optional[int] = None,
+            callback_on_step_end: Optional[
+                Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+            ] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            vq_indices (`Optional[PipelineImageInput]`, *optional*):
+                The VQ indices for semantic and pixel tokens. Should be a tuple of (semantic_indices, pixel_indices).
+            images (`Optional[PipelineImageInput]`, *optional*):
+                Input images in range [-1, 1] as torch.Tensor with shape (batch_size, channels, height, width).
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            height,
+            width,
+            callback_steps,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        # 2. encode vq_embeds
+        assert images is not None or vq_indices is not None or vq_embeds is not None
+        batch_size = len(images) if images is not None else len(vq_indices[0])
+        if images:
+            vq_embeds, indices_semantic, indices_pixel = self.vq_model_embedder(images, return_indices=True)
+        elif vq_indices:
+            indices_semantic, indices_pixel = vq_indices[0], vq_indices[1]
+            vq_embeds = self.vq_model_embedder.indices_to_codes(vq_indices[0], vq_indices[1])
+        elif vq_embeds:
+            if isinstance(vq_embeds, list):
+                vq_embeds = self.vq_model_embedder.merge_quants(vq_embeds)
+            indices_semantic, indices_pixel = None, None
+        else:
+            raise ValueError("No valid input provided")
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = repeat(self.empty_prompt_embeds, '1 l c -> b l c', b=batch_size)
+        pooled_prompt_embeds = repeat(self.empty_pooled_prompt_embeds, '1 c -> b c', b=batch_size)
+        negative_prompt_embeds = prompt_embeds
+        negative_pooled_prompt_embeds = pooled_prompt_embeds
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        # 5. Prepare latent variables
+        # num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = 4
+        latents = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        text_encoder_projection_dim = 1280
+        resolution = f'({width}, {height})'
+        assert resolution in self.resolution_group, f"resolution are not in resolution group. Got {resolution}. Candidates:{self.resolution_group}"
+        resolution_index = self.resolution_group.index(resolution)
+        # resolution_index = None
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+            resolution_index=resolution_index,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size, 1)
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 8.1 Apply denoising_end
+        if (
+                self.denoising_end is not None
+                and isinstance(self.denoising_end, float)
+                and self.denoising_end > 0
+                and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self._num_timesteps = len(timesteps)
+        # with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            vq_embeds = vq_embeds.to(latent_model_input) if vq_embeds.size(
+                -1) == latent_model_input.size(
+                -1) else \
+                torch.nn.functional.interpolate(vq_embeds.to(latent_model_input),
+                                                size=latent_model_input.shape[-2:])
+            vq_embeds_input = torch.cat([torch.zeros_like(vq_embeds),
+                                         vq_embeds]) if self.do_classifier_free_guidance else vq_embeds
+            # predict the noise residual
+            added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+            latent_model_input = torch.cat([latent_model_input, vq_embeds_input], dim=1)
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=timestep_cond,
+                cross_attention_kwargs=self.cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                noise_pred = rescale_noise_cfg(noise_pred, noise_pred_cond, guidance_rescale=self.guidance_rescale)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                # progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
+            if XLA_AVAILABLE:
+                xm.mark_step()
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLDecoderPipelineOutput(images=image,
+                                                      indices_semantic=indices_semantic,
+                                                      indices_pixel=indices_pixel)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efb35e4dde47fc87a13f21a10fc3a0ac50340bc53ad9d88999616403d8498216
+size 33100531

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db9da21d556f5a362c7d22bfe8bf4d1bd734a303db356521eeb4c96a11881970
+size 25814172

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff