"""ILLUME model configuration""" from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging from transformers.models.auto import CONFIG_MAPPING # import the first three to make sure the last one recognize them. from .modeling_rope_utils import rope_config_validation from .configuration_movqgan import MoVQConfig from .configuration_qwen2vit import Qwen2VLVisionConfig from .configuration_dualvitok import DualViTokConfig logger = logging.get_logger(__name__) class ILLUMEConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`ILLUMEForConditionalGeneration`]. It is used to instantiate an ILLUME model according to the specified arguments, defining the model architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `DualViTokConfig`): The config object or dictionary of the vision backbone. mm_projector_config (`dict`, *optional*, defaults to `None`): Configuration for the multimodal projector. text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`): The config object or dictionary of the text backbone. ignore_index (`int`, *optional*, defaults to -100): The ignore index for the loss function. image_token_index (`int`, *optional*, defaults to 32000): The image token index to encode the image prompt. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. Example: ```python >>> from transformers import ILLUMEForConditionalGeneration, ILLUMEConfig, CLIPVisionConfig, LlamaConfig >>> # Initializing a CLIP-vision config >>> vision_config = CLIPVisionConfig() >>> # Initializing a Llama config >>> text_config = LlamaConfig() >>> # Initializing a ILLUME style configuration >>> configuration = ILLUMEConfig(vision_config, text_config) >>> # Initializing a model from the style configuration >>> model = ILLUMEForConditionalGeneration(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "illume" is_composition = False def __init__( self, vision_config=None, mm_projector_config=None, text_config=None, ignore_index=-100, image_token_index=32000, tie_word_embeddings=False, **kwargs, ): self.ignore_index = ignore_index self.image_token_index = image_token_index if isinstance(vision_config, dict): vision_config = DualViTokConfig(**vision_config) elif vision_config is None: vision_config = DualViTokConfig({ "semantic_encoder": { "pretrained_semantic_encoder": "Emova-ollm/qwen2vit600m", "z_channels": 32, "num_blocks": 4, "out_layer": "linear", "embed_dim": 1280, "target_mlp": "norm" }, "semantic_decoder": { "z_channels": 32, "num_blocks": 4, "embed_dim": 1280, "out_layer": "linear_norm", "out_channels": 3584 }, "semantic_quantizer_type": "simvq", "pixel_quantizer_type": "simvq", "semantic_quantizer_codebook_size": 32768, "pixel_quantizer_codebook_size": 98304, "attn_implementation": "sdpa", "pixel_encoder": { "codebook_size": 98304, "embed_dim": 32, "z_channels": 32, "double_z": False, "in_channels": 3, "out_channels": 3, "ch": 128, "ch_mult": [ 1, 1, 2, 2, 4 ], "num_res_blocks": 2, "attn_resolutions": [ 4 ], "dropout": 0.0, "use_dc_up_down_blocks": True }, "pixel_decoder": { "codebook_size": 98304, "embed_dim": 64, "z_channels": 64, "double_z": False, "in_channels": 3, "out_channels": 3, "ch": 384, "ch_mult": [ 1, 1, 2, 2, 4 ], "num_res_blocks": 2, "attn_resolutions": [4], "dropout": 0.0, "use_dc_up_down_blocks": True }, } ) self.vision_config = vision_config self.mm_projector_config = mm_projector_config if isinstance(text_config, dict): text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2" text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) elif text_config is None: text_config = CONFIG_MAPPING["qwen2"]() self.text_config = text_config super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)