"""ILLUME model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
from transformers.models.auto import CONFIG_MAPPING

# import the first three to make sure the last one recognize them.
from .modeling_rope_utils import rope_config_validation
from .configuration_movqgan import MoVQConfig
from .configuration_qwen2vit import Qwen2VLVisionConfig
from .configuration_dualvitok import DualViTokConfig

logger = logging.get_logger(__name__)


class ILLUMEConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ILLUMEForConditionalGeneration`]. It is used to instantiate an
    ILLUME model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `DualViTokConfig`):
            The config object or dictionary of the vision backbone.
        mm_projector_config (`dict`, *optional*, defaults to `None`):
            Configuration for the multimodal projector.
        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
            The config object or dictionary of the text backbone.
        ignore_index (`int`, *optional*, defaults to -100):
            The ignore index for the loss function.
        image_token_index (`int`, *optional*, defaults to 32000):
            The image token index to encode the image prompt.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.

    Example:

    ```python
    >>> from transformers import ILLUMEForConditionalGeneration, ILLUMEConfig, CLIPVisionConfig, LlamaConfig

    >>> # Initializing a CLIP-vision config
    >>> vision_config = CLIPVisionConfig()

    >>> # Initializing a Llama config
    >>> text_config = LlamaConfig()

    >>> # Initializing a ILLUME style configuration
    >>> configuration = ILLUMEConfig(vision_config, text_config)

    >>> # Initializing a model from the style configuration
    >>> model = ILLUMEForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "illume"
    is_composition = False

    def __init__(
            self,
            vision_config=None,
            mm_projector_config=None,
            text_config=None,
            ignore_index=-100,
            image_token_index=32000,
            tie_word_embeddings=False,
            **kwargs,
    ):
        self.ignore_index = ignore_index
        self.image_token_index = image_token_index

        if isinstance(vision_config, dict):
            vision_config = DualViTokConfig(**vision_config)
        elif vision_config is None:
            vision_config = DualViTokConfig({
                "semantic_encoder": {
                  "pretrained_semantic_encoder":
                      "Emova-ollm/qwen2vit600m",
                  "z_channels": 32,
                  "num_blocks": 4,
                  "out_layer": "linear",
                  "embed_dim": 1280,
                  "target_mlp": "norm"
                }, 
                "semantic_decoder": {
                  "z_channels": 32,
                  "num_blocks": 4,
                  "embed_dim": 1280,
                  "out_layer": "linear_norm",
                  "out_channels": 3584
                },
                "semantic_quantizer_type": "simvq",
                "pixel_quantizer_type": "simvq",
                "semantic_quantizer_codebook_size": 32768,
                "pixel_quantizer_codebook_size": 98304,
                "attn_implementation": "sdpa",
                "pixel_encoder": {
                  "codebook_size": 98304,
                  "embed_dim": 32,
                  "z_channels": 32,
                  "double_z": False,
                  "in_channels": 3,
                  "out_channels": 3,
                  "ch": 128,
                  "ch_mult": [ 1, 1, 2, 2, 4 ],
                  "num_res_blocks": 2,
                  "attn_resolutions": [ 4 ],
                  "dropout": 0.0,
                  "use_dc_up_down_blocks": True
                },
                "pixel_decoder": {
                  "codebook_size": 98304,
                  "embed_dim": 64,
                  "z_channels": 64,
                  "double_z": False,
                  "in_channels": 3,
                  "out_channels": 3,
                  "ch": 384,
                  "ch_mult": [ 1, 1, 2, 2, 4 ],
                  "num_res_blocks": 2,
                  "attn_resolutions": [4],
                  "dropout": 0.0,
                  "use_dc_up_down_blocks": True
                },
              }
            )

        self.vision_config = vision_config
        self.mm_projector_config = mm_projector_config
        if isinstance(text_config, dict):
            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2"
            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
        elif text_config is None:
            text_config = CONFIG_MAPPING["qwen2"]()

        self.text_config = text_config
        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)