{ "Ubit": 100, "_attn_implementation_autoset": true, "_name_or_path": "runs/train/stage35_BS18/model", "architectures": [ "LlavaTopDownLlamaModel" ], "babit": "E5M2", "bobit": "E5M2", "bwbit": "E5M2", "chat_template": null, "col_blocksize": -1, "col_blocksize_optimizer": 128, "draw_distribution_backward": false, "draw_distribution_forward": false, "drop_path_rate": 0.0, "dynamic_s2": false, "epsilon": 1e-10, "epsilon_optimizer": 1e-15, "fabit": "E4M3", "first_order_bit": null, "first_order_quant_type": null, "fobit": "E4M3", "fps": 0.0, "fwbit": "E4M3", "group_size": -1, "hidden_size": 3584, "high_res_pos_embed": true, "image_aspect_ratio": "resize", "image_encoder": { "_target_": "llava.model.encoders.BasicImageEncoder" }, "interpolate_mode": "linear", "llm_cfg": { "_attn_implementation_autoset": false, "_name_or_path": "runs/train/stage35_BS18/model/llm", "add_cross_attention": false, "architectures": [ "Qwen2ForCausalLM" ], "attention_dropout": 0.0, "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": 151643, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": 151645, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_act": "silu", "hidden_size": 3584, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "initializer_range": 0.02, "intermediate_size": 18944, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "max_position_embeddings": 32768, "max_window_layers": 28, "min_length": 0, "model_max_length": 10240, "model_type": "qwen2", "no_repeat_ngram_size": 0, "num_attention_heads": 28, "num_beam_groups": 1, "num_beams": 1, "num_hidden_layers": 28, "num_key_value_heads": 4, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 1000000.0, "sep_token_id": null, "sliding_window": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": false, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": "bfloat16", "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "use_cache": true, "use_sliding_window": false, "vocab_size": 151651 }, "look_close_mode": "after_prompt", "max_tiles": 12, "min_blockunit_col": 4, "min_blockunit_row": 4, "min_tiles": 1, "mlp_path": null, "mm_hidden_size": 1152, "mm_low_res_token_num": 729, "mm_projector": "mlp_downsample", "mm_projector_cfg": { "_attn_implementation_autoset": false, "_name_or_path": "runs/train/stage35_BS18/model/mm_projector", "add_cross_attention": false, "architectures": [ "MultimodalProjector" ], "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "min_length": 0, "mm_projector_type": "mlp_downsample", "model_type": "v2l_projector", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "prefix": null, "problem_type": null, "pruned_heads": {}, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "sep_token_id": null, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": "bfloat16", "torchscript": false, "typical_p": 1.0, "use_bfloat16": false }, "mm_projector_lr": null, "mm_scale_num": 4, "mm_use_bos_eos_tokens": false, "mm_use_im_patch_token": false, "mm_use_im_start_end": false, "mm_vision_select_feature": "cls_patch", "mm_vision_select_layer": -2, "model_dtype": "torch.bfloat16", "model_name_or_path": "runs/train/stage3_BS18/model", "model_type": "llava_topdown_llama", "num_look_close": 6, "num_time_tokens": 0, "num_token_look_close": null, "num_video_frames": 8, "pad_block": false, "pad_to_multiple_of": 0, "ps3": true, "ps3_dynamic_aspect_ratio": true, "ps3_grad_checkpointing": false, "qchoice": "none", "quantize_model": false, "refine_attn_blocksize": false, "refine_col_blocksize": 4, "refine_ln_blocksize": false, "refine_ln_blocksize_but_only_backward": false, "refine_ln_blocksize_but_only_forward": false, "refine_ln_pertoken": false, "refine_mlp_blocksize": false, "refine_residual_fp": false, "refine_row_blocksize": 4, "resume_path": "runs/train/stage35_BS18/model", "row_blocksize": -1, "row_blocksize_optimizer": 1, "s2": false, "s2_max_split_size": 336, "s2_resize_output_to_scale_idx": 0, "s2_scales": "336,672,1008", "second_order_bit": null, "second_order_quant_type": null, "soft_ce_std": 1.0, "sound_mm_projector": "mlp", "sound_mm_projector_cfg": null, "sound_tower": "", "sound_tower_cfg": null, "speech_mm_projector": "mlp", "speech_mm_projector_cfg": null, "speech_tower": "", "speech_tower_cfg": null, "symm": true, "time_token_format": "", "time_token_ids": [], "top_down_prompt_head_type": "mlp", "transformers_version": "4.46.0", "tune_language_model": false, "tune_mm_projector": false, "tune_vision_tower": false, "unified_audio_encoder": false, "use_quantize_optimizer": false, "version": "auto", "video_encoder": { "_target_": "llava.model.encoders.BasicVideoEncoder" }, "video_max_tiles": 1, "vision_resolution": -1, "vision_tower": "/home/baifengs/baifengs/projects/open_clip/hf_ckpt/250605_1500", "vision_tower_cfg": { "_attn_implementation_autoset": false, "_name_or_path": "runs/train/stage35_BS18/model/vision_tower", "add_cross_attention": false, "architectures": [ "PS3VisionModel" ], "bad_words_ids": null, "begin_suppress_tokens": null, "bos_token_id": null, "chunk_size_feed_forward": 0, "class_token": null, "cross_attention_hidden_size": null, "decoder_start_token_id": null, "diversity_penalty": 0.0, "do_sample": false, "drop": 0.0, "drop_path": null, "dynamic_img_size": true, "early_stopping": false, "encoder_no_repeat_ngram_size": 0, "eos_token_id": null, "exponential_decay_length_penalty": null, "final_norm": false, "finetuning_task": null, "forced_bos_token_id": null, "forced_eos_token_id": null, "hidden_size": 1152, "highres_selection_feature": true, "highres_selection_module_depth": 3, "highres_selection_module_hidden_dim": 512, "highres_selection_module_kernel_size": 28, "highres_selection_module_out_dim": 512, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "image_size": 3780, "img_size": null, "is_decoder": false, "is_encoder_decoder": false, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "length_penalty": 1.0, "max_length": 20, "max_select_num": 2560, "max_select_num_each_scale": null, "min_length": 0, "min_select_num": 1, "model_name": "vit_so400m_patch14_siglip_378", "model_type": "ps3_vision_model", "no_repeat_ngram_size": 0, "num_beam_groups": 1, "num_beams": 1, "num_return_sequences": 1, "output_attentions": false, "output_hidden_states": false, "output_scores": false, "pad_token_id": null, "patch_drop": null, "patch_size": 14, "pool": "map", "prefix": null, "pretrained": false, "problem_type": null, "pruned_heads": {}, "ps3": true, "ps3_scales": [ 378, 756, 1512, 3780 ], "radio": false, "radio_adapter_mlp_hidden_dim": null, "radio_adapter_mlp_input_dim": null, "radio_adapter_mlp_num_inner": null, "radio_adapter_mlp_output_dim": null, "radio_adapter_mlp_version": null, "remove_invalid_values": false, "repetition_penalty": 1.0, "return_dict": true, "return_dict_in_generate": false, "select_based_on_layer": [ 0, 9, 18, 26 ], "sep_token_id": null, "separate_pos_emb": true, "suppress_tokens": null, "task_specific_params": null, "temperature": 1.0, "tf_legacy_loss": false, "tie_encoder_decoder": false, "tie_word_embeddings": true, "tokenizer_class": null, "top_k": 50, "top_p": 1.0, "torch_dtype": "bfloat16", "torchscript": false, "typical_p": 1.0, "use_bfloat16": false, "vision_tower_name": "vit_so400m_patch14_siglip_378" }, "vision_tower_lr": null, "weight_memory_efficient": true, "xvila_mode": false }