encoder_config: vocab_size: 256 hidden_size: 1024 num_hidden_layers: 6 num_attention_heads: 8 num_key_value_heads: 8 rms_norm_eps: 1.0e-05 intermediate_size: 2816 max_position_embeddings: 262144 rope_scaling: rope_type: default rope_theta: 100000 mlp_bias: false use_cache: true sliding_window: 768 transformers_version: null key_query_norm: false key_query_norm_per_head: false is_neox_style: true cross_attention_config: hidden_size_q: 4096 hidden_size_kv: 1024 hidden_size: 4096 num_attention_heads: 32 attention_num_kv_heads: 32 word_window_size: 1 key_query_norm: false key_query_norm_per_head: false backbone_config: vocab_size: 0 hidden_size: 4096 num_hidden_layers: 32 num_attention_heads: 32 num_key_value_heads: 8 rms_norm_eps: 1.0e-05 intermediate_size: 14336 max_position_embeddings: 32768 rope_scaling: rope_type: llama3 factor: 8.0 original_max_position_embeddings: 8192 low_freq_factor: 1.0 high_freq_factor: 4.0 rope_theta: 500000 mlp_bias: false use_cache: true sliding_window: null transformers_version: null key_query_norm: false key_query_norm_per_head: false is_neox_style: true decoder_config: vocab_size: 256 hidden_size: 1024 num_hidden_layers: 4 num_attention_heads: 8 num_key_value_heads: 8 rms_norm_eps: 1.0e-05 intermediate_size: 2816 max_position_embeddings: 262144 rope_scaling: rope_type: default rope_theta: 100000 mlp_bias: false use_cache: true sliding_window: 768 transformers_version: null key_query_norm: false key_query_norm_per_head: false is_neox_style: true cross_attn_every_layer: true cross_attention_config: hidden_size_q: 1024 hidden_size_kv: 4096 hidden_size: 1024 num_attention_heads: 8 attention_num_kv_heads: 8 word_window_size: 1 key_query_norm: false key_query_norm_per_head: false model_type: hierarchical_autoregressive_transformer transformers_version: 4.46.3 auto_map: AutoConfig: config.HATArchitectureConfig AutoModelForCausalLM: model.HATForCausalLM special_token_dict: <|begin_of_text|>: 250 <|start_header_id|>: 251 <|end_header_id|>: 252 <|eot_id|>: 192 max_word_size: 100 sliding_window: 768 max_position_embeddings: 262144 torch_dtype: bfloat16 architectures: - HATDecoderForCausalLM