maxmeuer's picture
Update transformer
7568e10 verified
encoder_config:
vocab_size: 256
hidden_size: 1024
num_hidden_layers: 6
num_attention_heads: 8
num_key_value_heads: 8
rms_norm_eps: 1.0e-05
intermediate_size: 2816
max_position_embeddings: 262144
rope_scaling:
rope_type: default
rope_theta: 100000
mlp_bias: false
use_cache: true
sliding_window: 768
transformers_version: null
key_query_norm: false
key_query_norm_per_head: false
is_neox_style: true
cross_attention_config:
hidden_size_q: 4096
hidden_size_kv: 1024
hidden_size: 4096
num_attention_heads: 32
attention_num_kv_heads: 32
word_window_size: 1
key_query_norm: false
key_query_norm_per_head: false
backbone_config:
vocab_size: 0
hidden_size: 4096
num_hidden_layers: 32
num_attention_heads: 32
num_key_value_heads: 8
rms_norm_eps: 1.0e-05
intermediate_size: 14336
max_position_embeddings: 32768
rope_scaling:
rope_type: llama3
factor: 8.0
original_max_position_embeddings: 8192
low_freq_factor: 1.0
high_freq_factor: 4.0
rope_theta: 500000
mlp_bias: false
use_cache: true
sliding_window: null
transformers_version: null
key_query_norm: false
key_query_norm_per_head: false
is_neox_style: true
decoder_config:
vocab_size: 256
hidden_size: 1024
num_hidden_layers: 4
num_attention_heads: 8
num_key_value_heads: 8
rms_norm_eps: 1.0e-05
intermediate_size: 2816
max_position_embeddings: 262144
rope_scaling:
rope_type: default
rope_theta: 100000
mlp_bias: false
use_cache: true
sliding_window: 768
transformers_version: null
key_query_norm: false
key_query_norm_per_head: false
is_neox_style: true
cross_attn_every_layer: true
cross_attention_config:
hidden_size_q: 1024
hidden_size_kv: 4096
hidden_size: 1024
num_attention_heads: 8
attention_num_kv_heads: 8
word_window_size: 1
key_query_norm: false
key_query_norm_per_head: false
model_type: hierarchical_autoregressive_transformer
transformers_version: 4.46.3
auto_map:
AutoConfig: config.HATArchitectureConfig
AutoModelForCausalLM: model.HATForCausalLM
special_token_dict:
<|begin_of_text|>: 250
<|start_header_id|>: 251
<|end_header_id|>: 252
<|eot_id|>: 192
max_word_size: 100
sliding_window: 768
max_position_embeddings: 262144
torch_dtype: bfloat16
architectures:
- HATDecoderForCausalLM