File size: 2,359 Bytes
7568e10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
encoder_config:
  vocab_size: 256
  hidden_size: 1024
  num_hidden_layers: 6
  num_attention_heads: 8
  num_key_value_heads: 8
  rms_norm_eps: 1.0e-05
  intermediate_size: 2816
  max_position_embeddings: 262144
  rope_scaling:
    rope_type: default
  rope_theta: 100000
  mlp_bias: false
  use_cache: true
  sliding_window: 768
  transformers_version: null
  key_query_norm: false
  key_query_norm_per_head: false
  is_neox_style: true
  cross_attention_config:
    hidden_size_q: 4096
    hidden_size_kv: 1024
    hidden_size: 4096
    num_attention_heads: 32
    attention_num_kv_heads: 32
    word_window_size: 1
    key_query_norm: false
    key_query_norm_per_head: false
backbone_config:
  vocab_size: 0
  hidden_size: 4096
  num_hidden_layers: 32
  num_attention_heads: 32
  num_key_value_heads: 8
  rms_norm_eps: 1.0e-05
  intermediate_size: 14336
  max_position_embeddings: 32768
  rope_scaling:
    rope_type: llama3
    factor: 8.0
    original_max_position_embeddings: 8192
    low_freq_factor: 1.0
    high_freq_factor: 4.0
  rope_theta: 500000
  mlp_bias: false
  use_cache: true
  sliding_window: null
  transformers_version: null
  key_query_norm: false
  key_query_norm_per_head: false
  is_neox_style: true
decoder_config:
  vocab_size: 256
  hidden_size: 1024
  num_hidden_layers: 4
  num_attention_heads: 8
  num_key_value_heads: 8
  rms_norm_eps: 1.0e-05
  intermediate_size: 2816
  max_position_embeddings: 262144
  rope_scaling:
    rope_type: default
  rope_theta: 100000
  mlp_bias: false
  use_cache: true
  sliding_window: 768
  transformers_version: null
  key_query_norm: false
  key_query_norm_per_head: false
  is_neox_style: true
  cross_attn_every_layer: true
  cross_attention_config:
    hidden_size_q: 1024
    hidden_size_kv: 4096
    hidden_size: 1024
    num_attention_heads: 8
    attention_num_kv_heads: 8
    word_window_size: 1
    key_query_norm: false
    key_query_norm_per_head: false
model_type: hierarchical_autoregressive_transformer
transformers_version: 4.46.3
auto_map:
  AutoConfig: config.HATArchitectureConfig
  AutoModelForCausalLM: model.HATForCausalLM
special_token_dict:
  <|begin_of_text|>: 250
  <|start_header_id|>: 251
  <|end_header_id|>: 252
  <|eot_id|>: 192
max_word_size: 100
sliding_window: 768
max_position_embeddings: 262144
torch_dtype: bfloat16
architectures:
- HATDecoderForCausalLM