apriasmoro commited on
Commit
d2ce116
·
verified ·
1 Parent(s): 3c975dc

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "q_proj",
29
- "k_proj",
30
  "v_proj",
31
- "gate_proj",
32
  "up_proj",
 
 
33
  "down_proj",
34
- "o_proj"
35
  ],
36
  "task_type": "CAUSAL_LM",
37
  "trainable_token_indices": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
 
28
  "v_proj",
29
+ "k_proj",
30
  "up_proj",
31
+ "o_proj",
32
+ "q_proj",
33
  "down_proj",
34
+ "gate_proj"
35
  ],
36
  "task_type": "CAUSAL_LM",
37
  "trainable_token_indices": null,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fffaaf3dd6044593ef888cd121d4d57091cdf96d82288678999f506a36e70fc1
3
  size 349243752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:765031db245f46679d16364da940abe936eb759476eb39586b177763b63d7048
3
  size 349243752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12537611b187c325f732a1c57df8d48a27b9201d8cc511ed058118b5306856a1
3
  size 177908741
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeecb7ce5c206a6e0864069a17767c8a6414e150ffc8ff984df3aa77cf4ce3b3
3
  size 177908741
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93da98f41e985fca73c3baca6a02ba03cfcce6986c7abd33cb201195e5fbab5c
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d6de30ce9af6a5ba1e64753f4acb301298a2e376005d9767c3d60d31cb45d5b
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da0b998422a3dc253ae0972fd9207eebf2190589880dd54501b58c1760fdda21
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad7bb275e743cbc804d55f372622f5470a926848e615f7806e17e210a72f38ef
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,156 +2,191 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.01990049751243781,
6
  "eval_steps": 500,
7
- "global_step": 20,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.0009950248756218905,
14
- "grad_norm": 1.1177501678466797,
15
- "learning_rate": 0.0,
16
- "loss": 1.9834,
17
- "step": 1
18
  },
19
  {
20
- "epoch": 0.001990049751243781,
21
- "grad_norm": 0.9374369382858276,
22
- "learning_rate": 7e-07,
23
- "loss": 1.7952,
24
- "step": 2
25
  },
26
  {
27
- "epoch": 0.0029850746268656717,
28
- "grad_norm": 1.0108283758163452,
29
- "learning_rate": 1.4e-06,
30
- "loss": 1.9963,
31
- "step": 3
32
  },
33
  {
34
- "epoch": 0.003980099502487562,
35
- "grad_norm": 0.9044333696365356,
36
- "learning_rate": 2.1e-06,
37
- "loss": 1.7992,
38
- "step": 4
39
  },
40
  {
41
- "epoch": 0.004975124378109453,
42
- "grad_norm": 0.8546856045722961,
43
- "learning_rate": 2.8e-06,
44
- "loss": 2.0496,
45
- "step": 5
46
  },
47
  {
48
- "epoch": 0.005970149253731343,
49
- "grad_norm": 0.8030872941017151,
50
- "learning_rate": 3.5e-06,
51
- "loss": 1.9614,
52
- "step": 6
53
  },
54
  {
55
- "epoch": 0.006965174129353234,
56
- "grad_norm": 0.8157246708869934,
57
- "learning_rate": 4.2e-06,
58
- "loss": 1.832,
59
- "step": 7
60
  },
61
  {
62
- "epoch": 0.007960199004975124,
63
- "grad_norm": 0.8207484483718872,
64
- "learning_rate": 4.9e-06,
65
- "loss": 1.8791,
66
- "step": 8
67
  },
68
  {
69
- "epoch": 0.008955223880597015,
70
- "grad_norm": 0.9034633636474609,
71
- "learning_rate": 5.6e-06,
72
- "loss": 2.0136,
73
- "step": 9
74
  },
75
  {
76
- "epoch": 0.009950248756218905,
77
- "grad_norm": 0.6087605953216553,
78
- "learning_rate": 6.299999999999999e-06,
79
- "loss": 1.593,
80
- "step": 10
81
  },
82
  {
83
- "epoch": 0.010945273631840797,
84
- "grad_norm": 0.5611479878425598,
85
- "learning_rate": 7e-06,
86
- "loss": 1.6113,
87
- "step": 11
88
  },
89
  {
90
- "epoch": 0.011940298507462687,
91
- "grad_norm": 0.46190792322158813,
92
- "learning_rate": 7.699999999999999e-06,
93
- "loss": 1.5577,
94
- "step": 12
95
  },
96
  {
97
- "epoch": 0.012935323383084577,
98
- "grad_norm": 0.4395303726196289,
99
- "learning_rate": 8.4e-06,
100
- "loss": 1.5148,
101
- "step": 13
102
  },
103
  {
104
- "epoch": 0.013930348258706468,
105
- "grad_norm": 0.4437845051288605,
106
- "learning_rate": 9.1e-06,
107
- "loss": 1.5968,
108
- "step": 14
109
  },
110
  {
111
- "epoch": 0.014925373134328358,
112
- "grad_norm": 0.5449181199073792,
113
- "learning_rate": 9.8e-06,
114
- "loss": 1.7633,
115
- "step": 15
116
  },
117
  {
118
- "epoch": 0.015920398009950248,
119
- "grad_norm": 0.5540159344673157,
120
- "learning_rate": 1.05e-05,
121
- "loss": 1.5104,
122
- "step": 16
123
  },
124
  {
125
- "epoch": 0.01691542288557214,
126
- "grad_norm": 0.418242484331131,
127
- "learning_rate": 1.12e-05,
128
- "loss": 1.5135,
129
- "step": 17
130
  },
131
  {
132
- "epoch": 0.01791044776119403,
133
- "grad_norm": 0.29859286546707153,
134
- "learning_rate": 1.19e-05,
135
- "loss": 1.3894,
136
- "step": 18
137
  },
138
  {
139
- "epoch": 0.01890547263681592,
140
- "grad_norm": 0.36172112822532654,
141
- "learning_rate": 1.2599999999999998e-05,
142
- "loss": 1.4343,
143
- "step": 19
144
  },
145
  {
146
- "epoch": 0.01990049751243781,
147
- "grad_norm": 0.4372071325778961,
148
- "learning_rate": 1.33e-05,
149
- "loss": 1.633,
150
- "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  }
152
  ],
153
- "logging_steps": 1,
154
- "max_steps": 20,
155
  "num_input_tokens_seen": 0,
156
  "num_train_epochs": 1,
157
  "save_steps": 100,
@@ -162,12 +197,12 @@
162
  "should_evaluate": false,
163
  "should_log": false,
164
  "should_save": true,
165
- "should_training_stop": true
166
  },
167
  "attributes": {}
168
  }
169
  },
170
- "total_flos": 1.4722461264248832e+16,
171
  "train_batch_size": 24,
172
  "trial_name": null,
173
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.09950248756218906,
6
  "eval_steps": 500,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.003980099502487562,
14
+ "grad_norm": 1.1235613822937012,
15
+ "learning_rate": 1.785e-07,
16
+ "loss": 1.899,
17
+ "step": 4
18
  },
19
  {
20
+ "epoch": 0.007960199004975124,
21
+ "grad_norm": 1.061541199684143,
22
+ "learning_rate": 4.165e-07,
23
+ "loss": 1.8765,
24
+ "step": 8
25
  },
26
  {
27
+ "epoch": 0.011940298507462687,
28
+ "grad_norm": 0.7362794876098633,
29
+ "learning_rate": 6.544999999999999e-07,
30
+ "loss": 1.8281,
31
+ "step": 12
32
  },
33
  {
34
+ "epoch": 0.015920398009950248,
35
+ "grad_norm": 0.8780789971351624,
36
+ "learning_rate": 8.924999999999999e-07,
37
+ "loss": 1.8408,
38
+ "step": 16
39
  },
40
  {
41
+ "epoch": 0.01990049751243781,
42
+ "grad_norm": 0.705963671207428,
43
+ "learning_rate": 1.1305e-06,
44
+ "loss": 1.7984,
45
+ "step": 20
46
  },
47
  {
48
+ "epoch": 0.023880597014925373,
49
+ "grad_norm": 0.6901829838752747,
50
+ "learning_rate": 1.3685000000000001e-06,
51
+ "loss": 1.6908,
52
+ "step": 24
53
  },
54
  {
55
+ "epoch": 0.027860696517412936,
56
+ "grad_norm": 0.7429279685020447,
57
+ "learning_rate": 1.6065e-06,
58
+ "loss": 1.7849,
59
+ "step": 28
60
  },
61
  {
62
+ "epoch": 0.031840796019900496,
63
+ "grad_norm": 0.7095203995704651,
64
+ "learning_rate": 1.8444999999999999e-06,
65
+ "loss": 1.6476,
66
+ "step": 32
67
  },
68
  {
69
+ "epoch": 0.03582089552238806,
70
+ "grad_norm": 0.6006342172622681,
71
+ "learning_rate": 2.0825e-06,
72
+ "loss": 1.68,
73
+ "step": 36
74
  },
75
  {
76
+ "epoch": 0.03980099502487562,
77
+ "grad_norm": 0.49123871326446533,
78
+ "learning_rate": 2.3205e-06,
79
+ "loss": 1.5384,
80
+ "step": 40
81
  },
82
  {
83
+ "epoch": 0.04378109452736319,
84
+ "grad_norm": 0.44953885674476624,
85
+ "learning_rate": 2.5584999999999997e-06,
86
+ "loss": 1.5843,
87
+ "step": 44
88
  },
89
  {
90
+ "epoch": 0.04776119402985075,
91
+ "grad_norm": 0.45015600323677063,
92
+ "learning_rate": 2.7965e-06,
93
+ "loss": 1.5667,
94
+ "step": 48
95
  },
96
  {
97
+ "epoch": 0.051741293532338306,
98
+ "grad_norm": 0.4326404333114624,
99
+ "learning_rate": 3.0345e-06,
100
+ "loss": 1.6001,
101
+ "step": 52
102
  },
103
  {
104
+ "epoch": 0.05572139303482587,
105
+ "grad_norm": 0.2951863408088684,
106
+ "learning_rate": 3.2725e-06,
107
+ "loss": 1.6729,
108
+ "step": 56
109
  },
110
  {
111
+ "epoch": 0.05970149253731343,
112
+ "grad_norm": 0.23001885414123535,
113
+ "learning_rate": 3.5104999999999997e-06,
114
+ "loss": 1.4787,
115
+ "step": 60
116
  },
117
  {
118
+ "epoch": 0.06368159203980099,
119
+ "grad_norm": 0.29933151602745056,
120
+ "learning_rate": 3.7484999999999998e-06,
121
+ "loss": 1.4886,
122
+ "step": 64
123
  },
124
  {
125
+ "epoch": 0.06766169154228856,
126
+ "grad_norm": 0.3464262783527374,
127
+ "learning_rate": 3.9865e-06,
128
+ "loss": 1.4839,
129
+ "step": 68
130
  },
131
  {
132
+ "epoch": 0.07164179104477612,
133
+ "grad_norm": 0.18549823760986328,
134
+ "learning_rate": 4.2245e-06,
135
+ "loss": 1.5965,
136
+ "step": 72
137
  },
138
  {
139
+ "epoch": 0.07562189054726368,
140
+ "grad_norm": 0.1957472413778305,
141
+ "learning_rate": 4.4625e-06,
142
+ "loss": 1.4596,
143
+ "step": 76
144
  },
145
  {
146
+ "epoch": 0.07960199004975124,
147
+ "grad_norm": 0.19810840487480164,
148
+ "learning_rate": 4.7005e-06,
149
+ "loss": 1.4978,
150
+ "step": 80
151
+ },
152
+ {
153
+ "epoch": 0.08358208955223881,
154
+ "grad_norm": 0.19601857662200928,
155
+ "learning_rate": 4.938499999999999e-06,
156
+ "loss": 1.5918,
157
+ "step": 84
158
+ },
159
+ {
160
+ "epoch": 0.08756218905472637,
161
+ "grad_norm": 0.21324580907821655,
162
+ "learning_rate": 5.1764999999999995e-06,
163
+ "loss": 1.4612,
164
+ "step": 88
165
+ },
166
+ {
167
+ "epoch": 0.09154228855721393,
168
+ "grad_norm": 0.18543551862239838,
169
+ "learning_rate": 5.4144999999999996e-06,
170
+ "loss": 1.5037,
171
+ "step": 92
172
+ },
173
+ {
174
+ "epoch": 0.0955223880597015,
175
+ "grad_norm": 0.20870834589004517,
176
+ "learning_rate": 5.6525e-06,
177
+ "loss": 1.4631,
178
+ "step": 96
179
+ },
180
+ {
181
+ "epoch": 0.09950248756218906,
182
+ "grad_norm": 0.1726507693529129,
183
+ "learning_rate": 5.8905e-06,
184
+ "loss": 1.4945,
185
+ "step": 100
186
  }
187
  ],
188
+ "logging_steps": 4,
189
+ "max_steps": 972,
190
  "num_input_tokens_seen": 0,
191
  "num_train_epochs": 1,
192
  "save_steps": 100,
 
197
  "should_evaluate": false,
198
  "should_log": false,
199
  "should_save": true,
200
+ "should_training_stop": false
201
  },
202
  "attributes": {}
203
  }
204
  },
205
+ "total_flos": 7.572895410683904e+16,
206
  "train_batch_size": 24,
207
  "trial_name": null,
208
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e93fc2110e24c8c1cd5be545899591697cd37b015c7526d6a4204695dbcf135
3
  size 7697
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6954ea297f3f47bf7e65177726af21c61a398f05a898f3f8ce402fc696622cfa
3
  size 7697