ajikadev commited on
Commit
4bbe7a2
·
verified ·
1 Parent(s): c1b2b3c

End of training

Browse files
Files changed (5) hide show
  1. README.md +5 -3
  2. all_results.json +11 -11
  3. eval_results.json +6 -6
  4. train_results.json +6 -6
  5. trainer_state.json +581 -121
README.md CHANGED
@@ -2,6 +2,8 @@
2
  library_name: transformers
3
  base_model: microsoft/wavlm-base
4
  tags:
 
 
5
  - generated_from_trainer
6
  metrics:
7
  - wer
@@ -15,10 +17,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # wavlm-salt-eng
17
 
18
- This model is a fine-tuned version of [microsoft/wavlm-base](https://huggingface.co/microsoft/wavlm-base) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.2243
21
- - Wer: 0.2204
22
 
23
  ## Model description
24
 
 
2
  library_name: transformers
3
  base_model: microsoft/wavlm-base
4
  tags:
5
+ - automatic-speech-recognition
6
+ - Sunbird/salt
7
  - generated_from_trainer
8
  metrics:
9
  - wer
 
17
 
18
  # wavlm-salt-eng
19
 
20
+ This model is a fine-tuned version of [microsoft/wavlm-base](https://huggingface.co/microsoft/wavlm-base) on the SUNBIRD/SALT - MULTISPEAKER-ENG dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.2244
23
+ - Wer: 0.2118
24
 
25
  ## Model description
26
 
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 33.22296173044925,
3
- "eval_loss": 0.20475824177265167,
4
- "eval_runtime": 5.1372,
5
  "eval_samples": 101,
6
- "eval_samples_per_second": 19.66,
7
- "eval_steps_per_second": 2.531,
8
- "eval_wer": 0.23010752688172043,
9
- "total_flos": 6.831383494195675e+18,
10
- "train_loss": 0.2052539484024048,
11
- "train_runtime": 9327.6943,
12
  "train_samples": 4804,
13
- "train_samples_per_second": 17.153,
14
- "train_steps_per_second": 1.072
15
  }
 
1
  {
2
+ "epoch": 99.66888519134775,
3
+ "eval_loss": 0.22443878650665283,
4
+ "eval_runtime": 5.3058,
5
  "eval_samples": 101,
6
+ "eval_samples_per_second": 19.036,
7
+ "eval_steps_per_second": 2.45,
8
+ "eval_wer": 0.2118279569892473,
9
+ "total_flos": 2.0486046325976072e+19,
10
+ "train_loss": 0.08741244434913,
11
+ "train_runtime": 29833.4209,
12
  "train_samples": 4804,
13
+ "train_samples_per_second": 16.089,
14
+ "train_steps_per_second": 1.006
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 33.22296173044925,
3
- "eval_loss": 0.20475824177265167,
4
- "eval_runtime": 5.1372,
5
  "eval_samples": 101,
6
- "eval_samples_per_second": 19.66,
7
- "eval_steps_per_second": 2.531,
8
- "eval_wer": 0.23010752688172043
9
  }
 
1
  {
2
+ "epoch": 99.66888519134775,
3
+ "eval_loss": 0.22443878650665283,
4
+ "eval_runtime": 5.3058,
5
  "eval_samples": 101,
6
+ "eval_samples_per_second": 19.036,
7
+ "eval_steps_per_second": 2.45,
8
+ "eval_wer": 0.2118279569892473
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 33.22296173044925,
3
- "total_flos": 6.831383494195675e+18,
4
- "train_loss": 0.2052539484024048,
5
- "train_runtime": 9327.6943,
6
  "train_samples": 4804,
7
- "train_samples_per_second": 17.153,
8
- "train_steps_per_second": 1.072
9
  }
 
1
  {
2
+ "epoch": 99.66888519134775,
3
+ "total_flos": 2.0486046325976072e+19,
4
+ "train_loss": 0.08741244434913,
5
+ "train_runtime": 29833.4209,
6
  "train_samples": 4804,
7
+ "train_samples_per_second": 16.089,
8
+ "train_steps_per_second": 1.006
9
  }
trainer_state.json CHANGED
@@ -2,257 +2,717 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 33.22296173044925,
6
  "eval_steps": 1000,
7
- "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.6622296173044924,
14
- "grad_norm": 2.9154744148254395,
15
  "learning_rate": 0.00029939999999999996,
16
- "loss": 2.5693,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 3.32279534109817,
21
- "grad_norm": 2.086198091506958,
22
- "learning_rate": 0.00028424210526315787,
23
- "loss": 0.3806,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 3.32279534109817,
28
- "eval_loss": 0.3095531761646271,
29
- "eval_runtime": 5.0217,
30
- "eval_samples_per_second": 20.113,
31
- "eval_steps_per_second": 2.589,
32
- "eval_wer": 0.3956989247311828,
33
  "step": 1000
34
  },
35
  {
36
  "epoch": 4.985024958402662,
37
- "grad_norm": 3.3510117530822754,
38
- "learning_rate": 0.00026845263157894737,
39
- "loss": 0.2254,
40
  "step": 1500
41
  },
42
  {
43
  "epoch": 6.64559068219634,
44
- "grad_norm": 2.1008408069610596,
45
- "learning_rate": 0.0002526631578947368,
46
- "loss": 0.156,
47
  "step": 2000
48
  },
49
  {
50
  "epoch": 6.64559068219634,
51
- "eval_loss": 0.27666544914245605,
52
- "eval_runtime": 5.0002,
53
- "eval_samples_per_second": 20.199,
54
- "eval_steps_per_second": 2.6,
55
- "eval_wer": 0.3247311827956989,
56
  "step": 2000
57
  },
58
  {
59
  "epoch": 8.306156405990016,
60
- "grad_norm": 2.587332010269165,
61
- "learning_rate": 0.00023687368421052628,
62
- "loss": 0.1241,
63
  "step": 2500
64
  },
65
  {
66
  "epoch": 9.96838602329451,
67
- "grad_norm": 0.9522386789321899,
68
- "learning_rate": 0.00022108421052631578,
69
- "loss": 0.1015,
70
  "step": 3000
71
  },
72
  {
73
  "epoch": 9.96838602329451,
74
- "eval_loss": 0.28304827213287354,
75
- "eval_runtime": 4.972,
76
- "eval_samples_per_second": 20.314,
77
- "eval_steps_per_second": 2.615,
78
- "eval_wer": 0.2838709677419355,
79
  "step": 3000
80
  },
81
  {
82
  "epoch": 11.628951747088186,
83
- "grad_norm": 1.9689347743988037,
84
- "learning_rate": 0.00020529473684210525,
85
- "loss": 0.0853,
86
  "step": 3500
87
  },
88
  {
89
  "epoch": 13.289517470881863,
90
- "grad_norm": 1.2812440395355225,
91
- "learning_rate": 0.0001895052631578947,
92
- "loss": 0.0755,
93
  "step": 4000
94
  },
95
  {
96
  "epoch": 13.289517470881863,
97
- "eval_loss": 0.27407756447792053,
98
- "eval_runtime": 4.9737,
99
- "eval_samples_per_second": 20.307,
100
- "eval_steps_per_second": 2.614,
101
- "eval_wer": 0.289247311827957,
102
  "step": 4000
103
  },
104
  {
105
  "epoch": 14.951747088186355,
106
- "grad_norm": 0.7250840067863464,
107
- "learning_rate": 0.0001737157894736842,
108
- "loss": 0.0641,
109
  "step": 4500
110
  },
111
  {
112
  "epoch": 16.612312811980033,
113
- "grad_norm": 0.5321822166442871,
114
- "learning_rate": 0.00015792631578947366,
115
- "loss": 0.0567,
116
  "step": 5000
117
  },
118
  {
119
  "epoch": 16.612312811980033,
120
- "eval_loss": 0.20905862748622894,
121
- "eval_runtime": 4.9679,
122
- "eval_samples_per_second": 20.331,
123
- "eval_steps_per_second": 2.617,
124
- "eval_wer": 0.24946236559139784,
125
  "step": 5000
126
  },
127
  {
128
  "epoch": 18.27287853577371,
129
- "grad_norm": 0.936040997505188,
130
- "learning_rate": 0.00014213684210526316,
131
- "loss": 0.0463,
132
  "step": 5500
133
  },
134
  {
135
  "epoch": 19.935108153078204,
136
- "grad_norm": 1.0212537050247192,
137
- "learning_rate": 0.00012634736842105263,
138
- "loss": 0.0386,
139
  "step": 6000
140
  },
141
  {
142
  "epoch": 19.935108153078204,
143
- "eval_loss": 0.22282364964485168,
144
- "eval_runtime": 4.9552,
145
- "eval_samples_per_second": 20.382,
146
- "eval_steps_per_second": 2.623,
147
- "eval_wer": 0.24731182795698925,
148
  "step": 6000
149
  },
150
  {
151
  "epoch": 21.59567387687188,
152
- "grad_norm": 0.5801821947097778,
153
- "learning_rate": 0.0001105578947368421,
154
- "loss": 0.0346,
155
  "step": 6500
156
  },
157
  {
158
  "epoch": 23.25623960066556,
159
- "grad_norm": 1.4400817155838013,
160
- "learning_rate": 9.476842105263157e-05,
161
- "loss": 0.0316,
162
  "step": 7000
163
  },
164
  {
165
  "epoch": 23.25623960066556,
166
- "eval_loss": 0.22444939613342285,
167
- "eval_runtime": 5.0325,
168
- "eval_samples_per_second": 20.069,
169
- "eval_steps_per_second": 2.583,
170
- "eval_wer": 0.26881720430107525,
171
  "step": 7000
172
  },
173
  {
174
  "epoch": 24.91846921797005,
175
- "grad_norm": 1.2605141401290894,
176
- "learning_rate": 7.897894736842106e-05,
177
- "loss": 0.0262,
178
  "step": 7500
179
  },
180
  {
181
  "epoch": 26.579034941763727,
182
- "grad_norm": 0.4129526615142822,
183
- "learning_rate": 6.318947368421052e-05,
184
- "loss": 0.0233,
185
  "step": 8000
186
  },
187
  {
188
  "epoch": 26.579034941763727,
189
- "eval_loss": 0.21599678695201874,
190
- "eval_runtime": 4.9966,
191
- "eval_samples_per_second": 20.214,
192
- "eval_steps_per_second": 2.602,
193
- "eval_wer": 0.23010752688172043,
194
  "step": 8000
195
  },
196
  {
197
  "epoch": 28.239600665557404,
198
- "grad_norm": 0.13726775348186493,
199
- "learning_rate": 4.7399999999999993e-05,
200
- "loss": 0.0187,
201
  "step": 8500
202
  },
203
  {
204
  "epoch": 29.901830282861898,
205
- "grad_norm": 0.28249457478523254,
206
- "learning_rate": 3.161052631578947e-05,
207
- "loss": 0.0186,
208
  "step": 9000
209
  },
210
  {
211
  "epoch": 29.901830282861898,
212
- "eval_loss": 0.2085915356874466,
213
- "eval_runtime": 4.9628,
214
- "eval_samples_per_second": 20.351,
215
- "eval_steps_per_second": 2.619,
216
- "eval_wer": 0.23225806451612904,
217
  "step": 9000
218
  },
219
  {
220
  "epoch": 31.562396006655575,
221
- "grad_norm": 0.8073873519897461,
222
- "learning_rate": 1.5821052631578945e-05,
223
- "loss": 0.0151,
224
  "step": 9500
225
  },
226
  {
227
  "epoch": 33.22296173044925,
228
- "grad_norm": 0.1773974597454071,
229
- "learning_rate": 3.157894736842105e-08,
230
- "loss": 0.0137,
231
  "step": 10000
232
  },
233
  {
234
  "epoch": 33.22296173044925,
235
- "eval_loss": 0.20346209406852722,
236
- "eval_runtime": 4.9853,
237
- "eval_samples_per_second": 20.259,
238
- "eval_steps_per_second": 2.608,
239
- "eval_wer": 0.23118279569892472,
240
  "step": 10000
241
  },
242
  {
243
- "epoch": 33.22296173044925,
244
- "step": 10000,
245
- "total_flos": 6.831383494195675e+18,
246
- "train_loss": 0.2052539484024048,
247
- "train_runtime": 9327.6943,
248
- "train_samples_per_second": 17.153,
249
- "train_steps_per_second": 1.072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  }
251
  ],
252
  "logging_steps": 500,
253
- "max_steps": 10000,
254
  "num_input_tokens_seen": 0,
255
- "num_train_epochs": 34,
256
  "save_steps": 1000,
257
  "stateful_callbacks": {
258
  "TrainerControl": {
@@ -266,7 +726,7 @@
266
  "attributes": {}
267
  }
268
  },
269
- "total_flos": 6.831383494195675e+18,
270
  "train_batch_size": 8,
271
  "trial_name": null,
272
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 99.66888519134775,
6
  "eval_steps": 1000,
7
+ "global_step": 30000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 1.6622296173044924,
14
+ "grad_norm": 3.129899024963379,
15
  "learning_rate": 0.00029939999999999996,
16
+ "loss": 2.5636,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 3.32279534109817,
21
+ "grad_norm": 1.6870653629302979,
22
+ "learning_rate": 0.0002949254237288135,
23
+ "loss": 0.3803,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 3.32279534109817,
28
+ "eval_loss": 0.3107321560382843,
29
+ "eval_runtime": 5.3302,
30
+ "eval_samples_per_second": 18.949,
31
+ "eval_steps_per_second": 2.439,
32
+ "eval_wer": 0.36236559139784946,
33
  "step": 1000
34
  },
35
  {
36
  "epoch": 4.985024958402662,
37
+ "grad_norm": 4.024764537811279,
38
+ "learning_rate": 0.00028984067796610164,
39
+ "loss": 0.2315,
40
  "step": 1500
41
  },
42
  {
43
  "epoch": 6.64559068219634,
44
+ "grad_norm": 2.152501106262207,
45
+ "learning_rate": 0.0002847559322033898,
46
+ "loss": 0.1663,
47
  "step": 2000
48
  },
49
  {
50
  "epoch": 6.64559068219634,
51
+ "eval_loss": 0.2739505469799042,
52
+ "eval_runtime": 5.2893,
53
+ "eval_samples_per_second": 19.095,
54
+ "eval_steps_per_second": 2.458,
55
+ "eval_wer": 0.3096774193548387,
56
  "step": 2000
57
  },
58
  {
59
  "epoch": 8.306156405990016,
60
+ "grad_norm": 1.6033236980438232,
61
+ "learning_rate": 0.00027967118644067795,
62
+ "loss": 0.1381,
63
  "step": 2500
64
  },
65
  {
66
  "epoch": 9.96838602329451,
67
+ "grad_norm": 1.2841393947601318,
68
+ "learning_rate": 0.00027458644067796607,
69
+ "loss": 0.1206,
70
  "step": 3000
71
  },
72
  {
73
  "epoch": 9.96838602329451,
74
+ "eval_loss": 0.24860869348049164,
75
+ "eval_runtime": 5.3415,
76
+ "eval_samples_per_second": 18.909,
77
+ "eval_steps_per_second": 2.434,
78
+ "eval_wer": 0.2903225806451613,
79
  "step": 3000
80
  },
81
  {
82
  "epoch": 11.628951747088186,
83
+ "grad_norm": 1.436146855354309,
84
+ "learning_rate": 0.0002695016949152542,
85
+ "loss": 0.1053,
86
  "step": 3500
87
  },
88
  {
89
  "epoch": 13.289517470881863,
90
+ "grad_norm": 0.8757719397544861,
91
+ "learning_rate": 0.0002644169491525423,
92
+ "loss": 0.0938,
93
  "step": 4000
94
  },
95
  {
96
  "epoch": 13.289517470881863,
97
+ "eval_loss": 0.25879770517349243,
98
+ "eval_runtime": 5.2643,
99
+ "eval_samples_per_second": 19.186,
100
+ "eval_steps_per_second": 2.469,
101
+ "eval_wer": 0.28279569892473116,
102
  "step": 4000
103
  },
104
  {
105
  "epoch": 14.951747088186355,
106
+ "grad_norm": 0.9832671284675598,
107
+ "learning_rate": 0.0002593322033898305,
108
+ "loss": 0.0872,
109
  "step": 4500
110
  },
111
  {
112
  "epoch": 16.612312811980033,
113
+ "grad_norm": 0.9009350538253784,
114
+ "learning_rate": 0.00025424745762711863,
115
+ "loss": 0.0816,
116
  "step": 5000
117
  },
118
  {
119
  "epoch": 16.612312811980033,
120
+ "eval_loss": 0.27693644165992737,
121
+ "eval_runtime": 5.2922,
122
+ "eval_samples_per_second": 19.085,
123
+ "eval_steps_per_second": 2.456,
124
+ "eval_wer": 0.278494623655914,
125
  "step": 5000
126
  },
127
  {
128
  "epoch": 18.27287853577371,
129
+ "grad_norm": 1.4376908540725708,
130
+ "learning_rate": 0.00024916271186440676,
131
+ "loss": 0.0756,
132
  "step": 5500
133
  },
134
  {
135
  "epoch": 19.935108153078204,
136
+ "grad_norm": 0.4128202795982361,
137
+ "learning_rate": 0.0002440779661016949,
138
+ "loss": 0.0689,
139
  "step": 6000
140
  },
141
  {
142
  "epoch": 19.935108153078204,
143
+ "eval_loss": 0.24573881924152374,
144
+ "eval_runtime": 5.2488,
145
+ "eval_samples_per_second": 19.243,
146
+ "eval_steps_per_second": 2.477,
147
+ "eval_wer": 0.28817204301075267,
148
  "step": 6000
149
  },
150
  {
151
  "epoch": 21.59567387687188,
152
+ "grad_norm": 0.9727014899253845,
153
+ "learning_rate": 0.00023899322033898301,
154
+ "loss": 0.0634,
155
  "step": 6500
156
  },
157
  {
158
  "epoch": 23.25623960066556,
159
+ "grad_norm": 0.9723252058029175,
160
+ "learning_rate": 0.00023390847457627117,
161
+ "loss": 0.0642,
162
  "step": 7000
163
  },
164
  {
165
  "epoch": 23.25623960066556,
166
+ "eval_loss": 0.26387491822242737,
167
+ "eval_runtime": 5.2852,
168
+ "eval_samples_per_second": 19.11,
169
+ "eval_steps_per_second": 2.46,
170
+ "eval_wer": 0.2913978494623656,
171
  "step": 7000
172
  },
173
  {
174
  "epoch": 24.91846921797005,
175
+ "grad_norm": 0.8104386329650879,
176
+ "learning_rate": 0.00022882372881355932,
177
+ "loss": 0.0586,
178
  "step": 7500
179
  },
180
  {
181
  "epoch": 26.579034941763727,
182
+ "grad_norm": 0.31280040740966797,
183
+ "learning_rate": 0.00022373898305084742,
184
+ "loss": 0.0566,
185
  "step": 8000
186
  },
187
  {
188
  "epoch": 26.579034941763727,
189
+ "eval_loss": 0.2954213619232178,
190
+ "eval_runtime": 5.2555,
191
+ "eval_samples_per_second": 19.218,
192
+ "eval_steps_per_second": 2.474,
193
+ "eval_wer": 0.28279569892473116,
194
  "step": 8000
195
  },
196
  {
197
  "epoch": 28.239600665557404,
198
+ "grad_norm": 0.31530410051345825,
199
+ "learning_rate": 0.00021865423728813558,
200
+ "loss": 0.0509,
201
  "step": 8500
202
  },
203
  {
204
  "epoch": 29.901830282861898,
205
+ "grad_norm": 0.7447624802589417,
206
+ "learning_rate": 0.0002135694915254237,
207
+ "loss": 0.049,
208
  "step": 9000
209
  },
210
  {
211
  "epoch": 29.901830282861898,
212
+ "eval_loss": 0.31719881296157837,
213
+ "eval_runtime": 5.2417,
214
+ "eval_samples_per_second": 19.268,
215
+ "eval_steps_per_second": 2.48,
216
+ "eval_wer": 0.2763440860215054,
217
  "step": 9000
218
  },
219
  {
220
  "epoch": 31.562396006655575,
221
+ "grad_norm": 0.7232189774513245,
222
+ "learning_rate": 0.00020848474576271186,
223
+ "loss": 0.0464,
224
  "step": 9500
225
  },
226
  {
227
  "epoch": 33.22296173044925,
228
+ "grad_norm": 0.7363786101341248,
229
+ "learning_rate": 0.00020339999999999998,
230
+ "loss": 0.0454,
231
  "step": 10000
232
  },
233
  {
234
  "epoch": 33.22296173044925,
235
+ "eval_loss": 0.31861352920532227,
236
+ "eval_runtime": 5.2622,
237
+ "eval_samples_per_second": 19.194,
238
+ "eval_steps_per_second": 2.47,
239
+ "eval_wer": 0.28279569892473116,
240
  "step": 10000
241
  },
242
  {
243
+ "epoch": 34.88519134775375,
244
+ "grad_norm": 1.2368154525756836,
245
+ "learning_rate": 0.0001983152542372881,
246
+ "loss": 0.0419,
247
+ "step": 10500
248
+ },
249
+ {
250
+ "epoch": 36.54575707154742,
251
+ "grad_norm": 0.20310941338539124,
252
+ "learning_rate": 0.00019323050847457626,
253
+ "loss": 0.0395,
254
+ "step": 11000
255
+ },
256
+ {
257
+ "epoch": 36.54575707154742,
258
+ "eval_loss": 0.27824845910072327,
259
+ "eval_runtime": 5.2266,
260
+ "eval_samples_per_second": 19.324,
261
+ "eval_steps_per_second": 2.487,
262
+ "eval_wer": 0.2817204301075269,
263
+ "step": 11000
264
+ },
265
+ {
266
+ "epoch": 38.2063227953411,
267
+ "grad_norm": 0.7990397214889526,
268
+ "learning_rate": 0.0001881457627118644,
269
+ "loss": 0.0379,
270
+ "step": 11500
271
+ },
272
+ {
273
+ "epoch": 39.86855241264559,
274
+ "grad_norm": 1.0379022359848022,
275
+ "learning_rate": 0.00018306101694915252,
276
+ "loss": 0.0389,
277
+ "step": 12000
278
+ },
279
+ {
280
+ "epoch": 39.86855241264559,
281
+ "eval_loss": 0.28572770953178406,
282
+ "eval_runtime": 5.2471,
283
+ "eval_samples_per_second": 19.249,
284
+ "eval_steps_per_second": 2.478,
285
+ "eval_wer": 0.28279569892473116,
286
+ "step": 12000
287
+ },
288
+ {
289
+ "epoch": 41.529118136439266,
290
+ "grad_norm": 0.42599430680274963,
291
+ "learning_rate": 0.00017797627118644067,
292
+ "loss": 0.0338,
293
+ "step": 12500
294
+ },
295
+ {
296
+ "epoch": 43.18968386023295,
297
+ "grad_norm": 0.8438450694084167,
298
+ "learning_rate": 0.0001728915254237288,
299
+ "loss": 0.0321,
300
+ "step": 13000
301
+ },
302
+ {
303
+ "epoch": 43.18968386023295,
304
+ "eval_loss": 0.26923489570617676,
305
+ "eval_runtime": 5.1894,
306
+ "eval_samples_per_second": 19.463,
307
+ "eval_steps_per_second": 2.505,
308
+ "eval_wer": 0.25268817204301075,
309
+ "step": 13000
310
+ },
311
+ {
312
+ "epoch": 44.85191347753744,
313
+ "grad_norm": 0.27244409918785095,
314
+ "learning_rate": 0.00016780677966101695,
315
+ "loss": 0.0307,
316
+ "step": 13500
317
+ },
318
+ {
319
+ "epoch": 46.51247920133112,
320
+ "grad_norm": 0.5336557626724243,
321
+ "learning_rate": 0.00016272203389830505,
322
+ "loss": 0.0282,
323
+ "step": 14000
324
+ },
325
+ {
326
+ "epoch": 46.51247920133112,
327
+ "eval_loss": 0.2570391595363617,
328
+ "eval_runtime": 5.2068,
329
+ "eval_samples_per_second": 19.398,
330
+ "eval_steps_per_second": 2.497,
331
+ "eval_wer": 0.25591397849462366,
332
+ "step": 14000
333
+ },
334
+ {
335
+ "epoch": 48.17304492512479,
336
+ "grad_norm": 0.5201185941696167,
337
+ "learning_rate": 0.0001576372881355932,
338
+ "loss": 0.0276,
339
+ "step": 14500
340
+ },
341
+ {
342
+ "epoch": 49.83527454242928,
343
+ "grad_norm": 0.42062297463417053,
344
+ "learning_rate": 0.00015255254237288136,
345
+ "loss": 0.0269,
346
+ "step": 15000
347
+ },
348
+ {
349
+ "epoch": 49.83527454242928,
350
+ "eval_loss": 0.24461327493190765,
351
+ "eval_runtime": 5.222,
352
+ "eval_samples_per_second": 19.341,
353
+ "eval_steps_per_second": 2.489,
354
+ "eval_wer": 0.2623655913978495,
355
+ "step": 15000
356
+ },
357
+ {
358
+ "epoch": 51.49584026622296,
359
+ "grad_norm": 1.2455600500106812,
360
+ "learning_rate": 0.0001474677966101695,
361
+ "loss": 0.0253,
362
+ "step": 15500
363
+ },
364
+ {
365
+ "epoch": 53.15640599001664,
366
+ "grad_norm": 0.5616517066955566,
367
+ "learning_rate": 0.00014238305084745761,
368
+ "loss": 0.0233,
369
+ "step": 16000
370
+ },
371
+ {
372
+ "epoch": 53.15640599001664,
373
+ "eval_loss": 0.23834320902824402,
374
+ "eval_runtime": 5.2763,
375
+ "eval_samples_per_second": 19.142,
376
+ "eval_steps_per_second": 2.464,
377
+ "eval_wer": 0.24731182795698925,
378
+ "step": 16000
379
+ },
380
+ {
381
+ "epoch": 54.818635607321134,
382
+ "grad_norm": 0.6374333500862122,
383
+ "learning_rate": 0.00013729830508474577,
384
+ "loss": 0.023,
385
+ "step": 16500
386
+ },
387
+ {
388
+ "epoch": 56.47920133111481,
389
+ "grad_norm": 0.599651575088501,
390
+ "learning_rate": 0.0001322135593220339,
391
+ "loss": 0.0224,
392
+ "step": 17000
393
+ },
394
+ {
395
+ "epoch": 56.47920133111481,
396
+ "eval_loss": 0.28050878643989563,
397
+ "eval_runtime": 5.2647,
398
+ "eval_samples_per_second": 19.185,
399
+ "eval_steps_per_second": 2.469,
400
+ "eval_wer": 0.24731182795698925,
401
+ "step": 17000
402
+ },
403
+ {
404
+ "epoch": 58.13976705490849,
405
+ "grad_norm": 0.5658329129219055,
406
+ "learning_rate": 0.00012712881355932202,
407
+ "loss": 0.0202,
408
+ "step": 17500
409
+ },
410
+ {
411
+ "epoch": 59.80199667221298,
412
+ "grad_norm": 0.1811748892068863,
413
+ "learning_rate": 0.00012204406779661016,
414
+ "loss": 0.0198,
415
+ "step": 18000
416
+ },
417
+ {
418
+ "epoch": 59.80199667221298,
419
+ "eval_loss": 0.25546789169311523,
420
+ "eval_runtime": 5.2627,
421
+ "eval_samples_per_second": 19.192,
422
+ "eval_steps_per_second": 2.47,
423
+ "eval_wer": 0.25161290322580643,
424
+ "step": 18000
425
+ },
426
+ {
427
+ "epoch": 61.46256239600665,
428
+ "grad_norm": 0.3274936378002167,
429
+ "learning_rate": 0.00011695932203389829,
430
+ "loss": 0.0179,
431
+ "step": 18500
432
+ },
433
+ {
434
+ "epoch": 63.123128119800334,
435
+ "grad_norm": 0.4713875353336334,
436
+ "learning_rate": 0.00011187457627118644,
437
+ "loss": 0.0159,
438
+ "step": 19000
439
+ },
440
+ {
441
+ "epoch": 63.123128119800334,
442
+ "eval_loss": 0.20965830981731415,
443
+ "eval_runtime": 5.2591,
444
+ "eval_samples_per_second": 19.205,
445
+ "eval_steps_per_second": 2.472,
446
+ "eval_wer": 0.24086021505376345,
447
+ "step": 19000
448
+ },
449
+ {
450
+ "epoch": 64.78535773710483,
451
+ "grad_norm": 0.07249698787927628,
452
+ "learning_rate": 0.00010678983050847457,
453
+ "loss": 0.0162,
454
+ "step": 19500
455
+ },
456
+ {
457
+ "epoch": 66.4459234608985,
458
+ "grad_norm": 0.0907130241394043,
459
+ "learning_rate": 0.00010170508474576271,
460
+ "loss": 0.015,
461
+ "step": 20000
462
+ },
463
+ {
464
+ "epoch": 66.4459234608985,
465
+ "eval_loss": 0.23673121631145477,
466
+ "eval_runtime": 5.2393,
467
+ "eval_samples_per_second": 19.277,
468
+ "eval_steps_per_second": 2.481,
469
+ "eval_wer": 0.25053763440860216,
470
+ "step": 20000
471
+ },
472
+ {
473
+ "epoch": 68.10648918469218,
474
+ "grad_norm": 0.05233411118388176,
475
+ "learning_rate": 9.662033898305084e-05,
476
+ "loss": 0.0144,
477
+ "step": 20500
478
+ },
479
+ {
480
+ "epoch": 69.76871880199667,
481
+ "grad_norm": 0.10925977677106857,
482
+ "learning_rate": 9.153559322033896e-05,
483
+ "loss": 0.015,
484
+ "step": 21000
485
+ },
486
+ {
487
+ "epoch": 69.76871880199667,
488
+ "eval_loss": 0.24856378138065338,
489
+ "eval_runtime": 5.3437,
490
+ "eval_samples_per_second": 18.901,
491
+ "eval_steps_per_second": 2.433,
492
+ "eval_wer": 0.25268817204301075,
493
+ "step": 21000
494
+ },
495
+ {
496
+ "epoch": 71.42928452579035,
497
+ "grad_norm": 0.267115980386734,
498
+ "learning_rate": 8.64508474576271e-05,
499
+ "loss": 0.0117,
500
+ "step": 21500
501
+ },
502
+ {
503
+ "epoch": 73.08985024958403,
504
+ "grad_norm": 0.4500684440135956,
505
+ "learning_rate": 8.136610169491526e-05,
506
+ "loss": 0.0122,
507
+ "step": 22000
508
+ },
509
+ {
510
+ "epoch": 73.08985024958403,
511
+ "eval_loss": 0.24751359224319458,
512
+ "eval_runtime": 5.2943,
513
+ "eval_samples_per_second": 19.077,
514
+ "eval_steps_per_second": 2.455,
515
+ "eval_wer": 0.25268817204301075,
516
+ "step": 22000
517
+ },
518
+ {
519
+ "epoch": 74.75207986688852,
520
+ "grad_norm": 1.1715344190597534,
521
+ "learning_rate": 7.628135593220339e-05,
522
+ "loss": 0.0119,
523
+ "step": 22500
524
+ },
525
+ {
526
+ "epoch": 76.4126455906822,
527
+ "grad_norm": 0.22268928587436676,
528
+ "learning_rate": 7.119661016949153e-05,
529
+ "loss": 0.0104,
530
+ "step": 23000
531
+ },
532
+ {
533
+ "epoch": 76.4126455906822,
534
+ "eval_loss": 0.23766544461250305,
535
+ "eval_runtime": 5.275,
536
+ "eval_samples_per_second": 19.147,
537
+ "eval_steps_per_second": 2.464,
538
+ "eval_wer": 0.23440860215053763,
539
+ "step": 23000
540
+ },
541
+ {
542
+ "epoch": 78.07321131447587,
543
+ "grad_norm": 0.3640448749065399,
544
+ "learning_rate": 6.611186440677965e-05,
545
+ "loss": 0.0097,
546
+ "step": 23500
547
+ },
548
+ {
549
+ "epoch": 79.73544093178036,
550
+ "grad_norm": 0.13920682668685913,
551
+ "learning_rate": 6.102711864406779e-05,
552
+ "loss": 0.008,
553
+ "step": 24000
554
+ },
555
+ {
556
+ "epoch": 79.73544093178036,
557
+ "eval_loss": 0.23628441989421844,
558
+ "eval_runtime": 5.3789,
559
+ "eval_samples_per_second": 18.777,
560
+ "eval_steps_per_second": 2.417,
561
+ "eval_wer": 0.24408602150537634,
562
+ "step": 24000
563
+ },
564
+ {
565
+ "epoch": 81.39600665557404,
566
+ "grad_norm": 0.051646001636981964,
567
+ "learning_rate": 5.594237288135593e-05,
568
+ "loss": 0.0082,
569
+ "step": 24500
570
+ },
571
+ {
572
+ "epoch": 83.05657237936772,
573
+ "grad_norm": 0.034305017441511154,
574
+ "learning_rate": 5.085762711864406e-05,
575
+ "loss": 0.0081,
576
+ "step": 25000
577
+ },
578
+ {
579
+ "epoch": 83.05657237936772,
580
+ "eval_loss": 0.23471036553382874,
581
+ "eval_runtime": 5.3686,
582
+ "eval_samples_per_second": 18.813,
583
+ "eval_steps_per_second": 2.422,
584
+ "eval_wer": 0.23333333333333334,
585
+ "step": 25000
586
+ },
587
+ {
588
+ "epoch": 84.71880199667221,
589
+ "grad_norm": 0.006502960808575153,
590
+ "learning_rate": 4.57728813559322e-05,
591
+ "loss": 0.0057,
592
+ "step": 25500
593
+ },
594
+ {
595
+ "epoch": 86.3793677204659,
596
+ "grad_norm": 0.5881304144859314,
597
+ "learning_rate": 4.0688135593220334e-05,
598
+ "loss": 0.0072,
599
+ "step": 26000
600
+ },
601
+ {
602
+ "epoch": 86.3793677204659,
603
+ "eval_loss": 0.22321127355098724,
604
+ "eval_runtime": 5.3519,
605
+ "eval_samples_per_second": 18.872,
606
+ "eval_steps_per_second": 2.429,
607
+ "eval_wer": 0.22903225806451613,
608
+ "step": 26000
609
+ },
610
+ {
611
+ "epoch": 88.03993344425957,
612
+ "grad_norm": 0.2054450660943985,
613
+ "learning_rate": 3.560338983050847e-05,
614
+ "loss": 0.0063,
615
+ "step": 26500
616
+ },
617
+ {
618
+ "epoch": 89.70216306156406,
619
+ "grad_norm": 0.25417467951774597,
620
+ "learning_rate": 3.051864406779661e-05,
621
+ "loss": 0.0064,
622
+ "step": 27000
623
+ },
624
+ {
625
+ "epoch": 89.70216306156406,
626
+ "eval_loss": 0.22117015719413757,
627
+ "eval_runtime": 5.3469,
628
+ "eval_samples_per_second": 18.89,
629
+ "eval_steps_per_second": 2.431,
630
+ "eval_wer": 0.22795698924731184,
631
+ "step": 27000
632
+ },
633
+ {
634
+ "epoch": 91.36272878535773,
635
+ "grad_norm": 0.26413634419441223,
636
+ "learning_rate": 2.5433898305084745e-05,
637
+ "loss": 0.0052,
638
+ "step": 27500
639
+ },
640
+ {
641
+ "epoch": 93.02329450915141,
642
+ "grad_norm": 0.06111468747258186,
643
+ "learning_rate": 2.034915254237288e-05,
644
+ "loss": 0.0044,
645
+ "step": 28000
646
+ },
647
+ {
648
+ "epoch": 93.02329450915141,
649
+ "eval_loss": 0.22874999046325684,
650
+ "eval_runtime": 5.2229,
651
+ "eval_samples_per_second": 19.338,
652
+ "eval_steps_per_second": 2.489,
653
+ "eval_wer": 0.22580645161290322,
654
+ "step": 28000
655
+ },
656
+ {
657
+ "epoch": 94.6855241264559,
658
+ "grad_norm": 0.4941785931587219,
659
+ "learning_rate": 1.5264406779661016e-05,
660
+ "loss": 0.0041,
661
+ "step": 28500
662
+ },
663
+ {
664
+ "epoch": 96.34608985024958,
665
+ "grad_norm": 0.19530624151229858,
666
+ "learning_rate": 1.0179661016949151e-05,
667
+ "loss": 0.004,
668
+ "step": 29000
669
+ },
670
+ {
671
+ "epoch": 96.34608985024958,
672
+ "eval_loss": 0.22946567833423615,
673
+ "eval_runtime": 5.3077,
674
+ "eval_samples_per_second": 19.029,
675
+ "eval_steps_per_second": 2.449,
676
+ "eval_wer": 0.23440860215053763,
677
+ "step": 29000
678
+ },
679
+ {
680
+ "epoch": 98.00665557404326,
681
+ "grad_norm": 0.19543957710266113,
682
+ "learning_rate": 5.094915254237288e-06,
683
+ "loss": 0.0042,
684
+ "step": 29500
685
+ },
686
+ {
687
+ "epoch": 99.66888519134775,
688
+ "grad_norm": 0.6488747000694275,
689
+ "learning_rate": 1.0169491525423728e-08,
690
+ "loss": 0.0037,
691
+ "step": 30000
692
+ },
693
+ {
694
+ "epoch": 99.66888519134775,
695
+ "eval_loss": 0.22431735694408417,
696
+ "eval_runtime": 5.3048,
697
+ "eval_samples_per_second": 19.039,
698
+ "eval_steps_per_second": 2.451,
699
+ "eval_wer": 0.22043010752688172,
700
+ "step": 30000
701
+ },
702
+ {
703
+ "epoch": 99.66888519134775,
704
+ "step": 30000,
705
+ "total_flos": 2.0486046325976072e+19,
706
+ "train_loss": 0.08741244434913,
707
+ "train_runtime": 29833.4209,
708
+ "train_samples_per_second": 16.089,
709
+ "train_steps_per_second": 1.006
710
  }
711
  ],
712
  "logging_steps": 500,
713
+ "max_steps": 30000,
714
  "num_input_tokens_seen": 0,
715
+ "num_train_epochs": 100,
716
  "save_steps": 1000,
717
  "stateful_callbacks": {
718
  "TrainerControl": {
 
726
  "attributes": {}
727
  }
728
  },
729
+ "total_flos": 2.0486046325976072e+19,
730
  "train_batch_size": 8,
731
  "trial_name": null,
732
  "trial_params": null