musicgen-small-lora-minmaj-chords / trainer_state.json
luizapzbn's picture
End of training
2ac6358 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 2484,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004830917874396135,
"grad_norm": NaN,
"learning_rate": 0.0002,
"loss": 77.3398,
"step": 2
},
{
"epoch": 0.00966183574879227,
"grad_norm": 46.88166809082031,
"learning_rate": 0.0001998389694041868,
"loss": 74.1949,
"step": 4
},
{
"epoch": 0.014492753623188406,
"grad_norm": 57.10332489013672,
"learning_rate": 0.0001996779388083736,
"loss": 64.1131,
"step": 6
},
{
"epoch": 0.01932367149758454,
"grad_norm": 32.98695373535156,
"learning_rate": 0.0001995169082125604,
"loss": 50.8557,
"step": 8
},
{
"epoch": 0.024154589371980676,
"grad_norm": 42.64508056640625,
"learning_rate": 0.0001993558776167472,
"loss": 46.3401,
"step": 10
},
{
"epoch": 0.028985507246376812,
"grad_norm": 23.093393325805664,
"learning_rate": 0.00019919484702093397,
"loss": 42.264,
"step": 12
},
{
"epoch": 0.033816425120772944,
"grad_norm": 11.703313827514648,
"learning_rate": 0.00019903381642512078,
"loss": 40.6424,
"step": 14
},
{
"epoch": 0.03864734299516908,
"grad_norm": 12.010417938232422,
"learning_rate": 0.00019887278582930758,
"loss": 41.0451,
"step": 16
},
{
"epoch": 0.043478260869565216,
"grad_norm": 8.227810859680176,
"learning_rate": 0.00019871175523349436,
"loss": 40.8654,
"step": 18
},
{
"epoch": 0.04830917874396135,
"grad_norm": 8.531281471252441,
"learning_rate": 0.00019855072463768116,
"loss": 39.9403,
"step": 20
},
{
"epoch": 0.05314009661835749,
"grad_norm": 13.759291648864746,
"learning_rate": 0.00019838969404186796,
"loss": 40.4885,
"step": 22
},
{
"epoch": 0.057971014492753624,
"grad_norm": 12.24333381652832,
"learning_rate": 0.00019822866344605474,
"loss": 38.9464,
"step": 24
},
{
"epoch": 0.06280193236714976,
"grad_norm": 9.252888679504395,
"learning_rate": 0.00019806763285024154,
"loss": 37.4614,
"step": 26
},
{
"epoch": 0.06763285024154589,
"grad_norm": 12.859115600585938,
"learning_rate": 0.00019790660225442835,
"loss": 40.04,
"step": 28
},
{
"epoch": 0.07246376811594203,
"grad_norm": 8.288698196411133,
"learning_rate": 0.00019774557165861512,
"loss": 39.2962,
"step": 30
},
{
"epoch": 0.07729468599033816,
"grad_norm": 11.058815956115723,
"learning_rate": 0.00019758454106280193,
"loss": 38.5443,
"step": 32
},
{
"epoch": 0.0821256038647343,
"grad_norm": 16.540409088134766,
"learning_rate": 0.00019742351046698876,
"loss": 40.7756,
"step": 34
},
{
"epoch": 0.08695652173913043,
"grad_norm": 7.264046669006348,
"learning_rate": 0.00019726247987117553,
"loss": 40.0833,
"step": 36
},
{
"epoch": 0.09178743961352658,
"grad_norm": 7.761327743530273,
"learning_rate": 0.00019710144927536234,
"loss": 36.8212,
"step": 38
},
{
"epoch": 0.0966183574879227,
"grad_norm": 12.5891695022583,
"learning_rate": 0.00019694041867954914,
"loss": 37.6787,
"step": 40
},
{
"epoch": 0.10144927536231885,
"grad_norm": 8.316587448120117,
"learning_rate": 0.00019677938808373592,
"loss": 38.399,
"step": 42
},
{
"epoch": 0.10628019323671498,
"grad_norm": 6.445375442504883,
"learning_rate": 0.00019661835748792272,
"loss": 37.4109,
"step": 44
},
{
"epoch": 0.1111111111111111,
"grad_norm": 10.068939208984375,
"learning_rate": 0.00019645732689210952,
"loss": 36.8106,
"step": 46
},
{
"epoch": 0.11594202898550725,
"grad_norm": 10.530860900878906,
"learning_rate": 0.0001962962962962963,
"loss": 37.1642,
"step": 48
},
{
"epoch": 0.12077294685990338,
"grad_norm": 7.902243614196777,
"learning_rate": 0.0001961352657004831,
"loss": 39.0534,
"step": 50
},
{
"epoch": 0.12560386473429952,
"grad_norm": 7.551494598388672,
"learning_rate": 0.0001959742351046699,
"loss": 39.5417,
"step": 52
},
{
"epoch": 0.13043478260869565,
"grad_norm": 10.609959602355957,
"learning_rate": 0.00019581320450885668,
"loss": 38.9163,
"step": 54
},
{
"epoch": 0.13526570048309178,
"grad_norm": 11.915197372436523,
"learning_rate": 0.0001956521739130435,
"loss": 37.7346,
"step": 56
},
{
"epoch": 0.14009661835748793,
"grad_norm": 9.105422973632812,
"learning_rate": 0.0001954911433172303,
"loss": 37.7469,
"step": 58
},
{
"epoch": 0.14492753623188406,
"grad_norm": 10.62623119354248,
"learning_rate": 0.00019533011272141707,
"loss": 37.0263,
"step": 60
},
{
"epoch": 0.1497584541062802,
"grad_norm": 9.892080307006836,
"learning_rate": 0.00019516908212560387,
"loss": 39.5135,
"step": 62
},
{
"epoch": 0.15458937198067632,
"grad_norm": 12.9131441116333,
"learning_rate": 0.00019500805152979068,
"loss": 35.7463,
"step": 64
},
{
"epoch": 0.15942028985507245,
"grad_norm": 9.631657600402832,
"learning_rate": 0.00019484702093397745,
"loss": 36.228,
"step": 66
},
{
"epoch": 0.1642512077294686,
"grad_norm": 9.47088623046875,
"learning_rate": 0.00019468599033816426,
"loss": 38.1165,
"step": 68
},
{
"epoch": 0.16908212560386474,
"grad_norm": 10.864086151123047,
"learning_rate": 0.00019452495974235106,
"loss": 37.2582,
"step": 70
},
{
"epoch": 0.17391304347826086,
"grad_norm": 11.572696685791016,
"learning_rate": 0.00019436392914653784,
"loss": 36.3281,
"step": 72
},
{
"epoch": 0.178743961352657,
"grad_norm": 8.3622407913208,
"learning_rate": 0.00019420289855072464,
"loss": 37.2185,
"step": 74
},
{
"epoch": 0.18357487922705315,
"grad_norm": 10.799039840698242,
"learning_rate": 0.00019404186795491144,
"loss": 37.1632,
"step": 76
},
{
"epoch": 0.18840579710144928,
"grad_norm": 10.033112525939941,
"learning_rate": 0.00019388083735909825,
"loss": 36.5457,
"step": 78
},
{
"epoch": 0.1932367149758454,
"grad_norm": 14.008647918701172,
"learning_rate": 0.00019371980676328502,
"loss": 37.3024,
"step": 80
},
{
"epoch": 0.19806763285024154,
"grad_norm": 16.30805778503418,
"learning_rate": 0.00019355877616747183,
"loss": 33.9588,
"step": 82
},
{
"epoch": 0.2028985507246377,
"grad_norm": 10.949873924255371,
"learning_rate": 0.00019339774557165863,
"loss": 34.5954,
"step": 84
},
{
"epoch": 0.20772946859903382,
"grad_norm": 13.2377290725708,
"learning_rate": 0.0001932367149758454,
"loss": 35.3164,
"step": 86
},
{
"epoch": 0.21256038647342995,
"grad_norm": 16.080217361450195,
"learning_rate": 0.0001930756843800322,
"loss": 36.9439,
"step": 88
},
{
"epoch": 0.21739130434782608,
"grad_norm": 12.830262184143066,
"learning_rate": 0.000192914653784219,
"loss": 35.9063,
"step": 90
},
{
"epoch": 0.2222222222222222,
"grad_norm": 12.519986152648926,
"learning_rate": 0.0001927536231884058,
"loss": 36.0092,
"step": 92
},
{
"epoch": 0.22705314009661837,
"grad_norm": 11.222923278808594,
"learning_rate": 0.0001925925925925926,
"loss": 35.2874,
"step": 94
},
{
"epoch": 0.2318840579710145,
"grad_norm": 13.27009105682373,
"learning_rate": 0.0001924315619967794,
"loss": 33.1713,
"step": 96
},
{
"epoch": 0.23671497584541062,
"grad_norm": 10.449563026428223,
"learning_rate": 0.0001922705314009662,
"loss": 34.836,
"step": 98
},
{
"epoch": 0.24154589371980675,
"grad_norm": 17.162439346313477,
"learning_rate": 0.000192109500805153,
"loss": 33.4152,
"step": 100
},
{
"epoch": 0.2463768115942029,
"grad_norm": 11.21731185913086,
"learning_rate": 0.00019194847020933978,
"loss": 33.7839,
"step": 102
},
{
"epoch": 0.25120772946859904,
"grad_norm": 12.32532024383545,
"learning_rate": 0.00019178743961352658,
"loss": 34.1485,
"step": 104
},
{
"epoch": 0.2560386473429952,
"grad_norm": 10.481746673583984,
"learning_rate": 0.0001916264090177134,
"loss": 32.3468,
"step": 106
},
{
"epoch": 0.2608695652173913,
"grad_norm": 10.69057846069336,
"learning_rate": 0.00019146537842190016,
"loss": 33.281,
"step": 108
},
{
"epoch": 0.26570048309178745,
"grad_norm": 14.237508773803711,
"learning_rate": 0.00019130434782608697,
"loss": 33.0036,
"step": 110
},
{
"epoch": 0.27053140096618356,
"grad_norm": 8.754230499267578,
"learning_rate": 0.00019114331723027377,
"loss": 34.4538,
"step": 112
},
{
"epoch": 0.2753623188405797,
"grad_norm": 8.595375061035156,
"learning_rate": 0.00019098228663446057,
"loss": 32.7405,
"step": 114
},
{
"epoch": 0.28019323671497587,
"grad_norm": 10.565451622009277,
"learning_rate": 0.00019082125603864735,
"loss": 32.7687,
"step": 116
},
{
"epoch": 0.28502415458937197,
"grad_norm": 9.513022422790527,
"learning_rate": 0.00019066022544283415,
"loss": 32.8597,
"step": 118
},
{
"epoch": 0.2898550724637681,
"grad_norm": 12.073749542236328,
"learning_rate": 0.00019049919484702096,
"loss": 32.9837,
"step": 120
},
{
"epoch": 0.2946859903381642,
"grad_norm": 9.835869789123535,
"learning_rate": 0.00019033816425120773,
"loss": 33.7884,
"step": 122
},
{
"epoch": 0.2995169082125604,
"grad_norm": 9.995708465576172,
"learning_rate": 0.00019017713365539454,
"loss": 33.7684,
"step": 124
},
{
"epoch": 0.30434782608695654,
"grad_norm": 9.067010879516602,
"learning_rate": 0.00019001610305958134,
"loss": 30.8072,
"step": 126
},
{
"epoch": 0.30917874396135264,
"grad_norm": 9.235272407531738,
"learning_rate": 0.00018985507246376812,
"loss": 32.0888,
"step": 128
},
{
"epoch": 0.3140096618357488,
"grad_norm": 9.046205520629883,
"learning_rate": 0.00018969404186795492,
"loss": 33.0407,
"step": 130
},
{
"epoch": 0.3188405797101449,
"grad_norm": 9.927671432495117,
"learning_rate": 0.00018953301127214172,
"loss": 32.0351,
"step": 132
},
{
"epoch": 0.32367149758454106,
"grad_norm": 10.035076141357422,
"learning_rate": 0.0001893719806763285,
"loss": 32.5972,
"step": 134
},
{
"epoch": 0.3285024154589372,
"grad_norm": 10.489717483520508,
"learning_rate": 0.0001892109500805153,
"loss": 31.3804,
"step": 136
},
{
"epoch": 0.3333333333333333,
"grad_norm": 13.48115348815918,
"learning_rate": 0.0001890499194847021,
"loss": 32.5356,
"step": 138
},
{
"epoch": 0.33816425120772947,
"grad_norm": 8.694147109985352,
"learning_rate": 0.00018888888888888888,
"loss": 32.9306,
"step": 140
},
{
"epoch": 0.34299516908212563,
"grad_norm": 8.273658752441406,
"learning_rate": 0.0001887278582930757,
"loss": 32.2116,
"step": 142
},
{
"epoch": 0.34782608695652173,
"grad_norm": 10.635282516479492,
"learning_rate": 0.0001885668276972625,
"loss": 30.2346,
"step": 144
},
{
"epoch": 0.3526570048309179,
"grad_norm": 9.83012866973877,
"learning_rate": 0.00018840579710144927,
"loss": 32.6259,
"step": 146
},
{
"epoch": 0.357487922705314,
"grad_norm": 12.415063858032227,
"learning_rate": 0.00018824476650563607,
"loss": 32.0993,
"step": 148
},
{
"epoch": 0.36231884057971014,
"grad_norm": 11.103983879089355,
"learning_rate": 0.00018808373590982287,
"loss": 31.7034,
"step": 150
},
{
"epoch": 0.3671497584541063,
"grad_norm": 15.64197826385498,
"learning_rate": 0.00018792270531400965,
"loss": 31.6066,
"step": 152
},
{
"epoch": 0.3719806763285024,
"grad_norm": 10.493355751037598,
"learning_rate": 0.00018776167471819645,
"loss": 29.1212,
"step": 154
},
{
"epoch": 0.37681159420289856,
"grad_norm": 9.921483993530273,
"learning_rate": 0.00018760064412238326,
"loss": 31.0883,
"step": 156
},
{
"epoch": 0.38164251207729466,
"grad_norm": 9.639843940734863,
"learning_rate": 0.00018743961352657006,
"loss": 29.4677,
"step": 158
},
{
"epoch": 0.3864734299516908,
"grad_norm": 13.891378402709961,
"learning_rate": 0.00018727858293075687,
"loss": 31.5877,
"step": 160
},
{
"epoch": 0.391304347826087,
"grad_norm": 10.116133689880371,
"learning_rate": 0.00018711755233494367,
"loss": 29.9048,
"step": 162
},
{
"epoch": 0.3961352657004831,
"grad_norm": 7.1683173179626465,
"learning_rate": 0.00018695652173913045,
"loss": 30.6905,
"step": 164
},
{
"epoch": 0.40096618357487923,
"grad_norm": 11.81785774230957,
"learning_rate": 0.00018679549114331725,
"loss": 32.2774,
"step": 166
},
{
"epoch": 0.4057971014492754,
"grad_norm": 7.2079925537109375,
"learning_rate": 0.00018663446054750405,
"loss": 31.3603,
"step": 168
},
{
"epoch": 0.4106280193236715,
"grad_norm": 10.22714900970459,
"learning_rate": 0.00018647342995169083,
"loss": 29.7549,
"step": 170
},
{
"epoch": 0.41545893719806765,
"grad_norm": 8.35627269744873,
"learning_rate": 0.00018631239935587763,
"loss": 31.7175,
"step": 172
},
{
"epoch": 0.42028985507246375,
"grad_norm": 8.98567008972168,
"learning_rate": 0.00018615136876006444,
"loss": 31.4399,
"step": 174
},
{
"epoch": 0.4251207729468599,
"grad_norm": 10.814435958862305,
"learning_rate": 0.0001859903381642512,
"loss": 30.4269,
"step": 176
},
{
"epoch": 0.42995169082125606,
"grad_norm": 9.445025444030762,
"learning_rate": 0.00018582930756843802,
"loss": 29.7986,
"step": 178
},
{
"epoch": 0.43478260869565216,
"grad_norm": 11.667193412780762,
"learning_rate": 0.00018566827697262482,
"loss": 28.7911,
"step": 180
},
{
"epoch": 0.4396135265700483,
"grad_norm": 8.154279708862305,
"learning_rate": 0.0001855072463768116,
"loss": 32.8533,
"step": 182
},
{
"epoch": 0.4444444444444444,
"grad_norm": 9.40849781036377,
"learning_rate": 0.0001853462157809984,
"loss": 30.3062,
"step": 184
},
{
"epoch": 0.4492753623188406,
"grad_norm": 9.476842880249023,
"learning_rate": 0.0001851851851851852,
"loss": 29.9025,
"step": 186
},
{
"epoch": 0.45410628019323673,
"grad_norm": 9.150154113769531,
"learning_rate": 0.00018502415458937198,
"loss": 31.5757,
"step": 188
},
{
"epoch": 0.45893719806763283,
"grad_norm": 8.072809219360352,
"learning_rate": 0.00018486312399355878,
"loss": 32.186,
"step": 190
},
{
"epoch": 0.463768115942029,
"grad_norm": 11.970826148986816,
"learning_rate": 0.0001847020933977456,
"loss": 29.3784,
"step": 192
},
{
"epoch": 0.46859903381642515,
"grad_norm": 11.011039733886719,
"learning_rate": 0.00018454106280193236,
"loss": 28.1951,
"step": 194
},
{
"epoch": 0.47342995169082125,
"grad_norm": 10.958206176757812,
"learning_rate": 0.00018438003220611917,
"loss": 31.0484,
"step": 196
},
{
"epoch": 0.4782608695652174,
"grad_norm": 9.812915802001953,
"learning_rate": 0.00018421900161030597,
"loss": 31.784,
"step": 198
},
{
"epoch": 0.4830917874396135,
"grad_norm": 11.235363960266113,
"learning_rate": 0.00018405797101449275,
"loss": 31.3269,
"step": 200
},
{
"epoch": 0.48792270531400966,
"grad_norm": 14.048873901367188,
"learning_rate": 0.00018389694041867955,
"loss": 31.0424,
"step": 202
},
{
"epoch": 0.4927536231884058,
"grad_norm": 9.81869125366211,
"learning_rate": 0.00018373590982286635,
"loss": 32.6147,
"step": 204
},
{
"epoch": 0.4975845410628019,
"grad_norm": 8.801289558410645,
"learning_rate": 0.00018357487922705313,
"loss": 30.4718,
"step": 206
},
{
"epoch": 0.5024154589371981,
"grad_norm": 7.1190385818481445,
"learning_rate": 0.00018341384863123993,
"loss": 29.2369,
"step": 208
},
{
"epoch": 0.5072463768115942,
"grad_norm": 8.437512397766113,
"learning_rate": 0.00018325281803542674,
"loss": 30.7868,
"step": 210
},
{
"epoch": 0.5120772946859904,
"grad_norm": 6.539140224456787,
"learning_rate": 0.0001830917874396135,
"loss": 29.9767,
"step": 212
},
{
"epoch": 0.5169082125603864,
"grad_norm": 9.160558700561523,
"learning_rate": 0.00018293075684380032,
"loss": 29.9602,
"step": 214
},
{
"epoch": 0.5217391304347826,
"grad_norm": 7.8765177726745605,
"learning_rate": 0.00018276972624798712,
"loss": 30.8873,
"step": 216
},
{
"epoch": 0.5265700483091788,
"grad_norm": 8.778061866760254,
"learning_rate": 0.00018260869565217392,
"loss": 30.1888,
"step": 218
},
{
"epoch": 0.5314009661835749,
"grad_norm": 8.268914222717285,
"learning_rate": 0.00018244766505636073,
"loss": 31.7208,
"step": 220
},
{
"epoch": 0.5362318840579711,
"grad_norm": 8.659038543701172,
"learning_rate": 0.00018228663446054753,
"loss": 31.3643,
"step": 222
},
{
"epoch": 0.5410628019323671,
"grad_norm": 9.013368606567383,
"learning_rate": 0.0001821256038647343,
"loss": 30.1504,
"step": 224
},
{
"epoch": 0.5458937198067633,
"grad_norm": 9.309354782104492,
"learning_rate": 0.0001819645732689211,
"loss": 30.2107,
"step": 226
},
{
"epoch": 0.5507246376811594,
"grad_norm": 7.953092098236084,
"learning_rate": 0.00018180354267310791,
"loss": 29.3963,
"step": 228
},
{
"epoch": 0.5555555555555556,
"grad_norm": 9.035888671875,
"learning_rate": 0.0001816425120772947,
"loss": 31.3732,
"step": 230
},
{
"epoch": 0.5603864734299517,
"grad_norm": 10.098958969116211,
"learning_rate": 0.0001814814814814815,
"loss": 30.0866,
"step": 232
},
{
"epoch": 0.5652173913043478,
"grad_norm": 9.308027267456055,
"learning_rate": 0.0001813204508856683,
"loss": 31.168,
"step": 234
},
{
"epoch": 0.5700483091787439,
"grad_norm": 10.684345245361328,
"learning_rate": 0.00018115942028985507,
"loss": 30.3999,
"step": 236
},
{
"epoch": 0.5748792270531401,
"grad_norm": 8.09032917022705,
"learning_rate": 0.00018099838969404188,
"loss": 31.6167,
"step": 238
},
{
"epoch": 0.5797101449275363,
"grad_norm": 7.366332530975342,
"learning_rate": 0.00018083735909822868,
"loss": 30.2593,
"step": 240
},
{
"epoch": 0.5845410628019324,
"grad_norm": 7.711369514465332,
"learning_rate": 0.00018067632850241546,
"loss": 28.9501,
"step": 242
},
{
"epoch": 0.5893719806763285,
"grad_norm": 7.934360504150391,
"learning_rate": 0.00018051529790660226,
"loss": 28.5365,
"step": 244
},
{
"epoch": 0.5942028985507246,
"grad_norm": 8.121601104736328,
"learning_rate": 0.00018035426731078907,
"loss": 29.2618,
"step": 246
},
{
"epoch": 0.5990338164251208,
"grad_norm": 7.918673038482666,
"learning_rate": 0.00018019323671497584,
"loss": 30.0373,
"step": 248
},
{
"epoch": 0.6038647342995169,
"grad_norm": 11.193553924560547,
"learning_rate": 0.00018003220611916265,
"loss": 31.3798,
"step": 250
},
{
"epoch": 0.6086956521739131,
"grad_norm": 9.393643379211426,
"learning_rate": 0.00017987117552334945,
"loss": 31.2223,
"step": 252
},
{
"epoch": 0.6135265700483091,
"grad_norm": 8.13814926147461,
"learning_rate": 0.00017971014492753625,
"loss": 30.5097,
"step": 254
},
{
"epoch": 0.6183574879227053,
"grad_norm": 8.290206909179688,
"learning_rate": 0.00017954911433172303,
"loss": 27.856,
"step": 256
},
{
"epoch": 0.6231884057971014,
"grad_norm": 9.917459487915039,
"learning_rate": 0.00017938808373590983,
"loss": 28.664,
"step": 258
},
{
"epoch": 0.6280193236714976,
"grad_norm": 10.206878662109375,
"learning_rate": 0.00017922705314009664,
"loss": 31.3406,
"step": 260
},
{
"epoch": 0.6328502415458938,
"grad_norm": 9.776812553405762,
"learning_rate": 0.0001790660225442834,
"loss": 30.5843,
"step": 262
},
{
"epoch": 0.6376811594202898,
"grad_norm": 10.508336067199707,
"learning_rate": 0.00017890499194847022,
"loss": 30.3617,
"step": 264
},
{
"epoch": 0.642512077294686,
"grad_norm": 9.144083976745605,
"learning_rate": 0.00017874396135265702,
"loss": 30.0566,
"step": 266
},
{
"epoch": 0.6473429951690821,
"grad_norm": 9.019740104675293,
"learning_rate": 0.0001785829307568438,
"loss": 30.1384,
"step": 268
},
{
"epoch": 0.6521739130434783,
"grad_norm": 9.140926361083984,
"learning_rate": 0.0001784219001610306,
"loss": 28.6601,
"step": 270
},
{
"epoch": 0.6570048309178744,
"grad_norm": 9.820598602294922,
"learning_rate": 0.0001782608695652174,
"loss": 30.1565,
"step": 272
},
{
"epoch": 0.6618357487922706,
"grad_norm": 9.670087814331055,
"learning_rate": 0.00017809983896940418,
"loss": 30.156,
"step": 274
},
{
"epoch": 0.6666666666666666,
"grad_norm": 8.119627952575684,
"learning_rate": 0.00017793880837359098,
"loss": 29.5374,
"step": 276
},
{
"epoch": 0.6714975845410628,
"grad_norm": 8.52702522277832,
"learning_rate": 0.00017777777777777779,
"loss": 30.2013,
"step": 278
},
{
"epoch": 0.6763285024154589,
"grad_norm": 8.241043090820312,
"learning_rate": 0.00017761674718196456,
"loss": 29.2284,
"step": 280
},
{
"epoch": 0.6811594202898551,
"grad_norm": 9.305002212524414,
"learning_rate": 0.0001774557165861514,
"loss": 30.8417,
"step": 282
},
{
"epoch": 0.6859903381642513,
"grad_norm": 8.483264923095703,
"learning_rate": 0.00017729468599033817,
"loss": 28.3121,
"step": 284
},
{
"epoch": 0.6908212560386473,
"grad_norm": 8.674230575561523,
"learning_rate": 0.00017713365539452497,
"loss": 30.4354,
"step": 286
},
{
"epoch": 0.6956521739130435,
"grad_norm": 8.816984176635742,
"learning_rate": 0.00017697262479871178,
"loss": 29.0581,
"step": 288
},
{
"epoch": 0.7004830917874396,
"grad_norm": 8.081759452819824,
"learning_rate": 0.00017681159420289858,
"loss": 31.0066,
"step": 290
},
{
"epoch": 0.7053140096618358,
"grad_norm": 10.987712860107422,
"learning_rate": 0.00017665056360708536,
"loss": 28.4036,
"step": 292
},
{
"epoch": 0.7101449275362319,
"grad_norm": 9.358428955078125,
"learning_rate": 0.00017648953301127216,
"loss": 31.733,
"step": 294
},
{
"epoch": 0.714975845410628,
"grad_norm": 9.714231491088867,
"learning_rate": 0.00017632850241545896,
"loss": 28.3295,
"step": 296
},
{
"epoch": 0.7198067632850241,
"grad_norm": 10.079188346862793,
"learning_rate": 0.00017616747181964574,
"loss": 29.7857,
"step": 298
},
{
"epoch": 0.7246376811594203,
"grad_norm": 10.379854202270508,
"learning_rate": 0.00017600644122383254,
"loss": 28.7091,
"step": 300
},
{
"epoch": 0.7294685990338164,
"grad_norm": 9.6157808303833,
"learning_rate": 0.00017584541062801935,
"loss": 30.0664,
"step": 302
},
{
"epoch": 0.7342995169082126,
"grad_norm": 9.851590156555176,
"learning_rate": 0.00017568438003220612,
"loss": 30.5656,
"step": 304
},
{
"epoch": 0.7391304347826086,
"grad_norm": 9.500916481018066,
"learning_rate": 0.00017552334943639293,
"loss": 28.8709,
"step": 306
},
{
"epoch": 0.7439613526570048,
"grad_norm": 9.999371528625488,
"learning_rate": 0.00017536231884057973,
"loss": 29.5298,
"step": 308
},
{
"epoch": 0.748792270531401,
"grad_norm": 8.5446195602417,
"learning_rate": 0.0001752012882447665,
"loss": 29.5134,
"step": 310
},
{
"epoch": 0.7536231884057971,
"grad_norm": 9.369108200073242,
"learning_rate": 0.0001750402576489533,
"loss": 30.5918,
"step": 312
},
{
"epoch": 0.7584541062801933,
"grad_norm": 9.66053581237793,
"learning_rate": 0.00017487922705314011,
"loss": 29.5254,
"step": 314
},
{
"epoch": 0.7632850241545893,
"grad_norm": 9.699007034301758,
"learning_rate": 0.0001747181964573269,
"loss": 29.297,
"step": 316
},
{
"epoch": 0.7681159420289855,
"grad_norm": 6.751578330993652,
"learning_rate": 0.0001745571658615137,
"loss": 27.781,
"step": 318
},
{
"epoch": 0.7729468599033816,
"grad_norm": 8.00158977508545,
"learning_rate": 0.0001743961352657005,
"loss": 28.7844,
"step": 320
},
{
"epoch": 0.7777777777777778,
"grad_norm": 11.203788757324219,
"learning_rate": 0.00017423510466988727,
"loss": 29.4509,
"step": 322
},
{
"epoch": 0.782608695652174,
"grad_norm": 11.6134033203125,
"learning_rate": 0.00017407407407407408,
"loss": 28.4847,
"step": 324
},
{
"epoch": 0.7874396135265701,
"grad_norm": 8.184885025024414,
"learning_rate": 0.00017391304347826088,
"loss": 30.9662,
"step": 326
},
{
"epoch": 0.7922705314009661,
"grad_norm": 7.118366241455078,
"learning_rate": 0.00017375201288244766,
"loss": 30.3931,
"step": 328
},
{
"epoch": 0.7971014492753623,
"grad_norm": 8.378530502319336,
"learning_rate": 0.00017359098228663446,
"loss": 28.4438,
"step": 330
},
{
"epoch": 0.8019323671497585,
"grad_norm": 8.539013862609863,
"learning_rate": 0.00017342995169082126,
"loss": 30.293,
"step": 332
},
{
"epoch": 0.8067632850241546,
"grad_norm": 10.329437255859375,
"learning_rate": 0.00017326892109500804,
"loss": 28.9439,
"step": 334
},
{
"epoch": 0.8115942028985508,
"grad_norm": 7.267086982727051,
"learning_rate": 0.00017310789049919484,
"loss": 30.0624,
"step": 336
},
{
"epoch": 0.8164251207729468,
"grad_norm": 10.7781400680542,
"learning_rate": 0.00017294685990338165,
"loss": 30.338,
"step": 338
},
{
"epoch": 0.821256038647343,
"grad_norm": 11.077190399169922,
"learning_rate": 0.00017278582930756842,
"loss": 29.5852,
"step": 340
},
{
"epoch": 0.8260869565217391,
"grad_norm": 9.007209777832031,
"learning_rate": 0.00017262479871175523,
"loss": 29.4697,
"step": 342
},
{
"epoch": 0.8309178743961353,
"grad_norm": 8.706661224365234,
"learning_rate": 0.00017246376811594206,
"loss": 29.4373,
"step": 344
},
{
"epoch": 0.8357487922705314,
"grad_norm": 8.104077339172363,
"learning_rate": 0.00017230273752012884,
"loss": 31.1459,
"step": 346
},
{
"epoch": 0.8405797101449275,
"grad_norm": 8.499916076660156,
"learning_rate": 0.00017214170692431564,
"loss": 29.6039,
"step": 348
},
{
"epoch": 0.8454106280193237,
"grad_norm": 9.886308670043945,
"learning_rate": 0.00017198067632850244,
"loss": 28.4998,
"step": 350
},
{
"epoch": 0.8502415458937198,
"grad_norm": 6.680812835693359,
"learning_rate": 0.00017181964573268922,
"loss": 29.2374,
"step": 352
},
{
"epoch": 0.855072463768116,
"grad_norm": 7.037901401519775,
"learning_rate": 0.00017165861513687602,
"loss": 29.3442,
"step": 354
},
{
"epoch": 0.8599033816425121,
"grad_norm": 9.425200462341309,
"learning_rate": 0.00017149758454106283,
"loss": 28.2594,
"step": 356
},
{
"epoch": 0.8647342995169082,
"grad_norm": 10.08089828491211,
"learning_rate": 0.0001713365539452496,
"loss": 29.583,
"step": 358
},
{
"epoch": 0.8695652173913043,
"grad_norm": 8.83069133758545,
"learning_rate": 0.0001711755233494364,
"loss": 28.8106,
"step": 360
},
{
"epoch": 0.8743961352657005,
"grad_norm": 12.723852157592773,
"learning_rate": 0.0001710144927536232,
"loss": 28.6247,
"step": 362
},
{
"epoch": 0.8792270531400966,
"grad_norm": 7.244641304016113,
"learning_rate": 0.00017085346215780999,
"loss": 28.2318,
"step": 364
},
{
"epoch": 0.8840579710144928,
"grad_norm": 10.645294189453125,
"learning_rate": 0.0001706924315619968,
"loss": 30.3984,
"step": 366
},
{
"epoch": 0.8888888888888888,
"grad_norm": 8.675403594970703,
"learning_rate": 0.0001705314009661836,
"loss": 29.51,
"step": 368
},
{
"epoch": 0.893719806763285,
"grad_norm": 9.324760437011719,
"learning_rate": 0.00017037037037037037,
"loss": 29.173,
"step": 370
},
{
"epoch": 0.8985507246376812,
"grad_norm": 8.37873363494873,
"learning_rate": 0.00017020933977455717,
"loss": 28.2663,
"step": 372
},
{
"epoch": 0.9033816425120773,
"grad_norm": 7.841792583465576,
"learning_rate": 0.00017004830917874398,
"loss": 30.4364,
"step": 374
},
{
"epoch": 0.9082125603864735,
"grad_norm": 9.046091079711914,
"learning_rate": 0.00016988727858293075,
"loss": 30.9454,
"step": 376
},
{
"epoch": 0.9130434782608695,
"grad_norm": 8.812469482421875,
"learning_rate": 0.00016972624798711756,
"loss": 26.7434,
"step": 378
},
{
"epoch": 0.9178743961352657,
"grad_norm": 6.815216541290283,
"learning_rate": 0.00016956521739130436,
"loss": 30.5352,
"step": 380
},
{
"epoch": 0.9227053140096618,
"grad_norm": 9.451848983764648,
"learning_rate": 0.00016940418679549114,
"loss": 29.4241,
"step": 382
},
{
"epoch": 0.927536231884058,
"grad_norm": 8.97130298614502,
"learning_rate": 0.00016924315619967794,
"loss": 28.7862,
"step": 384
},
{
"epoch": 0.9323671497584541,
"grad_norm": 7.6972975730896,
"learning_rate": 0.00016908212560386474,
"loss": 29.5439,
"step": 386
},
{
"epoch": 0.9371980676328503,
"grad_norm": 7.955355167388916,
"learning_rate": 0.00016892109500805152,
"loss": 29.0917,
"step": 388
},
{
"epoch": 0.9420289855072463,
"grad_norm": 9.80173397064209,
"learning_rate": 0.00016876006441223832,
"loss": 26.3015,
"step": 390
},
{
"epoch": 0.9468599033816425,
"grad_norm": 9.457799911499023,
"learning_rate": 0.00016859903381642513,
"loss": 27.8037,
"step": 392
},
{
"epoch": 0.9516908212560387,
"grad_norm": 7.2435173988342285,
"learning_rate": 0.00016843800322061193,
"loss": 28.3055,
"step": 394
},
{
"epoch": 0.9565217391304348,
"grad_norm": 8.652717590332031,
"learning_rate": 0.0001682769726247987,
"loss": 29.4198,
"step": 396
},
{
"epoch": 0.961352657004831,
"grad_norm": 11.697986602783203,
"learning_rate": 0.0001681159420289855,
"loss": 29.3374,
"step": 398
},
{
"epoch": 0.966183574879227,
"grad_norm": 9.140453338623047,
"learning_rate": 0.00016795491143317231,
"loss": 28.7595,
"step": 400
},
{
"epoch": 0.9710144927536232,
"grad_norm": 8.438916206359863,
"learning_rate": 0.0001677938808373591,
"loss": 27.0845,
"step": 402
},
{
"epoch": 0.9758454106280193,
"grad_norm": 9.950366973876953,
"learning_rate": 0.0001676328502415459,
"loss": 28.7205,
"step": 404
},
{
"epoch": 0.9806763285024155,
"grad_norm": 7.97797155380249,
"learning_rate": 0.0001674718196457327,
"loss": 30.1159,
"step": 406
},
{
"epoch": 0.9855072463768116,
"grad_norm": 7.832582950592041,
"learning_rate": 0.0001673107890499195,
"loss": 29.9818,
"step": 408
},
{
"epoch": 0.9903381642512077,
"grad_norm": 9.50314998626709,
"learning_rate": 0.0001671497584541063,
"loss": 26.6032,
"step": 410
},
{
"epoch": 0.9951690821256038,
"grad_norm": 10.015514373779297,
"learning_rate": 0.00016698872785829308,
"loss": 30.9247,
"step": 412
},
{
"epoch": 1.0,
"grad_norm": 8.032495498657227,
"learning_rate": 0.00016682769726247988,
"loss": 29.0615,
"step": 414
},
{
"epoch": 1.0048309178743962,
"grad_norm": 7.304556846618652,
"learning_rate": 0.0001666666666666667,
"loss": 29.5321,
"step": 416
},
{
"epoch": 1.0096618357487923,
"grad_norm": 7.575999736785889,
"learning_rate": 0.00016650563607085346,
"loss": 26.892,
"step": 418
},
{
"epoch": 1.0144927536231885,
"grad_norm": 9.361268043518066,
"learning_rate": 0.00016634460547504027,
"loss": 28.0997,
"step": 420
},
{
"epoch": 1.0193236714975846,
"grad_norm": 8.8648099899292,
"learning_rate": 0.00016618357487922707,
"loss": 28.1403,
"step": 422
},
{
"epoch": 1.0241545893719808,
"grad_norm": 11.204512596130371,
"learning_rate": 0.00016602254428341385,
"loss": 29.3477,
"step": 424
},
{
"epoch": 1.0289855072463767,
"grad_norm": 8.673910140991211,
"learning_rate": 0.00016586151368760065,
"loss": 29.9338,
"step": 426
},
{
"epoch": 1.0338164251207729,
"grad_norm": 10.797616958618164,
"learning_rate": 0.00016570048309178746,
"loss": 28.3507,
"step": 428
},
{
"epoch": 1.038647342995169,
"grad_norm": 9.084686279296875,
"learning_rate": 0.00016553945249597426,
"loss": 27.9469,
"step": 430
},
{
"epoch": 1.0434782608695652,
"grad_norm": 9.642114639282227,
"learning_rate": 0.00016537842190016104,
"loss": 29.2968,
"step": 432
},
{
"epoch": 1.0483091787439613,
"grad_norm": 8.333573341369629,
"learning_rate": 0.00016521739130434784,
"loss": 27.1096,
"step": 434
},
{
"epoch": 1.0531400966183575,
"grad_norm": 10.562450408935547,
"learning_rate": 0.00016505636070853464,
"loss": 30.1957,
"step": 436
},
{
"epoch": 1.0579710144927537,
"grad_norm": 7.98309326171875,
"learning_rate": 0.00016489533011272142,
"loss": 29.5,
"step": 438
},
{
"epoch": 1.0628019323671498,
"grad_norm": 7.789132595062256,
"learning_rate": 0.00016473429951690822,
"loss": 29.6741,
"step": 440
},
{
"epoch": 1.067632850241546,
"grad_norm": 8.362640380859375,
"learning_rate": 0.00016457326892109503,
"loss": 29.4575,
"step": 442
},
{
"epoch": 1.0724637681159421,
"grad_norm": 7.407423973083496,
"learning_rate": 0.0001644122383252818,
"loss": 26.9575,
"step": 444
},
{
"epoch": 1.077294685990338,
"grad_norm": 8.499112129211426,
"learning_rate": 0.0001642512077294686,
"loss": 29.4598,
"step": 446
},
{
"epoch": 1.0821256038647342,
"grad_norm": 8.675498008728027,
"learning_rate": 0.0001640901771336554,
"loss": 26.3951,
"step": 448
},
{
"epoch": 1.0869565217391304,
"grad_norm": 9.390106201171875,
"learning_rate": 0.00016392914653784219,
"loss": 27.4861,
"step": 450
},
{
"epoch": 1.0917874396135265,
"grad_norm": 8.66092586517334,
"learning_rate": 0.000163768115942029,
"loss": 27.5384,
"step": 452
},
{
"epoch": 1.0966183574879227,
"grad_norm": 9.866594314575195,
"learning_rate": 0.0001636070853462158,
"loss": 28.5237,
"step": 454
},
{
"epoch": 1.1014492753623188,
"grad_norm": 8.653681755065918,
"learning_rate": 0.00016344605475040257,
"loss": 28.2452,
"step": 456
},
{
"epoch": 1.106280193236715,
"grad_norm": 7.964409351348877,
"learning_rate": 0.00016328502415458937,
"loss": 27.3373,
"step": 458
},
{
"epoch": 1.1111111111111112,
"grad_norm": 6.7314863204956055,
"learning_rate": 0.00016312399355877618,
"loss": 27.628,
"step": 460
},
{
"epoch": 1.1159420289855073,
"grad_norm": 8.670600891113281,
"learning_rate": 0.00016296296296296295,
"loss": 28.4199,
"step": 462
},
{
"epoch": 1.1207729468599035,
"grad_norm": 8.304594993591309,
"learning_rate": 0.00016280193236714976,
"loss": 28.3439,
"step": 464
},
{
"epoch": 1.1256038647342996,
"grad_norm": 8.142372131347656,
"learning_rate": 0.00016264090177133656,
"loss": 29.8356,
"step": 466
},
{
"epoch": 1.1304347826086956,
"grad_norm": 9.617864608764648,
"learning_rate": 0.00016247987117552336,
"loss": 26.8729,
"step": 468
},
{
"epoch": 1.1352657004830917,
"grad_norm": 11.739964485168457,
"learning_rate": 0.00016231884057971017,
"loss": 27.5099,
"step": 470
},
{
"epoch": 1.1400966183574879,
"grad_norm": 9.0482759475708,
"learning_rate": 0.00016215780998389697,
"loss": 25.6606,
"step": 472
},
{
"epoch": 1.144927536231884,
"grad_norm": 7.055074214935303,
"learning_rate": 0.00016199677938808375,
"loss": 27.0075,
"step": 474
},
{
"epoch": 1.1497584541062802,
"grad_norm": 9.319602012634277,
"learning_rate": 0.00016183574879227055,
"loss": 30.2885,
"step": 476
},
{
"epoch": 1.1545893719806763,
"grad_norm": 9.021683692932129,
"learning_rate": 0.00016167471819645735,
"loss": 28.8099,
"step": 478
},
{
"epoch": 1.1594202898550725,
"grad_norm": 6.554941177368164,
"learning_rate": 0.00016151368760064413,
"loss": 28.2995,
"step": 480
},
{
"epoch": 1.1642512077294687,
"grad_norm": 7.542542934417725,
"learning_rate": 0.00016135265700483093,
"loss": 28.35,
"step": 482
},
{
"epoch": 1.1690821256038648,
"grad_norm": 12.053621292114258,
"learning_rate": 0.00016119162640901774,
"loss": 30.3838,
"step": 484
},
{
"epoch": 1.1739130434782608,
"grad_norm": 8.615163803100586,
"learning_rate": 0.00016103059581320451,
"loss": 27.872,
"step": 486
},
{
"epoch": 1.178743961352657,
"grad_norm": 10.88862419128418,
"learning_rate": 0.00016086956521739132,
"loss": 26.9656,
"step": 488
},
{
"epoch": 1.183574879227053,
"grad_norm": 9.35364818572998,
"learning_rate": 0.00016070853462157812,
"loss": 27.791,
"step": 490
},
{
"epoch": 1.1884057971014492,
"grad_norm": 8.610274314880371,
"learning_rate": 0.0001605475040257649,
"loss": 27.1357,
"step": 492
},
{
"epoch": 1.1932367149758454,
"grad_norm": 8.759892463684082,
"learning_rate": 0.0001603864734299517,
"loss": 27.035,
"step": 494
},
{
"epoch": 1.1980676328502415,
"grad_norm": 10.015132904052734,
"learning_rate": 0.0001602254428341385,
"loss": 28.5965,
"step": 496
},
{
"epoch": 1.2028985507246377,
"grad_norm": 9.121025085449219,
"learning_rate": 0.00016006441223832528,
"loss": 28.5107,
"step": 498
},
{
"epoch": 1.2077294685990339,
"grad_norm": 9.401590347290039,
"learning_rate": 0.00015990338164251208,
"loss": 28.5304,
"step": 500
},
{
"epoch": 1.21256038647343,
"grad_norm": 8.708001136779785,
"learning_rate": 0.0001597423510466989,
"loss": 29.9226,
"step": 502
},
{
"epoch": 1.2173913043478262,
"grad_norm": 9.344232559204102,
"learning_rate": 0.00015958132045088566,
"loss": 27.7489,
"step": 504
},
{
"epoch": 1.2222222222222223,
"grad_norm": 7.874361991882324,
"learning_rate": 0.00015942028985507247,
"loss": 29.2094,
"step": 506
},
{
"epoch": 1.2270531400966185,
"grad_norm": 9.35866928100586,
"learning_rate": 0.00015925925925925927,
"loss": 28.5889,
"step": 508
},
{
"epoch": 1.2318840579710144,
"grad_norm": 9.740680694580078,
"learning_rate": 0.00015909822866344605,
"loss": 29.5747,
"step": 510
},
{
"epoch": 1.2367149758454106,
"grad_norm": 7.713297367095947,
"learning_rate": 0.00015893719806763285,
"loss": 28.9137,
"step": 512
},
{
"epoch": 1.2415458937198067,
"grad_norm": 8.05880355834961,
"learning_rate": 0.00015877616747181965,
"loss": 29.4705,
"step": 514
},
{
"epoch": 1.2463768115942029,
"grad_norm": 8.30479621887207,
"learning_rate": 0.00015861513687600643,
"loss": 29.1907,
"step": 516
},
{
"epoch": 1.251207729468599,
"grad_norm": 10.590409278869629,
"learning_rate": 0.00015845410628019323,
"loss": 27.9428,
"step": 518
},
{
"epoch": 1.2560386473429952,
"grad_norm": 8.054545402526855,
"learning_rate": 0.00015829307568438004,
"loss": 28.4886,
"step": 520
},
{
"epoch": 1.2608695652173914,
"grad_norm": 8.148458480834961,
"learning_rate": 0.00015813204508856681,
"loss": 27.5395,
"step": 522
},
{
"epoch": 1.2657004830917875,
"grad_norm": 8.747846603393555,
"learning_rate": 0.00015797101449275362,
"loss": 29.784,
"step": 524
},
{
"epoch": 1.2705314009661834,
"grad_norm": 8.56131362915039,
"learning_rate": 0.00015780998389694042,
"loss": 26.8683,
"step": 526
},
{
"epoch": 1.2753623188405796,
"grad_norm": 7.3210883140563965,
"learning_rate": 0.0001576489533011272,
"loss": 28.6462,
"step": 528
},
{
"epoch": 1.2801932367149758,
"grad_norm": 7.494152545928955,
"learning_rate": 0.00015748792270531403,
"loss": 28.5302,
"step": 530
},
{
"epoch": 1.285024154589372,
"grad_norm": 8.267993927001953,
"learning_rate": 0.00015732689210950083,
"loss": 28.3822,
"step": 532
},
{
"epoch": 1.289855072463768,
"grad_norm": 9.768172264099121,
"learning_rate": 0.0001571658615136876,
"loss": 27.6898,
"step": 534
},
{
"epoch": 1.2946859903381642,
"grad_norm": 6.865130424499512,
"learning_rate": 0.0001570048309178744,
"loss": 28.2198,
"step": 536
},
{
"epoch": 1.2995169082125604,
"grad_norm": 8.628961563110352,
"learning_rate": 0.00015684380032206122,
"loss": 26.5476,
"step": 538
},
{
"epoch": 1.3043478260869565,
"grad_norm": 9.150886535644531,
"learning_rate": 0.000156682769726248,
"loss": 29.4819,
"step": 540
},
{
"epoch": 1.3091787439613527,
"grad_norm": 8.535932540893555,
"learning_rate": 0.0001565217391304348,
"loss": 28.8612,
"step": 542
},
{
"epoch": 1.3140096618357489,
"grad_norm": 8.818495750427246,
"learning_rate": 0.0001563607085346216,
"loss": 29.2101,
"step": 544
},
{
"epoch": 1.318840579710145,
"grad_norm": 8.080242156982422,
"learning_rate": 0.00015619967793880838,
"loss": 26.2693,
"step": 546
},
{
"epoch": 1.3236714975845412,
"grad_norm": 7.340477466583252,
"learning_rate": 0.00015603864734299518,
"loss": 28.7229,
"step": 548
},
{
"epoch": 1.3285024154589373,
"grad_norm": 7.5151047706604,
"learning_rate": 0.00015587761674718198,
"loss": 28.9171,
"step": 550
},
{
"epoch": 1.3333333333333333,
"grad_norm": 8.710932731628418,
"learning_rate": 0.00015571658615136876,
"loss": 27.4353,
"step": 552
},
{
"epoch": 1.3381642512077294,
"grad_norm": 8.146522521972656,
"learning_rate": 0.00015555555555555556,
"loss": 27.0576,
"step": 554
},
{
"epoch": 1.3429951690821256,
"grad_norm": 9.677267074584961,
"learning_rate": 0.00015539452495974237,
"loss": 27.569,
"step": 556
},
{
"epoch": 1.3478260869565217,
"grad_norm": 8.272392272949219,
"learning_rate": 0.00015523349436392914,
"loss": 27.5188,
"step": 558
},
{
"epoch": 1.3526570048309179,
"grad_norm": 9.684012413024902,
"learning_rate": 0.00015507246376811595,
"loss": 29.1665,
"step": 560
},
{
"epoch": 1.357487922705314,
"grad_norm": 12.55364990234375,
"learning_rate": 0.00015491143317230275,
"loss": 28.0156,
"step": 562
},
{
"epoch": 1.3623188405797102,
"grad_norm": 8.099139213562012,
"learning_rate": 0.00015475040257648953,
"loss": 28.706,
"step": 564
},
{
"epoch": 1.3671497584541064,
"grad_norm": 9.807384490966797,
"learning_rate": 0.00015458937198067633,
"loss": 28.5265,
"step": 566
},
{
"epoch": 1.3719806763285023,
"grad_norm": 9.85666275024414,
"learning_rate": 0.00015442834138486313,
"loss": 27.7005,
"step": 568
},
{
"epoch": 1.3768115942028984,
"grad_norm": 7.128468990325928,
"learning_rate": 0.00015426731078904994,
"loss": 27.5816,
"step": 570
},
{
"epoch": 1.3816425120772946,
"grad_norm": 8.653708457946777,
"learning_rate": 0.0001541062801932367,
"loss": 28.4573,
"step": 572
},
{
"epoch": 1.3864734299516908,
"grad_norm": 7.988314151763916,
"learning_rate": 0.00015394524959742352,
"loss": 27.9508,
"step": 574
},
{
"epoch": 1.391304347826087,
"grad_norm": 10.148573875427246,
"learning_rate": 0.00015378421900161032,
"loss": 28.0399,
"step": 576
},
{
"epoch": 1.396135265700483,
"grad_norm": 8.33492660522461,
"learning_rate": 0.0001536231884057971,
"loss": 29.6252,
"step": 578
},
{
"epoch": 1.4009661835748792,
"grad_norm": 10.362284660339355,
"learning_rate": 0.0001534621578099839,
"loss": 28.2634,
"step": 580
},
{
"epoch": 1.4057971014492754,
"grad_norm": 11.445610046386719,
"learning_rate": 0.0001533011272141707,
"loss": 27.94,
"step": 582
},
{
"epoch": 1.4106280193236715,
"grad_norm": 5.8856916427612305,
"learning_rate": 0.00015314009661835748,
"loss": 28.6919,
"step": 584
},
{
"epoch": 1.4154589371980677,
"grad_norm": 8.040237426757812,
"learning_rate": 0.00015297906602254428,
"loss": 27.4428,
"step": 586
},
{
"epoch": 1.4202898550724639,
"grad_norm": 8.459455490112305,
"learning_rate": 0.0001528180354267311,
"loss": 28.883,
"step": 588
},
{
"epoch": 1.42512077294686,
"grad_norm": 9.862092971801758,
"learning_rate": 0.00015265700483091786,
"loss": 27.5396,
"step": 590
},
{
"epoch": 1.4299516908212562,
"grad_norm": 9.240866661071777,
"learning_rate": 0.0001524959742351047,
"loss": 28.5589,
"step": 592
},
{
"epoch": 1.434782608695652,
"grad_norm": 8.943296432495117,
"learning_rate": 0.00015233494363929147,
"loss": 28.6255,
"step": 594
},
{
"epoch": 1.4396135265700483,
"grad_norm": 9.087813377380371,
"learning_rate": 0.00015217391304347827,
"loss": 29.508,
"step": 596
},
{
"epoch": 1.4444444444444444,
"grad_norm": 8.143028259277344,
"learning_rate": 0.00015201288244766508,
"loss": 27.9508,
"step": 598
},
{
"epoch": 1.4492753623188406,
"grad_norm": 8.073821067810059,
"learning_rate": 0.00015185185185185185,
"loss": 28.4479,
"step": 600
},
{
"epoch": 1.4541062801932367,
"grad_norm": 7.678289413452148,
"learning_rate": 0.00015169082125603866,
"loss": 28.75,
"step": 602
},
{
"epoch": 1.458937198067633,
"grad_norm": 7.962745189666748,
"learning_rate": 0.00015152979066022546,
"loss": 28.074,
"step": 604
},
{
"epoch": 1.463768115942029,
"grad_norm": 8.225008010864258,
"learning_rate": 0.00015136876006441224,
"loss": 28.5051,
"step": 606
},
{
"epoch": 1.4685990338164252,
"grad_norm": 6.815709590911865,
"learning_rate": 0.00015120772946859904,
"loss": 29.1703,
"step": 608
},
{
"epoch": 1.4734299516908211,
"grad_norm": 7.653327465057373,
"learning_rate": 0.00015104669887278585,
"loss": 27.2951,
"step": 610
},
{
"epoch": 1.4782608695652173,
"grad_norm": 10.327927589416504,
"learning_rate": 0.00015088566827697265,
"loss": 28.1144,
"step": 612
},
{
"epoch": 1.4830917874396135,
"grad_norm": 8.612911224365234,
"learning_rate": 0.00015072463768115943,
"loss": 28.5057,
"step": 614
},
{
"epoch": 1.4879227053140096,
"grad_norm": 8.190404891967773,
"learning_rate": 0.00015056360708534623,
"loss": 27.3437,
"step": 616
},
{
"epoch": 1.4927536231884058,
"grad_norm": 7.556375980377197,
"learning_rate": 0.00015040257648953303,
"loss": 26.3396,
"step": 618
},
{
"epoch": 1.497584541062802,
"grad_norm": 8.995963096618652,
"learning_rate": 0.0001502415458937198,
"loss": 28.9629,
"step": 620
},
{
"epoch": 1.502415458937198,
"grad_norm": 13.403937339782715,
"learning_rate": 0.0001500805152979066,
"loss": 28.1381,
"step": 622
},
{
"epoch": 1.5072463768115942,
"grad_norm": 8.48337459564209,
"learning_rate": 0.00014991948470209342,
"loss": 28.8002,
"step": 624
},
{
"epoch": 1.5120772946859904,
"grad_norm": 7.916252613067627,
"learning_rate": 0.0001497584541062802,
"loss": 29.5661,
"step": 626
},
{
"epoch": 1.5169082125603865,
"grad_norm": 8.097860336303711,
"learning_rate": 0.000149597423510467,
"loss": 29.1108,
"step": 628
},
{
"epoch": 1.5217391304347827,
"grad_norm": 7.992598056793213,
"learning_rate": 0.0001494363929146538,
"loss": 25.8962,
"step": 630
},
{
"epoch": 1.5265700483091789,
"grad_norm": 6.601809501647949,
"learning_rate": 0.00014927536231884058,
"loss": 27.6647,
"step": 632
},
{
"epoch": 1.531400966183575,
"grad_norm": 10.532876014709473,
"learning_rate": 0.00014911433172302738,
"loss": 30.0814,
"step": 634
},
{
"epoch": 1.5362318840579712,
"grad_norm": 8.925707817077637,
"learning_rate": 0.00014895330112721418,
"loss": 28.0325,
"step": 636
},
{
"epoch": 1.541062801932367,
"grad_norm": 6.749852657318115,
"learning_rate": 0.00014879227053140096,
"loss": 27.5785,
"step": 638
},
{
"epoch": 1.5458937198067633,
"grad_norm": 9.3954439163208,
"learning_rate": 0.00014863123993558776,
"loss": 29.7051,
"step": 640
},
{
"epoch": 1.5507246376811594,
"grad_norm": 7.524625778198242,
"learning_rate": 0.00014847020933977457,
"loss": 29.8091,
"step": 642
},
{
"epoch": 1.5555555555555556,
"grad_norm": 8.303244590759277,
"learning_rate": 0.00014830917874396134,
"loss": 28.2779,
"step": 644
},
{
"epoch": 1.5603864734299517,
"grad_norm": 8.040205001831055,
"learning_rate": 0.00014814814814814815,
"loss": 29.3399,
"step": 646
},
{
"epoch": 1.5652173913043477,
"grad_norm": 6.253566265106201,
"learning_rate": 0.00014798711755233495,
"loss": 29.2561,
"step": 648
},
{
"epoch": 1.5700483091787438,
"grad_norm": 8.045578002929688,
"learning_rate": 0.00014782608695652173,
"loss": 26.8589,
"step": 650
},
{
"epoch": 1.57487922705314,
"grad_norm": 8.851433753967285,
"learning_rate": 0.00014766505636070853,
"loss": 28.2428,
"step": 652
},
{
"epoch": 1.5797101449275361,
"grad_norm": 8.400809288024902,
"learning_rate": 0.00014750402576489533,
"loss": 29.5603,
"step": 654
},
{
"epoch": 1.5845410628019323,
"grad_norm": 7.533260345458984,
"learning_rate": 0.00014734299516908214,
"loss": 26.993,
"step": 656
},
{
"epoch": 1.5893719806763285,
"grad_norm": 6.993838310241699,
"learning_rate": 0.00014718196457326894,
"loss": 28.0578,
"step": 658
},
{
"epoch": 1.5942028985507246,
"grad_norm": 9.009007453918457,
"learning_rate": 0.00014702093397745574,
"loss": 27.0416,
"step": 660
},
{
"epoch": 1.5990338164251208,
"grad_norm": 7.587328910827637,
"learning_rate": 0.00014685990338164252,
"loss": 27.896,
"step": 662
},
{
"epoch": 1.603864734299517,
"grad_norm": 7.423081398010254,
"learning_rate": 0.00014669887278582932,
"loss": 26.6892,
"step": 664
},
{
"epoch": 1.608695652173913,
"grad_norm": 8.408404350280762,
"learning_rate": 0.00014653784219001613,
"loss": 27.5191,
"step": 666
},
{
"epoch": 1.6135265700483092,
"grad_norm": 8.044210433959961,
"learning_rate": 0.0001463768115942029,
"loss": 30.3552,
"step": 668
},
{
"epoch": 1.6183574879227054,
"grad_norm": 8.7662935256958,
"learning_rate": 0.0001462157809983897,
"loss": 27.6742,
"step": 670
},
{
"epoch": 1.6231884057971016,
"grad_norm": 7.504002094268799,
"learning_rate": 0.0001460547504025765,
"loss": 26.1561,
"step": 672
},
{
"epoch": 1.6280193236714977,
"grad_norm": 7.4576826095581055,
"learning_rate": 0.0001458937198067633,
"loss": 27.9011,
"step": 674
},
{
"epoch": 1.6328502415458939,
"grad_norm": 7.216124057769775,
"learning_rate": 0.0001457326892109501,
"loss": 27.3678,
"step": 676
},
{
"epoch": 1.6376811594202898,
"grad_norm": 8.461156845092773,
"learning_rate": 0.0001455716586151369,
"loss": 28.0337,
"step": 678
},
{
"epoch": 1.642512077294686,
"grad_norm": 9.682413101196289,
"learning_rate": 0.00014541062801932367,
"loss": 27.1031,
"step": 680
},
{
"epoch": 1.6473429951690821,
"grad_norm": 6.8604817390441895,
"learning_rate": 0.00014524959742351047,
"loss": 28.9948,
"step": 682
},
{
"epoch": 1.6521739130434783,
"grad_norm": 8.835001945495605,
"learning_rate": 0.00014508856682769728,
"loss": 28.2096,
"step": 684
},
{
"epoch": 1.6570048309178744,
"grad_norm": 8.947821617126465,
"learning_rate": 0.00014492753623188405,
"loss": 28.8512,
"step": 686
},
{
"epoch": 1.6618357487922706,
"grad_norm": 7.301581859588623,
"learning_rate": 0.00014476650563607086,
"loss": 28.8649,
"step": 688
},
{
"epoch": 1.6666666666666665,
"grad_norm": 8.465940475463867,
"learning_rate": 0.00014460547504025766,
"loss": 28.6396,
"step": 690
},
{
"epoch": 1.6714975845410627,
"grad_norm": 9.281678199768066,
"learning_rate": 0.00014444444444444444,
"loss": 26.2167,
"step": 692
},
{
"epoch": 1.6763285024154588,
"grad_norm": 8.054730415344238,
"learning_rate": 0.00014428341384863124,
"loss": 30.5799,
"step": 694
},
{
"epoch": 1.681159420289855,
"grad_norm": 9.177703857421875,
"learning_rate": 0.00014412238325281804,
"loss": 27.6689,
"step": 696
},
{
"epoch": 1.6859903381642511,
"grad_norm": 7.34149169921875,
"learning_rate": 0.00014396135265700482,
"loss": 28.1966,
"step": 698
},
{
"epoch": 1.6908212560386473,
"grad_norm": 9.34843921661377,
"learning_rate": 0.00014380032206119162,
"loss": 28.2126,
"step": 700
},
{
"epoch": 1.6956521739130435,
"grad_norm": 8.255733489990234,
"learning_rate": 0.00014363929146537843,
"loss": 28.279,
"step": 702
},
{
"epoch": 1.7004830917874396,
"grad_norm": 7.138146877288818,
"learning_rate": 0.0001434782608695652,
"loss": 28.0335,
"step": 704
},
{
"epoch": 1.7053140096618358,
"grad_norm": 7.608633041381836,
"learning_rate": 0.000143317230273752,
"loss": 27.788,
"step": 706
},
{
"epoch": 1.710144927536232,
"grad_norm": 10.221348762512207,
"learning_rate": 0.0001431561996779388,
"loss": 26.4698,
"step": 708
},
{
"epoch": 1.714975845410628,
"grad_norm": 7.764200210571289,
"learning_rate": 0.0001429951690821256,
"loss": 27.5784,
"step": 710
},
{
"epoch": 1.7198067632850242,
"grad_norm": 7.295494079589844,
"learning_rate": 0.0001428341384863124,
"loss": 25.7921,
"step": 712
},
{
"epoch": 1.7246376811594204,
"grad_norm": 7.534460544586182,
"learning_rate": 0.0001426731078904992,
"loss": 27.5532,
"step": 714
},
{
"epoch": 1.7294685990338166,
"grad_norm": 7.3485002517700195,
"learning_rate": 0.000142512077294686,
"loss": 27.3069,
"step": 716
},
{
"epoch": 1.7342995169082127,
"grad_norm": 7.3418049812316895,
"learning_rate": 0.0001423510466988728,
"loss": 28.4993,
"step": 718
},
{
"epoch": 1.7391304347826086,
"grad_norm": 7.740454196929932,
"learning_rate": 0.0001421900161030596,
"loss": 27.7353,
"step": 720
},
{
"epoch": 1.7439613526570048,
"grad_norm": 6.945924282073975,
"learning_rate": 0.00014202898550724638,
"loss": 29.0447,
"step": 722
},
{
"epoch": 1.748792270531401,
"grad_norm": 10.651424407958984,
"learning_rate": 0.00014186795491143319,
"loss": 28.2643,
"step": 724
},
{
"epoch": 1.7536231884057971,
"grad_norm": 8.329526901245117,
"learning_rate": 0.00014170692431562,
"loss": 27.9287,
"step": 726
},
{
"epoch": 1.7584541062801933,
"grad_norm": 9.379905700683594,
"learning_rate": 0.00014154589371980677,
"loss": 29.7388,
"step": 728
},
{
"epoch": 1.7632850241545892,
"grad_norm": 8.386578559875488,
"learning_rate": 0.00014138486312399357,
"loss": 27.136,
"step": 730
},
{
"epoch": 1.7681159420289854,
"grad_norm": 7.3653388023376465,
"learning_rate": 0.00014122383252818037,
"loss": 27.7946,
"step": 732
},
{
"epoch": 1.7729468599033815,
"grad_norm": 8.317994117736816,
"learning_rate": 0.00014106280193236715,
"loss": 27.9617,
"step": 734
},
{
"epoch": 1.7777777777777777,
"grad_norm": 9.021920204162598,
"learning_rate": 0.00014090177133655395,
"loss": 27.6967,
"step": 736
},
{
"epoch": 1.7826086956521738,
"grad_norm": 6.490061283111572,
"learning_rate": 0.00014074074074074076,
"loss": 29.1245,
"step": 738
},
{
"epoch": 1.78743961352657,
"grad_norm": 8.023162841796875,
"learning_rate": 0.00014057971014492753,
"loss": 28.1596,
"step": 740
},
{
"epoch": 1.7922705314009661,
"grad_norm": 7.9419169425964355,
"learning_rate": 0.00014041867954911434,
"loss": 27.8295,
"step": 742
},
{
"epoch": 1.7971014492753623,
"grad_norm": 7.990035057067871,
"learning_rate": 0.00014025764895330114,
"loss": 26.8629,
"step": 744
},
{
"epoch": 1.8019323671497585,
"grad_norm": 8.936909675598145,
"learning_rate": 0.00014009661835748792,
"loss": 28.8046,
"step": 746
},
{
"epoch": 1.8067632850241546,
"grad_norm": 8.737541198730469,
"learning_rate": 0.00013993558776167472,
"loss": 28.1564,
"step": 748
},
{
"epoch": 1.8115942028985508,
"grad_norm": 6.9399518966674805,
"learning_rate": 0.00013977455716586152,
"loss": 28.085,
"step": 750
},
{
"epoch": 1.816425120772947,
"grad_norm": 7.811395645141602,
"learning_rate": 0.00013961352657004833,
"loss": 26.8601,
"step": 752
},
{
"epoch": 1.821256038647343,
"grad_norm": 7.366069793701172,
"learning_rate": 0.0001394524959742351,
"loss": 27.7022,
"step": 754
},
{
"epoch": 1.8260869565217392,
"grad_norm": 7.216097831726074,
"learning_rate": 0.0001392914653784219,
"loss": 27.8814,
"step": 756
},
{
"epoch": 1.8309178743961354,
"grad_norm": 7.748776912689209,
"learning_rate": 0.0001391304347826087,
"loss": 26.7868,
"step": 758
},
{
"epoch": 1.8357487922705316,
"grad_norm": 7.118618488311768,
"learning_rate": 0.0001389694041867955,
"loss": 27.519,
"step": 760
},
{
"epoch": 1.8405797101449275,
"grad_norm": 7.588200092315674,
"learning_rate": 0.0001388083735909823,
"loss": 27.8801,
"step": 762
},
{
"epoch": 1.8454106280193237,
"grad_norm": 8.082246780395508,
"learning_rate": 0.0001386473429951691,
"loss": 27.8767,
"step": 764
},
{
"epoch": 1.8502415458937198,
"grad_norm": 8.772019386291504,
"learning_rate": 0.00013848631239935587,
"loss": 27.0454,
"step": 766
},
{
"epoch": 1.855072463768116,
"grad_norm": 11.820154190063477,
"learning_rate": 0.00013832528180354267,
"loss": 27.4214,
"step": 768
},
{
"epoch": 1.8599033816425121,
"grad_norm": 7.21035623550415,
"learning_rate": 0.00013816425120772948,
"loss": 28.2132,
"step": 770
},
{
"epoch": 1.864734299516908,
"grad_norm": 7.833118438720703,
"learning_rate": 0.00013800322061191625,
"loss": 26.0298,
"step": 772
},
{
"epoch": 1.8695652173913042,
"grad_norm": 9.474292755126953,
"learning_rate": 0.00013784219001610306,
"loss": 27.4543,
"step": 774
},
{
"epoch": 1.8743961352657004,
"grad_norm": 8.790839195251465,
"learning_rate": 0.00013768115942028986,
"loss": 26.6007,
"step": 776
},
{
"epoch": 1.8792270531400965,
"grad_norm": 7.7932963371276855,
"learning_rate": 0.00013752012882447664,
"loss": 26.3272,
"step": 778
},
{
"epoch": 1.8840579710144927,
"grad_norm": 8.080236434936523,
"learning_rate": 0.00013735909822866347,
"loss": 27.8956,
"step": 780
},
{
"epoch": 1.8888888888888888,
"grad_norm": 8.07216739654541,
"learning_rate": 0.00013719806763285024,
"loss": 27.893,
"step": 782
},
{
"epoch": 1.893719806763285,
"grad_norm": 12.139753341674805,
"learning_rate": 0.00013703703703703705,
"loss": 29.3664,
"step": 784
},
{
"epoch": 1.8985507246376812,
"grad_norm": 8.131410598754883,
"learning_rate": 0.00013687600644122385,
"loss": 26.927,
"step": 786
},
{
"epoch": 1.9033816425120773,
"grad_norm": 7.748467922210693,
"learning_rate": 0.00013671497584541066,
"loss": 27.8784,
"step": 788
},
{
"epoch": 1.9082125603864735,
"grad_norm": 7.19915771484375,
"learning_rate": 0.00013655394524959743,
"loss": 26.3311,
"step": 790
},
{
"epoch": 1.9130434782608696,
"grad_norm": 7.374076843261719,
"learning_rate": 0.00013639291465378424,
"loss": 27.5376,
"step": 792
},
{
"epoch": 1.9178743961352658,
"grad_norm": 9.71866512298584,
"learning_rate": 0.00013623188405797104,
"loss": 25.3754,
"step": 794
},
{
"epoch": 1.922705314009662,
"grad_norm": 7.384367942810059,
"learning_rate": 0.00013607085346215782,
"loss": 27.993,
"step": 796
},
{
"epoch": 1.927536231884058,
"grad_norm": 8.255502700805664,
"learning_rate": 0.00013590982286634462,
"loss": 27.9883,
"step": 798
},
{
"epoch": 1.9323671497584543,
"grad_norm": 6.8607306480407715,
"learning_rate": 0.00013574879227053142,
"loss": 27.9741,
"step": 800
},
{
"epoch": 1.9371980676328504,
"grad_norm": 7.215616226196289,
"learning_rate": 0.0001355877616747182,
"loss": 28.1998,
"step": 802
},
{
"epoch": 1.9420289855072463,
"grad_norm": 7.920051574707031,
"learning_rate": 0.000135426731078905,
"loss": 27.6914,
"step": 804
},
{
"epoch": 1.9468599033816425,
"grad_norm": 7.799438953399658,
"learning_rate": 0.0001352657004830918,
"loss": 26.2782,
"step": 806
},
{
"epoch": 1.9516908212560387,
"grad_norm": 7.846622943878174,
"learning_rate": 0.00013510466988727858,
"loss": 27.5173,
"step": 808
},
{
"epoch": 1.9565217391304348,
"grad_norm": 7.30129861831665,
"learning_rate": 0.00013494363929146539,
"loss": 24.9119,
"step": 810
},
{
"epoch": 1.961352657004831,
"grad_norm": 7.13409948348999,
"learning_rate": 0.0001347826086956522,
"loss": 27.9657,
"step": 812
},
{
"epoch": 1.966183574879227,
"grad_norm": 9.307235717773438,
"learning_rate": 0.00013462157809983897,
"loss": 27.6966,
"step": 814
},
{
"epoch": 1.971014492753623,
"grad_norm": 7.8404741287231445,
"learning_rate": 0.00013446054750402577,
"loss": 26.6173,
"step": 816
},
{
"epoch": 1.9758454106280192,
"grad_norm": 8.165302276611328,
"learning_rate": 0.00013429951690821257,
"loss": 27.9967,
"step": 818
},
{
"epoch": 1.9806763285024154,
"grad_norm": 7.126535892486572,
"learning_rate": 0.00013413848631239935,
"loss": 28.0344,
"step": 820
},
{
"epoch": 1.9855072463768115,
"grad_norm": 9.40721321105957,
"learning_rate": 0.00013397745571658615,
"loss": 26.6568,
"step": 822
},
{
"epoch": 1.9903381642512077,
"grad_norm": 6.842724323272705,
"learning_rate": 0.00013381642512077296,
"loss": 27.3194,
"step": 824
},
{
"epoch": 1.9951690821256038,
"grad_norm": 6.537780284881592,
"learning_rate": 0.00013365539452495973,
"loss": 27.7764,
"step": 826
},
{
"epoch": 2.0,
"grad_norm": 6.422900199890137,
"learning_rate": 0.00013349436392914654,
"loss": 27.8163,
"step": 828
},
{
"epoch": 2.004830917874396,
"grad_norm": 7.0895466804504395,
"learning_rate": 0.00013333333333333334,
"loss": 28.3841,
"step": 830
},
{
"epoch": 2.0096618357487923,
"grad_norm": 6.439542293548584,
"learning_rate": 0.00013317230273752012,
"loss": 28.0774,
"step": 832
},
{
"epoch": 2.0144927536231885,
"grad_norm": 7.766908645629883,
"learning_rate": 0.00013301127214170692,
"loss": 26.6744,
"step": 834
},
{
"epoch": 2.0193236714975846,
"grad_norm": 9.178189277648926,
"learning_rate": 0.00013285024154589372,
"loss": 29.3364,
"step": 836
},
{
"epoch": 2.024154589371981,
"grad_norm": 6.916229248046875,
"learning_rate": 0.0001326892109500805,
"loss": 28.0622,
"step": 838
},
{
"epoch": 2.028985507246377,
"grad_norm": 7.51179838180542,
"learning_rate": 0.0001325281803542673,
"loss": 26.2849,
"step": 840
},
{
"epoch": 2.033816425120773,
"grad_norm": 8.321070671081543,
"learning_rate": 0.00013236714975845413,
"loss": 27.222,
"step": 842
},
{
"epoch": 2.0386473429951693,
"grad_norm": 6.450362205505371,
"learning_rate": 0.0001322061191626409,
"loss": 28.371,
"step": 844
},
{
"epoch": 2.0434782608695654,
"grad_norm": 9.631372451782227,
"learning_rate": 0.00013204508856682771,
"loss": 29.1135,
"step": 846
},
{
"epoch": 2.0483091787439616,
"grad_norm": 7.727206707000732,
"learning_rate": 0.00013188405797101452,
"loss": 27.9027,
"step": 848
},
{
"epoch": 2.0531400966183573,
"grad_norm": 8.837319374084473,
"learning_rate": 0.0001317230273752013,
"loss": 27.5687,
"step": 850
},
{
"epoch": 2.0579710144927534,
"grad_norm": 8.151753425598145,
"learning_rate": 0.0001315619967793881,
"loss": 27.0424,
"step": 852
},
{
"epoch": 2.0628019323671496,
"grad_norm": 7.2588605880737305,
"learning_rate": 0.0001314009661835749,
"loss": 26.2731,
"step": 854
},
{
"epoch": 2.0676328502415457,
"grad_norm": 9.428071975708008,
"learning_rate": 0.00013123993558776168,
"loss": 27.1224,
"step": 856
},
{
"epoch": 2.072463768115942,
"grad_norm": 8.864592552185059,
"learning_rate": 0.00013107890499194848,
"loss": 27.3137,
"step": 858
},
{
"epoch": 2.077294685990338,
"grad_norm": 9.21855640411377,
"learning_rate": 0.00013091787439613528,
"loss": 26.886,
"step": 860
},
{
"epoch": 2.082125603864734,
"grad_norm": 7.239558696746826,
"learning_rate": 0.00013075684380032206,
"loss": 28.2175,
"step": 862
},
{
"epoch": 2.0869565217391304,
"grad_norm": 8.155842781066895,
"learning_rate": 0.00013059581320450886,
"loss": 27.7151,
"step": 864
},
{
"epoch": 2.0917874396135265,
"grad_norm": 7.057051658630371,
"learning_rate": 0.00013043478260869567,
"loss": 26.3673,
"step": 866
},
{
"epoch": 2.0966183574879227,
"grad_norm": 7.664299488067627,
"learning_rate": 0.00013027375201288244,
"loss": 25.7326,
"step": 868
},
{
"epoch": 2.101449275362319,
"grad_norm": 6.310895919799805,
"learning_rate": 0.00013011272141706925,
"loss": 28.7024,
"step": 870
},
{
"epoch": 2.106280193236715,
"grad_norm": 7.707338809967041,
"learning_rate": 0.00012995169082125605,
"loss": 27.724,
"step": 872
},
{
"epoch": 2.111111111111111,
"grad_norm": 7.318761825561523,
"learning_rate": 0.00012979066022544283,
"loss": 27.2221,
"step": 874
},
{
"epoch": 2.1159420289855073,
"grad_norm": 9.668201446533203,
"learning_rate": 0.00012962962962962963,
"loss": 27.0287,
"step": 876
},
{
"epoch": 2.1207729468599035,
"grad_norm": 7.614035129547119,
"learning_rate": 0.00012946859903381643,
"loss": 26.1026,
"step": 878
},
{
"epoch": 2.1256038647342996,
"grad_norm": 8.675333023071289,
"learning_rate": 0.0001293075684380032,
"loss": 27.6808,
"step": 880
},
{
"epoch": 2.130434782608696,
"grad_norm": 6.966851234436035,
"learning_rate": 0.00012914653784219001,
"loss": 27.6239,
"step": 882
},
{
"epoch": 2.135265700483092,
"grad_norm": 6.5391974449157715,
"learning_rate": 0.00012898550724637682,
"loss": 27.9896,
"step": 884
},
{
"epoch": 2.140096618357488,
"grad_norm": 8.508500099182129,
"learning_rate": 0.0001288244766505636,
"loss": 27.918,
"step": 886
},
{
"epoch": 2.1449275362318843,
"grad_norm": 7.540635108947754,
"learning_rate": 0.0001286634460547504,
"loss": 28.694,
"step": 888
},
{
"epoch": 2.14975845410628,
"grad_norm": 8.311809539794922,
"learning_rate": 0.0001285024154589372,
"loss": 27.7563,
"step": 890
},
{
"epoch": 2.154589371980676,
"grad_norm": 7.208229064941406,
"learning_rate": 0.000128341384863124,
"loss": 28.0522,
"step": 892
},
{
"epoch": 2.1594202898550723,
"grad_norm": 7.324676036834717,
"learning_rate": 0.00012818035426731078,
"loss": 28.7856,
"step": 894
},
{
"epoch": 2.1642512077294684,
"grad_norm": 8.06933879852295,
"learning_rate": 0.00012801932367149759,
"loss": 26.2217,
"step": 896
},
{
"epoch": 2.1690821256038646,
"grad_norm": 7.0082902908325195,
"learning_rate": 0.0001278582930756844,
"loss": 28.2109,
"step": 898
},
{
"epoch": 2.1739130434782608,
"grad_norm": 6.494582176208496,
"learning_rate": 0.00012769726247987117,
"loss": 27.6243,
"step": 900
},
{
"epoch": 2.178743961352657,
"grad_norm": 6.218760967254639,
"learning_rate": 0.00012753623188405797,
"loss": 26.7612,
"step": 902
},
{
"epoch": 2.183574879227053,
"grad_norm": 9.239087104797363,
"learning_rate": 0.00012737520128824477,
"loss": 27.9478,
"step": 904
},
{
"epoch": 2.1884057971014492,
"grad_norm": 6.95756196975708,
"learning_rate": 0.00012721417069243158,
"loss": 28.2791,
"step": 906
},
{
"epoch": 2.1932367149758454,
"grad_norm": 7.1247944831848145,
"learning_rate": 0.00012705314009661838,
"loss": 26.1297,
"step": 908
},
{
"epoch": 2.1980676328502415,
"grad_norm": 9.735993385314941,
"learning_rate": 0.00012689210950080516,
"loss": 24.9398,
"step": 910
},
{
"epoch": 2.2028985507246377,
"grad_norm": 10.508362770080566,
"learning_rate": 0.00012673107890499196,
"loss": 26.488,
"step": 912
},
{
"epoch": 2.207729468599034,
"grad_norm": 8.847992897033691,
"learning_rate": 0.00012657004830917876,
"loss": 27.945,
"step": 914
},
{
"epoch": 2.21256038647343,
"grad_norm": 6.917768478393555,
"learning_rate": 0.00012640901771336554,
"loss": 28.1242,
"step": 916
},
{
"epoch": 2.217391304347826,
"grad_norm": 8.339996337890625,
"learning_rate": 0.00012624798711755234,
"loss": 26.9138,
"step": 918
},
{
"epoch": 2.2222222222222223,
"grad_norm": 6.435300827026367,
"learning_rate": 0.00012608695652173915,
"loss": 27.6529,
"step": 920
},
{
"epoch": 2.2270531400966185,
"grad_norm": 7.194887638092041,
"learning_rate": 0.00012592592592592592,
"loss": 27.1809,
"step": 922
},
{
"epoch": 2.2318840579710146,
"grad_norm": 9.154160499572754,
"learning_rate": 0.00012576489533011273,
"loss": 27.0501,
"step": 924
},
{
"epoch": 2.236714975845411,
"grad_norm": 7.581670761108398,
"learning_rate": 0.00012560386473429953,
"loss": 26.1787,
"step": 926
},
{
"epoch": 2.241545893719807,
"grad_norm": 8.077373504638672,
"learning_rate": 0.00012544283413848633,
"loss": 27.04,
"step": 928
},
{
"epoch": 2.246376811594203,
"grad_norm": 7.282364845275879,
"learning_rate": 0.0001252818035426731,
"loss": 27.0844,
"step": 930
},
{
"epoch": 2.2512077294685993,
"grad_norm": 7.848824501037598,
"learning_rate": 0.0001251207729468599,
"loss": 29.1671,
"step": 932
},
{
"epoch": 2.2560386473429954,
"grad_norm": 7.200251579284668,
"learning_rate": 0.00012495974235104672,
"loss": 26.1801,
"step": 934
},
{
"epoch": 2.260869565217391,
"grad_norm": 7.419154167175293,
"learning_rate": 0.0001247987117552335,
"loss": 27.1106,
"step": 936
},
{
"epoch": 2.2657004830917873,
"grad_norm": 8.16390609741211,
"learning_rate": 0.0001246376811594203,
"loss": 25.5017,
"step": 938
},
{
"epoch": 2.2705314009661834,
"grad_norm": 7.58992338180542,
"learning_rate": 0.0001244766505636071,
"loss": 27.0191,
"step": 940
},
{
"epoch": 2.2753623188405796,
"grad_norm": 8.532602310180664,
"learning_rate": 0.00012431561996779388,
"loss": 28.0053,
"step": 942
},
{
"epoch": 2.2801932367149758,
"grad_norm": 7.449092388153076,
"learning_rate": 0.00012415458937198068,
"loss": 25.7749,
"step": 944
},
{
"epoch": 2.285024154589372,
"grad_norm": 7.38059139251709,
"learning_rate": 0.00012399355877616748,
"loss": 28.2566,
"step": 946
},
{
"epoch": 2.289855072463768,
"grad_norm": 6.6862874031066895,
"learning_rate": 0.00012383252818035426,
"loss": 28.8852,
"step": 948
},
{
"epoch": 2.2946859903381642,
"grad_norm": 7.916528701782227,
"learning_rate": 0.00012367149758454106,
"loss": 27.8083,
"step": 950
},
{
"epoch": 2.2995169082125604,
"grad_norm": 6.143187522888184,
"learning_rate": 0.00012351046698872787,
"loss": 26.0691,
"step": 952
},
{
"epoch": 2.3043478260869565,
"grad_norm": 8.420724868774414,
"learning_rate": 0.00012334943639291464,
"loss": 28.0858,
"step": 954
},
{
"epoch": 2.3091787439613527,
"grad_norm": 7.883975505828857,
"learning_rate": 0.00012318840579710145,
"loss": 27.3439,
"step": 956
},
{
"epoch": 2.314009661835749,
"grad_norm": 7.242871284484863,
"learning_rate": 0.00012302737520128825,
"loss": 27.1341,
"step": 958
},
{
"epoch": 2.318840579710145,
"grad_norm": 7.858469009399414,
"learning_rate": 0.00012286634460547503,
"loss": 25.8494,
"step": 960
},
{
"epoch": 2.323671497584541,
"grad_norm": 7.365942478179932,
"learning_rate": 0.00012270531400966183,
"loss": 26.9695,
"step": 962
},
{
"epoch": 2.3285024154589373,
"grad_norm": 6.930251121520996,
"learning_rate": 0.00012254428341384863,
"loss": 25.7841,
"step": 964
},
{
"epoch": 2.3333333333333335,
"grad_norm": 6.728757858276367,
"learning_rate": 0.00012238325281803544,
"loss": 28.053,
"step": 966
},
{
"epoch": 2.3381642512077296,
"grad_norm": 6.711808681488037,
"learning_rate": 0.00012222222222222224,
"loss": 27.7962,
"step": 968
},
{
"epoch": 2.342995169082126,
"grad_norm": 7.4918951988220215,
"learning_rate": 0.00012206119162640903,
"loss": 27.0597,
"step": 970
},
{
"epoch": 2.3478260869565215,
"grad_norm": 8.181355476379395,
"learning_rate": 0.00012190016103059582,
"loss": 28.2665,
"step": 972
},
{
"epoch": 2.3526570048309177,
"grad_norm": 7.762918949127197,
"learning_rate": 0.00012173913043478263,
"loss": 28.546,
"step": 974
},
{
"epoch": 2.357487922705314,
"grad_norm": 7.8778276443481445,
"learning_rate": 0.00012157809983896942,
"loss": 27.1973,
"step": 976
},
{
"epoch": 2.36231884057971,
"grad_norm": 7.002277374267578,
"learning_rate": 0.0001214170692431562,
"loss": 26.2418,
"step": 978
},
{
"epoch": 2.367149758454106,
"grad_norm": 7.298165321350098,
"learning_rate": 0.00012125603864734301,
"loss": 28.2059,
"step": 980
},
{
"epoch": 2.3719806763285023,
"grad_norm": 7.899686336517334,
"learning_rate": 0.0001210950080515298,
"loss": 26.9666,
"step": 982
},
{
"epoch": 2.3768115942028984,
"grad_norm": 7.3516669273376465,
"learning_rate": 0.0001209339774557166,
"loss": 27.9299,
"step": 984
},
{
"epoch": 2.3816425120772946,
"grad_norm": 7.224858283996582,
"learning_rate": 0.00012077294685990339,
"loss": 25.816,
"step": 986
},
{
"epoch": 2.3864734299516908,
"grad_norm": 7.0076494216918945,
"learning_rate": 0.00012061191626409018,
"loss": 24.8251,
"step": 988
},
{
"epoch": 2.391304347826087,
"grad_norm": 6.74472188949585,
"learning_rate": 0.00012045088566827699,
"loss": 27.6622,
"step": 990
},
{
"epoch": 2.396135265700483,
"grad_norm": 6.549550533294678,
"learning_rate": 0.00012028985507246378,
"loss": 28.284,
"step": 992
},
{
"epoch": 2.4009661835748792,
"grad_norm": 6.806623458862305,
"learning_rate": 0.00012012882447665057,
"loss": 26.6694,
"step": 994
},
{
"epoch": 2.4057971014492754,
"grad_norm": 8.050207138061523,
"learning_rate": 0.00011996779388083737,
"loss": 28.3372,
"step": 996
},
{
"epoch": 2.4106280193236715,
"grad_norm": 7.284823417663574,
"learning_rate": 0.00011980676328502416,
"loss": 26.9082,
"step": 998
},
{
"epoch": 2.4154589371980677,
"grad_norm": 7.920591831207275,
"learning_rate": 0.00011964573268921095,
"loss": 29.7462,
"step": 1000
},
{
"epoch": 2.420289855072464,
"grad_norm": 8.616438865661621,
"learning_rate": 0.00011948470209339775,
"loss": 26.7905,
"step": 1002
},
{
"epoch": 2.42512077294686,
"grad_norm": 7.106829643249512,
"learning_rate": 0.00011932367149758454,
"loss": 27.5633,
"step": 1004
},
{
"epoch": 2.429951690821256,
"grad_norm": 8.117084503173828,
"learning_rate": 0.00011916264090177133,
"loss": 26.9659,
"step": 1006
},
{
"epoch": 2.4347826086956523,
"grad_norm": 7.732640743255615,
"learning_rate": 0.00011900161030595814,
"loss": 28.2114,
"step": 1008
},
{
"epoch": 2.4396135265700485,
"grad_norm": 7.36362361907959,
"learning_rate": 0.00011884057971014493,
"loss": 26.3716,
"step": 1010
},
{
"epoch": 2.4444444444444446,
"grad_norm": 8.114975929260254,
"learning_rate": 0.00011867954911433172,
"loss": 28.8353,
"step": 1012
},
{
"epoch": 2.449275362318841,
"grad_norm": 7.141117095947266,
"learning_rate": 0.00011851851851851852,
"loss": 25.7371,
"step": 1014
},
{
"epoch": 2.454106280193237,
"grad_norm": 7.491177558898926,
"learning_rate": 0.00011835748792270531,
"loss": 26.9146,
"step": 1016
},
{
"epoch": 2.4589371980676327,
"grad_norm": 6.710269451141357,
"learning_rate": 0.00011819645732689211,
"loss": 25.5321,
"step": 1018
},
{
"epoch": 2.463768115942029,
"grad_norm": 7.143400192260742,
"learning_rate": 0.0001180354267310789,
"loss": 29.9676,
"step": 1020
},
{
"epoch": 2.468599033816425,
"grad_norm": 8.246957778930664,
"learning_rate": 0.00011787439613526569,
"loss": 29.2592,
"step": 1022
},
{
"epoch": 2.473429951690821,
"grad_norm": 8.44863510131836,
"learning_rate": 0.0001177133655394525,
"loss": 26.0309,
"step": 1024
},
{
"epoch": 2.4782608695652173,
"grad_norm": 7.821875095367432,
"learning_rate": 0.00011755233494363929,
"loss": 26.8746,
"step": 1026
},
{
"epoch": 2.4830917874396135,
"grad_norm": 8.529960632324219,
"learning_rate": 0.0001173913043478261,
"loss": 27.0204,
"step": 1028
},
{
"epoch": 2.4879227053140096,
"grad_norm": 6.8329339027404785,
"learning_rate": 0.0001172302737520129,
"loss": 25.2555,
"step": 1030
},
{
"epoch": 2.4927536231884058,
"grad_norm": 6.804640769958496,
"learning_rate": 0.0001170692431561997,
"loss": 25.6537,
"step": 1032
},
{
"epoch": 2.497584541062802,
"grad_norm": 7.089588642120361,
"learning_rate": 0.00011690821256038649,
"loss": 25.2568,
"step": 1034
},
{
"epoch": 2.502415458937198,
"grad_norm": 11.241130828857422,
"learning_rate": 0.00011674718196457328,
"loss": 27.1132,
"step": 1036
},
{
"epoch": 2.5072463768115942,
"grad_norm": 7.47288703918457,
"learning_rate": 0.00011658615136876008,
"loss": 25.9993,
"step": 1038
},
{
"epoch": 2.5120772946859904,
"grad_norm": 8.372520446777344,
"learning_rate": 0.00011642512077294687,
"loss": 27.6641,
"step": 1040
},
{
"epoch": 2.5169082125603865,
"grad_norm": 8.117879867553711,
"learning_rate": 0.00011626409017713366,
"loss": 26.6226,
"step": 1042
},
{
"epoch": 2.5217391304347827,
"grad_norm": 8.319169044494629,
"learning_rate": 0.00011610305958132046,
"loss": 27.4311,
"step": 1044
},
{
"epoch": 2.526570048309179,
"grad_norm": 7.18233585357666,
"learning_rate": 0.00011594202898550725,
"loss": 27.6304,
"step": 1046
},
{
"epoch": 2.531400966183575,
"grad_norm": 7.594292640686035,
"learning_rate": 0.00011578099838969404,
"loss": 26.9063,
"step": 1048
},
{
"epoch": 2.536231884057971,
"grad_norm": 8.392667770385742,
"learning_rate": 0.00011561996779388085,
"loss": 27.2786,
"step": 1050
},
{
"epoch": 2.541062801932367,
"grad_norm": 6.698591709136963,
"learning_rate": 0.00011545893719806764,
"loss": 25.5416,
"step": 1052
},
{
"epoch": 2.545893719806763,
"grad_norm": 6.185670375823975,
"learning_rate": 0.00011529790660225444,
"loss": 26.9696,
"step": 1054
},
{
"epoch": 2.550724637681159,
"grad_norm": 7.676215648651123,
"learning_rate": 0.00011513687600644123,
"loss": 26.5383,
"step": 1056
},
{
"epoch": 2.5555555555555554,
"grad_norm": 6.880972385406494,
"learning_rate": 0.00011497584541062802,
"loss": 26.3302,
"step": 1058
},
{
"epoch": 2.5603864734299515,
"grad_norm": 8.553890228271484,
"learning_rate": 0.00011481481481481482,
"loss": 26.0391,
"step": 1060
},
{
"epoch": 2.5652173913043477,
"grad_norm": 6.153205394744873,
"learning_rate": 0.00011465378421900161,
"loss": 25.729,
"step": 1062
},
{
"epoch": 2.570048309178744,
"grad_norm": 8.465208053588867,
"learning_rate": 0.0001144927536231884,
"loss": 26.5018,
"step": 1064
},
{
"epoch": 2.57487922705314,
"grad_norm": 8.127817153930664,
"learning_rate": 0.00011433172302737521,
"loss": 26.3506,
"step": 1066
},
{
"epoch": 2.579710144927536,
"grad_norm": 9.615152359008789,
"learning_rate": 0.000114170692431562,
"loss": 25.9415,
"step": 1068
},
{
"epoch": 2.5845410628019323,
"grad_norm": 7.294039249420166,
"learning_rate": 0.00011400966183574879,
"loss": 26.7507,
"step": 1070
},
{
"epoch": 2.5893719806763285,
"grad_norm": 8.261009216308594,
"learning_rate": 0.00011384863123993559,
"loss": 26.7187,
"step": 1072
},
{
"epoch": 2.5942028985507246,
"grad_norm": 6.705962181091309,
"learning_rate": 0.00011368760064412238,
"loss": 26.6202,
"step": 1074
},
{
"epoch": 2.5990338164251208,
"grad_norm": 10.057275772094727,
"learning_rate": 0.00011352657004830917,
"loss": 26.6226,
"step": 1076
},
{
"epoch": 2.603864734299517,
"grad_norm": 8.795845031738281,
"learning_rate": 0.00011336553945249598,
"loss": 28.1032,
"step": 1078
},
{
"epoch": 2.608695652173913,
"grad_norm": 7.4816131591796875,
"learning_rate": 0.00011320450885668277,
"loss": 25.8255,
"step": 1080
},
{
"epoch": 2.6135265700483092,
"grad_norm": 7.060609340667725,
"learning_rate": 0.00011304347826086956,
"loss": 26.9353,
"step": 1082
},
{
"epoch": 2.6183574879227054,
"grad_norm": 7.140244960784912,
"learning_rate": 0.00011288244766505636,
"loss": 27.3619,
"step": 1084
},
{
"epoch": 2.6231884057971016,
"grad_norm": 7.22598934173584,
"learning_rate": 0.00011272141706924315,
"loss": 25.3791,
"step": 1086
},
{
"epoch": 2.6280193236714977,
"grad_norm": 7.098104953765869,
"learning_rate": 0.00011256038647342995,
"loss": 26.0269,
"step": 1088
},
{
"epoch": 2.632850241545894,
"grad_norm": 6.918243408203125,
"learning_rate": 0.00011239935587761677,
"loss": 26.9077,
"step": 1090
},
{
"epoch": 2.63768115942029,
"grad_norm": 7.557582378387451,
"learning_rate": 0.00011223832528180356,
"loss": 26.3413,
"step": 1092
},
{
"epoch": 2.642512077294686,
"grad_norm": 7.406020164489746,
"learning_rate": 0.00011207729468599035,
"loss": 25.9181,
"step": 1094
},
{
"epoch": 2.6473429951690823,
"grad_norm": 7.0549492835998535,
"learning_rate": 0.00011191626409017715,
"loss": 26.8606,
"step": 1096
},
{
"epoch": 2.6521739130434785,
"grad_norm": 6.645535469055176,
"learning_rate": 0.00011175523349436394,
"loss": 27.9375,
"step": 1098
},
{
"epoch": 2.6570048309178746,
"grad_norm": 7.90491247177124,
"learning_rate": 0.00011159420289855073,
"loss": 26.3062,
"step": 1100
},
{
"epoch": 2.661835748792271,
"grad_norm": 6.990922927856445,
"learning_rate": 0.00011143317230273754,
"loss": 28.5585,
"step": 1102
},
{
"epoch": 2.6666666666666665,
"grad_norm": 7.085525989532471,
"learning_rate": 0.00011127214170692433,
"loss": 25.2121,
"step": 1104
},
{
"epoch": 2.6714975845410627,
"grad_norm": 8.292244911193848,
"learning_rate": 0.00011111111111111112,
"loss": 26.5729,
"step": 1106
},
{
"epoch": 2.676328502415459,
"grad_norm": 7.650384426116943,
"learning_rate": 0.00011095008051529792,
"loss": 25.5093,
"step": 1108
},
{
"epoch": 2.681159420289855,
"grad_norm": 9.83218765258789,
"learning_rate": 0.00011078904991948471,
"loss": 25.2708,
"step": 1110
},
{
"epoch": 2.685990338164251,
"grad_norm": 6.258013725280762,
"learning_rate": 0.0001106280193236715,
"loss": 24.9544,
"step": 1112
},
{
"epoch": 2.6908212560386473,
"grad_norm": 7.423259258270264,
"learning_rate": 0.0001104669887278583,
"loss": 27.2744,
"step": 1114
},
{
"epoch": 2.6956521739130435,
"grad_norm": 7.9002814292907715,
"learning_rate": 0.0001103059581320451,
"loss": 26.9861,
"step": 1116
},
{
"epoch": 2.7004830917874396,
"grad_norm": 7.641670227050781,
"learning_rate": 0.00011014492753623188,
"loss": 27.426,
"step": 1118
},
{
"epoch": 2.7053140096618358,
"grad_norm": 7.658080577850342,
"learning_rate": 0.00010998389694041869,
"loss": 27.6252,
"step": 1120
},
{
"epoch": 2.710144927536232,
"grad_norm": 7.938218116760254,
"learning_rate": 0.00010982286634460548,
"loss": 26.1781,
"step": 1122
},
{
"epoch": 2.714975845410628,
"grad_norm": 7.96283483505249,
"learning_rate": 0.00010966183574879228,
"loss": 27.7596,
"step": 1124
},
{
"epoch": 2.7198067632850242,
"grad_norm": 10.215167045593262,
"learning_rate": 0.00010950080515297907,
"loss": 26.9451,
"step": 1126
},
{
"epoch": 2.7246376811594204,
"grad_norm": 7.972415924072266,
"learning_rate": 0.00010933977455716586,
"loss": 27.1329,
"step": 1128
},
{
"epoch": 2.7294685990338166,
"grad_norm": 5.932509899139404,
"learning_rate": 0.00010917874396135266,
"loss": 28.5013,
"step": 1130
},
{
"epoch": 2.7342995169082127,
"grad_norm": 8.786707878112793,
"learning_rate": 0.00010901771336553945,
"loss": 26.5279,
"step": 1132
},
{
"epoch": 2.7391304347826084,
"grad_norm": 6.930019855499268,
"learning_rate": 0.00010885668276972624,
"loss": 27.3484,
"step": 1134
},
{
"epoch": 2.7439613526570046,
"grad_norm": 7.4109015464782715,
"learning_rate": 0.00010869565217391305,
"loss": 26.4129,
"step": 1136
},
{
"epoch": 2.7487922705314007,
"grad_norm": 6.286072731018066,
"learning_rate": 0.00010853462157809984,
"loss": 26.3836,
"step": 1138
},
{
"epoch": 2.753623188405797,
"grad_norm": 8.696404457092285,
"learning_rate": 0.00010837359098228663,
"loss": 25.7786,
"step": 1140
},
{
"epoch": 2.758454106280193,
"grad_norm": 8.277897834777832,
"learning_rate": 0.00010821256038647343,
"loss": 27.2492,
"step": 1142
},
{
"epoch": 2.763285024154589,
"grad_norm": 7.653816223144531,
"learning_rate": 0.00010805152979066022,
"loss": 27.0198,
"step": 1144
},
{
"epoch": 2.7681159420289854,
"grad_norm": 7.8368144035339355,
"learning_rate": 0.00010789049919484701,
"loss": 28.3334,
"step": 1146
},
{
"epoch": 2.7729468599033815,
"grad_norm": 6.9786529541015625,
"learning_rate": 0.00010772946859903381,
"loss": 26.5917,
"step": 1148
},
{
"epoch": 2.7777777777777777,
"grad_norm": 7.004583358764648,
"learning_rate": 0.0001075684380032206,
"loss": 26.4706,
"step": 1150
},
{
"epoch": 2.782608695652174,
"grad_norm": 8.017105102539062,
"learning_rate": 0.00010740740740740742,
"loss": 28.0672,
"step": 1152
},
{
"epoch": 2.78743961352657,
"grad_norm": 6.233907699584961,
"learning_rate": 0.00010724637681159421,
"loss": 27.5043,
"step": 1154
},
{
"epoch": 2.792270531400966,
"grad_norm": 7.529089450836182,
"learning_rate": 0.00010708534621578102,
"loss": 25.2191,
"step": 1156
},
{
"epoch": 2.7971014492753623,
"grad_norm": 7.839463233947754,
"learning_rate": 0.0001069243156199678,
"loss": 25.6082,
"step": 1158
},
{
"epoch": 2.8019323671497585,
"grad_norm": 8.686691284179688,
"learning_rate": 0.00010676328502415461,
"loss": 27.9281,
"step": 1160
},
{
"epoch": 2.8067632850241546,
"grad_norm": 6.9186930656433105,
"learning_rate": 0.0001066022544283414,
"loss": 26.3933,
"step": 1162
},
{
"epoch": 2.8115942028985508,
"grad_norm": 7.170950889587402,
"learning_rate": 0.00010644122383252819,
"loss": 26.5526,
"step": 1164
},
{
"epoch": 2.816425120772947,
"grad_norm": 6.971534729003906,
"learning_rate": 0.00010628019323671499,
"loss": 26.1706,
"step": 1166
},
{
"epoch": 2.821256038647343,
"grad_norm": 7.302921295166016,
"learning_rate": 0.00010611916264090178,
"loss": 28.0723,
"step": 1168
},
{
"epoch": 2.8260869565217392,
"grad_norm": 7.918272495269775,
"learning_rate": 0.00010595813204508857,
"loss": 25.9546,
"step": 1170
},
{
"epoch": 2.8309178743961354,
"grad_norm": 8.934640884399414,
"learning_rate": 0.00010579710144927538,
"loss": 28.0027,
"step": 1172
},
{
"epoch": 2.8357487922705316,
"grad_norm": 9.624857902526855,
"learning_rate": 0.00010563607085346217,
"loss": 27.6072,
"step": 1174
},
{
"epoch": 2.8405797101449277,
"grad_norm": 7.182722091674805,
"learning_rate": 0.00010547504025764896,
"loss": 25.9444,
"step": 1176
},
{
"epoch": 2.845410628019324,
"grad_norm": 8.560644149780273,
"learning_rate": 0.00010531400966183576,
"loss": 24.5426,
"step": 1178
},
{
"epoch": 2.85024154589372,
"grad_norm": 7.0820088386535645,
"learning_rate": 0.00010515297906602255,
"loss": 27.1353,
"step": 1180
},
{
"epoch": 2.855072463768116,
"grad_norm": 7.135811805725098,
"learning_rate": 0.00010499194847020934,
"loss": 25.9438,
"step": 1182
},
{
"epoch": 2.8599033816425123,
"grad_norm": 7.968995571136475,
"learning_rate": 0.00010483091787439614,
"loss": 25.7914,
"step": 1184
},
{
"epoch": 2.864734299516908,
"grad_norm": 7.4556193351745605,
"learning_rate": 0.00010466988727858293,
"loss": 28.4208,
"step": 1186
},
{
"epoch": 2.869565217391304,
"grad_norm": 8.124032974243164,
"learning_rate": 0.00010450885668276972,
"loss": 26.6249,
"step": 1188
},
{
"epoch": 2.8743961352657004,
"grad_norm": 6.682657718658447,
"learning_rate": 0.00010434782608695653,
"loss": 27.7629,
"step": 1190
},
{
"epoch": 2.8792270531400965,
"grad_norm": 7.784018516540527,
"learning_rate": 0.00010418679549114332,
"loss": 26.3142,
"step": 1192
},
{
"epoch": 2.8840579710144927,
"grad_norm": 6.824240207672119,
"learning_rate": 0.00010402576489533012,
"loss": 26.4967,
"step": 1194
},
{
"epoch": 2.888888888888889,
"grad_norm": 6.703210353851318,
"learning_rate": 0.00010386473429951691,
"loss": 27.9698,
"step": 1196
},
{
"epoch": 2.893719806763285,
"grad_norm": 7.0591840744018555,
"learning_rate": 0.0001037037037037037,
"loss": 26.4026,
"step": 1198
},
{
"epoch": 2.898550724637681,
"grad_norm": 6.3246564865112305,
"learning_rate": 0.0001035426731078905,
"loss": 26.839,
"step": 1200
},
{
"epoch": 2.9033816425120773,
"grad_norm": 8.211289405822754,
"learning_rate": 0.00010338164251207729,
"loss": 27.0174,
"step": 1202
},
{
"epoch": 2.9082125603864735,
"grad_norm": 6.735382556915283,
"learning_rate": 0.00010322061191626408,
"loss": 26.3102,
"step": 1204
},
{
"epoch": 2.9130434782608696,
"grad_norm": 8.0295991897583,
"learning_rate": 0.00010305958132045089,
"loss": 25.7761,
"step": 1206
},
{
"epoch": 2.917874396135266,
"grad_norm": 8.097826957702637,
"learning_rate": 0.00010289855072463768,
"loss": 28.9129,
"step": 1208
},
{
"epoch": 2.922705314009662,
"grad_norm": 8.124273300170898,
"learning_rate": 0.00010273752012882447,
"loss": 26.1519,
"step": 1210
},
{
"epoch": 2.927536231884058,
"grad_norm": 8.470534324645996,
"learning_rate": 0.00010257648953301127,
"loss": 25.6004,
"step": 1212
},
{
"epoch": 2.9323671497584543,
"grad_norm": 7.348142147064209,
"learning_rate": 0.00010241545893719809,
"loss": 27.1859,
"step": 1214
},
{
"epoch": 2.9371980676328504,
"grad_norm": 8.258639335632324,
"learning_rate": 0.00010225442834138488,
"loss": 24.1802,
"step": 1216
},
{
"epoch": 2.942028985507246,
"grad_norm": 8.160893440246582,
"learning_rate": 0.00010209339774557167,
"loss": 26.3956,
"step": 1218
},
{
"epoch": 2.9468599033816423,
"grad_norm": 7.1116814613342285,
"learning_rate": 0.00010193236714975847,
"loss": 25.9712,
"step": 1220
},
{
"epoch": 2.9516908212560384,
"grad_norm": 6.059470176696777,
"learning_rate": 0.00010177133655394526,
"loss": 27.1363,
"step": 1222
},
{
"epoch": 2.9565217391304346,
"grad_norm": 7.71455192565918,
"learning_rate": 0.00010161030595813205,
"loss": 26.5133,
"step": 1224
},
{
"epoch": 2.9613526570048307,
"grad_norm": 9.131839752197266,
"learning_rate": 0.00010144927536231885,
"loss": 27.7297,
"step": 1226
},
{
"epoch": 2.966183574879227,
"grad_norm": 6.740046977996826,
"learning_rate": 0.00010128824476650564,
"loss": 25.8968,
"step": 1228
},
{
"epoch": 2.971014492753623,
"grad_norm": 7.255392074584961,
"learning_rate": 0.00010112721417069245,
"loss": 26.343,
"step": 1230
},
{
"epoch": 2.975845410628019,
"grad_norm": 7.241657733917236,
"learning_rate": 0.00010096618357487924,
"loss": 26.2671,
"step": 1232
},
{
"epoch": 2.9806763285024154,
"grad_norm": 8.625435829162598,
"learning_rate": 0.00010080515297906603,
"loss": 26.2536,
"step": 1234
},
{
"epoch": 2.9855072463768115,
"grad_norm": 7.044302940368652,
"learning_rate": 0.00010064412238325283,
"loss": 27.3368,
"step": 1236
},
{
"epoch": 2.9903381642512077,
"grad_norm": 7.077991485595703,
"learning_rate": 0.00010048309178743962,
"loss": 28.0877,
"step": 1238
},
{
"epoch": 2.995169082125604,
"grad_norm": 7.624186992645264,
"learning_rate": 0.00010032206119162641,
"loss": 27.3155,
"step": 1240
},
{
"epoch": 3.0,
"grad_norm": 7.315317630767822,
"learning_rate": 0.00010016103059581321,
"loss": 26.3495,
"step": 1242
},
{
"epoch": 3.004830917874396,
"grad_norm": 6.168877124786377,
"learning_rate": 0.0001,
"loss": 27.1989,
"step": 1244
},
{
"epoch": 3.0096618357487923,
"grad_norm": 7.338534832000732,
"learning_rate": 9.98389694041868e-05,
"loss": 26.6896,
"step": 1246
},
{
"epoch": 3.0144927536231885,
"grad_norm": 7.950836658477783,
"learning_rate": 9.96779388083736e-05,
"loss": 26.1743,
"step": 1248
},
{
"epoch": 3.0193236714975846,
"grad_norm": 7.836818218231201,
"learning_rate": 9.951690821256039e-05,
"loss": 24.6431,
"step": 1250
},
{
"epoch": 3.024154589371981,
"grad_norm": 7.391972064971924,
"learning_rate": 9.935587761674718e-05,
"loss": 26.9987,
"step": 1252
},
{
"epoch": 3.028985507246377,
"grad_norm": 6.927128314971924,
"learning_rate": 9.919484702093398e-05,
"loss": 26.3314,
"step": 1254
},
{
"epoch": 3.033816425120773,
"grad_norm": 6.5931267738342285,
"learning_rate": 9.903381642512077e-05,
"loss": 28.486,
"step": 1256
},
{
"epoch": 3.0386473429951693,
"grad_norm": 6.712624549865723,
"learning_rate": 9.887278582930756e-05,
"loss": 23.3735,
"step": 1258
},
{
"epoch": 3.0434782608695654,
"grad_norm": 7.244742393493652,
"learning_rate": 9.871175523349438e-05,
"loss": 28.1393,
"step": 1260
},
{
"epoch": 3.0483091787439616,
"grad_norm": 7.571489334106445,
"learning_rate": 9.855072463768117e-05,
"loss": 26.1208,
"step": 1262
},
{
"epoch": 3.0531400966183573,
"grad_norm": 7.6882643699646,
"learning_rate": 9.838969404186796e-05,
"loss": 25.3927,
"step": 1264
},
{
"epoch": 3.0579710144927534,
"grad_norm": 7.103066444396973,
"learning_rate": 9.822866344605476e-05,
"loss": 25.6778,
"step": 1266
},
{
"epoch": 3.0628019323671496,
"grad_norm": 7.564841270446777,
"learning_rate": 9.806763285024155e-05,
"loss": 26.3471,
"step": 1268
},
{
"epoch": 3.0676328502415457,
"grad_norm": 7.3738508224487305,
"learning_rate": 9.790660225442834e-05,
"loss": 26.4939,
"step": 1270
},
{
"epoch": 3.072463768115942,
"grad_norm": 8.300433158874512,
"learning_rate": 9.774557165861515e-05,
"loss": 27.4497,
"step": 1272
},
{
"epoch": 3.077294685990338,
"grad_norm": 6.373605251312256,
"learning_rate": 9.758454106280194e-05,
"loss": 27.1139,
"step": 1274
},
{
"epoch": 3.082125603864734,
"grad_norm": 7.21131706237793,
"learning_rate": 9.742351046698873e-05,
"loss": 25.3131,
"step": 1276
},
{
"epoch": 3.0869565217391304,
"grad_norm": 7.3897504806518555,
"learning_rate": 9.726247987117553e-05,
"loss": 24.7751,
"step": 1278
},
{
"epoch": 3.0917874396135265,
"grad_norm": 6.666619777679443,
"learning_rate": 9.710144927536232e-05,
"loss": 25.6616,
"step": 1280
},
{
"epoch": 3.0966183574879227,
"grad_norm": 6.16898250579834,
"learning_rate": 9.694041867954912e-05,
"loss": 23.4636,
"step": 1282
},
{
"epoch": 3.101449275362319,
"grad_norm": 6.940250396728516,
"learning_rate": 9.677938808373591e-05,
"loss": 27.0285,
"step": 1284
},
{
"epoch": 3.106280193236715,
"grad_norm": 8.428845405578613,
"learning_rate": 9.66183574879227e-05,
"loss": 26.6035,
"step": 1286
},
{
"epoch": 3.111111111111111,
"grad_norm": 7.685654640197754,
"learning_rate": 9.64573268921095e-05,
"loss": 27.0342,
"step": 1288
},
{
"epoch": 3.1159420289855073,
"grad_norm": 8.046797752380371,
"learning_rate": 9.62962962962963e-05,
"loss": 26.7352,
"step": 1290
},
{
"epoch": 3.1207729468599035,
"grad_norm": 7.739950180053711,
"learning_rate": 9.61352657004831e-05,
"loss": 27.607,
"step": 1292
},
{
"epoch": 3.1256038647342996,
"grad_norm": 8.301579475402832,
"learning_rate": 9.597423510466989e-05,
"loss": 26.7545,
"step": 1294
},
{
"epoch": 3.130434782608696,
"grad_norm": 7.416752338409424,
"learning_rate": 9.58132045088567e-05,
"loss": 25.5911,
"step": 1296
},
{
"epoch": 3.135265700483092,
"grad_norm": 7.364454746246338,
"learning_rate": 9.565217391304348e-05,
"loss": 25.8314,
"step": 1298
},
{
"epoch": 3.140096618357488,
"grad_norm": 7.930257797241211,
"learning_rate": 9.549114331723029e-05,
"loss": 23.9096,
"step": 1300
},
{
"epoch": 3.1449275362318843,
"grad_norm": 6.694441795349121,
"learning_rate": 9.533011272141708e-05,
"loss": 26.1153,
"step": 1302
},
{
"epoch": 3.14975845410628,
"grad_norm": 6.781352996826172,
"learning_rate": 9.516908212560387e-05,
"loss": 26.7928,
"step": 1304
},
{
"epoch": 3.154589371980676,
"grad_norm": 6.676225662231445,
"learning_rate": 9.500805152979067e-05,
"loss": 27.0461,
"step": 1306
},
{
"epoch": 3.1594202898550723,
"grad_norm": 7.4368767738342285,
"learning_rate": 9.484702093397746e-05,
"loss": 26.7284,
"step": 1308
},
{
"epoch": 3.1642512077294684,
"grad_norm": 7.008518695831299,
"learning_rate": 9.468599033816425e-05,
"loss": 27.4804,
"step": 1310
},
{
"epoch": 3.1690821256038646,
"grad_norm": 7.6441850662231445,
"learning_rate": 9.452495974235105e-05,
"loss": 27.3274,
"step": 1312
},
{
"epoch": 3.1739130434782608,
"grad_norm": 7.242411136627197,
"learning_rate": 9.436392914653784e-05,
"loss": 24.6985,
"step": 1314
},
{
"epoch": 3.178743961352657,
"grad_norm": 6.712805271148682,
"learning_rate": 9.420289855072463e-05,
"loss": 25.8327,
"step": 1316
},
{
"epoch": 3.183574879227053,
"grad_norm": 6.724958419799805,
"learning_rate": 9.404186795491144e-05,
"loss": 26.9393,
"step": 1318
},
{
"epoch": 3.1884057971014492,
"grad_norm": 7.451432228088379,
"learning_rate": 9.388083735909823e-05,
"loss": 25.1356,
"step": 1320
},
{
"epoch": 3.1932367149758454,
"grad_norm": 7.7775421142578125,
"learning_rate": 9.371980676328503e-05,
"loss": 26.6738,
"step": 1322
},
{
"epoch": 3.1980676328502415,
"grad_norm": 7.692890167236328,
"learning_rate": 9.355877616747183e-05,
"loss": 25.1647,
"step": 1324
},
{
"epoch": 3.2028985507246377,
"grad_norm": 7.3927812576293945,
"learning_rate": 9.339774557165862e-05,
"loss": 26.9764,
"step": 1326
},
{
"epoch": 3.207729468599034,
"grad_norm": 7.326320171356201,
"learning_rate": 9.323671497584541e-05,
"loss": 25.9006,
"step": 1328
},
{
"epoch": 3.21256038647343,
"grad_norm": 8.4861421585083,
"learning_rate": 9.307568438003222e-05,
"loss": 26.148,
"step": 1330
},
{
"epoch": 3.217391304347826,
"grad_norm": 8.520912170410156,
"learning_rate": 9.291465378421901e-05,
"loss": 26.3554,
"step": 1332
},
{
"epoch": 3.2222222222222223,
"grad_norm": 7.051355361938477,
"learning_rate": 9.27536231884058e-05,
"loss": 26.3572,
"step": 1334
},
{
"epoch": 3.2270531400966185,
"grad_norm": 9.287524223327637,
"learning_rate": 9.25925925925926e-05,
"loss": 25.9726,
"step": 1336
},
{
"epoch": 3.2318840579710146,
"grad_norm": 7.160129070281982,
"learning_rate": 9.243156199677939e-05,
"loss": 28.7179,
"step": 1338
},
{
"epoch": 3.236714975845411,
"grad_norm": 7.048616886138916,
"learning_rate": 9.227053140096618e-05,
"loss": 25.7061,
"step": 1340
},
{
"epoch": 3.241545893719807,
"grad_norm": 7.782952785491943,
"learning_rate": 9.210950080515299e-05,
"loss": 26.6252,
"step": 1342
},
{
"epoch": 3.246376811594203,
"grad_norm": 8.396957397460938,
"learning_rate": 9.194847020933978e-05,
"loss": 25.4261,
"step": 1344
},
{
"epoch": 3.2512077294685993,
"grad_norm": 7.221895217895508,
"learning_rate": 9.178743961352657e-05,
"loss": 25.6437,
"step": 1346
},
{
"epoch": 3.2560386473429954,
"grad_norm": 7.694455146789551,
"learning_rate": 9.162640901771337e-05,
"loss": 26.2562,
"step": 1348
},
{
"epoch": 3.260869565217391,
"grad_norm": 7.642673492431641,
"learning_rate": 9.146537842190016e-05,
"loss": 25.1317,
"step": 1350
},
{
"epoch": 3.2657004830917873,
"grad_norm": 6.599581241607666,
"learning_rate": 9.130434782608696e-05,
"loss": 25.9692,
"step": 1352
},
{
"epoch": 3.2705314009661834,
"grad_norm": 8.950820922851562,
"learning_rate": 9.114331723027377e-05,
"loss": 27.4472,
"step": 1354
},
{
"epoch": 3.2753623188405796,
"grad_norm": 6.30159854888916,
"learning_rate": 9.098228663446056e-05,
"loss": 25.9316,
"step": 1356
},
{
"epoch": 3.2801932367149758,
"grad_norm": 6.927635192871094,
"learning_rate": 9.082125603864735e-05,
"loss": 27.5305,
"step": 1358
},
{
"epoch": 3.285024154589372,
"grad_norm": 6.424526214599609,
"learning_rate": 9.066022544283415e-05,
"loss": 26.0866,
"step": 1360
},
{
"epoch": 3.289855072463768,
"grad_norm": 8.260842323303223,
"learning_rate": 9.049919484702094e-05,
"loss": 25.4734,
"step": 1362
},
{
"epoch": 3.2946859903381642,
"grad_norm": 6.600332736968994,
"learning_rate": 9.033816425120773e-05,
"loss": 25.1304,
"step": 1364
},
{
"epoch": 3.2995169082125604,
"grad_norm": 6.801137447357178,
"learning_rate": 9.017713365539453e-05,
"loss": 27.4591,
"step": 1366
},
{
"epoch": 3.3043478260869565,
"grad_norm": 7.686280250549316,
"learning_rate": 9.001610305958132e-05,
"loss": 26.2466,
"step": 1368
},
{
"epoch": 3.3091787439613527,
"grad_norm": 6.084709644317627,
"learning_rate": 8.985507246376813e-05,
"loss": 25.2827,
"step": 1370
},
{
"epoch": 3.314009661835749,
"grad_norm": 7.699804306030273,
"learning_rate": 8.969404186795492e-05,
"loss": 28.068,
"step": 1372
},
{
"epoch": 3.318840579710145,
"grad_norm": 8.359792709350586,
"learning_rate": 8.95330112721417e-05,
"loss": 28.9643,
"step": 1374
},
{
"epoch": 3.323671497584541,
"grad_norm": 7.701099872589111,
"learning_rate": 8.937198067632851e-05,
"loss": 26.1439,
"step": 1376
},
{
"epoch": 3.3285024154589373,
"grad_norm": 8.339729309082031,
"learning_rate": 8.92109500805153e-05,
"loss": 26.0983,
"step": 1378
},
{
"epoch": 3.3333333333333335,
"grad_norm": 8.924784660339355,
"learning_rate": 8.904991948470209e-05,
"loss": 25.8818,
"step": 1380
},
{
"epoch": 3.3381642512077296,
"grad_norm": 8.396602630615234,
"learning_rate": 8.888888888888889e-05,
"loss": 27.7536,
"step": 1382
},
{
"epoch": 3.342995169082126,
"grad_norm": 8.177582740783691,
"learning_rate": 8.87278582930757e-05,
"loss": 25.7908,
"step": 1384
},
{
"epoch": 3.3478260869565215,
"grad_norm": 6.711874008178711,
"learning_rate": 8.856682769726249e-05,
"loss": 27.9945,
"step": 1386
},
{
"epoch": 3.3526570048309177,
"grad_norm": 6.735175132751465,
"learning_rate": 8.840579710144929e-05,
"loss": 27.7595,
"step": 1388
},
{
"epoch": 3.357487922705314,
"grad_norm": 8.890625,
"learning_rate": 8.824476650563608e-05,
"loss": 25.7886,
"step": 1390
},
{
"epoch": 3.36231884057971,
"grad_norm": 7.918723106384277,
"learning_rate": 8.808373590982287e-05,
"loss": 27.3296,
"step": 1392
},
{
"epoch": 3.367149758454106,
"grad_norm": 8.405486106872559,
"learning_rate": 8.792270531400967e-05,
"loss": 24.8263,
"step": 1394
},
{
"epoch": 3.3719806763285023,
"grad_norm": 7.2000837326049805,
"learning_rate": 8.776167471819646e-05,
"loss": 27.6412,
"step": 1396
},
{
"epoch": 3.3768115942028984,
"grad_norm": 9.657790184020996,
"learning_rate": 8.760064412238325e-05,
"loss": 24.8264,
"step": 1398
},
{
"epoch": 3.3816425120772946,
"grad_norm": 7.06240177154541,
"learning_rate": 8.743961352657006e-05,
"loss": 26.162,
"step": 1400
},
{
"epoch": 3.3864734299516908,
"grad_norm": 7.3674116134643555,
"learning_rate": 8.727858293075685e-05,
"loss": 27.8042,
"step": 1402
},
{
"epoch": 3.391304347826087,
"grad_norm": 7.9507737159729,
"learning_rate": 8.711755233494364e-05,
"loss": 26.6252,
"step": 1404
},
{
"epoch": 3.396135265700483,
"grad_norm": 8.195547103881836,
"learning_rate": 8.695652173913044e-05,
"loss": 26.886,
"step": 1406
},
{
"epoch": 3.4009661835748792,
"grad_norm": 7.462141513824463,
"learning_rate": 8.679549114331723e-05,
"loss": 27.8522,
"step": 1408
},
{
"epoch": 3.4057971014492754,
"grad_norm": 7.903439521789551,
"learning_rate": 8.663446054750402e-05,
"loss": 26.1915,
"step": 1410
},
{
"epoch": 3.4106280193236715,
"grad_norm": 7.791518211364746,
"learning_rate": 8.647342995169082e-05,
"loss": 27.6484,
"step": 1412
},
{
"epoch": 3.4154589371980677,
"grad_norm": 7.624407768249512,
"learning_rate": 8.631239935587761e-05,
"loss": 28.3851,
"step": 1414
},
{
"epoch": 3.420289855072464,
"grad_norm": 7.524753570556641,
"learning_rate": 8.615136876006442e-05,
"loss": 25.3125,
"step": 1416
},
{
"epoch": 3.42512077294686,
"grad_norm": 8.102710723876953,
"learning_rate": 8.599033816425122e-05,
"loss": 24.9681,
"step": 1418
},
{
"epoch": 3.429951690821256,
"grad_norm": 6.52889347076416,
"learning_rate": 8.582930756843801e-05,
"loss": 27.6317,
"step": 1420
},
{
"epoch": 3.4347826086956523,
"grad_norm": 8.491759300231934,
"learning_rate": 8.56682769726248e-05,
"loss": 26.7627,
"step": 1422
},
{
"epoch": 3.4396135265700485,
"grad_norm": 8.082484245300293,
"learning_rate": 8.55072463768116e-05,
"loss": 25.5842,
"step": 1424
},
{
"epoch": 3.4444444444444446,
"grad_norm": 8.158738136291504,
"learning_rate": 8.53462157809984e-05,
"loss": 27.7775,
"step": 1426
},
{
"epoch": 3.449275362318841,
"grad_norm": 6.948888778686523,
"learning_rate": 8.518518518518518e-05,
"loss": 24.269,
"step": 1428
},
{
"epoch": 3.454106280193237,
"grad_norm": 7.217655181884766,
"learning_rate": 8.502415458937199e-05,
"loss": 27.2054,
"step": 1430
},
{
"epoch": 3.4589371980676327,
"grad_norm": 7.5419440269470215,
"learning_rate": 8.486312399355878e-05,
"loss": 25.9815,
"step": 1432
},
{
"epoch": 3.463768115942029,
"grad_norm": 7.58052921295166,
"learning_rate": 8.470209339774557e-05,
"loss": 27.4913,
"step": 1434
},
{
"epoch": 3.468599033816425,
"grad_norm": 7.221286296844482,
"learning_rate": 8.454106280193237e-05,
"loss": 26.7118,
"step": 1436
},
{
"epoch": 3.473429951690821,
"grad_norm": 7.131877899169922,
"learning_rate": 8.438003220611916e-05,
"loss": 27.0661,
"step": 1438
},
{
"epoch": 3.4782608695652173,
"grad_norm": 6.600888729095459,
"learning_rate": 8.421900161030597e-05,
"loss": 24.237,
"step": 1440
},
{
"epoch": 3.4830917874396135,
"grad_norm": 7.91683292388916,
"learning_rate": 8.405797101449276e-05,
"loss": 24.8586,
"step": 1442
},
{
"epoch": 3.4879227053140096,
"grad_norm": 6.824517250061035,
"learning_rate": 8.389694041867955e-05,
"loss": 26.9819,
"step": 1444
},
{
"epoch": 3.4927536231884058,
"grad_norm": 6.753680229187012,
"learning_rate": 8.373590982286635e-05,
"loss": 26.0397,
"step": 1446
},
{
"epoch": 3.497584541062802,
"grad_norm": 7.486673831939697,
"learning_rate": 8.357487922705315e-05,
"loss": 26.7425,
"step": 1448
},
{
"epoch": 3.502415458937198,
"grad_norm": 8.475358009338379,
"learning_rate": 8.341384863123994e-05,
"loss": 26.1292,
"step": 1450
},
{
"epoch": 3.5072463768115942,
"grad_norm": 6.859409332275391,
"learning_rate": 8.325281803542673e-05,
"loss": 26.4357,
"step": 1452
},
{
"epoch": 3.5120772946859904,
"grad_norm": 7.169741630554199,
"learning_rate": 8.309178743961354e-05,
"loss": 27.2822,
"step": 1454
},
{
"epoch": 3.5169082125603865,
"grad_norm": 8.31079387664795,
"learning_rate": 8.293075684380033e-05,
"loss": 27.7764,
"step": 1456
},
{
"epoch": 3.5217391304347827,
"grad_norm": 6.888429164886475,
"learning_rate": 8.276972624798713e-05,
"loss": 26.5406,
"step": 1458
},
{
"epoch": 3.526570048309179,
"grad_norm": 7.568389892578125,
"learning_rate": 8.260869565217392e-05,
"loss": 25.9647,
"step": 1460
},
{
"epoch": 3.531400966183575,
"grad_norm": 6.64613151550293,
"learning_rate": 8.244766505636071e-05,
"loss": 26.9271,
"step": 1462
},
{
"epoch": 3.536231884057971,
"grad_norm": 6.534989833831787,
"learning_rate": 8.228663446054751e-05,
"loss": 26.2684,
"step": 1464
},
{
"epoch": 3.541062801932367,
"grad_norm": 7.926050662994385,
"learning_rate": 8.21256038647343e-05,
"loss": 26.845,
"step": 1466
},
{
"epoch": 3.545893719806763,
"grad_norm": 7.452934741973877,
"learning_rate": 8.196457326892109e-05,
"loss": 25.8417,
"step": 1468
},
{
"epoch": 3.550724637681159,
"grad_norm": 7.26784086227417,
"learning_rate": 8.18035426731079e-05,
"loss": 26.0035,
"step": 1470
},
{
"epoch": 3.5555555555555554,
"grad_norm": 7.318904399871826,
"learning_rate": 8.164251207729469e-05,
"loss": 27.4574,
"step": 1472
},
{
"epoch": 3.5603864734299515,
"grad_norm": 6.999464511871338,
"learning_rate": 8.148148148148148e-05,
"loss": 25.949,
"step": 1474
},
{
"epoch": 3.5652173913043477,
"grad_norm": 7.244204044342041,
"learning_rate": 8.132045088566828e-05,
"loss": 26.636,
"step": 1476
},
{
"epoch": 3.570048309178744,
"grad_norm": 8.60554027557373,
"learning_rate": 8.115942028985508e-05,
"loss": 27.4116,
"step": 1478
},
{
"epoch": 3.57487922705314,
"grad_norm": 6.701752662658691,
"learning_rate": 8.099838969404187e-05,
"loss": 25.0194,
"step": 1480
},
{
"epoch": 3.579710144927536,
"grad_norm": 6.613931655883789,
"learning_rate": 8.083735909822868e-05,
"loss": 25.7569,
"step": 1482
},
{
"epoch": 3.5845410628019323,
"grad_norm": 7.828546524047852,
"learning_rate": 8.067632850241547e-05,
"loss": 27.1483,
"step": 1484
},
{
"epoch": 3.5893719806763285,
"grad_norm": 7.983916282653809,
"learning_rate": 8.051529790660226e-05,
"loss": 24.6017,
"step": 1486
},
{
"epoch": 3.5942028985507246,
"grad_norm": 8.500826835632324,
"learning_rate": 8.035426731078906e-05,
"loss": 26.6575,
"step": 1488
},
{
"epoch": 3.5990338164251208,
"grad_norm": 8.88049030303955,
"learning_rate": 8.019323671497585e-05,
"loss": 23.7421,
"step": 1490
},
{
"epoch": 3.603864734299517,
"grad_norm": 7.034642696380615,
"learning_rate": 8.003220611916264e-05,
"loss": 27.8291,
"step": 1492
},
{
"epoch": 3.608695652173913,
"grad_norm": 7.1023077964782715,
"learning_rate": 7.987117552334944e-05,
"loss": 26.7066,
"step": 1494
},
{
"epoch": 3.6135265700483092,
"grad_norm": 8.332448959350586,
"learning_rate": 7.971014492753623e-05,
"loss": 25.7769,
"step": 1496
},
{
"epoch": 3.6183574879227054,
"grad_norm": 7.105356693267822,
"learning_rate": 7.954911433172302e-05,
"loss": 25.7133,
"step": 1498
},
{
"epoch": 3.6231884057971016,
"grad_norm": 7.028257369995117,
"learning_rate": 7.938808373590983e-05,
"loss": 25.0051,
"step": 1500
},
{
"epoch": 3.6280193236714977,
"grad_norm": 7.71824312210083,
"learning_rate": 7.922705314009662e-05,
"loss": 25.9737,
"step": 1502
},
{
"epoch": 3.632850241545894,
"grad_norm": 5.497483253479004,
"learning_rate": 7.906602254428341e-05,
"loss": 27.4592,
"step": 1504
},
{
"epoch": 3.63768115942029,
"grad_norm": 8.458606719970703,
"learning_rate": 7.890499194847021e-05,
"loss": 24.0378,
"step": 1506
},
{
"epoch": 3.642512077294686,
"grad_norm": 8.406185150146484,
"learning_rate": 7.874396135265701e-05,
"loss": 26.3229,
"step": 1508
},
{
"epoch": 3.6473429951690823,
"grad_norm": 7.685035228729248,
"learning_rate": 7.85829307568438e-05,
"loss": 25.97,
"step": 1510
},
{
"epoch": 3.6521739130434785,
"grad_norm": 8.686131477355957,
"learning_rate": 7.842190016103061e-05,
"loss": 26.591,
"step": 1512
},
{
"epoch": 3.6570048309178746,
"grad_norm": 6.984585285186768,
"learning_rate": 7.82608695652174e-05,
"loss": 25.8358,
"step": 1514
},
{
"epoch": 3.661835748792271,
"grad_norm": 5.834330081939697,
"learning_rate": 7.809983896940419e-05,
"loss": 26.1456,
"step": 1516
},
{
"epoch": 3.6666666666666665,
"grad_norm": 6.367677688598633,
"learning_rate": 7.793880837359099e-05,
"loss": 26.5751,
"step": 1518
},
{
"epoch": 3.6714975845410627,
"grad_norm": 6.723855018615723,
"learning_rate": 7.777777777777778e-05,
"loss": 26.5181,
"step": 1520
},
{
"epoch": 3.676328502415459,
"grad_norm": 6.305589199066162,
"learning_rate": 7.761674718196457e-05,
"loss": 25.6111,
"step": 1522
},
{
"epoch": 3.681159420289855,
"grad_norm": 6.444118976593018,
"learning_rate": 7.745571658615138e-05,
"loss": 25.0445,
"step": 1524
},
{
"epoch": 3.685990338164251,
"grad_norm": 7.176147937774658,
"learning_rate": 7.729468599033817e-05,
"loss": 25.2998,
"step": 1526
},
{
"epoch": 3.6908212560386473,
"grad_norm": 8.422863006591797,
"learning_rate": 7.713365539452497e-05,
"loss": 27.3324,
"step": 1528
},
{
"epoch": 3.6956521739130435,
"grad_norm": 6.9630913734436035,
"learning_rate": 7.697262479871176e-05,
"loss": 25.7144,
"step": 1530
},
{
"epoch": 3.7004830917874396,
"grad_norm": 5.846348285675049,
"learning_rate": 7.681159420289855e-05,
"loss": 27.7589,
"step": 1532
},
{
"epoch": 3.7053140096618358,
"grad_norm": 7.343765735626221,
"learning_rate": 7.665056360708535e-05,
"loss": 25.8322,
"step": 1534
},
{
"epoch": 3.710144927536232,
"grad_norm": 6.997490882873535,
"learning_rate": 7.648953301127214e-05,
"loss": 28.2401,
"step": 1536
},
{
"epoch": 3.714975845410628,
"grad_norm": 6.830377101898193,
"learning_rate": 7.632850241545893e-05,
"loss": 25.1853,
"step": 1538
},
{
"epoch": 3.7198067632850242,
"grad_norm": 7.353569030761719,
"learning_rate": 7.616747181964574e-05,
"loss": 27.8896,
"step": 1540
},
{
"epoch": 3.7246376811594204,
"grad_norm": 6.923029899597168,
"learning_rate": 7.600644122383254e-05,
"loss": 27.466,
"step": 1542
},
{
"epoch": 3.7294685990338166,
"grad_norm": 7.982540607452393,
"learning_rate": 7.584541062801933e-05,
"loss": 26.5827,
"step": 1544
},
{
"epoch": 3.7342995169082127,
"grad_norm": 7.8132758140563965,
"learning_rate": 7.568438003220612e-05,
"loss": 25.5102,
"step": 1546
},
{
"epoch": 3.7391304347826084,
"grad_norm": 6.979062557220459,
"learning_rate": 7.552334943639292e-05,
"loss": 26.6007,
"step": 1548
},
{
"epoch": 3.7439613526570046,
"grad_norm": 6.988529682159424,
"learning_rate": 7.536231884057971e-05,
"loss": 27.2199,
"step": 1550
},
{
"epoch": 3.7487922705314007,
"grad_norm": 6.884960174560547,
"learning_rate": 7.520128824476652e-05,
"loss": 28.3951,
"step": 1552
},
{
"epoch": 3.753623188405797,
"grad_norm": 7.593159198760986,
"learning_rate": 7.50402576489533e-05,
"loss": 26.4899,
"step": 1554
},
{
"epoch": 3.758454106280193,
"grad_norm": 7.603058815002441,
"learning_rate": 7.48792270531401e-05,
"loss": 25.2797,
"step": 1556
},
{
"epoch": 3.763285024154589,
"grad_norm": 8.542155265808105,
"learning_rate": 7.47181964573269e-05,
"loss": 25.7991,
"step": 1558
},
{
"epoch": 3.7681159420289854,
"grad_norm": 7.652464389801025,
"learning_rate": 7.455716586151369e-05,
"loss": 26.4845,
"step": 1560
},
{
"epoch": 3.7729468599033815,
"grad_norm": 8.047564506530762,
"learning_rate": 7.439613526570048e-05,
"loss": 26.0127,
"step": 1562
},
{
"epoch": 3.7777777777777777,
"grad_norm": 6.38883113861084,
"learning_rate": 7.423510466988728e-05,
"loss": 27.0549,
"step": 1564
},
{
"epoch": 3.782608695652174,
"grad_norm": 6.353972434997559,
"learning_rate": 7.407407407407407e-05,
"loss": 25.2923,
"step": 1566
},
{
"epoch": 3.78743961352657,
"grad_norm": 6.962271690368652,
"learning_rate": 7.391304347826086e-05,
"loss": 27.8826,
"step": 1568
},
{
"epoch": 3.792270531400966,
"grad_norm": 6.521156311035156,
"learning_rate": 7.375201288244767e-05,
"loss": 28.1107,
"step": 1570
},
{
"epoch": 3.7971014492753623,
"grad_norm": 8.195451736450195,
"learning_rate": 7.359098228663447e-05,
"loss": 26.5253,
"step": 1572
},
{
"epoch": 3.8019323671497585,
"grad_norm": 6.806168556213379,
"learning_rate": 7.342995169082126e-05,
"loss": 27.1728,
"step": 1574
},
{
"epoch": 3.8067632850241546,
"grad_norm": 7.229825973510742,
"learning_rate": 7.326892109500806e-05,
"loss": 25.333,
"step": 1576
},
{
"epoch": 3.8115942028985508,
"grad_norm": 6.635615825653076,
"learning_rate": 7.310789049919485e-05,
"loss": 27.0733,
"step": 1578
},
{
"epoch": 3.816425120772947,
"grad_norm": 6.50180721282959,
"learning_rate": 7.294685990338164e-05,
"loss": 27.4529,
"step": 1580
},
{
"epoch": 3.821256038647343,
"grad_norm": 7.335048675537109,
"learning_rate": 7.278582930756845e-05,
"loss": 25.6855,
"step": 1582
},
{
"epoch": 3.8260869565217392,
"grad_norm": 6.961329460144043,
"learning_rate": 7.262479871175524e-05,
"loss": 26.0103,
"step": 1584
},
{
"epoch": 3.8309178743961354,
"grad_norm": 6.842545986175537,
"learning_rate": 7.246376811594203e-05,
"loss": 27.9322,
"step": 1586
},
{
"epoch": 3.8357487922705316,
"grad_norm": 6.83944845199585,
"learning_rate": 7.230273752012883e-05,
"loss": 27.5723,
"step": 1588
},
{
"epoch": 3.8405797101449277,
"grad_norm": 7.0610127449035645,
"learning_rate": 7.214170692431562e-05,
"loss": 26.011,
"step": 1590
},
{
"epoch": 3.845410628019324,
"grad_norm": 7.726437568664551,
"learning_rate": 7.198067632850241e-05,
"loss": 26.6807,
"step": 1592
},
{
"epoch": 3.85024154589372,
"grad_norm": 9.280223846435547,
"learning_rate": 7.181964573268921e-05,
"loss": 26.9886,
"step": 1594
},
{
"epoch": 3.855072463768116,
"grad_norm": 6.995485782623291,
"learning_rate": 7.1658615136876e-05,
"loss": 27.2315,
"step": 1596
},
{
"epoch": 3.8599033816425123,
"grad_norm": 7.200146198272705,
"learning_rate": 7.14975845410628e-05,
"loss": 25.7971,
"step": 1598
},
{
"epoch": 3.864734299516908,
"grad_norm": 7.404515743255615,
"learning_rate": 7.13365539452496e-05,
"loss": 25.3168,
"step": 1600
},
{
"epoch": 3.869565217391304,
"grad_norm": 7.142045497894287,
"learning_rate": 7.11755233494364e-05,
"loss": 24.6409,
"step": 1602
},
{
"epoch": 3.8743961352657004,
"grad_norm": 7.02120304107666,
"learning_rate": 7.101449275362319e-05,
"loss": 27.3518,
"step": 1604
},
{
"epoch": 3.8792270531400965,
"grad_norm": 7.604321002960205,
"learning_rate": 7.085346215781e-05,
"loss": 26.8926,
"step": 1606
},
{
"epoch": 3.8840579710144927,
"grad_norm": 7.089973449707031,
"learning_rate": 7.069243156199678e-05,
"loss": 24.8074,
"step": 1608
},
{
"epoch": 3.888888888888889,
"grad_norm": 8.049272537231445,
"learning_rate": 7.053140096618357e-05,
"loss": 25.8524,
"step": 1610
},
{
"epoch": 3.893719806763285,
"grad_norm": 7.1630144119262695,
"learning_rate": 7.037037037037038e-05,
"loss": 26.9071,
"step": 1612
},
{
"epoch": 3.898550724637681,
"grad_norm": 6.2005510330200195,
"learning_rate": 7.020933977455717e-05,
"loss": 27.107,
"step": 1614
},
{
"epoch": 3.9033816425120773,
"grad_norm": 8.320915222167969,
"learning_rate": 7.004830917874396e-05,
"loss": 27.0582,
"step": 1616
},
{
"epoch": 3.9082125603864735,
"grad_norm": 6.952855110168457,
"learning_rate": 6.988727858293076e-05,
"loss": 26.4762,
"step": 1618
},
{
"epoch": 3.9130434782608696,
"grad_norm": 7.927274227142334,
"learning_rate": 6.972624798711755e-05,
"loss": 24.2706,
"step": 1620
},
{
"epoch": 3.917874396135266,
"grad_norm": 7.922103404998779,
"learning_rate": 6.956521739130436e-05,
"loss": 25.8205,
"step": 1622
},
{
"epoch": 3.922705314009662,
"grad_norm": 7.824489116668701,
"learning_rate": 6.940418679549115e-05,
"loss": 26.4827,
"step": 1624
},
{
"epoch": 3.927536231884058,
"grad_norm": 6.419587135314941,
"learning_rate": 6.924315619967794e-05,
"loss": 28.2682,
"step": 1626
},
{
"epoch": 3.9323671497584543,
"grad_norm": 8.104780197143555,
"learning_rate": 6.908212560386474e-05,
"loss": 25.7621,
"step": 1628
},
{
"epoch": 3.9371980676328504,
"grad_norm": 7.307147979736328,
"learning_rate": 6.892109500805153e-05,
"loss": 27.1352,
"step": 1630
},
{
"epoch": 3.942028985507246,
"grad_norm": 7.069173812866211,
"learning_rate": 6.876006441223832e-05,
"loss": 25.538,
"step": 1632
},
{
"epoch": 3.9468599033816423,
"grad_norm": 7.971487522125244,
"learning_rate": 6.859903381642512e-05,
"loss": 26.754,
"step": 1634
},
{
"epoch": 3.9516908212560384,
"grad_norm": 7.200797080993652,
"learning_rate": 6.843800322061193e-05,
"loss": 25.5438,
"step": 1636
},
{
"epoch": 3.9565217391304346,
"grad_norm": 8.00469970703125,
"learning_rate": 6.827697262479872e-05,
"loss": 26.6568,
"step": 1638
},
{
"epoch": 3.9613526570048307,
"grad_norm": 6.9250359535217285,
"learning_rate": 6.811594202898552e-05,
"loss": 25.4743,
"step": 1640
},
{
"epoch": 3.966183574879227,
"grad_norm": 6.473790168762207,
"learning_rate": 6.795491143317231e-05,
"loss": 25.9443,
"step": 1642
},
{
"epoch": 3.971014492753623,
"grad_norm": 8.05759048461914,
"learning_rate": 6.77938808373591e-05,
"loss": 25.5339,
"step": 1644
},
{
"epoch": 3.975845410628019,
"grad_norm": 7.342809200286865,
"learning_rate": 6.76328502415459e-05,
"loss": 24.9969,
"step": 1646
},
{
"epoch": 3.9806763285024154,
"grad_norm": 7.265125274658203,
"learning_rate": 6.747181964573269e-05,
"loss": 27.3797,
"step": 1648
},
{
"epoch": 3.9855072463768115,
"grad_norm": 7.021026134490967,
"learning_rate": 6.731078904991948e-05,
"loss": 26.1091,
"step": 1650
},
{
"epoch": 3.9903381642512077,
"grad_norm": 7.2072529792785645,
"learning_rate": 6.714975845410629e-05,
"loss": 27.5467,
"step": 1652
},
{
"epoch": 3.995169082125604,
"grad_norm": 7.393160820007324,
"learning_rate": 6.698872785829308e-05,
"loss": 26.9415,
"step": 1654
},
{
"epoch": 4.0,
"grad_norm": 7.511723518371582,
"learning_rate": 6.682769726247987e-05,
"loss": 26.0607,
"step": 1656
},
{
"epoch": 4.004830917874396,
"grad_norm": 8.766012191772461,
"learning_rate": 6.666666666666667e-05,
"loss": 27.5218,
"step": 1658
},
{
"epoch": 4.009661835748792,
"grad_norm": 6.866961479187012,
"learning_rate": 6.650563607085346e-05,
"loss": 26.818,
"step": 1660
},
{
"epoch": 4.0144927536231885,
"grad_norm": 7.680884838104248,
"learning_rate": 6.634460547504025e-05,
"loss": 26.8022,
"step": 1662
},
{
"epoch": 4.019323671497585,
"grad_norm": 7.392796039581299,
"learning_rate": 6.618357487922707e-05,
"loss": 26.8426,
"step": 1664
},
{
"epoch": 4.024154589371981,
"grad_norm": 7.595928192138672,
"learning_rate": 6.602254428341386e-05,
"loss": 24.3986,
"step": 1666
},
{
"epoch": 4.028985507246377,
"grad_norm": 7.379922866821289,
"learning_rate": 6.586151368760065e-05,
"loss": 28.0114,
"step": 1668
},
{
"epoch": 4.033816425120773,
"grad_norm": 7.208115100860596,
"learning_rate": 6.570048309178745e-05,
"loss": 27.5758,
"step": 1670
},
{
"epoch": 4.038647342995169,
"grad_norm": 7.357963562011719,
"learning_rate": 6.553945249597424e-05,
"loss": 24.8787,
"step": 1672
},
{
"epoch": 4.043478260869565,
"grad_norm": 7.291189670562744,
"learning_rate": 6.537842190016103e-05,
"loss": 26.2749,
"step": 1674
},
{
"epoch": 4.048309178743962,
"grad_norm": 7.44353723526001,
"learning_rate": 6.521739130434783e-05,
"loss": 25.121,
"step": 1676
},
{
"epoch": 4.053140096618358,
"grad_norm": 6.338862419128418,
"learning_rate": 6.505636070853462e-05,
"loss": 25.7155,
"step": 1678
},
{
"epoch": 4.057971014492754,
"grad_norm": 6.6159162521362305,
"learning_rate": 6.489533011272141e-05,
"loss": 24.0727,
"step": 1680
},
{
"epoch": 4.06280193236715,
"grad_norm": 6.825524806976318,
"learning_rate": 6.473429951690822e-05,
"loss": 24.1144,
"step": 1682
},
{
"epoch": 4.067632850241546,
"grad_norm": 6.563850402832031,
"learning_rate": 6.457326892109501e-05,
"loss": 25.211,
"step": 1684
},
{
"epoch": 4.072463768115942,
"grad_norm": 6.340920925140381,
"learning_rate": 6.44122383252818e-05,
"loss": 25.9026,
"step": 1686
},
{
"epoch": 4.0772946859903385,
"grad_norm": 6.728626251220703,
"learning_rate": 6.42512077294686e-05,
"loss": 27.3648,
"step": 1688
},
{
"epoch": 4.082125603864735,
"grad_norm": 6.788083553314209,
"learning_rate": 6.409017713365539e-05,
"loss": 26.6329,
"step": 1690
},
{
"epoch": 4.086956521739131,
"grad_norm": 7.323519706726074,
"learning_rate": 6.39291465378422e-05,
"loss": 25.6293,
"step": 1692
},
{
"epoch": 4.091787439613527,
"grad_norm": 6.454324245452881,
"learning_rate": 6.376811594202898e-05,
"loss": 26.1033,
"step": 1694
},
{
"epoch": 4.096618357487923,
"grad_norm": 6.53643798828125,
"learning_rate": 6.360708534621579e-05,
"loss": 28.4655,
"step": 1696
},
{
"epoch": 4.101449275362318,
"grad_norm": 8.033370971679688,
"learning_rate": 6.344605475040258e-05,
"loss": 26.4287,
"step": 1698
},
{
"epoch": 4.106280193236715,
"grad_norm": 6.196560382843018,
"learning_rate": 6.328502415458938e-05,
"loss": 26.5747,
"step": 1700
},
{
"epoch": 4.111111111111111,
"grad_norm": 6.994458198547363,
"learning_rate": 6.312399355877617e-05,
"loss": 25.3307,
"step": 1702
},
{
"epoch": 4.115942028985507,
"grad_norm": 7.29825496673584,
"learning_rate": 6.296296296296296e-05,
"loss": 25.2931,
"step": 1704
},
{
"epoch": 4.120772946859903,
"grad_norm": 7.366706371307373,
"learning_rate": 6.280193236714976e-05,
"loss": 25.1327,
"step": 1706
},
{
"epoch": 4.125603864734299,
"grad_norm": 7.066011428833008,
"learning_rate": 6.264090177133655e-05,
"loss": 27.8359,
"step": 1708
},
{
"epoch": 4.130434782608695,
"grad_norm": 7.165285587310791,
"learning_rate": 6.247987117552336e-05,
"loss": 26.1166,
"step": 1710
},
{
"epoch": 4.1352657004830915,
"grad_norm": 6.823864936828613,
"learning_rate": 6.231884057971015e-05,
"loss": 27.5943,
"step": 1712
},
{
"epoch": 4.140096618357488,
"grad_norm": 7.767164707183838,
"learning_rate": 6.215780998389694e-05,
"loss": 24.9854,
"step": 1714
},
{
"epoch": 4.144927536231884,
"grad_norm": 6.458461284637451,
"learning_rate": 6.199677938808374e-05,
"loss": 26.8271,
"step": 1716
},
{
"epoch": 4.14975845410628,
"grad_norm": 7.082225322723389,
"learning_rate": 6.183574879227053e-05,
"loss": 25.678,
"step": 1718
},
{
"epoch": 4.154589371980676,
"grad_norm": 7.867661476135254,
"learning_rate": 6.167471819645732e-05,
"loss": 26.7575,
"step": 1720
},
{
"epoch": 4.159420289855072,
"grad_norm": 7.803908824920654,
"learning_rate": 6.151368760064413e-05,
"loss": 27.785,
"step": 1722
},
{
"epoch": 4.164251207729468,
"grad_norm": 7.704416751861572,
"learning_rate": 6.135265700483092e-05,
"loss": 26.4086,
"step": 1724
},
{
"epoch": 4.169082125603865,
"grad_norm": 7.166048049926758,
"learning_rate": 6.119162640901772e-05,
"loss": 25.6944,
"step": 1726
},
{
"epoch": 4.173913043478261,
"grad_norm": 7.665358066558838,
"learning_rate": 6.1030595813204516e-05,
"loss": 25.7421,
"step": 1728
},
{
"epoch": 4.178743961352657,
"grad_norm": 6.582197666168213,
"learning_rate": 6.086956521739131e-05,
"loss": 24.2085,
"step": 1730
},
{
"epoch": 4.183574879227053,
"grad_norm": 6.641133785247803,
"learning_rate": 6.07085346215781e-05,
"loss": 26.5714,
"step": 1732
},
{
"epoch": 4.188405797101449,
"grad_norm": 8.203088760375977,
"learning_rate": 6.05475040257649e-05,
"loss": 24.042,
"step": 1734
},
{
"epoch": 4.193236714975845,
"grad_norm": 7.593963146209717,
"learning_rate": 6.0386473429951696e-05,
"loss": 26.1024,
"step": 1736
},
{
"epoch": 4.1980676328502415,
"grad_norm": 6.2828450202941895,
"learning_rate": 6.022544283413849e-05,
"loss": 25.6751,
"step": 1738
},
{
"epoch": 4.202898550724638,
"grad_norm": 7.936067581176758,
"learning_rate": 6.006441223832528e-05,
"loss": 26.5624,
"step": 1740
},
{
"epoch": 4.207729468599034,
"grad_norm": 7.069867134094238,
"learning_rate": 5.990338164251208e-05,
"loss": 26.0663,
"step": 1742
},
{
"epoch": 4.21256038647343,
"grad_norm": 7.237870693206787,
"learning_rate": 5.9742351046698876e-05,
"loss": 26.6947,
"step": 1744
},
{
"epoch": 4.217391304347826,
"grad_norm": 6.671788692474365,
"learning_rate": 5.9581320450885666e-05,
"loss": 26.2133,
"step": 1746
},
{
"epoch": 4.222222222222222,
"grad_norm": 6.456491947174072,
"learning_rate": 5.942028985507246e-05,
"loss": 25.4374,
"step": 1748
},
{
"epoch": 4.2270531400966185,
"grad_norm": 6.428054332733154,
"learning_rate": 5.925925925925926e-05,
"loss": 27.421,
"step": 1750
},
{
"epoch": 4.231884057971015,
"grad_norm": 6.948849678039551,
"learning_rate": 5.9098228663446057e-05,
"loss": 24.7258,
"step": 1752
},
{
"epoch": 4.236714975845411,
"grad_norm": 7.914185047149658,
"learning_rate": 5.8937198067632847e-05,
"loss": 26.731,
"step": 1754
},
{
"epoch": 4.241545893719807,
"grad_norm": 6.79870080947876,
"learning_rate": 5.877616747181964e-05,
"loss": 27.7706,
"step": 1756
},
{
"epoch": 4.246376811594203,
"grad_norm": 7.053183078765869,
"learning_rate": 5.861513687600645e-05,
"loss": 25.8897,
"step": 1758
},
{
"epoch": 4.251207729468599,
"grad_norm": 7.341165065765381,
"learning_rate": 5.8454106280193244e-05,
"loss": 25.7238,
"step": 1760
},
{
"epoch": 4.256038647342995,
"grad_norm": 6.499047756195068,
"learning_rate": 5.829307568438004e-05,
"loss": 26.5338,
"step": 1762
},
{
"epoch": 4.260869565217392,
"grad_norm": 6.891699314117432,
"learning_rate": 5.813204508856683e-05,
"loss": 26.3659,
"step": 1764
},
{
"epoch": 4.265700483091788,
"grad_norm": 6.726503849029541,
"learning_rate": 5.797101449275363e-05,
"loss": 24.1357,
"step": 1766
},
{
"epoch": 4.270531400966184,
"grad_norm": 7.38776159286499,
"learning_rate": 5.7809983896940424e-05,
"loss": 25.8197,
"step": 1768
},
{
"epoch": 4.27536231884058,
"grad_norm": 6.880035400390625,
"learning_rate": 5.764895330112722e-05,
"loss": 26.0566,
"step": 1770
},
{
"epoch": 4.280193236714976,
"grad_norm": 6.925288677215576,
"learning_rate": 5.748792270531401e-05,
"loss": 27.931,
"step": 1772
},
{
"epoch": 4.285024154589372,
"grad_norm": 8.501145362854004,
"learning_rate": 5.732689210950081e-05,
"loss": 24.3897,
"step": 1774
},
{
"epoch": 4.2898550724637685,
"grad_norm": 7.33554744720459,
"learning_rate": 5.7165861513687604e-05,
"loss": 26.0568,
"step": 1776
},
{
"epoch": 4.294685990338165,
"grad_norm": 6.757916450500488,
"learning_rate": 5.7004830917874394e-05,
"loss": 26.4327,
"step": 1778
},
{
"epoch": 4.29951690821256,
"grad_norm": 7.093183517456055,
"learning_rate": 5.684380032206119e-05,
"loss": 25.8601,
"step": 1780
},
{
"epoch": 4.304347826086957,
"grad_norm": 5.872477054595947,
"learning_rate": 5.668276972624799e-05,
"loss": 26.353,
"step": 1782
},
{
"epoch": 4.309178743961352,
"grad_norm": 5.949990272521973,
"learning_rate": 5.652173913043478e-05,
"loss": 27.0481,
"step": 1784
},
{
"epoch": 4.314009661835748,
"grad_norm": 6.953137397766113,
"learning_rate": 5.6360708534621574e-05,
"loss": 26.569,
"step": 1786
},
{
"epoch": 4.318840579710145,
"grad_norm": 7.875227928161621,
"learning_rate": 5.6199677938808385e-05,
"loss": 23.7947,
"step": 1788
},
{
"epoch": 4.323671497584541,
"grad_norm": 6.187444686889648,
"learning_rate": 5.6038647342995175e-05,
"loss": 25.1237,
"step": 1790
},
{
"epoch": 4.328502415458937,
"grad_norm": 6.970160961151123,
"learning_rate": 5.587761674718197e-05,
"loss": 25.7053,
"step": 1792
},
{
"epoch": 4.333333333333333,
"grad_norm": 6.903000831604004,
"learning_rate": 5.571658615136877e-05,
"loss": 26.7737,
"step": 1794
},
{
"epoch": 4.338164251207729,
"grad_norm": 7.370026111602783,
"learning_rate": 5.555555555555556e-05,
"loss": 25.3098,
"step": 1796
},
{
"epoch": 4.342995169082125,
"grad_norm": 6.926233768463135,
"learning_rate": 5.5394524959742355e-05,
"loss": 24.9499,
"step": 1798
},
{
"epoch": 4.3478260869565215,
"grad_norm": 6.8403544425964355,
"learning_rate": 5.523349436392915e-05,
"loss": 25.6363,
"step": 1800
},
{
"epoch": 4.352657004830918,
"grad_norm": 7.1537089347839355,
"learning_rate": 5.507246376811594e-05,
"loss": 24.911,
"step": 1802
},
{
"epoch": 4.357487922705314,
"grad_norm": 6.798279285430908,
"learning_rate": 5.491143317230274e-05,
"loss": 27.4748,
"step": 1804
},
{
"epoch": 4.36231884057971,
"grad_norm": 5.993078231811523,
"learning_rate": 5.4750402576489535e-05,
"loss": 27.1362,
"step": 1806
},
{
"epoch": 4.367149758454106,
"grad_norm": 7.4096574783325195,
"learning_rate": 5.458937198067633e-05,
"loss": 26.0591,
"step": 1808
},
{
"epoch": 4.371980676328502,
"grad_norm": 6.903232574462891,
"learning_rate": 5.442834138486312e-05,
"loss": 26.8211,
"step": 1810
},
{
"epoch": 4.3768115942028984,
"grad_norm": 7.838393211364746,
"learning_rate": 5.426731078904992e-05,
"loss": 26.2384,
"step": 1812
},
{
"epoch": 4.381642512077295,
"grad_norm": 7.33106803894043,
"learning_rate": 5.4106280193236716e-05,
"loss": 26.5385,
"step": 1814
},
{
"epoch": 4.386473429951691,
"grad_norm": 6.619305610656738,
"learning_rate": 5.3945249597423505e-05,
"loss": 22.8647,
"step": 1816
},
{
"epoch": 4.391304347826087,
"grad_norm": 7.007352352142334,
"learning_rate": 5.37842190016103e-05,
"loss": 28.2365,
"step": 1818
},
{
"epoch": 4.396135265700483,
"grad_norm": 7.026554584503174,
"learning_rate": 5.3623188405797106e-05,
"loss": 25.9467,
"step": 1820
},
{
"epoch": 4.400966183574879,
"grad_norm": 8.395278930664062,
"learning_rate": 5.34621578099839e-05,
"loss": 24.3139,
"step": 1822
},
{
"epoch": 4.405797101449275,
"grad_norm": 6.9680495262146,
"learning_rate": 5.33011272141707e-05,
"loss": 24.9039,
"step": 1824
},
{
"epoch": 4.4106280193236715,
"grad_norm": 7.212375164031982,
"learning_rate": 5.3140096618357496e-05,
"loss": 25.4465,
"step": 1826
},
{
"epoch": 4.415458937198068,
"grad_norm": 6.966728210449219,
"learning_rate": 5.2979066022544286e-05,
"loss": 25.7792,
"step": 1828
},
{
"epoch": 4.420289855072464,
"grad_norm": 6.4454522132873535,
"learning_rate": 5.281803542673108e-05,
"loss": 25.9821,
"step": 1830
},
{
"epoch": 4.42512077294686,
"grad_norm": 7.032574653625488,
"learning_rate": 5.265700483091788e-05,
"loss": 26.4527,
"step": 1832
},
{
"epoch": 4.429951690821256,
"grad_norm": 7.715813159942627,
"learning_rate": 5.249597423510467e-05,
"loss": 25.1597,
"step": 1834
},
{
"epoch": 4.434782608695652,
"grad_norm": 8.366538047790527,
"learning_rate": 5.2334943639291466e-05,
"loss": 26.6993,
"step": 1836
},
{
"epoch": 4.4396135265700485,
"grad_norm": 6.7702484130859375,
"learning_rate": 5.217391304347826e-05,
"loss": 25.7026,
"step": 1838
},
{
"epoch": 4.444444444444445,
"grad_norm": 7.936936378479004,
"learning_rate": 5.201288244766506e-05,
"loss": 26.0788,
"step": 1840
},
{
"epoch": 4.449275362318841,
"grad_norm": 9.027806282043457,
"learning_rate": 5.185185185185185e-05,
"loss": 26.8397,
"step": 1842
},
{
"epoch": 4.454106280193237,
"grad_norm": 7.541802406311035,
"learning_rate": 5.1690821256038647e-05,
"loss": 25.2463,
"step": 1844
},
{
"epoch": 4.458937198067633,
"grad_norm": 6.402732849121094,
"learning_rate": 5.152979066022544e-05,
"loss": 26.2031,
"step": 1846
},
{
"epoch": 4.463768115942029,
"grad_norm": 9.23645305633545,
"learning_rate": 5.136876006441223e-05,
"loss": 25.5027,
"step": 1848
},
{
"epoch": 4.468599033816425,
"grad_norm": 7.548840045928955,
"learning_rate": 5.1207729468599044e-05,
"loss": 25.3803,
"step": 1850
},
{
"epoch": 4.473429951690822,
"grad_norm": 6.839424133300781,
"learning_rate": 5.1046698872785834e-05,
"loss": 25.4142,
"step": 1852
},
{
"epoch": 4.478260869565218,
"grad_norm": 6.8843512535095215,
"learning_rate": 5.088566827697263e-05,
"loss": 25.0163,
"step": 1854
},
{
"epoch": 4.483091787439614,
"grad_norm": 6.359217643737793,
"learning_rate": 5.072463768115943e-05,
"loss": 26.7153,
"step": 1856
},
{
"epoch": 4.48792270531401,
"grad_norm": 7.048843860626221,
"learning_rate": 5.0563607085346224e-05,
"loss": 25.6665,
"step": 1858
},
{
"epoch": 4.492753623188406,
"grad_norm": 7.086437702178955,
"learning_rate": 5.0402576489533014e-05,
"loss": 26.86,
"step": 1860
},
{
"epoch": 4.4975845410628015,
"grad_norm": 6.8362507820129395,
"learning_rate": 5.024154589371981e-05,
"loss": 26.5117,
"step": 1862
},
{
"epoch": 4.5024154589371985,
"grad_norm": 6.434200763702393,
"learning_rate": 5.008051529790661e-05,
"loss": 25.2265,
"step": 1864
},
{
"epoch": 4.507246376811594,
"grad_norm": 8.030712127685547,
"learning_rate": 4.99194847020934e-05,
"loss": 24.4644,
"step": 1866
},
{
"epoch": 4.512077294685991,
"grad_norm": 7.7696051597595215,
"learning_rate": 4.9758454106280194e-05,
"loss": 27.1793,
"step": 1868
},
{
"epoch": 4.516908212560386,
"grad_norm": 6.404499530792236,
"learning_rate": 4.959742351046699e-05,
"loss": 26.4276,
"step": 1870
},
{
"epoch": 4.521739130434782,
"grad_norm": 7.412373065948486,
"learning_rate": 4.943639291465378e-05,
"loss": 25.6971,
"step": 1872
},
{
"epoch": 4.526570048309178,
"grad_norm": 7.425329685211182,
"learning_rate": 4.9275362318840584e-05,
"loss": 23.4725,
"step": 1874
},
{
"epoch": 4.531400966183575,
"grad_norm": 6.722659587860107,
"learning_rate": 4.911433172302738e-05,
"loss": 26.158,
"step": 1876
},
{
"epoch": 4.536231884057971,
"grad_norm": 7.206009387969971,
"learning_rate": 4.895330112721417e-05,
"loss": 26.8682,
"step": 1878
},
{
"epoch": 4.541062801932367,
"grad_norm": 7.180261135101318,
"learning_rate": 4.879227053140097e-05,
"loss": 26.191,
"step": 1880
},
{
"epoch": 4.545893719806763,
"grad_norm": 7.371028900146484,
"learning_rate": 4.8631239935587765e-05,
"loss": 23.7948,
"step": 1882
},
{
"epoch": 4.550724637681159,
"grad_norm": 6.874049663543701,
"learning_rate": 4.847020933977456e-05,
"loss": 26.638,
"step": 1884
},
{
"epoch": 4.555555555555555,
"grad_norm": 7.5235795974731445,
"learning_rate": 4.830917874396135e-05,
"loss": 26.9283,
"step": 1886
},
{
"epoch": 4.5603864734299515,
"grad_norm": 7.371413707733154,
"learning_rate": 4.814814814814815e-05,
"loss": 26.5723,
"step": 1888
},
{
"epoch": 4.565217391304348,
"grad_norm": 6.487553119659424,
"learning_rate": 4.7987117552334945e-05,
"loss": 25.9711,
"step": 1890
},
{
"epoch": 4.570048309178744,
"grad_norm": 6.800736427307129,
"learning_rate": 4.782608695652174e-05,
"loss": 23.376,
"step": 1892
},
{
"epoch": 4.57487922705314,
"grad_norm": 7.149484634399414,
"learning_rate": 4.766505636070854e-05,
"loss": 26.0393,
"step": 1894
},
{
"epoch": 4.579710144927536,
"grad_norm": 7.532267093658447,
"learning_rate": 4.7504025764895335e-05,
"loss": 26.1861,
"step": 1896
},
{
"epoch": 4.584541062801932,
"grad_norm": 7.492485046386719,
"learning_rate": 4.7342995169082125e-05,
"loss": 26.4485,
"step": 1898
},
{
"epoch": 4.5893719806763285,
"grad_norm": 6.885655879974365,
"learning_rate": 4.718196457326892e-05,
"loss": 25.1794,
"step": 1900
},
{
"epoch": 4.594202898550725,
"grad_norm": 6.430235862731934,
"learning_rate": 4.702093397745572e-05,
"loss": 25.6459,
"step": 1902
},
{
"epoch": 4.599033816425121,
"grad_norm": 6.470332145690918,
"learning_rate": 4.6859903381642516e-05,
"loss": 26.0056,
"step": 1904
},
{
"epoch": 4.603864734299517,
"grad_norm": 6.93711519241333,
"learning_rate": 4.669887278582931e-05,
"loss": 26.8588,
"step": 1906
},
{
"epoch": 4.608695652173913,
"grad_norm": 7.658902168273926,
"learning_rate": 4.653784219001611e-05,
"loss": 25.5478,
"step": 1908
},
{
"epoch": 4.613526570048309,
"grad_norm": 7.67640495300293,
"learning_rate": 4.63768115942029e-05,
"loss": 26.2871,
"step": 1910
},
{
"epoch": 4.618357487922705,
"grad_norm": 7.06746244430542,
"learning_rate": 4.6215780998389696e-05,
"loss": 25.8891,
"step": 1912
},
{
"epoch": 4.6231884057971016,
"grad_norm": 7.047806739807129,
"learning_rate": 4.605475040257649e-05,
"loss": 27.2715,
"step": 1914
},
{
"epoch": 4.628019323671498,
"grad_norm": 7.097225189208984,
"learning_rate": 4.589371980676328e-05,
"loss": 24.1438,
"step": 1916
},
{
"epoch": 4.632850241545894,
"grad_norm": 7.487665176391602,
"learning_rate": 4.573268921095008e-05,
"loss": 25.5925,
"step": 1918
},
{
"epoch": 4.63768115942029,
"grad_norm": 6.561511516571045,
"learning_rate": 4.557165861513688e-05,
"loss": 27.0304,
"step": 1920
},
{
"epoch": 4.642512077294686,
"grad_norm": 7.644463539123535,
"learning_rate": 4.541062801932367e-05,
"loss": 26.007,
"step": 1922
},
{
"epoch": 4.647342995169082,
"grad_norm": 7.329721927642822,
"learning_rate": 4.524959742351047e-05,
"loss": 23.3239,
"step": 1924
},
{
"epoch": 4.6521739130434785,
"grad_norm": 6.725891590118408,
"learning_rate": 4.5088566827697266e-05,
"loss": 25.8835,
"step": 1926
},
{
"epoch": 4.657004830917875,
"grad_norm": 7.27399206161499,
"learning_rate": 4.492753623188406e-05,
"loss": 26.4273,
"step": 1928
},
{
"epoch": 4.661835748792271,
"grad_norm": 6.614084720611572,
"learning_rate": 4.476650563607085e-05,
"loss": 25.8323,
"step": 1930
},
{
"epoch": 4.666666666666667,
"grad_norm": 6.703570365905762,
"learning_rate": 4.460547504025765e-05,
"loss": 25.0444,
"step": 1932
},
{
"epoch": 4.671497584541063,
"grad_norm": 7.8840556144714355,
"learning_rate": 4.4444444444444447e-05,
"loss": 26.0548,
"step": 1934
},
{
"epoch": 4.676328502415459,
"grad_norm": 6.566593170166016,
"learning_rate": 4.428341384863124e-05,
"loss": 25.8758,
"step": 1936
},
{
"epoch": 4.681159420289855,
"grad_norm": 6.961997985839844,
"learning_rate": 4.412238325281804e-05,
"loss": 26.1125,
"step": 1938
},
{
"epoch": 4.685990338164252,
"grad_norm": 8.170991897583008,
"learning_rate": 4.396135265700484e-05,
"loss": 27.3513,
"step": 1940
},
{
"epoch": 4.690821256038648,
"grad_norm": 6.823581218719482,
"learning_rate": 4.380032206119163e-05,
"loss": 25.9433,
"step": 1942
},
{
"epoch": 4.695652173913043,
"grad_norm": 7.356668949127197,
"learning_rate": 4.3639291465378424e-05,
"loss": 27.2403,
"step": 1944
},
{
"epoch": 4.70048309178744,
"grad_norm": 7.08234977722168,
"learning_rate": 4.347826086956522e-05,
"loss": 25.7369,
"step": 1946
},
{
"epoch": 4.705314009661835,
"grad_norm": 6.981078147888184,
"learning_rate": 4.331723027375201e-05,
"loss": 25.5263,
"step": 1948
},
{
"epoch": 4.710144927536232,
"grad_norm": 6.724111080169678,
"learning_rate": 4.315619967793881e-05,
"loss": 25.9395,
"step": 1950
},
{
"epoch": 4.714975845410628,
"grad_norm": 6.105647563934326,
"learning_rate": 4.299516908212561e-05,
"loss": 26.302,
"step": 1952
},
{
"epoch": 4.719806763285024,
"grad_norm": 7.321731090545654,
"learning_rate": 4.28341384863124e-05,
"loss": 25.4126,
"step": 1954
},
{
"epoch": 4.72463768115942,
"grad_norm": 6.488819599151611,
"learning_rate": 4.26731078904992e-05,
"loss": 26.1355,
"step": 1956
},
{
"epoch": 4.729468599033816,
"grad_norm": 6.578047752380371,
"learning_rate": 4.2512077294685994e-05,
"loss": 27.0345,
"step": 1958
},
{
"epoch": 4.734299516908212,
"grad_norm": 6.070748805999756,
"learning_rate": 4.2351046698872784e-05,
"loss": 24.2139,
"step": 1960
},
{
"epoch": 4.739130434782608,
"grad_norm": 6.960094451904297,
"learning_rate": 4.219001610305958e-05,
"loss": 26.6307,
"step": 1962
},
{
"epoch": 4.743961352657005,
"grad_norm": 6.557458877563477,
"learning_rate": 4.202898550724638e-05,
"loss": 24.7468,
"step": 1964
},
{
"epoch": 4.748792270531401,
"grad_norm": 7.3893656730651855,
"learning_rate": 4.1867954911433174e-05,
"loss": 25.1312,
"step": 1966
},
{
"epoch": 4.753623188405797,
"grad_norm": 7.08898401260376,
"learning_rate": 4.170692431561997e-05,
"loss": 26.4543,
"step": 1968
},
{
"epoch": 4.758454106280193,
"grad_norm": 7.590085029602051,
"learning_rate": 4.154589371980677e-05,
"loss": 26.8427,
"step": 1970
},
{
"epoch": 4.763285024154589,
"grad_norm": 6.841743469238281,
"learning_rate": 4.1384863123993565e-05,
"loss": 25.9606,
"step": 1972
},
{
"epoch": 4.768115942028985,
"grad_norm": 7.612220764160156,
"learning_rate": 4.1223832528180355e-05,
"loss": 27.338,
"step": 1974
},
{
"epoch": 4.7729468599033815,
"grad_norm": 6.759093761444092,
"learning_rate": 4.106280193236715e-05,
"loss": 26.6189,
"step": 1976
},
{
"epoch": 4.777777777777778,
"grad_norm": 7.5177226066589355,
"learning_rate": 4.090177133655395e-05,
"loss": 26.6014,
"step": 1978
},
{
"epoch": 4.782608695652174,
"grad_norm": 6.755998611450195,
"learning_rate": 4.074074074074074e-05,
"loss": 24.8074,
"step": 1980
},
{
"epoch": 4.78743961352657,
"grad_norm": 7.969665050506592,
"learning_rate": 4.057971014492754e-05,
"loss": 26.8168,
"step": 1982
},
{
"epoch": 4.792270531400966,
"grad_norm": 6.537661552429199,
"learning_rate": 4.041867954911434e-05,
"loss": 26.8878,
"step": 1984
},
{
"epoch": 4.797101449275362,
"grad_norm": 7.462778091430664,
"learning_rate": 4.025764895330113e-05,
"loss": 26.6395,
"step": 1986
},
{
"epoch": 4.8019323671497585,
"grad_norm": 7.199199199676514,
"learning_rate": 4.0096618357487925e-05,
"loss": 26.6467,
"step": 1988
},
{
"epoch": 4.806763285024155,
"grad_norm": 6.970396995544434,
"learning_rate": 3.993558776167472e-05,
"loss": 27.3238,
"step": 1990
},
{
"epoch": 4.811594202898551,
"grad_norm": 6.526374340057373,
"learning_rate": 3.977455716586151e-05,
"loss": 26.7327,
"step": 1992
},
{
"epoch": 4.816425120772947,
"grad_norm": 7.019384384155273,
"learning_rate": 3.961352657004831e-05,
"loss": 24.6199,
"step": 1994
},
{
"epoch": 4.821256038647343,
"grad_norm": 7.474978923797607,
"learning_rate": 3.9452495974235105e-05,
"loss": 26.8535,
"step": 1996
},
{
"epoch": 4.826086956521739,
"grad_norm": 7.651355266571045,
"learning_rate": 3.92914653784219e-05,
"loss": 25.2036,
"step": 1998
},
{
"epoch": 4.830917874396135,
"grad_norm": 6.540372848510742,
"learning_rate": 3.91304347826087e-05,
"loss": 26.1222,
"step": 2000
},
{
"epoch": 4.835748792270532,
"grad_norm": 7.769553184509277,
"learning_rate": 3.8969404186795496e-05,
"loss": 25.3271,
"step": 2002
},
{
"epoch": 4.840579710144928,
"grad_norm": 7.059219837188721,
"learning_rate": 3.8808373590982286e-05,
"loss": 27.5878,
"step": 2004
},
{
"epoch": 4.845410628019324,
"grad_norm": 7.040493011474609,
"learning_rate": 3.864734299516908e-05,
"loss": 24.8298,
"step": 2006
},
{
"epoch": 4.85024154589372,
"grad_norm": 6.8158111572265625,
"learning_rate": 3.848631239935588e-05,
"loss": 25.9933,
"step": 2008
},
{
"epoch": 4.855072463768116,
"grad_norm": 6.576706886291504,
"learning_rate": 3.8325281803542676e-05,
"loss": 25.7341,
"step": 2010
},
{
"epoch": 4.859903381642512,
"grad_norm": 6.51364803314209,
"learning_rate": 3.8164251207729466e-05,
"loss": 27.156,
"step": 2012
},
{
"epoch": 4.8647342995169085,
"grad_norm": 7.035210609436035,
"learning_rate": 3.800322061191627e-05,
"loss": 25.8662,
"step": 2014
},
{
"epoch": 4.869565217391305,
"grad_norm": 8.57784366607666,
"learning_rate": 3.784219001610306e-05,
"loss": 27.2607,
"step": 2016
},
{
"epoch": 4.874396135265701,
"grad_norm": 7.060666084289551,
"learning_rate": 3.7681159420289856e-05,
"loss": 25.5058,
"step": 2018
},
{
"epoch": 4.879227053140097,
"grad_norm": 6.544167995452881,
"learning_rate": 3.752012882447665e-05,
"loss": 27.5042,
"step": 2020
},
{
"epoch": 4.884057971014493,
"grad_norm": 7.82602071762085,
"learning_rate": 3.735909822866345e-05,
"loss": 25.1871,
"step": 2022
},
{
"epoch": 4.888888888888889,
"grad_norm": 6.692302227020264,
"learning_rate": 3.719806763285024e-05,
"loss": 24.8878,
"step": 2024
},
{
"epoch": 4.8937198067632846,
"grad_norm": 6.907380104064941,
"learning_rate": 3.7037037037037037e-05,
"loss": 26.2569,
"step": 2026
},
{
"epoch": 4.898550724637682,
"grad_norm": 6.529886245727539,
"learning_rate": 3.687600644122383e-05,
"loss": 25.2621,
"step": 2028
},
{
"epoch": 4.903381642512077,
"grad_norm": 8.162117958068848,
"learning_rate": 3.671497584541063e-05,
"loss": 25.3648,
"step": 2030
},
{
"epoch": 4.908212560386474,
"grad_norm": 7.2825422286987305,
"learning_rate": 3.655394524959743e-05,
"loss": 25.0396,
"step": 2032
},
{
"epoch": 4.913043478260869,
"grad_norm": 7.4677886962890625,
"learning_rate": 3.6392914653784224e-05,
"loss": 26.3367,
"step": 2034
},
{
"epoch": 4.917874396135265,
"grad_norm": 6.709794521331787,
"learning_rate": 3.6231884057971014e-05,
"loss": 25.5507,
"step": 2036
},
{
"epoch": 4.9227053140096615,
"grad_norm": 6.555368423461914,
"learning_rate": 3.607085346215781e-05,
"loss": 24.5421,
"step": 2038
},
{
"epoch": 4.927536231884058,
"grad_norm": 6.405154705047607,
"learning_rate": 3.590982286634461e-05,
"loss": 25.699,
"step": 2040
},
{
"epoch": 4.932367149758454,
"grad_norm": 7.2418012619018555,
"learning_rate": 3.57487922705314e-05,
"loss": 25.3032,
"step": 2042
},
{
"epoch": 4.93719806763285,
"grad_norm": 7.165282726287842,
"learning_rate": 3.55877616747182e-05,
"loss": 27.57,
"step": 2044
},
{
"epoch": 4.942028985507246,
"grad_norm": 8.555087089538574,
"learning_rate": 3.5426731078905e-05,
"loss": 25.6545,
"step": 2046
},
{
"epoch": 4.946859903381642,
"grad_norm": 7.7885613441467285,
"learning_rate": 3.526570048309179e-05,
"loss": 25.3195,
"step": 2048
},
{
"epoch": 4.951690821256038,
"grad_norm": 6.383197784423828,
"learning_rate": 3.5104669887278584e-05,
"loss": 27.5458,
"step": 2050
},
{
"epoch": 4.956521739130435,
"grad_norm": 7.210457801818848,
"learning_rate": 3.494363929146538e-05,
"loss": 26.0986,
"step": 2052
},
{
"epoch": 4.961352657004831,
"grad_norm": 6.477179050445557,
"learning_rate": 3.478260869565218e-05,
"loss": 26.3113,
"step": 2054
},
{
"epoch": 4.966183574879227,
"grad_norm": 6.750316619873047,
"learning_rate": 3.462157809983897e-05,
"loss": 26.5696,
"step": 2056
},
{
"epoch": 4.971014492753623,
"grad_norm": 6.577611923217773,
"learning_rate": 3.4460547504025764e-05,
"loss": 26.4256,
"step": 2058
},
{
"epoch": 4.975845410628019,
"grad_norm": 7.024559020996094,
"learning_rate": 3.429951690821256e-05,
"loss": 24.9867,
"step": 2060
},
{
"epoch": 4.980676328502415,
"grad_norm": 7.051502704620361,
"learning_rate": 3.413848631239936e-05,
"loss": 25.3915,
"step": 2062
},
{
"epoch": 4.9855072463768115,
"grad_norm": 7.6836838722229,
"learning_rate": 3.3977455716586155e-05,
"loss": 24.8861,
"step": 2064
},
{
"epoch": 4.990338164251208,
"grad_norm": 7.69392204284668,
"learning_rate": 3.381642512077295e-05,
"loss": 26.2732,
"step": 2066
},
{
"epoch": 4.995169082125604,
"grad_norm": 7.139024257659912,
"learning_rate": 3.365539452495974e-05,
"loss": 24.6846,
"step": 2068
},
{
"epoch": 5.0,
"grad_norm": 6.70409631729126,
"learning_rate": 3.349436392914654e-05,
"loss": 27.4742,
"step": 2070
},
{
"epoch": 5.004830917874396,
"grad_norm": 6.803808212280273,
"learning_rate": 3.3333333333333335e-05,
"loss": 24.7629,
"step": 2072
},
{
"epoch": 5.009661835748792,
"grad_norm": 6.341485977172852,
"learning_rate": 3.3172302737520125e-05,
"loss": 27.7422,
"step": 2074
},
{
"epoch": 5.0144927536231885,
"grad_norm": 6.5449066162109375,
"learning_rate": 3.301127214170693e-05,
"loss": 26.3897,
"step": 2076
},
{
"epoch": 5.019323671497585,
"grad_norm": 6.326546669006348,
"learning_rate": 3.2850241545893725e-05,
"loss": 26.7782,
"step": 2078
},
{
"epoch": 5.024154589371981,
"grad_norm": 7.492796897888184,
"learning_rate": 3.2689210950080515e-05,
"loss": 25.2565,
"step": 2080
},
{
"epoch": 5.028985507246377,
"grad_norm": 7.679995536804199,
"learning_rate": 3.252818035426731e-05,
"loss": 25.2879,
"step": 2082
},
{
"epoch": 5.033816425120773,
"grad_norm": 6.634117126464844,
"learning_rate": 3.236714975845411e-05,
"loss": 27.5415,
"step": 2084
},
{
"epoch": 5.038647342995169,
"grad_norm": 6.707841873168945,
"learning_rate": 3.22061191626409e-05,
"loss": 26.2413,
"step": 2086
},
{
"epoch": 5.043478260869565,
"grad_norm": 7.303376197814941,
"learning_rate": 3.2045088566827695e-05,
"loss": 22.9713,
"step": 2088
},
{
"epoch": 5.048309178743962,
"grad_norm": 5.641716957092285,
"learning_rate": 3.188405797101449e-05,
"loss": 23.7231,
"step": 2090
},
{
"epoch": 5.053140096618358,
"grad_norm": 7.5472636222839355,
"learning_rate": 3.172302737520129e-05,
"loss": 26.3035,
"step": 2092
},
{
"epoch": 5.057971014492754,
"grad_norm": 6.629962921142578,
"learning_rate": 3.1561996779388086e-05,
"loss": 27.2519,
"step": 2094
},
{
"epoch": 5.06280193236715,
"grad_norm": 6.610307216644287,
"learning_rate": 3.140096618357488e-05,
"loss": 24.3596,
"step": 2096
},
{
"epoch": 5.067632850241546,
"grad_norm": 8.222330093383789,
"learning_rate": 3.123993558776168e-05,
"loss": 26.812,
"step": 2098
},
{
"epoch": 5.072463768115942,
"grad_norm": 7.391679763793945,
"learning_rate": 3.107890499194847e-05,
"loss": 23.9302,
"step": 2100
},
{
"epoch": 5.0772946859903385,
"grad_norm": 7.474515914916992,
"learning_rate": 3.0917874396135266e-05,
"loss": 26.0697,
"step": 2102
},
{
"epoch": 5.082125603864735,
"grad_norm": 6.373252868652344,
"learning_rate": 3.075684380032206e-05,
"loss": 26.3179,
"step": 2104
},
{
"epoch": 5.086956521739131,
"grad_norm": 7.464061260223389,
"learning_rate": 3.059581320450886e-05,
"loss": 25.6336,
"step": 2106
},
{
"epoch": 5.091787439613527,
"grad_norm": 6.995118618011475,
"learning_rate": 3.0434782608695656e-05,
"loss": 26.2471,
"step": 2108
},
{
"epoch": 5.096618357487923,
"grad_norm": 7.116311550140381,
"learning_rate": 3.027375201288245e-05,
"loss": 26.614,
"step": 2110
},
{
"epoch": 5.101449275362318,
"grad_norm": 6.943987846374512,
"learning_rate": 3.0112721417069246e-05,
"loss": 25.0339,
"step": 2112
},
{
"epoch": 5.106280193236715,
"grad_norm": 7.350955009460449,
"learning_rate": 2.995169082125604e-05,
"loss": 26.2694,
"step": 2114
},
{
"epoch": 5.111111111111111,
"grad_norm": 6.849686622619629,
"learning_rate": 2.9790660225442833e-05,
"loss": 27.1826,
"step": 2116
},
{
"epoch": 5.115942028985507,
"grad_norm": 7.7651567459106445,
"learning_rate": 2.962962962962963e-05,
"loss": 24.8541,
"step": 2118
},
{
"epoch": 5.120772946859903,
"grad_norm": 5.836477279663086,
"learning_rate": 2.9468599033816423e-05,
"loss": 25.0215,
"step": 2120
},
{
"epoch": 5.125603864734299,
"grad_norm": 6.189184665679932,
"learning_rate": 2.9307568438003223e-05,
"loss": 26.9194,
"step": 2122
},
{
"epoch": 5.130434782608695,
"grad_norm": 6.857696533203125,
"learning_rate": 2.914653784219002e-05,
"loss": 26.1156,
"step": 2124
},
{
"epoch": 5.1352657004830915,
"grad_norm": 6.773160934448242,
"learning_rate": 2.8985507246376814e-05,
"loss": 25.4986,
"step": 2126
},
{
"epoch": 5.140096618357488,
"grad_norm": 8.016234397888184,
"learning_rate": 2.882447665056361e-05,
"loss": 26.8887,
"step": 2128
},
{
"epoch": 5.144927536231884,
"grad_norm": 7.765948295593262,
"learning_rate": 2.8663446054750404e-05,
"loss": 24.2822,
"step": 2130
},
{
"epoch": 5.14975845410628,
"grad_norm": 7.044548511505127,
"learning_rate": 2.8502415458937197e-05,
"loss": 25.6158,
"step": 2132
},
{
"epoch": 5.154589371980676,
"grad_norm": 6.452057361602783,
"learning_rate": 2.8341384863123994e-05,
"loss": 24.7033,
"step": 2134
},
{
"epoch": 5.159420289855072,
"grad_norm": 6.443338394165039,
"learning_rate": 2.8180354267310787e-05,
"loss": 25.911,
"step": 2136
},
{
"epoch": 5.164251207729468,
"grad_norm": 7.172874450683594,
"learning_rate": 2.8019323671497587e-05,
"loss": 25.0313,
"step": 2138
},
{
"epoch": 5.169082125603865,
"grad_norm": 7.001052379608154,
"learning_rate": 2.7858293075684384e-05,
"loss": 27.4582,
"step": 2140
},
{
"epoch": 5.173913043478261,
"grad_norm": 6.618391513824463,
"learning_rate": 2.7697262479871177e-05,
"loss": 25.3195,
"step": 2142
},
{
"epoch": 5.178743961352657,
"grad_norm": 7.667540073394775,
"learning_rate": 2.753623188405797e-05,
"loss": 24.9384,
"step": 2144
},
{
"epoch": 5.183574879227053,
"grad_norm": 7.570556163787842,
"learning_rate": 2.7375201288244768e-05,
"loss": 25.1559,
"step": 2146
},
{
"epoch": 5.188405797101449,
"grad_norm": 8.569737434387207,
"learning_rate": 2.721417069243156e-05,
"loss": 24.5403,
"step": 2148
},
{
"epoch": 5.193236714975845,
"grad_norm": 6.5838623046875,
"learning_rate": 2.7053140096618358e-05,
"loss": 25.5745,
"step": 2150
},
{
"epoch": 5.1980676328502415,
"grad_norm": 6.626333713531494,
"learning_rate": 2.689210950080515e-05,
"loss": 25.6172,
"step": 2152
},
{
"epoch": 5.202898550724638,
"grad_norm": 7.9010186195373535,
"learning_rate": 2.673107890499195e-05,
"loss": 25.8519,
"step": 2154
},
{
"epoch": 5.207729468599034,
"grad_norm": 6.161978244781494,
"learning_rate": 2.6570048309178748e-05,
"loss": 24.2309,
"step": 2156
},
{
"epoch": 5.21256038647343,
"grad_norm": 6.870685577392578,
"learning_rate": 2.640901771336554e-05,
"loss": 27.1665,
"step": 2158
},
{
"epoch": 5.217391304347826,
"grad_norm": 7.303822040557861,
"learning_rate": 2.6247987117552335e-05,
"loss": 25.556,
"step": 2160
},
{
"epoch": 5.222222222222222,
"grad_norm": 6.584065914154053,
"learning_rate": 2.608695652173913e-05,
"loss": 25.468,
"step": 2162
},
{
"epoch": 5.2270531400966185,
"grad_norm": 7.221360683441162,
"learning_rate": 2.5925925925925925e-05,
"loss": 25.8624,
"step": 2164
},
{
"epoch": 5.231884057971015,
"grad_norm": 7.08326530456543,
"learning_rate": 2.576489533011272e-05,
"loss": 26.5428,
"step": 2166
},
{
"epoch": 5.236714975845411,
"grad_norm": 6.360510349273682,
"learning_rate": 2.5603864734299522e-05,
"loss": 26.695,
"step": 2168
},
{
"epoch": 5.241545893719807,
"grad_norm": 7.52411413192749,
"learning_rate": 2.5442834138486315e-05,
"loss": 25.9067,
"step": 2170
},
{
"epoch": 5.246376811594203,
"grad_norm": 6.968140602111816,
"learning_rate": 2.5281803542673112e-05,
"loss": 25.0371,
"step": 2172
},
{
"epoch": 5.251207729468599,
"grad_norm": 7.372687339782715,
"learning_rate": 2.5120772946859905e-05,
"loss": 26.3727,
"step": 2174
},
{
"epoch": 5.256038647342995,
"grad_norm": 7.292659759521484,
"learning_rate": 2.49597423510467e-05,
"loss": 26.8115,
"step": 2176
},
{
"epoch": 5.260869565217392,
"grad_norm": 6.425929546356201,
"learning_rate": 2.4798711755233495e-05,
"loss": 27.7444,
"step": 2178
},
{
"epoch": 5.265700483091788,
"grad_norm": 7.451976776123047,
"learning_rate": 2.4637681159420292e-05,
"loss": 26.5838,
"step": 2180
},
{
"epoch": 5.270531400966184,
"grad_norm": 7.282567024230957,
"learning_rate": 2.4476650563607086e-05,
"loss": 25.9177,
"step": 2182
},
{
"epoch": 5.27536231884058,
"grad_norm": 7.04587459564209,
"learning_rate": 2.4315619967793882e-05,
"loss": 25.8643,
"step": 2184
},
{
"epoch": 5.280193236714976,
"grad_norm": 7.137731075286865,
"learning_rate": 2.4154589371980676e-05,
"loss": 26.6203,
"step": 2186
},
{
"epoch": 5.285024154589372,
"grad_norm": 6.674662113189697,
"learning_rate": 2.3993558776167472e-05,
"loss": 25.3759,
"step": 2188
},
{
"epoch": 5.2898550724637685,
"grad_norm": 6.6438164710998535,
"learning_rate": 2.383252818035427e-05,
"loss": 24.2837,
"step": 2190
},
{
"epoch": 5.294685990338165,
"grad_norm": 7.651294708251953,
"learning_rate": 2.3671497584541063e-05,
"loss": 26.9551,
"step": 2192
},
{
"epoch": 5.29951690821256,
"grad_norm": 6.606574058532715,
"learning_rate": 2.351046698872786e-05,
"loss": 24.8967,
"step": 2194
},
{
"epoch": 5.304347826086957,
"grad_norm": 6.956263065338135,
"learning_rate": 2.3349436392914656e-05,
"loss": 26.3236,
"step": 2196
},
{
"epoch": 5.309178743961352,
"grad_norm": 7.141554832458496,
"learning_rate": 2.318840579710145e-05,
"loss": 26.4252,
"step": 2198
},
{
"epoch": 5.314009661835748,
"grad_norm": 6.030832290649414,
"learning_rate": 2.3027375201288246e-05,
"loss": 28.0596,
"step": 2200
},
{
"epoch": 5.318840579710145,
"grad_norm": 6.431146621704102,
"learning_rate": 2.286634460547504e-05,
"loss": 26.1359,
"step": 2202
},
{
"epoch": 5.323671497584541,
"grad_norm": 7.26776647567749,
"learning_rate": 2.2705314009661836e-05,
"loss": 23.1691,
"step": 2204
},
{
"epoch": 5.328502415458937,
"grad_norm": 7.198235988616943,
"learning_rate": 2.2544283413848633e-05,
"loss": 24.2558,
"step": 2206
},
{
"epoch": 5.333333333333333,
"grad_norm": 7.205248832702637,
"learning_rate": 2.2383252818035427e-05,
"loss": 26.1497,
"step": 2208
},
{
"epoch": 5.338164251207729,
"grad_norm": 6.834975242614746,
"learning_rate": 2.2222222222222223e-05,
"loss": 25.5315,
"step": 2210
},
{
"epoch": 5.342995169082125,
"grad_norm": 6.981115341186523,
"learning_rate": 2.206119162640902e-05,
"loss": 25.0263,
"step": 2212
},
{
"epoch": 5.3478260869565215,
"grad_norm": 6.798349380493164,
"learning_rate": 2.1900161030595813e-05,
"loss": 27.7364,
"step": 2214
},
{
"epoch": 5.352657004830918,
"grad_norm": 7.136117458343506,
"learning_rate": 2.173913043478261e-05,
"loss": 26.0488,
"step": 2216
},
{
"epoch": 5.357487922705314,
"grad_norm": 6.846739768981934,
"learning_rate": 2.1578099838969404e-05,
"loss": 24.9772,
"step": 2218
},
{
"epoch": 5.36231884057971,
"grad_norm": 7.294228553771973,
"learning_rate": 2.14170692431562e-05,
"loss": 27.0872,
"step": 2220
},
{
"epoch": 5.367149758454106,
"grad_norm": 7.222455978393555,
"learning_rate": 2.1256038647342997e-05,
"loss": 25.2418,
"step": 2222
},
{
"epoch": 5.371980676328502,
"grad_norm": 6.867911338806152,
"learning_rate": 2.109500805152979e-05,
"loss": 26.6485,
"step": 2224
},
{
"epoch": 5.3768115942028984,
"grad_norm": 7.119537353515625,
"learning_rate": 2.0933977455716587e-05,
"loss": 24.7069,
"step": 2226
},
{
"epoch": 5.381642512077295,
"grad_norm": 6.486376762390137,
"learning_rate": 2.0772946859903384e-05,
"loss": 25.5981,
"step": 2228
},
{
"epoch": 5.386473429951691,
"grad_norm": 6.030795097351074,
"learning_rate": 2.0611916264090177e-05,
"loss": 26.4169,
"step": 2230
},
{
"epoch": 5.391304347826087,
"grad_norm": 6.1018171310424805,
"learning_rate": 2.0450885668276974e-05,
"loss": 25.8114,
"step": 2232
},
{
"epoch": 5.396135265700483,
"grad_norm": 6.3123860359191895,
"learning_rate": 2.028985507246377e-05,
"loss": 27.2772,
"step": 2234
},
{
"epoch": 5.400966183574879,
"grad_norm": 7.111965179443359,
"learning_rate": 2.0128824476650564e-05,
"loss": 22.8083,
"step": 2236
},
{
"epoch": 5.405797101449275,
"grad_norm": 6.663313865661621,
"learning_rate": 1.996779388083736e-05,
"loss": 26.1006,
"step": 2238
},
{
"epoch": 5.4106280193236715,
"grad_norm": 7.1827287673950195,
"learning_rate": 1.9806763285024154e-05,
"loss": 25.9993,
"step": 2240
},
{
"epoch": 5.415458937198068,
"grad_norm": 6.989486217498779,
"learning_rate": 1.964573268921095e-05,
"loss": 27.1008,
"step": 2242
},
{
"epoch": 5.420289855072464,
"grad_norm": 7.407745361328125,
"learning_rate": 1.9484702093397748e-05,
"loss": 25.4639,
"step": 2244
},
{
"epoch": 5.42512077294686,
"grad_norm": 6.708901405334473,
"learning_rate": 1.932367149758454e-05,
"loss": 26.7837,
"step": 2246
},
{
"epoch": 5.429951690821256,
"grad_norm": 6.670323848724365,
"learning_rate": 1.9162640901771338e-05,
"loss": 25.7905,
"step": 2248
},
{
"epoch": 5.434782608695652,
"grad_norm": 7.481121063232422,
"learning_rate": 1.9001610305958135e-05,
"loss": 25.6084,
"step": 2250
},
{
"epoch": 5.4396135265700485,
"grad_norm": 7.1586480140686035,
"learning_rate": 1.8840579710144928e-05,
"loss": 25.4086,
"step": 2252
},
{
"epoch": 5.444444444444445,
"grad_norm": 6.693662166595459,
"learning_rate": 1.8679549114331725e-05,
"loss": 25.3542,
"step": 2254
},
{
"epoch": 5.449275362318841,
"grad_norm": 6.597439289093018,
"learning_rate": 1.8518518518518518e-05,
"loss": 24.7352,
"step": 2256
},
{
"epoch": 5.454106280193237,
"grad_norm": 6.5035400390625,
"learning_rate": 1.8357487922705315e-05,
"loss": 25.9761,
"step": 2258
},
{
"epoch": 5.458937198067633,
"grad_norm": 6.170787811279297,
"learning_rate": 1.8196457326892112e-05,
"loss": 26.8993,
"step": 2260
},
{
"epoch": 5.463768115942029,
"grad_norm": 6.216879367828369,
"learning_rate": 1.8035426731078905e-05,
"loss": 23.9469,
"step": 2262
},
{
"epoch": 5.468599033816425,
"grad_norm": 6.804856777191162,
"learning_rate": 1.78743961352657e-05,
"loss": 27.1758,
"step": 2264
},
{
"epoch": 5.473429951690822,
"grad_norm": 7.740478038787842,
"learning_rate": 1.77133655394525e-05,
"loss": 26.7605,
"step": 2266
},
{
"epoch": 5.478260869565218,
"grad_norm": 6.862391471862793,
"learning_rate": 1.7552334943639292e-05,
"loss": 26.0278,
"step": 2268
},
{
"epoch": 5.483091787439614,
"grad_norm": 6.675685882568359,
"learning_rate": 1.739130434782609e-05,
"loss": 24.291,
"step": 2270
},
{
"epoch": 5.48792270531401,
"grad_norm": 7.202348232269287,
"learning_rate": 1.7230273752012882e-05,
"loss": 25.3346,
"step": 2272
},
{
"epoch": 5.492753623188406,
"grad_norm": 7.335130214691162,
"learning_rate": 1.706924315619968e-05,
"loss": 24.2558,
"step": 2274
},
{
"epoch": 5.4975845410628015,
"grad_norm": 6.820517539978027,
"learning_rate": 1.6908212560386476e-05,
"loss": 24.8161,
"step": 2276
},
{
"epoch": 5.5024154589371985,
"grad_norm": 6.23611307144165,
"learning_rate": 1.674718196457327e-05,
"loss": 25.4897,
"step": 2278
},
{
"epoch": 5.507246376811594,
"grad_norm": 6.273251056671143,
"learning_rate": 1.6586151368760062e-05,
"loss": 25.4466,
"step": 2280
},
{
"epoch": 5.512077294685991,
"grad_norm": 6.126486301422119,
"learning_rate": 1.6425120772946863e-05,
"loss": 26.4718,
"step": 2282
},
{
"epoch": 5.516908212560386,
"grad_norm": 6.196963787078857,
"learning_rate": 1.6264090177133656e-05,
"loss": 25.5031,
"step": 2284
},
{
"epoch": 5.521739130434782,
"grad_norm": 6.553043842315674,
"learning_rate": 1.610305958132045e-05,
"loss": 26.3876,
"step": 2286
},
{
"epoch": 5.526570048309178,
"grad_norm": 6.308940887451172,
"learning_rate": 1.5942028985507246e-05,
"loss": 25.3395,
"step": 2288
},
{
"epoch": 5.531400966183575,
"grad_norm": 5.9868059158325195,
"learning_rate": 1.5780998389694043e-05,
"loss": 26.1367,
"step": 2290
},
{
"epoch": 5.536231884057971,
"grad_norm": 5.966738224029541,
"learning_rate": 1.561996779388084e-05,
"loss": 24.9832,
"step": 2292
},
{
"epoch": 5.541062801932367,
"grad_norm": 6.130259990692139,
"learning_rate": 1.5458937198067633e-05,
"loss": 26.0377,
"step": 2294
},
{
"epoch": 5.545893719806763,
"grad_norm": 6.351025104522705,
"learning_rate": 1.529790660225443e-05,
"loss": 25.387,
"step": 2296
},
{
"epoch": 5.550724637681159,
"grad_norm": 7.592315673828125,
"learning_rate": 1.5136876006441225e-05,
"loss": 25.8822,
"step": 2298
},
{
"epoch": 5.555555555555555,
"grad_norm": 7.366810321807861,
"learning_rate": 1.497584541062802e-05,
"loss": 25.716,
"step": 2300
},
{
"epoch": 5.5603864734299515,
"grad_norm": 6.494503974914551,
"learning_rate": 1.4814814814814815e-05,
"loss": 24.903,
"step": 2302
},
{
"epoch": 5.565217391304348,
"grad_norm": 6.354084491729736,
"learning_rate": 1.4653784219001612e-05,
"loss": 24.19,
"step": 2304
},
{
"epoch": 5.570048309178744,
"grad_norm": 6.83246374130249,
"learning_rate": 1.4492753623188407e-05,
"loss": 25.3202,
"step": 2306
},
{
"epoch": 5.57487922705314,
"grad_norm": 7.3366379737854,
"learning_rate": 1.4331723027375202e-05,
"loss": 26.5993,
"step": 2308
},
{
"epoch": 5.579710144927536,
"grad_norm": 6.854272842407227,
"learning_rate": 1.4170692431561997e-05,
"loss": 27.5142,
"step": 2310
},
{
"epoch": 5.584541062801932,
"grad_norm": 7.033668041229248,
"learning_rate": 1.4009661835748794e-05,
"loss": 24.9908,
"step": 2312
},
{
"epoch": 5.5893719806763285,
"grad_norm": 5.725836277008057,
"learning_rate": 1.3848631239935589e-05,
"loss": 26.9088,
"step": 2314
},
{
"epoch": 5.594202898550725,
"grad_norm": 6.002683162689209,
"learning_rate": 1.3687600644122384e-05,
"loss": 26.1845,
"step": 2316
},
{
"epoch": 5.599033816425121,
"grad_norm": 6.32890510559082,
"learning_rate": 1.3526570048309179e-05,
"loss": 24.4862,
"step": 2318
},
{
"epoch": 5.603864734299517,
"grad_norm": 6.316839694976807,
"learning_rate": 1.3365539452495976e-05,
"loss": 25.2277,
"step": 2320
},
{
"epoch": 5.608695652173913,
"grad_norm": 6.241401672363281,
"learning_rate": 1.320450885668277e-05,
"loss": 26.0008,
"step": 2322
},
{
"epoch": 5.613526570048309,
"grad_norm": 6.929868221282959,
"learning_rate": 1.3043478260869566e-05,
"loss": 24.5377,
"step": 2324
},
{
"epoch": 5.618357487922705,
"grad_norm": 6.343822956085205,
"learning_rate": 1.288244766505636e-05,
"loss": 24.4674,
"step": 2326
},
{
"epoch": 5.6231884057971016,
"grad_norm": 7.933018684387207,
"learning_rate": 1.2721417069243158e-05,
"loss": 25.4539,
"step": 2328
},
{
"epoch": 5.628019323671498,
"grad_norm": 6.561947345733643,
"learning_rate": 1.2560386473429953e-05,
"loss": 25.2955,
"step": 2330
},
{
"epoch": 5.632850241545894,
"grad_norm": 7.06411075592041,
"learning_rate": 1.2399355877616748e-05,
"loss": 26.2145,
"step": 2332
},
{
"epoch": 5.63768115942029,
"grad_norm": 8.267963409423828,
"learning_rate": 1.2238325281803543e-05,
"loss": 24.2516,
"step": 2334
},
{
"epoch": 5.642512077294686,
"grad_norm": 7.202125072479248,
"learning_rate": 1.2077294685990338e-05,
"loss": 27.0705,
"step": 2336
},
{
"epoch": 5.647342995169082,
"grad_norm": 6.419391632080078,
"learning_rate": 1.1916264090177135e-05,
"loss": 23.7174,
"step": 2338
},
{
"epoch": 5.6521739130434785,
"grad_norm": 6.510631561279297,
"learning_rate": 1.175523349436393e-05,
"loss": 26.1881,
"step": 2340
},
{
"epoch": 5.657004830917875,
"grad_norm": 7.408875465393066,
"learning_rate": 1.1594202898550725e-05,
"loss": 24.7877,
"step": 2342
},
{
"epoch": 5.661835748792271,
"grad_norm": 6.503478050231934,
"learning_rate": 1.143317230273752e-05,
"loss": 24.4056,
"step": 2344
},
{
"epoch": 5.666666666666667,
"grad_norm": 6.382200241088867,
"learning_rate": 1.1272141706924317e-05,
"loss": 25.4992,
"step": 2346
},
{
"epoch": 5.671497584541063,
"grad_norm": 6.437609672546387,
"learning_rate": 1.1111111111111112e-05,
"loss": 26.6456,
"step": 2348
},
{
"epoch": 5.676328502415459,
"grad_norm": 6.871528625488281,
"learning_rate": 1.0950080515297907e-05,
"loss": 25.0839,
"step": 2350
},
{
"epoch": 5.681159420289855,
"grad_norm": 8.16054630279541,
"learning_rate": 1.0789049919484702e-05,
"loss": 26.1491,
"step": 2352
},
{
"epoch": 5.685990338164252,
"grad_norm": 6.024045467376709,
"learning_rate": 1.0628019323671499e-05,
"loss": 25.0689,
"step": 2354
},
{
"epoch": 5.690821256038648,
"grad_norm": 7.976418972015381,
"learning_rate": 1.0466988727858294e-05,
"loss": 26.3663,
"step": 2356
},
{
"epoch": 5.695652173913043,
"grad_norm": 5.949817657470703,
"learning_rate": 1.0305958132045089e-05,
"loss": 27.0973,
"step": 2358
},
{
"epoch": 5.70048309178744,
"grad_norm": 6.103696823120117,
"learning_rate": 1.0144927536231885e-05,
"loss": 27.1756,
"step": 2360
},
{
"epoch": 5.705314009661835,
"grad_norm": 6.458801746368408,
"learning_rate": 9.98389694041868e-06,
"loss": 25.2463,
"step": 2362
},
{
"epoch": 5.710144927536232,
"grad_norm": 7.07081413269043,
"learning_rate": 9.822866344605476e-06,
"loss": 26.1127,
"step": 2364
},
{
"epoch": 5.714975845410628,
"grad_norm": 8.160017967224121,
"learning_rate": 9.66183574879227e-06,
"loss": 26.3955,
"step": 2366
},
{
"epoch": 5.719806763285024,
"grad_norm": 6.197200775146484,
"learning_rate": 9.500805152979067e-06,
"loss": 26.8253,
"step": 2368
},
{
"epoch": 5.72463768115942,
"grad_norm": 7.202108860015869,
"learning_rate": 9.339774557165862e-06,
"loss": 26.4504,
"step": 2370
},
{
"epoch": 5.729468599033816,
"grad_norm": 6.539680480957031,
"learning_rate": 9.178743961352658e-06,
"loss": 25.6419,
"step": 2372
},
{
"epoch": 5.734299516908212,
"grad_norm": 7.3082756996154785,
"learning_rate": 9.017713365539453e-06,
"loss": 23.5667,
"step": 2374
},
{
"epoch": 5.739130434782608,
"grad_norm": 6.585788726806641,
"learning_rate": 8.85668276972625e-06,
"loss": 27.0034,
"step": 2376
},
{
"epoch": 5.743961352657005,
"grad_norm": 8.16417121887207,
"learning_rate": 8.695652173913044e-06,
"loss": 25.4119,
"step": 2378
},
{
"epoch": 5.748792270531401,
"grad_norm": 6.153932571411133,
"learning_rate": 8.53462157809984e-06,
"loss": 26.3388,
"step": 2380
},
{
"epoch": 5.753623188405797,
"grad_norm": 7.043217182159424,
"learning_rate": 8.373590982286635e-06,
"loss": 27.3709,
"step": 2382
},
{
"epoch": 5.758454106280193,
"grad_norm": 6.4633588790893555,
"learning_rate": 8.212560386473431e-06,
"loss": 28.7053,
"step": 2384
},
{
"epoch": 5.763285024154589,
"grad_norm": 7.188209056854248,
"learning_rate": 8.051529790660225e-06,
"loss": 26.8206,
"step": 2386
},
{
"epoch": 5.768115942028985,
"grad_norm": 6.451449394226074,
"learning_rate": 7.890499194847021e-06,
"loss": 24.2479,
"step": 2388
},
{
"epoch": 5.7729468599033815,
"grad_norm": 6.818403720855713,
"learning_rate": 7.729468599033817e-06,
"loss": 25.5073,
"step": 2390
},
{
"epoch": 5.777777777777778,
"grad_norm": 7.128567218780518,
"learning_rate": 7.568438003220612e-06,
"loss": 25.6976,
"step": 2392
},
{
"epoch": 5.782608695652174,
"grad_norm": 7.445803165435791,
"learning_rate": 7.4074074074074075e-06,
"loss": 25.5917,
"step": 2394
},
{
"epoch": 5.78743961352657,
"grad_norm": 6.30618953704834,
"learning_rate": 7.246376811594203e-06,
"loss": 24.4957,
"step": 2396
},
{
"epoch": 5.792270531400966,
"grad_norm": 6.549522399902344,
"learning_rate": 7.0853462157809985e-06,
"loss": 24.7214,
"step": 2398
},
{
"epoch": 5.797101449275362,
"grad_norm": 7.38835334777832,
"learning_rate": 6.924315619967794e-06,
"loss": 25.5254,
"step": 2400
},
{
"epoch": 5.8019323671497585,
"grad_norm": 5.928407669067383,
"learning_rate": 6.7632850241545894e-06,
"loss": 26.4047,
"step": 2402
},
{
"epoch": 5.806763285024155,
"grad_norm": 6.4094014167785645,
"learning_rate": 6.602254428341385e-06,
"loss": 27.6215,
"step": 2404
},
{
"epoch": 5.811594202898551,
"grad_norm": 6.558480739593506,
"learning_rate": 6.44122383252818e-06,
"loss": 26.5965,
"step": 2406
},
{
"epoch": 5.816425120772947,
"grad_norm": 6.696255207061768,
"learning_rate": 6.280193236714976e-06,
"loss": 26.2955,
"step": 2408
},
{
"epoch": 5.821256038647343,
"grad_norm": 6.232416152954102,
"learning_rate": 6.119162640901771e-06,
"loss": 27.1097,
"step": 2410
},
{
"epoch": 5.826086956521739,
"grad_norm": 6.8521199226379395,
"learning_rate": 5.958132045088567e-06,
"loss": 22.8195,
"step": 2412
},
{
"epoch": 5.830917874396135,
"grad_norm": 6.833296298980713,
"learning_rate": 5.797101449275362e-06,
"loss": 26.155,
"step": 2414
},
{
"epoch": 5.835748792270532,
"grad_norm": 7.534513473510742,
"learning_rate": 5.636070853462158e-06,
"loss": 25.9633,
"step": 2416
},
{
"epoch": 5.840579710144928,
"grad_norm": 7.544939041137695,
"learning_rate": 5.475040257648953e-06,
"loss": 26.5301,
"step": 2418
},
{
"epoch": 5.845410628019324,
"grad_norm": 6.818538188934326,
"learning_rate": 5.314009661835749e-06,
"loss": 27.4522,
"step": 2420
},
{
"epoch": 5.85024154589372,
"grad_norm": 6.0586395263671875,
"learning_rate": 5.152979066022544e-06,
"loss": 24.9713,
"step": 2422
},
{
"epoch": 5.855072463768116,
"grad_norm": 6.871267318725586,
"learning_rate": 4.99194847020934e-06,
"loss": 24.1245,
"step": 2424
},
{
"epoch": 5.859903381642512,
"grad_norm": 6.431079387664795,
"learning_rate": 4.830917874396135e-06,
"loss": 26.3168,
"step": 2426
},
{
"epoch": 5.8647342995169085,
"grad_norm": 6.309189319610596,
"learning_rate": 4.669887278582931e-06,
"loss": 25.1819,
"step": 2428
},
{
"epoch": 5.869565217391305,
"grad_norm": 7.3601250648498535,
"learning_rate": 4.508856682769726e-06,
"loss": 25.487,
"step": 2430
},
{
"epoch": 5.874396135265701,
"grad_norm": 6.830559730529785,
"learning_rate": 4.347826086956522e-06,
"loss": 24.2465,
"step": 2432
},
{
"epoch": 5.879227053140097,
"grad_norm": 6.231956481933594,
"learning_rate": 4.186795491143317e-06,
"loss": 25.4524,
"step": 2434
},
{
"epoch": 5.884057971014493,
"grad_norm": 7.170751094818115,
"learning_rate": 4.025764895330112e-06,
"loss": 25.4575,
"step": 2436
},
{
"epoch": 5.888888888888889,
"grad_norm": 6.459787845611572,
"learning_rate": 3.864734299516908e-06,
"loss": 25.8052,
"step": 2438
},
{
"epoch": 5.8937198067632846,
"grad_norm": 7.013184070587158,
"learning_rate": 3.7037037037037037e-06,
"loss": 25.0231,
"step": 2440
},
{
"epoch": 5.898550724637682,
"grad_norm": 6.488290786743164,
"learning_rate": 3.5426731078904992e-06,
"loss": 26.5961,
"step": 2442
},
{
"epoch": 5.903381642512077,
"grad_norm": 6.819639205932617,
"learning_rate": 3.3816425120772947e-06,
"loss": 24.8935,
"step": 2444
},
{
"epoch": 5.908212560386474,
"grad_norm": 6.606305122375488,
"learning_rate": 3.22061191626409e-06,
"loss": 25.7707,
"step": 2446
},
{
"epoch": 5.913043478260869,
"grad_norm": 6.314495086669922,
"learning_rate": 3.0595813204508857e-06,
"loss": 25.5029,
"step": 2448
},
{
"epoch": 5.917874396135265,
"grad_norm": 6.191902160644531,
"learning_rate": 2.898550724637681e-06,
"loss": 24.0935,
"step": 2450
},
{
"epoch": 5.9227053140096615,
"grad_norm": 7.267618179321289,
"learning_rate": 2.7375201288244767e-06,
"loss": 26.7584,
"step": 2452
},
{
"epoch": 5.927536231884058,
"grad_norm": 6.361123561859131,
"learning_rate": 2.576489533011272e-06,
"loss": 26.5634,
"step": 2454
},
{
"epoch": 5.932367149758454,
"grad_norm": 6.8421173095703125,
"learning_rate": 2.4154589371980677e-06,
"loss": 25.2036,
"step": 2456
},
{
"epoch": 5.93719806763285,
"grad_norm": 6.87398099899292,
"learning_rate": 2.254428341384863e-06,
"loss": 26.766,
"step": 2458
},
{
"epoch": 5.942028985507246,
"grad_norm": 10.025320053100586,
"learning_rate": 2.0933977455716586e-06,
"loss": 24.7975,
"step": 2460
},
{
"epoch": 5.946859903381642,
"grad_norm": 6.717752933502197,
"learning_rate": 1.932367149758454e-06,
"loss": 26.9315,
"step": 2462
},
{
"epoch": 5.951690821256038,
"grad_norm": 6.499180793762207,
"learning_rate": 1.7713365539452496e-06,
"loss": 25.7196,
"step": 2464
},
{
"epoch": 5.956521739130435,
"grad_norm": 6.772797107696533,
"learning_rate": 1.610305958132045e-06,
"loss": 23.9813,
"step": 2466
},
{
"epoch": 5.961352657004831,
"grad_norm": 6.387327671051025,
"learning_rate": 1.4492753623188406e-06,
"loss": 24.631,
"step": 2468
},
{
"epoch": 5.966183574879227,
"grad_norm": 6.289485931396484,
"learning_rate": 1.288244766505636e-06,
"loss": 26.311,
"step": 2470
},
{
"epoch": 5.971014492753623,
"grad_norm": 6.260473251342773,
"learning_rate": 1.1272141706924316e-06,
"loss": 25.3955,
"step": 2472
},
{
"epoch": 5.975845410628019,
"grad_norm": 6.831587791442871,
"learning_rate": 9.66183574879227e-07,
"loss": 26.9048,
"step": 2474
},
{
"epoch": 5.980676328502415,
"grad_norm": 7.09013032913208,
"learning_rate": 8.051529790660226e-07,
"loss": 26.662,
"step": 2476
},
{
"epoch": 5.9855072463768115,
"grad_norm": 6.909030914306641,
"learning_rate": 6.44122383252818e-07,
"loss": 24.5214,
"step": 2478
},
{
"epoch": 5.990338164251208,
"grad_norm": 6.548914432525635,
"learning_rate": 4.830917874396135e-07,
"loss": 26.6767,
"step": 2480
},
{
"epoch": 5.995169082125604,
"grad_norm": 6.59926176071167,
"learning_rate": 3.22061191626409e-07,
"loss": 27.5884,
"step": 2482
},
{
"epoch": 6.0,
"grad_norm": 6.204819679260254,
"learning_rate": 1.610305958132045e-07,
"loss": 25.388,
"step": 2484
},
{
"epoch": 6.0,
"step": 2484,
"total_flos": 629711046062928.0,
"train_loss": 27.644595153857736,
"train_runtime": 6156.0959,
"train_samples_per_second": 6.456,
"train_steps_per_second": 0.404
}
],
"logging_steps": 2,
"max_steps": 2484,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 629711046062928.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}