VyvoTTS-LFM2-Heizou / trainer_state.json
kadirnar's picture
Upload folder using huggingface_hub
b103a4d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 676,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014792899408284023,
"grad_norm": 19.051719665527344,
"learning_rate": 0.0002,
"loss": 11.4654,
"step": 1
},
{
"epoch": 0.0029585798816568047,
"grad_norm": 23.727689743041992,
"learning_rate": 0.00019970414201183435,
"loss": 9.7769,
"step": 2
},
{
"epoch": 0.004437869822485207,
"grad_norm": 8.161388397216797,
"learning_rate": 0.00019940828402366864,
"loss": 8.0434,
"step": 3
},
{
"epoch": 0.005917159763313609,
"grad_norm": 7.546419620513916,
"learning_rate": 0.00019911242603550298,
"loss": 7.4566,
"step": 4
},
{
"epoch": 0.0073964497041420114,
"grad_norm": 5.024906158447266,
"learning_rate": 0.0001988165680473373,
"loss": 7.1297,
"step": 5
},
{
"epoch": 0.008875739644970414,
"grad_norm": 4.575231075286865,
"learning_rate": 0.0001985207100591716,
"loss": 6.7507,
"step": 6
},
{
"epoch": 0.010355029585798817,
"grad_norm": 3.720844268798828,
"learning_rate": 0.00019822485207100592,
"loss": 6.7162,
"step": 7
},
{
"epoch": 0.011834319526627219,
"grad_norm": 3.82381534576416,
"learning_rate": 0.00019792899408284023,
"loss": 6.4728,
"step": 8
},
{
"epoch": 0.013313609467455622,
"grad_norm": 3.960883140563965,
"learning_rate": 0.00019763313609467458,
"loss": 6.0401,
"step": 9
},
{
"epoch": 0.014792899408284023,
"grad_norm": 11.000751495361328,
"learning_rate": 0.0001973372781065089,
"loss": 6.2245,
"step": 10
},
{
"epoch": 0.016272189349112426,
"grad_norm": 6.994083404541016,
"learning_rate": 0.0001970414201183432,
"loss": 5.9574,
"step": 11
},
{
"epoch": 0.01775147928994083,
"grad_norm": 5.17557430267334,
"learning_rate": 0.00019674556213017752,
"loss": 5.8088,
"step": 12
},
{
"epoch": 0.019230769230769232,
"grad_norm": 5.766794681549072,
"learning_rate": 0.00019644970414201186,
"loss": 5.8526,
"step": 13
},
{
"epoch": 0.020710059171597635,
"grad_norm": 8.630630493164062,
"learning_rate": 0.00019615384615384615,
"loss": 5.7402,
"step": 14
},
{
"epoch": 0.022189349112426034,
"grad_norm": 8.114038467407227,
"learning_rate": 0.00019585798816568049,
"loss": 5.657,
"step": 15
},
{
"epoch": 0.023668639053254437,
"grad_norm": 4.028295516967773,
"learning_rate": 0.0001955621301775148,
"loss": 5.6914,
"step": 16
},
{
"epoch": 0.02514792899408284,
"grad_norm": 4.497015953063965,
"learning_rate": 0.00019526627218934911,
"loss": 5.5357,
"step": 17
},
{
"epoch": 0.026627218934911243,
"grad_norm": 6.016404628753662,
"learning_rate": 0.00019497041420118345,
"loss": 5.8588,
"step": 18
},
{
"epoch": 0.028106508875739646,
"grad_norm": 3.9681546688079834,
"learning_rate": 0.00019467455621301777,
"loss": 5.6267,
"step": 19
},
{
"epoch": 0.029585798816568046,
"grad_norm": 5.046078681945801,
"learning_rate": 0.00019437869822485208,
"loss": 5.8643,
"step": 20
},
{
"epoch": 0.03106508875739645,
"grad_norm": 5.350937366485596,
"learning_rate": 0.0001940828402366864,
"loss": 5.4822,
"step": 21
},
{
"epoch": 0.03254437869822485,
"grad_norm": 5.343432426452637,
"learning_rate": 0.0001937869822485207,
"loss": 5.5491,
"step": 22
},
{
"epoch": 0.034023668639053255,
"grad_norm": 5.352399826049805,
"learning_rate": 0.00019349112426035502,
"loss": 5.4912,
"step": 23
},
{
"epoch": 0.03550295857988166,
"grad_norm": 4.353022575378418,
"learning_rate": 0.00019319526627218937,
"loss": 5.4095,
"step": 24
},
{
"epoch": 0.03698224852071006,
"grad_norm": 5.44649076461792,
"learning_rate": 0.00019289940828402368,
"loss": 5.6072,
"step": 25
},
{
"epoch": 0.038461538461538464,
"grad_norm": 3.686049699783325,
"learning_rate": 0.000192603550295858,
"loss": 5.4506,
"step": 26
},
{
"epoch": 0.03994082840236687,
"grad_norm": 5.867905139923096,
"learning_rate": 0.00019230769230769233,
"loss": 5.0424,
"step": 27
},
{
"epoch": 0.04142011834319527,
"grad_norm": 4.650610446929932,
"learning_rate": 0.00019201183431952662,
"loss": 5.4751,
"step": 28
},
{
"epoch": 0.042899408284023666,
"grad_norm": 7.5463361740112305,
"learning_rate": 0.00019171597633136096,
"loss": 5.1818,
"step": 29
},
{
"epoch": 0.04437869822485207,
"grad_norm": 4.3337297439575195,
"learning_rate": 0.00019142011834319528,
"loss": 5.2133,
"step": 30
},
{
"epoch": 0.04585798816568047,
"grad_norm": 3.1711843013763428,
"learning_rate": 0.0001911242603550296,
"loss": 5.552,
"step": 31
},
{
"epoch": 0.047337278106508875,
"grad_norm": 3.5800342559814453,
"learning_rate": 0.00019082840236686393,
"loss": 5.4874,
"step": 32
},
{
"epoch": 0.04881656804733728,
"grad_norm": 3.3150734901428223,
"learning_rate": 0.00019053254437869822,
"loss": 5.5885,
"step": 33
},
{
"epoch": 0.05029585798816568,
"grad_norm": 4.20947790145874,
"learning_rate": 0.00019023668639053256,
"loss": 5.156,
"step": 34
},
{
"epoch": 0.051775147928994084,
"grad_norm": 8.529911041259766,
"learning_rate": 0.00018994082840236687,
"loss": 4.8622,
"step": 35
},
{
"epoch": 0.05325443786982249,
"grad_norm": 4.638500690460205,
"learning_rate": 0.0001896449704142012,
"loss": 5.4578,
"step": 36
},
{
"epoch": 0.05473372781065089,
"grad_norm": 4.114462375640869,
"learning_rate": 0.0001893491124260355,
"loss": 5.596,
"step": 37
},
{
"epoch": 0.05621301775147929,
"grad_norm": 3.748403310775757,
"learning_rate": 0.00018905325443786984,
"loss": 5.5348,
"step": 38
},
{
"epoch": 0.057692307692307696,
"grad_norm": 3.6923162937164307,
"learning_rate": 0.00018875739644970416,
"loss": 5.4819,
"step": 39
},
{
"epoch": 0.05917159763313609,
"grad_norm": 23.383670806884766,
"learning_rate": 0.00018846153846153847,
"loss": 5.0785,
"step": 40
},
{
"epoch": 0.060650887573964495,
"grad_norm": 3.344937324523926,
"learning_rate": 0.00018816568047337278,
"loss": 5.1104,
"step": 41
},
{
"epoch": 0.0621301775147929,
"grad_norm": 5.313603401184082,
"learning_rate": 0.0001878698224852071,
"loss": 5.5287,
"step": 42
},
{
"epoch": 0.06360946745562131,
"grad_norm": 4.41381311416626,
"learning_rate": 0.00018757396449704144,
"loss": 5.3752,
"step": 43
},
{
"epoch": 0.0650887573964497,
"grad_norm": 3.2099368572235107,
"learning_rate": 0.00018727810650887573,
"loss": 5.3049,
"step": 44
},
{
"epoch": 0.06656804733727811,
"grad_norm": 3.733710765838623,
"learning_rate": 0.00018698224852071007,
"loss": 5.1356,
"step": 45
},
{
"epoch": 0.06804733727810651,
"grad_norm": 3.271888494491577,
"learning_rate": 0.0001866863905325444,
"loss": 5.1147,
"step": 46
},
{
"epoch": 0.0695266272189349,
"grad_norm": 6.020442008972168,
"learning_rate": 0.0001863905325443787,
"loss": 5.7782,
"step": 47
},
{
"epoch": 0.07100591715976332,
"grad_norm": 3.2710890769958496,
"learning_rate": 0.00018609467455621304,
"loss": 5.3381,
"step": 48
},
{
"epoch": 0.07248520710059171,
"grad_norm": 3.6689212322235107,
"learning_rate": 0.00018579881656804735,
"loss": 5.0549,
"step": 49
},
{
"epoch": 0.07396449704142012,
"grad_norm": 3.1567306518554688,
"learning_rate": 0.00018550295857988166,
"loss": 5.0467,
"step": 50
},
{
"epoch": 0.07544378698224852,
"grad_norm": 4.436924934387207,
"learning_rate": 0.00018520710059171598,
"loss": 5.1077,
"step": 51
},
{
"epoch": 0.07692307692307693,
"grad_norm": 3.6421241760253906,
"learning_rate": 0.00018491124260355032,
"loss": 5.2728,
"step": 52
},
{
"epoch": 0.07840236686390532,
"grad_norm": 3.36057186126709,
"learning_rate": 0.00018461538461538463,
"loss": 4.8919,
"step": 53
},
{
"epoch": 0.07988165680473373,
"grad_norm": 3.2132999897003174,
"learning_rate": 0.00018431952662721895,
"loss": 5.1664,
"step": 54
},
{
"epoch": 0.08136094674556213,
"grad_norm": 3.4055607318878174,
"learning_rate": 0.00018402366863905326,
"loss": 5.193,
"step": 55
},
{
"epoch": 0.08284023668639054,
"grad_norm": 6.030857086181641,
"learning_rate": 0.00018372781065088757,
"loss": 5.1533,
"step": 56
},
{
"epoch": 0.08431952662721894,
"grad_norm": 3.724673271179199,
"learning_rate": 0.00018343195266272192,
"loss": 5.3483,
"step": 57
},
{
"epoch": 0.08579881656804733,
"grad_norm": 2.424447536468506,
"learning_rate": 0.0001831360946745562,
"loss": 5.2737,
"step": 58
},
{
"epoch": 0.08727810650887574,
"grad_norm": 4.9499430656433105,
"learning_rate": 0.00018284023668639054,
"loss": 4.7834,
"step": 59
},
{
"epoch": 0.08875739644970414,
"grad_norm": 2.8640456199645996,
"learning_rate": 0.00018254437869822488,
"loss": 5.204,
"step": 60
},
{
"epoch": 0.09023668639053255,
"grad_norm": 4.299962043762207,
"learning_rate": 0.00018224852071005917,
"loss": 4.9461,
"step": 61
},
{
"epoch": 0.09171597633136094,
"grad_norm": 3.068300724029541,
"learning_rate": 0.0001819526627218935,
"loss": 5.1019,
"step": 62
},
{
"epoch": 0.09319526627218935,
"grad_norm": 3.216639995574951,
"learning_rate": 0.00018165680473372783,
"loss": 5.43,
"step": 63
},
{
"epoch": 0.09467455621301775,
"grad_norm": 4.38277530670166,
"learning_rate": 0.00018136094674556214,
"loss": 5.3557,
"step": 64
},
{
"epoch": 0.09615384615384616,
"grad_norm": 2.8199565410614014,
"learning_rate": 0.00018106508875739645,
"loss": 5.4996,
"step": 65
},
{
"epoch": 0.09763313609467456,
"grad_norm": 4.008239269256592,
"learning_rate": 0.00018076923076923077,
"loss": 5.1045,
"step": 66
},
{
"epoch": 0.09911242603550297,
"grad_norm": 3.61327862739563,
"learning_rate": 0.00018047337278106508,
"loss": 5.2507,
"step": 67
},
{
"epoch": 0.10059171597633136,
"grad_norm": 4.783330917358398,
"learning_rate": 0.00018017751479289942,
"loss": 5.0782,
"step": 68
},
{
"epoch": 0.10207100591715976,
"grad_norm": 3.5379090309143066,
"learning_rate": 0.00017988165680473374,
"loss": 5.3468,
"step": 69
},
{
"epoch": 0.10355029585798817,
"grad_norm": 3.339912176132202,
"learning_rate": 0.00017958579881656805,
"loss": 5.2642,
"step": 70
},
{
"epoch": 0.10502958579881656,
"grad_norm": 3.801499605178833,
"learning_rate": 0.0001792899408284024,
"loss": 5.3676,
"step": 71
},
{
"epoch": 0.10650887573964497,
"grad_norm": 5.259742259979248,
"learning_rate": 0.00017899408284023668,
"loss": 5.0667,
"step": 72
},
{
"epoch": 0.10798816568047337,
"grad_norm": 3.887979507446289,
"learning_rate": 0.00017869822485207102,
"loss": 5.3508,
"step": 73
},
{
"epoch": 0.10946745562130178,
"grad_norm": 2.761725425720215,
"learning_rate": 0.00017840236686390533,
"loss": 5.3571,
"step": 74
},
{
"epoch": 0.11094674556213018,
"grad_norm": 4.331815719604492,
"learning_rate": 0.00017810650887573965,
"loss": 4.9632,
"step": 75
},
{
"epoch": 0.11242603550295859,
"grad_norm": 3.6547133922576904,
"learning_rate": 0.000177810650887574,
"loss": 5.1787,
"step": 76
},
{
"epoch": 0.11390532544378698,
"grad_norm": 3.22377347946167,
"learning_rate": 0.00017751479289940828,
"loss": 5.3842,
"step": 77
},
{
"epoch": 0.11538461538461539,
"grad_norm": 6.254403591156006,
"learning_rate": 0.00017721893491124262,
"loss": 5.1528,
"step": 78
},
{
"epoch": 0.11686390532544379,
"grad_norm": 3.523214101791382,
"learning_rate": 0.00017692307692307693,
"loss": 5.3192,
"step": 79
},
{
"epoch": 0.11834319526627218,
"grad_norm": 3.261544704437256,
"learning_rate": 0.00017662721893491124,
"loss": 5.0825,
"step": 80
},
{
"epoch": 0.11982248520710059,
"grad_norm": 3.994359016418457,
"learning_rate": 0.00017633136094674556,
"loss": 5.1593,
"step": 81
},
{
"epoch": 0.12130177514792899,
"grad_norm": 2.5975234508514404,
"learning_rate": 0.0001760355029585799,
"loss": 5.4471,
"step": 82
},
{
"epoch": 0.1227810650887574,
"grad_norm": 5.125651836395264,
"learning_rate": 0.0001757396449704142,
"loss": 4.8965,
"step": 83
},
{
"epoch": 0.1242603550295858,
"grad_norm": 4.02432918548584,
"learning_rate": 0.00017544378698224853,
"loss": 5.1502,
"step": 84
},
{
"epoch": 0.1257396449704142,
"grad_norm": 4.148054599761963,
"learning_rate": 0.00017514792899408287,
"loss": 4.9383,
"step": 85
},
{
"epoch": 0.12721893491124261,
"grad_norm": 4.669896125793457,
"learning_rate": 0.00017485207100591716,
"loss": 5.1252,
"step": 86
},
{
"epoch": 0.128698224852071,
"grad_norm": 3.3551809787750244,
"learning_rate": 0.0001745562130177515,
"loss": 4.8864,
"step": 87
},
{
"epoch": 0.1301775147928994,
"grad_norm": 4.218849182128906,
"learning_rate": 0.0001742603550295858,
"loss": 5.963,
"step": 88
},
{
"epoch": 0.13165680473372782,
"grad_norm": 3.2162094116210938,
"learning_rate": 0.00017396449704142012,
"loss": 5.0602,
"step": 89
},
{
"epoch": 0.13313609467455623,
"grad_norm": 3.9395463466644287,
"learning_rate": 0.00017366863905325447,
"loss": 5.1657,
"step": 90
},
{
"epoch": 0.1346153846153846,
"grad_norm": 3.0270235538482666,
"learning_rate": 0.00017337278106508875,
"loss": 4.956,
"step": 91
},
{
"epoch": 0.13609467455621302,
"grad_norm": 4.6423726081848145,
"learning_rate": 0.0001730769230769231,
"loss": 5.2184,
"step": 92
},
{
"epoch": 0.13757396449704143,
"grad_norm": 3.070094347000122,
"learning_rate": 0.0001727810650887574,
"loss": 5.3557,
"step": 93
},
{
"epoch": 0.1390532544378698,
"grad_norm": 4.294172286987305,
"learning_rate": 0.00017248520710059172,
"loss": 4.9325,
"step": 94
},
{
"epoch": 0.14053254437869822,
"grad_norm": 2.961416721343994,
"learning_rate": 0.00017218934911242603,
"loss": 5.0524,
"step": 95
},
{
"epoch": 0.14201183431952663,
"grad_norm": 2.807966470718384,
"learning_rate": 0.00017189349112426038,
"loss": 4.9914,
"step": 96
},
{
"epoch": 0.14349112426035504,
"grad_norm": 4.755406379699707,
"learning_rate": 0.0001715976331360947,
"loss": 5.295,
"step": 97
},
{
"epoch": 0.14497041420118342,
"grad_norm": 2.498399257659912,
"learning_rate": 0.000171301775147929,
"loss": 5.3439,
"step": 98
},
{
"epoch": 0.14644970414201183,
"grad_norm": 3.2769603729248047,
"learning_rate": 0.00017100591715976332,
"loss": 5.0286,
"step": 99
},
{
"epoch": 0.14792899408284024,
"grad_norm": 3.3242433071136475,
"learning_rate": 0.00017071005917159763,
"loss": 5.2284,
"step": 100
},
{
"epoch": 0.14940828402366865,
"grad_norm": 3.9057908058166504,
"learning_rate": 0.00017041420118343197,
"loss": 5.1046,
"step": 101
},
{
"epoch": 0.15088757396449703,
"grad_norm": 2.3905794620513916,
"learning_rate": 0.00017011834319526626,
"loss": 4.9429,
"step": 102
},
{
"epoch": 0.15236686390532544,
"grad_norm": 2.85048246383667,
"learning_rate": 0.0001698224852071006,
"loss": 5.1951,
"step": 103
},
{
"epoch": 0.15384615384615385,
"grad_norm": 3.160964012145996,
"learning_rate": 0.00016952662721893494,
"loss": 5.0117,
"step": 104
},
{
"epoch": 0.15532544378698224,
"grad_norm": 6.133135795593262,
"learning_rate": 0.00016923076923076923,
"loss": 5.3851,
"step": 105
},
{
"epoch": 0.15680473372781065,
"grad_norm": 4.323611259460449,
"learning_rate": 0.00016893491124260357,
"loss": 4.9845,
"step": 106
},
{
"epoch": 0.15828402366863906,
"grad_norm": 3.989495038986206,
"learning_rate": 0.00016863905325443788,
"loss": 5.0833,
"step": 107
},
{
"epoch": 0.15976331360946747,
"grad_norm": 2.9743452072143555,
"learning_rate": 0.0001683431952662722,
"loss": 5.4684,
"step": 108
},
{
"epoch": 0.16124260355029585,
"grad_norm": 3.270275592803955,
"learning_rate": 0.0001680473372781065,
"loss": 4.861,
"step": 109
},
{
"epoch": 0.16272189349112426,
"grad_norm": 3.555215358734131,
"learning_rate": 0.00016775147928994083,
"loss": 5.0709,
"step": 110
},
{
"epoch": 0.16420118343195267,
"grad_norm": 4.3059306144714355,
"learning_rate": 0.00016745562130177514,
"loss": 5.5366,
"step": 111
},
{
"epoch": 0.16568047337278108,
"grad_norm": 3.379615068435669,
"learning_rate": 0.00016715976331360948,
"loss": 4.9129,
"step": 112
},
{
"epoch": 0.16715976331360946,
"grad_norm": 5.602575302124023,
"learning_rate": 0.0001668639053254438,
"loss": 4.8603,
"step": 113
},
{
"epoch": 0.16863905325443787,
"grad_norm": 2.965013265609741,
"learning_rate": 0.0001665680473372781,
"loss": 5.2722,
"step": 114
},
{
"epoch": 0.17011834319526628,
"grad_norm": 2.8539206981658936,
"learning_rate": 0.00016627218934911245,
"loss": 5.217,
"step": 115
},
{
"epoch": 0.17159763313609466,
"grad_norm": 2.861959457397461,
"learning_rate": 0.00016597633136094674,
"loss": 4.9753,
"step": 116
},
{
"epoch": 0.17307692307692307,
"grad_norm": 3.451565742492676,
"learning_rate": 0.00016568047337278108,
"loss": 4.9347,
"step": 117
},
{
"epoch": 0.17455621301775148,
"grad_norm": 2.7960565090179443,
"learning_rate": 0.0001653846153846154,
"loss": 5.2664,
"step": 118
},
{
"epoch": 0.1760355029585799,
"grad_norm": 3.4076054096221924,
"learning_rate": 0.0001650887573964497,
"loss": 4.8754,
"step": 119
},
{
"epoch": 0.17751479289940827,
"grad_norm": 3.4976394176483154,
"learning_rate": 0.00016479289940828405,
"loss": 5.0685,
"step": 120
},
{
"epoch": 0.17899408284023668,
"grad_norm": 2.982083320617676,
"learning_rate": 0.00016449704142011836,
"loss": 5.2266,
"step": 121
},
{
"epoch": 0.1804733727810651,
"grad_norm": 3.0358190536499023,
"learning_rate": 0.00016420118343195267,
"loss": 5.1363,
"step": 122
},
{
"epoch": 0.1819526627218935,
"grad_norm": 3.6040701866149902,
"learning_rate": 0.000163905325443787,
"loss": 4.8832,
"step": 123
},
{
"epoch": 0.1834319526627219,
"grad_norm": 4.876418590545654,
"learning_rate": 0.0001636094674556213,
"loss": 4.7069,
"step": 124
},
{
"epoch": 0.1849112426035503,
"grad_norm": 6.96181058883667,
"learning_rate": 0.00016331360946745562,
"loss": 4.6315,
"step": 125
},
{
"epoch": 0.1863905325443787,
"grad_norm": 2.648008108139038,
"learning_rate": 0.00016301775147928996,
"loss": 5.3977,
"step": 126
},
{
"epoch": 0.1878698224852071,
"grad_norm": 3.0898566246032715,
"learning_rate": 0.00016272189349112427,
"loss": 4.9533,
"step": 127
},
{
"epoch": 0.1893491124260355,
"grad_norm": 3.143807888031006,
"learning_rate": 0.00016242603550295858,
"loss": 5.3733,
"step": 128
},
{
"epoch": 0.1908284023668639,
"grad_norm": 3.230045795440674,
"learning_rate": 0.00016213017751479293,
"loss": 4.8937,
"step": 129
},
{
"epoch": 0.19230769230769232,
"grad_norm": 3.5977890491485596,
"learning_rate": 0.0001618343195266272,
"loss": 5.0298,
"step": 130
},
{
"epoch": 0.1937869822485207,
"grad_norm": 3.7372870445251465,
"learning_rate": 0.00016153846153846155,
"loss": 4.8582,
"step": 131
},
{
"epoch": 0.1952662721893491,
"grad_norm": 3.4929022789001465,
"learning_rate": 0.00016124260355029587,
"loss": 5.043,
"step": 132
},
{
"epoch": 0.19674556213017752,
"grad_norm": 3.389169931411743,
"learning_rate": 0.00016094674556213018,
"loss": 5.196,
"step": 133
},
{
"epoch": 0.19822485207100593,
"grad_norm": 5.478149890899658,
"learning_rate": 0.00016065088757396452,
"loss": 4.8719,
"step": 134
},
{
"epoch": 0.1997041420118343,
"grad_norm": 2.303262233734131,
"learning_rate": 0.0001603550295857988,
"loss": 5.1638,
"step": 135
},
{
"epoch": 0.20118343195266272,
"grad_norm": 3.297595739364624,
"learning_rate": 0.00016005917159763315,
"loss": 4.8086,
"step": 136
},
{
"epoch": 0.20266272189349113,
"grad_norm": 3.509148120880127,
"learning_rate": 0.00015976331360946746,
"loss": 4.8932,
"step": 137
},
{
"epoch": 0.20414201183431951,
"grad_norm": 3.4905240535736084,
"learning_rate": 0.00015946745562130178,
"loss": 5.0621,
"step": 138
},
{
"epoch": 0.20562130177514792,
"grad_norm": 3.4399490356445312,
"learning_rate": 0.0001591715976331361,
"loss": 5.1894,
"step": 139
},
{
"epoch": 0.20710059171597633,
"grad_norm": 3.0734777450561523,
"learning_rate": 0.00015887573964497043,
"loss": 4.9211,
"step": 140
},
{
"epoch": 0.20857988165680474,
"grad_norm": 3.179959297180176,
"learning_rate": 0.00015857988165680475,
"loss": 5.2447,
"step": 141
},
{
"epoch": 0.21005917159763313,
"grad_norm": 4.863833427429199,
"learning_rate": 0.00015828402366863906,
"loss": 5.0381,
"step": 142
},
{
"epoch": 0.21153846153846154,
"grad_norm": 5.0287556648254395,
"learning_rate": 0.00015798816568047337,
"loss": 4.7826,
"step": 143
},
{
"epoch": 0.21301775147928995,
"grad_norm": 2.806042194366455,
"learning_rate": 0.0001576923076923077,
"loss": 5.1359,
"step": 144
},
{
"epoch": 0.21449704142011836,
"grad_norm": 3.897864818572998,
"learning_rate": 0.00015739644970414203,
"loss": 5.2164,
"step": 145
},
{
"epoch": 0.21597633136094674,
"grad_norm": 3.212893009185791,
"learning_rate": 0.00015710059171597634,
"loss": 4.9873,
"step": 146
},
{
"epoch": 0.21745562130177515,
"grad_norm": 3.277351140975952,
"learning_rate": 0.00015680473372781066,
"loss": 4.9393,
"step": 147
},
{
"epoch": 0.21893491124260356,
"grad_norm": 3.687052011489868,
"learning_rate": 0.000156508875739645,
"loss": 5.0331,
"step": 148
},
{
"epoch": 0.22041420118343194,
"grad_norm": 2.938491106033325,
"learning_rate": 0.00015621301775147929,
"loss": 4.9259,
"step": 149
},
{
"epoch": 0.22189349112426035,
"grad_norm": 3.9487521648406982,
"learning_rate": 0.00015591715976331363,
"loss": 4.7096,
"step": 150
},
{
"epoch": 0.22337278106508876,
"grad_norm": 3.2990753650665283,
"learning_rate": 0.00015562130177514794,
"loss": 5.2393,
"step": 151
},
{
"epoch": 0.22485207100591717,
"grad_norm": 4.3339762687683105,
"learning_rate": 0.00015532544378698225,
"loss": 4.6888,
"step": 152
},
{
"epoch": 0.22633136094674555,
"grad_norm": 2.981172561645508,
"learning_rate": 0.00015502958579881657,
"loss": 5.1174,
"step": 153
},
{
"epoch": 0.22781065088757396,
"grad_norm": 3.2095184326171875,
"learning_rate": 0.0001547337278106509,
"loss": 5.5025,
"step": 154
},
{
"epoch": 0.22928994082840237,
"grad_norm": 3.3522627353668213,
"learning_rate": 0.0001544378698224852,
"loss": 5.0102,
"step": 155
},
{
"epoch": 0.23076923076923078,
"grad_norm": 3.347395658493042,
"learning_rate": 0.00015414201183431954,
"loss": 5.3176,
"step": 156
},
{
"epoch": 0.23224852071005916,
"grad_norm": 4.909267902374268,
"learning_rate": 0.00015384615384615385,
"loss": 4.7972,
"step": 157
},
{
"epoch": 0.23372781065088757,
"grad_norm": 3.8333449363708496,
"learning_rate": 0.00015355029585798817,
"loss": 4.829,
"step": 158
},
{
"epoch": 0.23520710059171598,
"grad_norm": 4.241271495819092,
"learning_rate": 0.0001532544378698225,
"loss": 4.9222,
"step": 159
},
{
"epoch": 0.23668639053254437,
"grad_norm": 2.662621021270752,
"learning_rate": 0.0001529585798816568,
"loss": 4.8007,
"step": 160
},
{
"epoch": 0.23816568047337278,
"grad_norm": 4.020458221435547,
"learning_rate": 0.00015266272189349113,
"loss": 5.0829,
"step": 161
},
{
"epoch": 0.23964497041420119,
"grad_norm": 3.640207052230835,
"learning_rate": 0.00015236686390532545,
"loss": 5.0856,
"step": 162
},
{
"epoch": 0.2411242603550296,
"grad_norm": 3.3714780807495117,
"learning_rate": 0.00015207100591715976,
"loss": 5.0783,
"step": 163
},
{
"epoch": 0.24260355029585798,
"grad_norm": 3.97548508644104,
"learning_rate": 0.0001517751479289941,
"loss": 4.8682,
"step": 164
},
{
"epoch": 0.2440828402366864,
"grad_norm": 2.2095658779144287,
"learning_rate": 0.00015147928994082842,
"loss": 5.2263,
"step": 165
},
{
"epoch": 0.2455621301775148,
"grad_norm": 3.80568265914917,
"learning_rate": 0.00015118343195266273,
"loss": 4.7762,
"step": 166
},
{
"epoch": 0.2470414201183432,
"grad_norm": 3.4083971977233887,
"learning_rate": 0.00015088757396449705,
"loss": 5.2287,
"step": 167
},
{
"epoch": 0.2485207100591716,
"grad_norm": 5.997743606567383,
"learning_rate": 0.00015059171597633136,
"loss": 4.7403,
"step": 168
},
{
"epoch": 0.25,
"grad_norm": 3.7599215507507324,
"learning_rate": 0.00015029585798816567,
"loss": 4.7957,
"step": 169
},
{
"epoch": 0.2514792899408284,
"grad_norm": 2.9996368885040283,
"learning_rate": 0.00015000000000000001,
"loss": 5.0695,
"step": 170
},
{
"epoch": 0.2529585798816568,
"grad_norm": 3.03244686126709,
"learning_rate": 0.00014970414201183433,
"loss": 4.7598,
"step": 171
},
{
"epoch": 0.25443786982248523,
"grad_norm": 2.692690372467041,
"learning_rate": 0.00014940828402366864,
"loss": 5.17,
"step": 172
},
{
"epoch": 0.2559171597633136,
"grad_norm": 4.609525203704834,
"learning_rate": 0.00014911242603550298,
"loss": 4.8048,
"step": 173
},
{
"epoch": 0.257396449704142,
"grad_norm": 4.3488264083862305,
"learning_rate": 0.00014881656804733727,
"loss": 5.0132,
"step": 174
},
{
"epoch": 0.2588757396449704,
"grad_norm": 4.0892558097839355,
"learning_rate": 0.0001485207100591716,
"loss": 5.0826,
"step": 175
},
{
"epoch": 0.2603550295857988,
"grad_norm": 3.7176010608673096,
"learning_rate": 0.00014822485207100592,
"loss": 5.0371,
"step": 176
},
{
"epoch": 0.2618343195266272,
"grad_norm": 2.8590428829193115,
"learning_rate": 0.00014792899408284024,
"loss": 5.201,
"step": 177
},
{
"epoch": 0.26331360946745563,
"grad_norm": 4.960610866546631,
"learning_rate": 0.00014763313609467458,
"loss": 4.7447,
"step": 178
},
{
"epoch": 0.26479289940828404,
"grad_norm": 4.061326503753662,
"learning_rate": 0.0001473372781065089,
"loss": 5.2911,
"step": 179
},
{
"epoch": 0.26627218934911245,
"grad_norm": 3.747265338897705,
"learning_rate": 0.0001470414201183432,
"loss": 4.8774,
"step": 180
},
{
"epoch": 0.2677514792899408,
"grad_norm": 3.293882369995117,
"learning_rate": 0.00014674556213017752,
"loss": 5.1072,
"step": 181
},
{
"epoch": 0.2692307692307692,
"grad_norm": 3.514312505722046,
"learning_rate": 0.00014644970414201184,
"loss": 4.8875,
"step": 182
},
{
"epoch": 0.27071005917159763,
"grad_norm": 3.192707061767578,
"learning_rate": 0.00014615384615384615,
"loss": 4.7942,
"step": 183
},
{
"epoch": 0.27218934911242604,
"grad_norm": 4.239992141723633,
"learning_rate": 0.0001458579881656805,
"loss": 4.8851,
"step": 184
},
{
"epoch": 0.27366863905325445,
"grad_norm": 3.4382450580596924,
"learning_rate": 0.0001455621301775148,
"loss": 5.0304,
"step": 185
},
{
"epoch": 0.27514792899408286,
"grad_norm": 3.2028238773345947,
"learning_rate": 0.00014526627218934912,
"loss": 4.947,
"step": 186
},
{
"epoch": 0.27662721893491127,
"grad_norm": 3.0747134685516357,
"learning_rate": 0.00014497041420118346,
"loss": 4.9979,
"step": 187
},
{
"epoch": 0.2781065088757396,
"grad_norm": 3.7282187938690186,
"learning_rate": 0.00014467455621301775,
"loss": 4.8038,
"step": 188
},
{
"epoch": 0.27958579881656803,
"grad_norm": 3.0893115997314453,
"learning_rate": 0.0001443786982248521,
"loss": 5.1064,
"step": 189
},
{
"epoch": 0.28106508875739644,
"grad_norm": 4.98429012298584,
"learning_rate": 0.0001440828402366864,
"loss": 5.7315,
"step": 190
},
{
"epoch": 0.28254437869822485,
"grad_norm": 3.111341714859009,
"learning_rate": 0.00014378698224852072,
"loss": 5.0315,
"step": 191
},
{
"epoch": 0.28402366863905326,
"grad_norm": 3.6870968341827393,
"learning_rate": 0.00014349112426035503,
"loss": 5.0105,
"step": 192
},
{
"epoch": 0.28550295857988167,
"grad_norm": 4.709840774536133,
"learning_rate": 0.00014319526627218934,
"loss": 5.0156,
"step": 193
},
{
"epoch": 0.2869822485207101,
"grad_norm": 2.8306596279144287,
"learning_rate": 0.00014289940828402368,
"loss": 4.9832,
"step": 194
},
{
"epoch": 0.28846153846153844,
"grad_norm": 4.066086292266846,
"learning_rate": 0.000142603550295858,
"loss": 4.886,
"step": 195
},
{
"epoch": 0.28994082840236685,
"grad_norm": 3.7111706733703613,
"learning_rate": 0.0001423076923076923,
"loss": 4.868,
"step": 196
},
{
"epoch": 0.29142011834319526,
"grad_norm": 2.9184136390686035,
"learning_rate": 0.00014201183431952663,
"loss": 5.1351,
"step": 197
},
{
"epoch": 0.29289940828402367,
"grad_norm": 3.7855818271636963,
"learning_rate": 0.00014171597633136097,
"loss": 4.6282,
"step": 198
},
{
"epoch": 0.2943786982248521,
"grad_norm": 2.7705442905426025,
"learning_rate": 0.00014142011834319525,
"loss": 4.9413,
"step": 199
},
{
"epoch": 0.2958579881656805,
"grad_norm": 2.315896511077881,
"learning_rate": 0.0001411242603550296,
"loss": 5.1848,
"step": 200
},
{
"epoch": 0.2973372781065089,
"grad_norm": 2.548875331878662,
"learning_rate": 0.0001408284023668639,
"loss": 5.1358,
"step": 201
},
{
"epoch": 0.2988165680473373,
"grad_norm": 2.73306941986084,
"learning_rate": 0.00014053254437869822,
"loss": 4.9356,
"step": 202
},
{
"epoch": 0.30029585798816566,
"grad_norm": 5.324806213378906,
"learning_rate": 0.00014023668639053256,
"loss": 4.8917,
"step": 203
},
{
"epoch": 0.30177514792899407,
"grad_norm": 3.943690299987793,
"learning_rate": 0.00013994082840236685,
"loss": 4.8406,
"step": 204
},
{
"epoch": 0.3032544378698225,
"grad_norm": 3.848397970199585,
"learning_rate": 0.0001396449704142012,
"loss": 4.7171,
"step": 205
},
{
"epoch": 0.3047337278106509,
"grad_norm": 3.5584487915039062,
"learning_rate": 0.0001393491124260355,
"loss": 4.8052,
"step": 206
},
{
"epoch": 0.3062130177514793,
"grad_norm": 3.0798633098602295,
"learning_rate": 0.00013905325443786982,
"loss": 4.959,
"step": 207
},
{
"epoch": 0.3076923076923077,
"grad_norm": 4.075172424316406,
"learning_rate": 0.00013875739644970416,
"loss": 4.6734,
"step": 208
},
{
"epoch": 0.3091715976331361,
"grad_norm": 2.977128505706787,
"learning_rate": 0.00013846153846153847,
"loss": 5.128,
"step": 209
},
{
"epoch": 0.3106508875739645,
"grad_norm": 2.536226987838745,
"learning_rate": 0.0001381656804733728,
"loss": 4.9083,
"step": 210
},
{
"epoch": 0.3121301775147929,
"grad_norm": 2.7597198486328125,
"learning_rate": 0.0001378698224852071,
"loss": 5.0091,
"step": 211
},
{
"epoch": 0.3136094674556213,
"grad_norm": 4.09480619430542,
"learning_rate": 0.00013757396449704144,
"loss": 4.6064,
"step": 212
},
{
"epoch": 0.3150887573964497,
"grad_norm": 4.707205772399902,
"learning_rate": 0.00013727810650887573,
"loss": 5.1816,
"step": 213
},
{
"epoch": 0.3165680473372781,
"grad_norm": 3.8236589431762695,
"learning_rate": 0.00013698224852071007,
"loss": 4.8127,
"step": 214
},
{
"epoch": 0.3180473372781065,
"grad_norm": 3.736663341522217,
"learning_rate": 0.00013668639053254439,
"loss": 5.0851,
"step": 215
},
{
"epoch": 0.31952662721893493,
"grad_norm": 3.5251119136810303,
"learning_rate": 0.0001363905325443787,
"loss": 4.7787,
"step": 216
},
{
"epoch": 0.3210059171597633,
"grad_norm": 4.254045009613037,
"learning_rate": 0.00013609467455621304,
"loss": 4.6852,
"step": 217
},
{
"epoch": 0.3224852071005917,
"grad_norm": 2.4598679542541504,
"learning_rate": 0.00013579881656804733,
"loss": 5.1052,
"step": 218
},
{
"epoch": 0.3239644970414201,
"grad_norm": 3.000013828277588,
"learning_rate": 0.00013550295857988167,
"loss": 5.092,
"step": 219
},
{
"epoch": 0.3254437869822485,
"grad_norm": 3.0184524059295654,
"learning_rate": 0.00013520710059171598,
"loss": 5.1709,
"step": 220
},
{
"epoch": 0.3269230769230769,
"grad_norm": 4.693943977355957,
"learning_rate": 0.0001349112426035503,
"loss": 5.0312,
"step": 221
},
{
"epoch": 0.32840236686390534,
"grad_norm": 4.594445705413818,
"learning_rate": 0.00013461538461538464,
"loss": 4.884,
"step": 222
},
{
"epoch": 0.32988165680473375,
"grad_norm": 2.1385953426361084,
"learning_rate": 0.00013431952662721895,
"loss": 5.1498,
"step": 223
},
{
"epoch": 0.33136094674556216,
"grad_norm": 3.80926251411438,
"learning_rate": 0.00013402366863905326,
"loss": 4.7281,
"step": 224
},
{
"epoch": 0.3328402366863905,
"grad_norm": 2.7746124267578125,
"learning_rate": 0.00013372781065088758,
"loss": 5.1611,
"step": 225
},
{
"epoch": 0.3343195266272189,
"grad_norm": 4.454751014709473,
"learning_rate": 0.0001334319526627219,
"loss": 4.6656,
"step": 226
},
{
"epoch": 0.33579881656804733,
"grad_norm": 3.0681076049804688,
"learning_rate": 0.0001331360946745562,
"loss": 4.947,
"step": 227
},
{
"epoch": 0.33727810650887574,
"grad_norm": 4.087713718414307,
"learning_rate": 0.00013284023668639055,
"loss": 4.659,
"step": 228
},
{
"epoch": 0.33875739644970415,
"grad_norm": 2.7272188663482666,
"learning_rate": 0.00013254437869822486,
"loss": 4.875,
"step": 229
},
{
"epoch": 0.34023668639053256,
"grad_norm": 4.372305393218994,
"learning_rate": 0.00013224852071005918,
"loss": 5.0376,
"step": 230
},
{
"epoch": 0.34171597633136097,
"grad_norm": 2.4269564151763916,
"learning_rate": 0.00013195266272189352,
"loss": 4.865,
"step": 231
},
{
"epoch": 0.3431952662721893,
"grad_norm": 2.970414876937866,
"learning_rate": 0.0001316568047337278,
"loss": 5.1939,
"step": 232
},
{
"epoch": 0.34467455621301774,
"grad_norm": 2.476336717605591,
"learning_rate": 0.00013136094674556214,
"loss": 4.8974,
"step": 233
},
{
"epoch": 0.34615384615384615,
"grad_norm": 2.5719239711761475,
"learning_rate": 0.00013106508875739646,
"loss": 4.7806,
"step": 234
},
{
"epoch": 0.34763313609467456,
"grad_norm": 3.170459747314453,
"learning_rate": 0.00013076923076923077,
"loss": 4.841,
"step": 235
},
{
"epoch": 0.34911242603550297,
"grad_norm": 3.061983108520508,
"learning_rate": 0.00013047337278106509,
"loss": 5.1059,
"step": 236
},
{
"epoch": 0.3505917159763314,
"grad_norm": 3.1553242206573486,
"learning_rate": 0.0001301775147928994,
"loss": 4.8559,
"step": 237
},
{
"epoch": 0.3520710059171598,
"grad_norm": 4.066651344299316,
"learning_rate": 0.00012988165680473374,
"loss": 4.8189,
"step": 238
},
{
"epoch": 0.35355029585798814,
"grad_norm": 2.3812334537506104,
"learning_rate": 0.00012958579881656806,
"loss": 4.8804,
"step": 239
},
{
"epoch": 0.35502958579881655,
"grad_norm": 4.674489974975586,
"learning_rate": 0.00012928994082840237,
"loss": 5.0399,
"step": 240
},
{
"epoch": 0.35650887573964496,
"grad_norm": 3.953132390975952,
"learning_rate": 0.00012899408284023668,
"loss": 5.3266,
"step": 241
},
{
"epoch": 0.35798816568047337,
"grad_norm": 3.108064651489258,
"learning_rate": 0.00012869822485207102,
"loss": 5.032,
"step": 242
},
{
"epoch": 0.3594674556213018,
"grad_norm": 3.2447121143341064,
"learning_rate": 0.0001284023668639053,
"loss": 4.9344,
"step": 243
},
{
"epoch": 0.3609467455621302,
"grad_norm": 6.096102714538574,
"learning_rate": 0.00012810650887573965,
"loss": 4.7905,
"step": 244
},
{
"epoch": 0.3624260355029586,
"grad_norm": 3.0096826553344727,
"learning_rate": 0.00012781065088757397,
"loss": 4.663,
"step": 245
},
{
"epoch": 0.363905325443787,
"grad_norm": 3.5750575065612793,
"learning_rate": 0.00012751479289940828,
"loss": 4.7274,
"step": 246
},
{
"epoch": 0.36538461538461536,
"grad_norm": 2.610539674758911,
"learning_rate": 0.00012721893491124262,
"loss": 4.9819,
"step": 247
},
{
"epoch": 0.3668639053254438,
"grad_norm": 2.911921977996826,
"learning_rate": 0.00012692307692307693,
"loss": 4.5621,
"step": 248
},
{
"epoch": 0.3683431952662722,
"grad_norm": 3.698395252227783,
"learning_rate": 0.00012662721893491125,
"loss": 4.8973,
"step": 249
},
{
"epoch": 0.3698224852071006,
"grad_norm": 2.5374014377593994,
"learning_rate": 0.00012633136094674556,
"loss": 5.0435,
"step": 250
},
{
"epoch": 0.371301775147929,
"grad_norm": 2.8798906803131104,
"learning_rate": 0.00012603550295857988,
"loss": 4.928,
"step": 251
},
{
"epoch": 0.3727810650887574,
"grad_norm": 3.2215588092803955,
"learning_rate": 0.00012573964497041422,
"loss": 5.0017,
"step": 252
},
{
"epoch": 0.3742603550295858,
"grad_norm": 3.9945900440216064,
"learning_rate": 0.00012544378698224853,
"loss": 4.9283,
"step": 253
},
{
"epoch": 0.3757396449704142,
"grad_norm": 3.116191864013672,
"learning_rate": 0.00012514792899408285,
"loss": 5.1576,
"step": 254
},
{
"epoch": 0.3772189349112426,
"grad_norm": 4.140561103820801,
"learning_rate": 0.00012485207100591716,
"loss": 4.6843,
"step": 255
},
{
"epoch": 0.378698224852071,
"grad_norm": 3.4369871616363525,
"learning_rate": 0.0001245562130177515,
"loss": 4.7908,
"step": 256
},
{
"epoch": 0.3801775147928994,
"grad_norm": 3.218079090118408,
"learning_rate": 0.0001242603550295858,
"loss": 5.1624,
"step": 257
},
{
"epoch": 0.3816568047337278,
"grad_norm": 2.8588600158691406,
"learning_rate": 0.00012396449704142013,
"loss": 4.9308,
"step": 258
},
{
"epoch": 0.3831360946745562,
"grad_norm": 3.2073476314544678,
"learning_rate": 0.00012366863905325444,
"loss": 5.0278,
"step": 259
},
{
"epoch": 0.38461538461538464,
"grad_norm": 3.6556711196899414,
"learning_rate": 0.00012337278106508876,
"loss": 4.6337,
"step": 260
},
{
"epoch": 0.386094674556213,
"grad_norm": 3.5764808654785156,
"learning_rate": 0.0001230769230769231,
"loss": 4.7867,
"step": 261
},
{
"epoch": 0.3875739644970414,
"grad_norm": 2.8595478534698486,
"learning_rate": 0.00012278106508875738,
"loss": 4.7535,
"step": 262
},
{
"epoch": 0.3890532544378698,
"grad_norm": 2.653027296066284,
"learning_rate": 0.00012248520710059173,
"loss": 5.0196,
"step": 263
},
{
"epoch": 0.3905325443786982,
"grad_norm": 5.582137584686279,
"learning_rate": 0.00012218934911242604,
"loss": 5.0398,
"step": 264
},
{
"epoch": 0.39201183431952663,
"grad_norm": 2.5141005516052246,
"learning_rate": 0.00012189349112426037,
"loss": 4.7242,
"step": 265
},
{
"epoch": 0.39349112426035504,
"grad_norm": 2.708251476287842,
"learning_rate": 0.0001215976331360947,
"loss": 5.2731,
"step": 266
},
{
"epoch": 0.39497041420118345,
"grad_norm": 3.10066294670105,
"learning_rate": 0.000121301775147929,
"loss": 5.0398,
"step": 267
},
{
"epoch": 0.39644970414201186,
"grad_norm": 3.866253137588501,
"learning_rate": 0.00012100591715976332,
"loss": 4.7469,
"step": 268
},
{
"epoch": 0.3979289940828402,
"grad_norm": 1.983142614364624,
"learning_rate": 0.00012071005917159764,
"loss": 5.1332,
"step": 269
},
{
"epoch": 0.3994082840236686,
"grad_norm": 4.262152671813965,
"learning_rate": 0.00012041420118343196,
"loss": 4.8862,
"step": 270
},
{
"epoch": 0.40088757396449703,
"grad_norm": 3.2552340030670166,
"learning_rate": 0.00012011834319526626,
"loss": 4.7106,
"step": 271
},
{
"epoch": 0.40236686390532544,
"grad_norm": 3.415714740753174,
"learning_rate": 0.0001198224852071006,
"loss": 4.7708,
"step": 272
},
{
"epoch": 0.40384615384615385,
"grad_norm": 3.799057960510254,
"learning_rate": 0.00011952662721893493,
"loss": 4.8023,
"step": 273
},
{
"epoch": 0.40532544378698226,
"grad_norm": 4.48440408706665,
"learning_rate": 0.00011923076923076923,
"loss": 4.6143,
"step": 274
},
{
"epoch": 0.4068047337278107,
"grad_norm": 2.993614435195923,
"learning_rate": 0.00011893491124260356,
"loss": 5.0484,
"step": 275
},
{
"epoch": 0.40828402366863903,
"grad_norm": 3.1117966175079346,
"learning_rate": 0.00011863905325443787,
"loss": 5.0049,
"step": 276
},
{
"epoch": 0.40976331360946744,
"grad_norm": 2.981684446334839,
"learning_rate": 0.0001183431952662722,
"loss": 4.9651,
"step": 277
},
{
"epoch": 0.41124260355029585,
"grad_norm": 4.644701957702637,
"learning_rate": 0.0001180473372781065,
"loss": 5.1983,
"step": 278
},
{
"epoch": 0.41272189349112426,
"grad_norm": 3.9957146644592285,
"learning_rate": 0.00011775147928994083,
"loss": 4.7752,
"step": 279
},
{
"epoch": 0.41420118343195267,
"grad_norm": 2.156813144683838,
"learning_rate": 0.00011745562130177514,
"loss": 4.9941,
"step": 280
},
{
"epoch": 0.4156804733727811,
"grad_norm": 3.0435688495635986,
"learning_rate": 0.00011715976331360947,
"loss": 5.1542,
"step": 281
},
{
"epoch": 0.4171597633136095,
"grad_norm": 2.6798031330108643,
"learning_rate": 0.0001168639053254438,
"loss": 5.0989,
"step": 282
},
{
"epoch": 0.41863905325443784,
"grad_norm": 4.525575637817383,
"learning_rate": 0.00011656804733727811,
"loss": 4.8101,
"step": 283
},
{
"epoch": 0.42011834319526625,
"grad_norm": 2.770167827606201,
"learning_rate": 0.00011627218934911244,
"loss": 4.8678,
"step": 284
},
{
"epoch": 0.42159763313609466,
"grad_norm": 3.7138099670410156,
"learning_rate": 0.00011597633136094674,
"loss": 4.5939,
"step": 285
},
{
"epoch": 0.4230769230769231,
"grad_norm": 2.398003339767456,
"learning_rate": 0.00011568047337278107,
"loss": 4.8854,
"step": 286
},
{
"epoch": 0.4245562130177515,
"grad_norm": 2.4538278579711914,
"learning_rate": 0.00011538461538461538,
"loss": 4.8346,
"step": 287
},
{
"epoch": 0.4260355029585799,
"grad_norm": 2.757596969604492,
"learning_rate": 0.00011508875739644971,
"loss": 4.8026,
"step": 288
},
{
"epoch": 0.4275147928994083,
"grad_norm": 2.84289813041687,
"learning_rate": 0.00011479289940828404,
"loss": 5.1102,
"step": 289
},
{
"epoch": 0.4289940828402367,
"grad_norm": 3.8550050258636475,
"learning_rate": 0.00011449704142011835,
"loss": 4.7836,
"step": 290
},
{
"epoch": 0.43047337278106507,
"grad_norm": 2.811088800430298,
"learning_rate": 0.00011420118343195268,
"loss": 4.7735,
"step": 291
},
{
"epoch": 0.4319526627218935,
"grad_norm": 3.845970392227173,
"learning_rate": 0.00011390532544378698,
"loss": 4.5021,
"step": 292
},
{
"epoch": 0.4334319526627219,
"grad_norm": 2.977811574935913,
"learning_rate": 0.0001136094674556213,
"loss": 5.0106,
"step": 293
},
{
"epoch": 0.4349112426035503,
"grad_norm": 3.136218309402466,
"learning_rate": 0.00011331360946745562,
"loss": 4.9972,
"step": 294
},
{
"epoch": 0.4363905325443787,
"grad_norm": 3.3354506492614746,
"learning_rate": 0.00011301775147928995,
"loss": 4.7461,
"step": 295
},
{
"epoch": 0.4378698224852071,
"grad_norm": 2.847717761993408,
"learning_rate": 0.00011272189349112428,
"loss": 5.0909,
"step": 296
},
{
"epoch": 0.4393491124260355,
"grad_norm": 3.3575217723846436,
"learning_rate": 0.00011242603550295858,
"loss": 4.6807,
"step": 297
},
{
"epoch": 0.4408284023668639,
"grad_norm": 4.795094966888428,
"learning_rate": 0.00011213017751479292,
"loss": 4.6449,
"step": 298
},
{
"epoch": 0.4423076923076923,
"grad_norm": 3.1832058429718018,
"learning_rate": 0.00011183431952662722,
"loss": 5.1715,
"step": 299
},
{
"epoch": 0.4437869822485207,
"grad_norm": 2.93605899810791,
"learning_rate": 0.00011153846153846154,
"loss": 4.743,
"step": 300
},
{
"epoch": 0.4452662721893491,
"grad_norm": 3.169785737991333,
"learning_rate": 0.00011124260355029586,
"loss": 4.8851,
"step": 301
},
{
"epoch": 0.4467455621301775,
"grad_norm": 3.1830434799194336,
"learning_rate": 0.00011094674556213019,
"loss": 4.6635,
"step": 302
},
{
"epoch": 0.44822485207100593,
"grad_norm": 3.0651752948760986,
"learning_rate": 0.00011065088757396451,
"loss": 4.9519,
"step": 303
},
{
"epoch": 0.44970414201183434,
"grad_norm": 2.6180307865142822,
"learning_rate": 0.00011035502958579881,
"loss": 4.8693,
"step": 304
},
{
"epoch": 0.4511834319526627,
"grad_norm": 2.6694605350494385,
"learning_rate": 0.00011005917159763315,
"loss": 4.9804,
"step": 305
},
{
"epoch": 0.4526627218934911,
"grad_norm": 4.2127299308776855,
"learning_rate": 0.00010976331360946746,
"loss": 4.9376,
"step": 306
},
{
"epoch": 0.4541420118343195,
"grad_norm": 3.186117172241211,
"learning_rate": 0.00010946745562130178,
"loss": 4.9848,
"step": 307
},
{
"epoch": 0.4556213017751479,
"grad_norm": 5.589068412780762,
"learning_rate": 0.0001091715976331361,
"loss": 4.603,
"step": 308
},
{
"epoch": 0.45710059171597633,
"grad_norm": 3.5685253143310547,
"learning_rate": 0.00010887573964497042,
"loss": 4.6498,
"step": 309
},
{
"epoch": 0.45857988165680474,
"grad_norm": 3.711947441101074,
"learning_rate": 0.00010857988165680475,
"loss": 4.8806,
"step": 310
},
{
"epoch": 0.46005917159763315,
"grad_norm": 3.018235683441162,
"learning_rate": 0.00010828402366863905,
"loss": 4.9364,
"step": 311
},
{
"epoch": 0.46153846153846156,
"grad_norm": 3.832859754562378,
"learning_rate": 0.00010798816568047338,
"loss": 4.5133,
"step": 312
},
{
"epoch": 0.4630177514792899,
"grad_norm": 3.1703977584838867,
"learning_rate": 0.0001076923076923077,
"loss": 4.9799,
"step": 313
},
{
"epoch": 0.46449704142011833,
"grad_norm": 3.1763527393341064,
"learning_rate": 0.00010739644970414202,
"loss": 5.1171,
"step": 314
},
{
"epoch": 0.46597633136094674,
"grad_norm": 3.3300058841705322,
"learning_rate": 0.00010710059171597633,
"loss": 5.0527,
"step": 315
},
{
"epoch": 0.46745562130177515,
"grad_norm": 4.077115535736084,
"learning_rate": 0.00010680473372781066,
"loss": 5.1883,
"step": 316
},
{
"epoch": 0.46893491124260356,
"grad_norm": 2.77453875541687,
"learning_rate": 0.00010650887573964499,
"loss": 4.7195,
"step": 317
},
{
"epoch": 0.47041420118343197,
"grad_norm": 3.7889904975891113,
"learning_rate": 0.00010621301775147929,
"loss": 4.6898,
"step": 318
},
{
"epoch": 0.4718934911242604,
"grad_norm": 3.072606325149536,
"learning_rate": 0.00010591715976331362,
"loss": 4.6968,
"step": 319
},
{
"epoch": 0.47337278106508873,
"grad_norm": 3.511852741241455,
"learning_rate": 0.00010562130177514793,
"loss": 5.2093,
"step": 320
},
{
"epoch": 0.47485207100591714,
"grad_norm": 3.1733694076538086,
"learning_rate": 0.00010532544378698226,
"loss": 4.708,
"step": 321
},
{
"epoch": 0.47633136094674555,
"grad_norm": 3.6563074588775635,
"learning_rate": 0.00010502958579881656,
"loss": 4.9192,
"step": 322
},
{
"epoch": 0.47781065088757396,
"grad_norm": 3.6182966232299805,
"learning_rate": 0.0001047337278106509,
"loss": 4.4543,
"step": 323
},
{
"epoch": 0.47928994082840237,
"grad_norm": 2.6363067626953125,
"learning_rate": 0.0001044378698224852,
"loss": 4.8366,
"step": 324
},
{
"epoch": 0.4807692307692308,
"grad_norm": 3.9731101989746094,
"learning_rate": 0.00010414201183431953,
"loss": 4.3,
"step": 325
},
{
"epoch": 0.4822485207100592,
"grad_norm": 3.4638442993164062,
"learning_rate": 0.00010384615384615386,
"loss": 4.7362,
"step": 326
},
{
"epoch": 0.48372781065088755,
"grad_norm": 3.5232033729553223,
"learning_rate": 0.00010355029585798817,
"loss": 4.6581,
"step": 327
},
{
"epoch": 0.48520710059171596,
"grad_norm": 3.3872711658477783,
"learning_rate": 0.0001032544378698225,
"loss": 5.115,
"step": 328
},
{
"epoch": 0.48668639053254437,
"grad_norm": 4.918847560882568,
"learning_rate": 0.0001029585798816568,
"loss": 4.7896,
"step": 329
},
{
"epoch": 0.4881656804733728,
"grad_norm": 2.2880265712738037,
"learning_rate": 0.00010266272189349113,
"loss": 4.9476,
"step": 330
},
{
"epoch": 0.4896449704142012,
"grad_norm": 2.6146740913391113,
"learning_rate": 0.00010236686390532544,
"loss": 4.8704,
"step": 331
},
{
"epoch": 0.4911242603550296,
"grad_norm": 4.03132438659668,
"learning_rate": 0.00010207100591715977,
"loss": 4.6685,
"step": 332
},
{
"epoch": 0.492603550295858,
"grad_norm": 2.9792962074279785,
"learning_rate": 0.0001017751479289941,
"loss": 4.6436,
"step": 333
},
{
"epoch": 0.4940828402366864,
"grad_norm": 4.724271774291992,
"learning_rate": 0.00010147928994082841,
"loss": 4.7177,
"step": 334
},
{
"epoch": 0.49556213017751477,
"grad_norm": 2.644087076187134,
"learning_rate": 0.00010118343195266274,
"loss": 4.8767,
"step": 335
},
{
"epoch": 0.4970414201183432,
"grad_norm": 3.115257740020752,
"learning_rate": 0.00010088757396449704,
"loss": 4.789,
"step": 336
},
{
"epoch": 0.4985207100591716,
"grad_norm": 2.866811752319336,
"learning_rate": 0.00010059171597633136,
"loss": 4.7059,
"step": 337
},
{
"epoch": 0.5,
"grad_norm": 2.982800245285034,
"learning_rate": 0.00010029585798816568,
"loss": 4.9884,
"step": 338
},
{
"epoch": 0.5014792899408284,
"grad_norm": 5.207017421722412,
"learning_rate": 0.0001,
"loss": 4.7667,
"step": 339
},
{
"epoch": 0.5029585798816568,
"grad_norm": 4.413080215454102,
"learning_rate": 9.970414201183432e-05,
"loss": 4.4587,
"step": 340
},
{
"epoch": 0.5044378698224852,
"grad_norm": 3.2763004302978516,
"learning_rate": 9.940828402366865e-05,
"loss": 4.8882,
"step": 341
},
{
"epoch": 0.5059171597633136,
"grad_norm": 4.111000061035156,
"learning_rate": 9.911242603550296e-05,
"loss": 4.9992,
"step": 342
},
{
"epoch": 0.507396449704142,
"grad_norm": 3.3207952976226807,
"learning_rate": 9.881656804733729e-05,
"loss": 4.6362,
"step": 343
},
{
"epoch": 0.5088757396449705,
"grad_norm": 4.325817108154297,
"learning_rate": 9.85207100591716e-05,
"loss": 4.3929,
"step": 344
},
{
"epoch": 0.5103550295857988,
"grad_norm": 2.3893871307373047,
"learning_rate": 9.822485207100593e-05,
"loss": 4.7993,
"step": 345
},
{
"epoch": 0.5118343195266272,
"grad_norm": 2.4651639461517334,
"learning_rate": 9.792899408284024e-05,
"loss": 4.871,
"step": 346
},
{
"epoch": 0.5133136094674556,
"grad_norm": 3.43371319770813,
"learning_rate": 9.763313609467456e-05,
"loss": 4.831,
"step": 347
},
{
"epoch": 0.514792899408284,
"grad_norm": 4.255308151245117,
"learning_rate": 9.733727810650888e-05,
"loss": 4.6105,
"step": 348
},
{
"epoch": 0.5162721893491125,
"grad_norm": 2.4571831226348877,
"learning_rate": 9.70414201183432e-05,
"loss": 4.5852,
"step": 349
},
{
"epoch": 0.5177514792899408,
"grad_norm": 3.8601267337799072,
"learning_rate": 9.674556213017751e-05,
"loss": 5.0089,
"step": 350
},
{
"epoch": 0.5192307692307693,
"grad_norm": 4.343974590301514,
"learning_rate": 9.644970414201184e-05,
"loss": 4.585,
"step": 351
},
{
"epoch": 0.5207100591715976,
"grad_norm": 3.3244740962982178,
"learning_rate": 9.615384615384617e-05,
"loss": 4.6961,
"step": 352
},
{
"epoch": 0.522189349112426,
"grad_norm": 3.011272668838501,
"learning_rate": 9.585798816568048e-05,
"loss": 4.7542,
"step": 353
},
{
"epoch": 0.5236686390532544,
"grad_norm": 4.02368688583374,
"learning_rate": 9.55621301775148e-05,
"loss": 4.605,
"step": 354
},
{
"epoch": 0.5251479289940828,
"grad_norm": 2.5176520347595215,
"learning_rate": 9.526627218934911e-05,
"loss": 5.057,
"step": 355
},
{
"epoch": 0.5266272189349113,
"grad_norm": 3.331259250640869,
"learning_rate": 9.497041420118344e-05,
"loss": 4.8883,
"step": 356
},
{
"epoch": 0.5281065088757396,
"grad_norm": 4.451882362365723,
"learning_rate": 9.467455621301775e-05,
"loss": 3.7867,
"step": 357
},
{
"epoch": 0.5295857988165681,
"grad_norm": 3.5634541511535645,
"learning_rate": 9.437869822485208e-05,
"loss": 4.6808,
"step": 358
},
{
"epoch": 0.5310650887573964,
"grad_norm": 4.832805156707764,
"learning_rate": 9.408284023668639e-05,
"loss": 4.8233,
"step": 359
},
{
"epoch": 0.5325443786982249,
"grad_norm": 3.726456642150879,
"learning_rate": 9.378698224852072e-05,
"loss": 5.0317,
"step": 360
},
{
"epoch": 0.5340236686390533,
"grad_norm": 5.337479114532471,
"learning_rate": 9.349112426035503e-05,
"loss": 4.701,
"step": 361
},
{
"epoch": 0.5355029585798816,
"grad_norm": 3.692549228668213,
"learning_rate": 9.319526627218935e-05,
"loss": 4.8653,
"step": 362
},
{
"epoch": 0.5369822485207101,
"grad_norm": 3.613229990005493,
"learning_rate": 9.289940828402367e-05,
"loss": 4.8845,
"step": 363
},
{
"epoch": 0.5384615384615384,
"grad_norm": 3.317173957824707,
"learning_rate": 9.260355029585799e-05,
"loss": 4.9675,
"step": 364
},
{
"epoch": 0.5399408284023669,
"grad_norm": 5.250802516937256,
"learning_rate": 9.230769230769232e-05,
"loss": 4.7967,
"step": 365
},
{
"epoch": 0.5414201183431953,
"grad_norm": 2.8899145126342773,
"learning_rate": 9.201183431952663e-05,
"loss": 4.7554,
"step": 366
},
{
"epoch": 0.5428994082840237,
"grad_norm": 2.3002591133117676,
"learning_rate": 9.171597633136096e-05,
"loss": 4.9671,
"step": 367
},
{
"epoch": 0.5443786982248521,
"grad_norm": 2.672611951828003,
"learning_rate": 9.142011834319527e-05,
"loss": 4.9408,
"step": 368
},
{
"epoch": 0.5458579881656804,
"grad_norm": 2.823984146118164,
"learning_rate": 9.112426035502959e-05,
"loss": 4.6108,
"step": 369
},
{
"epoch": 0.5473372781065089,
"grad_norm": 3.269995927810669,
"learning_rate": 9.082840236686391e-05,
"loss": 4.6865,
"step": 370
},
{
"epoch": 0.5488165680473372,
"grad_norm": 2.842123031616211,
"learning_rate": 9.053254437869823e-05,
"loss": 5.0082,
"step": 371
},
{
"epoch": 0.5502958579881657,
"grad_norm": 3.5125341415405273,
"learning_rate": 9.023668639053254e-05,
"loss": 4.6579,
"step": 372
},
{
"epoch": 0.5517751479289941,
"grad_norm": 2.400810718536377,
"learning_rate": 8.994082840236687e-05,
"loss": 4.9984,
"step": 373
},
{
"epoch": 0.5532544378698225,
"grad_norm": 2.6020796298980713,
"learning_rate": 8.96449704142012e-05,
"loss": 4.6243,
"step": 374
},
{
"epoch": 0.5547337278106509,
"grad_norm": 2.897449493408203,
"learning_rate": 8.934911242603551e-05,
"loss": 4.9226,
"step": 375
},
{
"epoch": 0.5562130177514792,
"grad_norm": 3.834491491317749,
"learning_rate": 8.905325443786982e-05,
"loss": 4.8789,
"step": 376
},
{
"epoch": 0.5576923076923077,
"grad_norm": 2.6195783615112305,
"learning_rate": 8.875739644970414e-05,
"loss": 4.9757,
"step": 377
},
{
"epoch": 0.5591715976331361,
"grad_norm": 2.357797861099243,
"learning_rate": 8.846153846153847e-05,
"loss": 4.8308,
"step": 378
},
{
"epoch": 0.5606508875739645,
"grad_norm": 4.5268659591674805,
"learning_rate": 8.816568047337278e-05,
"loss": 5.017,
"step": 379
},
{
"epoch": 0.5621301775147929,
"grad_norm": 3.124450445175171,
"learning_rate": 8.78698224852071e-05,
"loss": 4.902,
"step": 380
},
{
"epoch": 0.5636094674556213,
"grad_norm": 3.822741985321045,
"learning_rate": 8.757396449704143e-05,
"loss": 4.7571,
"step": 381
},
{
"epoch": 0.5650887573964497,
"grad_norm": 3.3395605087280273,
"learning_rate": 8.727810650887575e-05,
"loss": 4.3808,
"step": 382
},
{
"epoch": 0.5665680473372781,
"grad_norm": 3.0946218967437744,
"learning_rate": 8.698224852071006e-05,
"loss": 4.853,
"step": 383
},
{
"epoch": 0.5680473372781065,
"grad_norm": 2.6763172149658203,
"learning_rate": 8.668639053254438e-05,
"loss": 4.7149,
"step": 384
},
{
"epoch": 0.5695266272189349,
"grad_norm": 3.2569401264190674,
"learning_rate": 8.63905325443787e-05,
"loss": 4.7693,
"step": 385
},
{
"epoch": 0.5710059171597633,
"grad_norm": 2.8756442070007324,
"learning_rate": 8.609467455621302e-05,
"loss": 4.737,
"step": 386
},
{
"epoch": 0.5724852071005917,
"grad_norm": 3.0808393955230713,
"learning_rate": 8.579881656804734e-05,
"loss": 5.1146,
"step": 387
},
{
"epoch": 0.5739644970414202,
"grad_norm": 2.7643120288848877,
"learning_rate": 8.550295857988166e-05,
"loss": 4.9102,
"step": 388
},
{
"epoch": 0.5754437869822485,
"grad_norm": 4.045496940612793,
"learning_rate": 8.520710059171599e-05,
"loss": 4.4583,
"step": 389
},
{
"epoch": 0.5769230769230769,
"grad_norm": 2.765378952026367,
"learning_rate": 8.49112426035503e-05,
"loss": 4.6736,
"step": 390
},
{
"epoch": 0.5784023668639053,
"grad_norm": 2.4997928142547607,
"learning_rate": 8.461538461538461e-05,
"loss": 4.9013,
"step": 391
},
{
"epoch": 0.5798816568047337,
"grad_norm": 6.506393909454346,
"learning_rate": 8.431952662721894e-05,
"loss": 4.4212,
"step": 392
},
{
"epoch": 0.5813609467455622,
"grad_norm": 3.8824899196624756,
"learning_rate": 8.402366863905326e-05,
"loss": 4.8729,
"step": 393
},
{
"epoch": 0.5828402366863905,
"grad_norm": 2.4025533199310303,
"learning_rate": 8.372781065088757e-05,
"loss": 4.6856,
"step": 394
},
{
"epoch": 0.584319526627219,
"grad_norm": 2.9260175228118896,
"learning_rate": 8.34319526627219e-05,
"loss": 4.6581,
"step": 395
},
{
"epoch": 0.5857988165680473,
"grad_norm": 2.606157064437866,
"learning_rate": 8.313609467455622e-05,
"loss": 4.7248,
"step": 396
},
{
"epoch": 0.5872781065088757,
"grad_norm": 3.9847888946533203,
"learning_rate": 8.284023668639054e-05,
"loss": 4.4983,
"step": 397
},
{
"epoch": 0.5887573964497042,
"grad_norm": 4.147398471832275,
"learning_rate": 8.254437869822485e-05,
"loss": 4.6558,
"step": 398
},
{
"epoch": 0.5902366863905325,
"grad_norm": 3.062237501144409,
"learning_rate": 8.224852071005918e-05,
"loss": 4.6806,
"step": 399
},
{
"epoch": 0.591715976331361,
"grad_norm": 2.655552625656128,
"learning_rate": 8.19526627218935e-05,
"loss": 4.7624,
"step": 400
},
{
"epoch": 0.5931952662721893,
"grad_norm": 2.5713272094726562,
"learning_rate": 8.165680473372781e-05,
"loss": 4.4758,
"step": 401
},
{
"epoch": 0.5946745562130178,
"grad_norm": 4.519266605377197,
"learning_rate": 8.136094674556214e-05,
"loss": 4.1983,
"step": 402
},
{
"epoch": 0.5961538461538461,
"grad_norm": 2.5204124450683594,
"learning_rate": 8.106508875739646e-05,
"loss": 4.9982,
"step": 403
},
{
"epoch": 0.5976331360946746,
"grad_norm": 2.2742385864257812,
"learning_rate": 8.076923076923078e-05,
"loss": 4.8676,
"step": 404
},
{
"epoch": 0.599112426035503,
"grad_norm": 3.3117566108703613,
"learning_rate": 8.047337278106509e-05,
"loss": 4.5992,
"step": 405
},
{
"epoch": 0.6005917159763313,
"grad_norm": 2.826481819152832,
"learning_rate": 8.01775147928994e-05,
"loss": 4.8464,
"step": 406
},
{
"epoch": 0.6020710059171598,
"grad_norm": 2.4774935245513916,
"learning_rate": 7.988165680473373e-05,
"loss": 4.6491,
"step": 407
},
{
"epoch": 0.6035502958579881,
"grad_norm": 3.357351779937744,
"learning_rate": 7.958579881656805e-05,
"loss": 4.8258,
"step": 408
},
{
"epoch": 0.6050295857988166,
"grad_norm": 3.904404640197754,
"learning_rate": 7.928994082840237e-05,
"loss": 5.0641,
"step": 409
},
{
"epoch": 0.606508875739645,
"grad_norm": 3.978461742401123,
"learning_rate": 7.899408284023669e-05,
"loss": 4.8184,
"step": 410
},
{
"epoch": 0.6079881656804734,
"grad_norm": 2.7608234882354736,
"learning_rate": 7.869822485207101e-05,
"loss": 5.0468,
"step": 411
},
{
"epoch": 0.6094674556213018,
"grad_norm": 4.422279357910156,
"learning_rate": 7.840236686390533e-05,
"loss": 4.6623,
"step": 412
},
{
"epoch": 0.6109467455621301,
"grad_norm": 2.7389438152313232,
"learning_rate": 7.810650887573964e-05,
"loss": 4.6685,
"step": 413
},
{
"epoch": 0.6124260355029586,
"grad_norm": 3.2453866004943848,
"learning_rate": 7.781065088757397e-05,
"loss": 4.796,
"step": 414
},
{
"epoch": 0.613905325443787,
"grad_norm": 4.576845645904541,
"learning_rate": 7.751479289940828e-05,
"loss": 5.1346,
"step": 415
},
{
"epoch": 0.6153846153846154,
"grad_norm": 3.6460392475128174,
"learning_rate": 7.72189349112426e-05,
"loss": 4.3795,
"step": 416
},
{
"epoch": 0.6168639053254438,
"grad_norm": 3.7447457313537598,
"learning_rate": 7.692307692307693e-05,
"loss": 4.6371,
"step": 417
},
{
"epoch": 0.6183431952662722,
"grad_norm": 5.934640407562256,
"learning_rate": 7.662721893491125e-05,
"loss": 4.3938,
"step": 418
},
{
"epoch": 0.6198224852071006,
"grad_norm": 2.61598801612854,
"learning_rate": 7.633136094674557e-05,
"loss": 4.6006,
"step": 419
},
{
"epoch": 0.621301775147929,
"grad_norm": 2.915923833847046,
"learning_rate": 7.603550295857988e-05,
"loss": 4.5953,
"step": 420
},
{
"epoch": 0.6227810650887574,
"grad_norm": 3.000753402709961,
"learning_rate": 7.573964497041421e-05,
"loss": 4.7139,
"step": 421
},
{
"epoch": 0.6242603550295858,
"grad_norm": 2.5539729595184326,
"learning_rate": 7.544378698224852e-05,
"loss": 5.232,
"step": 422
},
{
"epoch": 0.6257396449704142,
"grad_norm": 3.208082675933838,
"learning_rate": 7.514792899408284e-05,
"loss": 4.6739,
"step": 423
},
{
"epoch": 0.6272189349112426,
"grad_norm": 4.210474491119385,
"learning_rate": 7.485207100591716e-05,
"loss": 4.7495,
"step": 424
},
{
"epoch": 0.628698224852071,
"grad_norm": 3.1150636672973633,
"learning_rate": 7.455621301775149e-05,
"loss": 4.4906,
"step": 425
},
{
"epoch": 0.6301775147928994,
"grad_norm": 4.412407398223877,
"learning_rate": 7.42603550295858e-05,
"loss": 4.0822,
"step": 426
},
{
"epoch": 0.6316568047337278,
"grad_norm": 2.090398073196411,
"learning_rate": 7.396449704142012e-05,
"loss": 4.8458,
"step": 427
},
{
"epoch": 0.6331360946745562,
"grad_norm": 2.350902795791626,
"learning_rate": 7.366863905325445e-05,
"loss": 4.761,
"step": 428
},
{
"epoch": 0.6346153846153846,
"grad_norm": 3.2042136192321777,
"learning_rate": 7.337278106508876e-05,
"loss": 4.796,
"step": 429
},
{
"epoch": 0.636094674556213,
"grad_norm": 3.3418521881103516,
"learning_rate": 7.307692307692307e-05,
"loss": 4.7222,
"step": 430
},
{
"epoch": 0.6375739644970414,
"grad_norm": 3.0160398483276367,
"learning_rate": 7.27810650887574e-05,
"loss": 4.4557,
"step": 431
},
{
"epoch": 0.6390532544378699,
"grad_norm": 2.5732924938201904,
"learning_rate": 7.248520710059173e-05,
"loss": 4.6779,
"step": 432
},
{
"epoch": 0.6405325443786982,
"grad_norm": 2.399313449859619,
"learning_rate": 7.218934911242604e-05,
"loss": 4.5695,
"step": 433
},
{
"epoch": 0.6420118343195266,
"grad_norm": 2.9960498809814453,
"learning_rate": 7.189349112426036e-05,
"loss": 4.8253,
"step": 434
},
{
"epoch": 0.643491124260355,
"grad_norm": 2.1950154304504395,
"learning_rate": 7.159763313609467e-05,
"loss": 4.492,
"step": 435
},
{
"epoch": 0.6449704142011834,
"grad_norm": 3.8510971069335938,
"learning_rate": 7.1301775147929e-05,
"loss": 4.6542,
"step": 436
},
{
"epoch": 0.6464497041420119,
"grad_norm": 3.028498649597168,
"learning_rate": 7.100591715976331e-05,
"loss": 4.5609,
"step": 437
},
{
"epoch": 0.6479289940828402,
"grad_norm": 3.7498788833618164,
"learning_rate": 7.071005917159763e-05,
"loss": 4.6621,
"step": 438
},
{
"epoch": 0.6494082840236687,
"grad_norm": 3.1128766536712646,
"learning_rate": 7.041420118343195e-05,
"loss": 5.0167,
"step": 439
},
{
"epoch": 0.650887573964497,
"grad_norm": 3.3049442768096924,
"learning_rate": 7.011834319526628e-05,
"loss": 4.5484,
"step": 440
},
{
"epoch": 0.6523668639053254,
"grad_norm": 2.101262331008911,
"learning_rate": 6.98224852071006e-05,
"loss": 4.7794,
"step": 441
},
{
"epoch": 0.6538461538461539,
"grad_norm": 3.5438482761383057,
"learning_rate": 6.952662721893491e-05,
"loss": 4.9424,
"step": 442
},
{
"epoch": 0.6553254437869822,
"grad_norm": 4.34566593170166,
"learning_rate": 6.923076923076924e-05,
"loss": 5.0576,
"step": 443
},
{
"epoch": 0.6568047337278107,
"grad_norm": 2.580064296722412,
"learning_rate": 6.893491124260355e-05,
"loss": 4.7086,
"step": 444
},
{
"epoch": 0.658284023668639,
"grad_norm": 4.950741291046143,
"learning_rate": 6.863905325443787e-05,
"loss": 4.877,
"step": 445
},
{
"epoch": 0.6597633136094675,
"grad_norm": 4.225985527038574,
"learning_rate": 6.834319526627219e-05,
"loss": 4.7599,
"step": 446
},
{
"epoch": 0.6612426035502958,
"grad_norm": 2.690343141555786,
"learning_rate": 6.804733727810652e-05,
"loss": 4.604,
"step": 447
},
{
"epoch": 0.6627218934911243,
"grad_norm": 2.512183904647827,
"learning_rate": 6.775147928994083e-05,
"loss": 5.0561,
"step": 448
},
{
"epoch": 0.6642011834319527,
"grad_norm": 4.449777126312256,
"learning_rate": 6.745562130177515e-05,
"loss": 4.8173,
"step": 449
},
{
"epoch": 0.665680473372781,
"grad_norm": 3.311457633972168,
"learning_rate": 6.715976331360948e-05,
"loss": 4.3368,
"step": 450
},
{
"epoch": 0.6671597633136095,
"grad_norm": 3.376208543777466,
"learning_rate": 6.686390532544379e-05,
"loss": 4.5102,
"step": 451
},
{
"epoch": 0.6686390532544378,
"grad_norm": 5.265395641326904,
"learning_rate": 6.65680473372781e-05,
"loss": 5.0219,
"step": 452
},
{
"epoch": 0.6701183431952663,
"grad_norm": 2.1007046699523926,
"learning_rate": 6.627218934911243e-05,
"loss": 5.1141,
"step": 453
},
{
"epoch": 0.6715976331360947,
"grad_norm": 4.058145046234131,
"learning_rate": 6.597633136094676e-05,
"loss": 4.422,
"step": 454
},
{
"epoch": 0.6730769230769231,
"grad_norm": 3.761136770248413,
"learning_rate": 6.568047337278107e-05,
"loss": 4.6039,
"step": 455
},
{
"epoch": 0.6745562130177515,
"grad_norm": 3.9088778495788574,
"learning_rate": 6.538461538461539e-05,
"loss": 4.5297,
"step": 456
},
{
"epoch": 0.6760355029585798,
"grad_norm": 3.4910857677459717,
"learning_rate": 6.50887573964497e-05,
"loss": 4.6043,
"step": 457
},
{
"epoch": 0.6775147928994083,
"grad_norm": 2.457960844039917,
"learning_rate": 6.479289940828403e-05,
"loss": 4.83,
"step": 458
},
{
"epoch": 0.6789940828402367,
"grad_norm": 3.0204050540924072,
"learning_rate": 6.449704142011834e-05,
"loss": 4.6305,
"step": 459
},
{
"epoch": 0.6804733727810651,
"grad_norm": 2.5136165618896484,
"learning_rate": 6.420118343195266e-05,
"loss": 4.79,
"step": 460
},
{
"epoch": 0.6819526627218935,
"grad_norm": 3.7918813228607178,
"learning_rate": 6.390532544378698e-05,
"loss": 5.0849,
"step": 461
},
{
"epoch": 0.6834319526627219,
"grad_norm": 3.5558226108551025,
"learning_rate": 6.360946745562131e-05,
"loss": 4.6819,
"step": 462
},
{
"epoch": 0.6849112426035503,
"grad_norm": 3.061944007873535,
"learning_rate": 6.331360946745562e-05,
"loss": 4.7948,
"step": 463
},
{
"epoch": 0.6863905325443787,
"grad_norm": 2.5074996948242188,
"learning_rate": 6.301775147928994e-05,
"loss": 4.6494,
"step": 464
},
{
"epoch": 0.6878698224852071,
"grad_norm": 3.643146276473999,
"learning_rate": 6.272189349112427e-05,
"loss": 4.221,
"step": 465
},
{
"epoch": 0.6893491124260355,
"grad_norm": 3.808267831802368,
"learning_rate": 6.242603550295858e-05,
"loss": 5.0687,
"step": 466
},
{
"epoch": 0.6908284023668639,
"grad_norm": 2.975130558013916,
"learning_rate": 6.21301775147929e-05,
"loss": 4.7507,
"step": 467
},
{
"epoch": 0.6923076923076923,
"grad_norm": 3.661033868789673,
"learning_rate": 6.183431952662722e-05,
"loss": 4.4896,
"step": 468
},
{
"epoch": 0.6937869822485208,
"grad_norm": 3.4433846473693848,
"learning_rate": 6.153846153846155e-05,
"loss": 5.1373,
"step": 469
},
{
"epoch": 0.6952662721893491,
"grad_norm": 2.975217342376709,
"learning_rate": 6.124260355029586e-05,
"loss": 4.5473,
"step": 470
},
{
"epoch": 0.6967455621301775,
"grad_norm": 3.5943000316619873,
"learning_rate": 6.094674556213018e-05,
"loss": 4.5376,
"step": 471
},
{
"epoch": 0.6982248520710059,
"grad_norm": 2.9393863677978516,
"learning_rate": 6.06508875739645e-05,
"loss": 4.6031,
"step": 472
},
{
"epoch": 0.6997041420118343,
"grad_norm": 4.864683151245117,
"learning_rate": 6.035502958579882e-05,
"loss": 4.7268,
"step": 473
},
{
"epoch": 0.7011834319526628,
"grad_norm": 2.9984517097473145,
"learning_rate": 6.005917159763313e-05,
"loss": 4.9315,
"step": 474
},
{
"epoch": 0.7026627218934911,
"grad_norm": 3.4088656902313232,
"learning_rate": 5.9763313609467466e-05,
"loss": 4.4509,
"step": 475
},
{
"epoch": 0.7041420118343196,
"grad_norm": 2.966920852661133,
"learning_rate": 5.946745562130178e-05,
"loss": 4.8591,
"step": 476
},
{
"epoch": 0.7056213017751479,
"grad_norm": 3.441845178604126,
"learning_rate": 5.91715976331361e-05,
"loss": 4.6297,
"step": 477
},
{
"epoch": 0.7071005917159763,
"grad_norm": 3.430345296859741,
"learning_rate": 5.8875739644970415e-05,
"loss": 4.7492,
"step": 478
},
{
"epoch": 0.7085798816568047,
"grad_norm": 3.0503721237182617,
"learning_rate": 5.8579881656804736e-05,
"loss": 4.4038,
"step": 479
},
{
"epoch": 0.7100591715976331,
"grad_norm": 3.72664213180542,
"learning_rate": 5.8284023668639056e-05,
"loss": 4.5192,
"step": 480
},
{
"epoch": 0.7115384615384616,
"grad_norm": 2.554041862487793,
"learning_rate": 5.798816568047337e-05,
"loss": 4.6383,
"step": 481
},
{
"epoch": 0.7130177514792899,
"grad_norm": 2.4178683757781982,
"learning_rate": 5.769230769230769e-05,
"loss": 4.706,
"step": 482
},
{
"epoch": 0.7144970414201184,
"grad_norm": 2.1827499866485596,
"learning_rate": 5.739644970414202e-05,
"loss": 4.6598,
"step": 483
},
{
"epoch": 0.7159763313609467,
"grad_norm": 3.2616827487945557,
"learning_rate": 5.710059171597634e-05,
"loss": 4.1314,
"step": 484
},
{
"epoch": 0.7174556213017751,
"grad_norm": 4.249692440032959,
"learning_rate": 5.680473372781065e-05,
"loss": 4.311,
"step": 485
},
{
"epoch": 0.7189349112426036,
"grad_norm": 3.310980796813965,
"learning_rate": 5.6508875739644974e-05,
"loss": 4.5443,
"step": 486
},
{
"epoch": 0.7204142011834319,
"grad_norm": 3.357168436050415,
"learning_rate": 5.621301775147929e-05,
"loss": 4.3564,
"step": 487
},
{
"epoch": 0.7218934911242604,
"grad_norm": 4.302040100097656,
"learning_rate": 5.591715976331361e-05,
"loss": 4.4839,
"step": 488
},
{
"epoch": 0.7233727810650887,
"grad_norm": 3.120237112045288,
"learning_rate": 5.562130177514793e-05,
"loss": 4.335,
"step": 489
},
{
"epoch": 0.7248520710059172,
"grad_norm": 3.0357606410980225,
"learning_rate": 5.532544378698226e-05,
"loss": 4.6092,
"step": 490
},
{
"epoch": 0.7263313609467456,
"grad_norm": 3.9619128704071045,
"learning_rate": 5.502958579881658e-05,
"loss": 4.5274,
"step": 491
},
{
"epoch": 0.727810650887574,
"grad_norm": 3.0399205684661865,
"learning_rate": 5.473372781065089e-05,
"loss": 4.3014,
"step": 492
},
{
"epoch": 0.7292899408284024,
"grad_norm": 2.8128726482391357,
"learning_rate": 5.443786982248521e-05,
"loss": 4.6141,
"step": 493
},
{
"epoch": 0.7307692307692307,
"grad_norm": 4.735112190246582,
"learning_rate": 5.4142011834319526e-05,
"loss": 4.8622,
"step": 494
},
{
"epoch": 0.7322485207100592,
"grad_norm": 5.150043487548828,
"learning_rate": 5.384615384615385e-05,
"loss": 4.9381,
"step": 495
},
{
"epoch": 0.7337278106508875,
"grad_norm": 3.8757338523864746,
"learning_rate": 5.355029585798817e-05,
"loss": 4.6672,
"step": 496
},
{
"epoch": 0.735207100591716,
"grad_norm": 3.1939332485198975,
"learning_rate": 5.3254437869822495e-05,
"loss": 4.2007,
"step": 497
},
{
"epoch": 0.7366863905325444,
"grad_norm": 2.566030979156494,
"learning_rate": 5.295857988165681e-05,
"loss": 4.8573,
"step": 498
},
{
"epoch": 0.7381656804733728,
"grad_norm": 4.250690460205078,
"learning_rate": 5.266272189349113e-05,
"loss": 5.2471,
"step": 499
},
{
"epoch": 0.7396449704142012,
"grad_norm": 3.5470082759857178,
"learning_rate": 5.236686390532545e-05,
"loss": 5.0067,
"step": 500
},
{
"epoch": 0.7411242603550295,
"grad_norm": 3.6443684101104736,
"learning_rate": 5.2071005917159764e-05,
"loss": 4.5634,
"step": 501
},
{
"epoch": 0.742603550295858,
"grad_norm": 3.061985492706299,
"learning_rate": 5.1775147928994085e-05,
"loss": 4.5597,
"step": 502
},
{
"epoch": 0.7440828402366864,
"grad_norm": 4.4746623039245605,
"learning_rate": 5.14792899408284e-05,
"loss": 4.298,
"step": 503
},
{
"epoch": 0.7455621301775148,
"grad_norm": 3.3475050926208496,
"learning_rate": 5.118343195266272e-05,
"loss": 4.733,
"step": 504
},
{
"epoch": 0.7470414201183432,
"grad_norm": 2.60097074508667,
"learning_rate": 5.088757396449705e-05,
"loss": 4.5727,
"step": 505
},
{
"epoch": 0.7485207100591716,
"grad_norm": 3.4892466068267822,
"learning_rate": 5.059171597633137e-05,
"loss": 4.8583,
"step": 506
},
{
"epoch": 0.75,
"grad_norm": 3.30724835395813,
"learning_rate": 5.029585798816568e-05,
"loss": 4.3811,
"step": 507
},
{
"epoch": 0.7514792899408284,
"grad_norm": 2.1683871746063232,
"learning_rate": 5e-05,
"loss": 4.5991,
"step": 508
},
{
"epoch": 0.7529585798816568,
"grad_norm": 2.1917285919189453,
"learning_rate": 4.970414201183432e-05,
"loss": 4.8409,
"step": 509
},
{
"epoch": 0.7544378698224852,
"grad_norm": 3.028266668319702,
"learning_rate": 4.9408284023668644e-05,
"loss": 4.5322,
"step": 510
},
{
"epoch": 0.7559171597633136,
"grad_norm": 3.031081438064575,
"learning_rate": 4.9112426035502965e-05,
"loss": 4.7952,
"step": 511
},
{
"epoch": 0.757396449704142,
"grad_norm": 3.272369861602783,
"learning_rate": 4.881656804733728e-05,
"loss": 4.3832,
"step": 512
},
{
"epoch": 0.7588757396449705,
"grad_norm": 4.823290824890137,
"learning_rate": 4.85207100591716e-05,
"loss": 4.6106,
"step": 513
},
{
"epoch": 0.7603550295857988,
"grad_norm": 2.7338478565216064,
"learning_rate": 4.822485207100592e-05,
"loss": 4.5112,
"step": 514
},
{
"epoch": 0.7618343195266272,
"grad_norm": 2.4823102951049805,
"learning_rate": 4.792899408284024e-05,
"loss": 4.6146,
"step": 515
},
{
"epoch": 0.7633136094674556,
"grad_norm": 3.007741689682007,
"learning_rate": 4.7633136094674555e-05,
"loss": 4.6655,
"step": 516
},
{
"epoch": 0.764792899408284,
"grad_norm": 4.28762674331665,
"learning_rate": 4.7337278106508875e-05,
"loss": 4.3793,
"step": 517
},
{
"epoch": 0.7662721893491125,
"grad_norm": 3.1191656589508057,
"learning_rate": 4.7041420118343196e-05,
"loss": 4.4016,
"step": 518
},
{
"epoch": 0.7677514792899408,
"grad_norm": 3.467637062072754,
"learning_rate": 4.674556213017752e-05,
"loss": 4.7173,
"step": 519
},
{
"epoch": 0.7692307692307693,
"grad_norm": 3.6230361461639404,
"learning_rate": 4.644970414201184e-05,
"loss": 4.5995,
"step": 520
},
{
"epoch": 0.7707100591715976,
"grad_norm": 3.44942045211792,
"learning_rate": 4.615384615384616e-05,
"loss": 4.6219,
"step": 521
},
{
"epoch": 0.772189349112426,
"grad_norm": 3.29604172706604,
"learning_rate": 4.585798816568048e-05,
"loss": 4.094,
"step": 522
},
{
"epoch": 0.7736686390532544,
"grad_norm": 2.802762031555176,
"learning_rate": 4.556213017751479e-05,
"loss": 4.7087,
"step": 523
},
{
"epoch": 0.7751479289940828,
"grad_norm": 4.227675437927246,
"learning_rate": 4.5266272189349114e-05,
"loss": 4.3078,
"step": 524
},
{
"epoch": 0.7766272189349113,
"grad_norm": 3.0525543689727783,
"learning_rate": 4.4970414201183434e-05,
"loss": 4.7471,
"step": 525
},
{
"epoch": 0.7781065088757396,
"grad_norm": 3.044851064682007,
"learning_rate": 4.4674556213017755e-05,
"loss": 4.9371,
"step": 526
},
{
"epoch": 0.7795857988165681,
"grad_norm": 3.847062826156616,
"learning_rate": 4.437869822485207e-05,
"loss": 4.3949,
"step": 527
},
{
"epoch": 0.7810650887573964,
"grad_norm": 2.669889211654663,
"learning_rate": 4.408284023668639e-05,
"loss": 4.5557,
"step": 528
},
{
"epoch": 0.7825443786982249,
"grad_norm": 2.970334768295288,
"learning_rate": 4.378698224852072e-05,
"loss": 4.5767,
"step": 529
},
{
"epoch": 0.7840236686390533,
"grad_norm": 2.5080573558807373,
"learning_rate": 4.349112426035503e-05,
"loss": 4.5653,
"step": 530
},
{
"epoch": 0.7855029585798816,
"grad_norm": 2.9842259883880615,
"learning_rate": 4.319526627218935e-05,
"loss": 4.7996,
"step": 531
},
{
"epoch": 0.7869822485207101,
"grad_norm": 3.3907594680786133,
"learning_rate": 4.289940828402367e-05,
"loss": 4.9064,
"step": 532
},
{
"epoch": 0.7884615384615384,
"grad_norm": 4.384812355041504,
"learning_rate": 4.260355029585799e-05,
"loss": 4.2071,
"step": 533
},
{
"epoch": 0.7899408284023669,
"grad_norm": 2.564908266067505,
"learning_rate": 4.230769230769231e-05,
"loss": 4.5635,
"step": 534
},
{
"epoch": 0.7914201183431953,
"grad_norm": 4.1615753173828125,
"learning_rate": 4.201183431952663e-05,
"loss": 4.3675,
"step": 535
},
{
"epoch": 0.7928994082840237,
"grad_norm": 3.0495238304138184,
"learning_rate": 4.171597633136095e-05,
"loss": 4.3898,
"step": 536
},
{
"epoch": 0.7943786982248521,
"grad_norm": 4.264047145843506,
"learning_rate": 4.142011834319527e-05,
"loss": 4.6589,
"step": 537
},
{
"epoch": 0.7958579881656804,
"grad_norm": 2.76383113861084,
"learning_rate": 4.112426035502959e-05,
"loss": 4.6825,
"step": 538
},
{
"epoch": 0.7973372781065089,
"grad_norm": 4.031206130981445,
"learning_rate": 4.0828402366863904e-05,
"loss": 5.0627,
"step": 539
},
{
"epoch": 0.7988165680473372,
"grad_norm": 4.084919452667236,
"learning_rate": 4.053254437869823e-05,
"loss": 4.1003,
"step": 540
},
{
"epoch": 0.8002958579881657,
"grad_norm": 2.7242236137390137,
"learning_rate": 4.0236686390532545e-05,
"loss": 5.074,
"step": 541
},
{
"epoch": 0.8017751479289941,
"grad_norm": 3.6945011615753174,
"learning_rate": 3.9940828402366866e-05,
"loss": 4.3444,
"step": 542
},
{
"epoch": 0.8032544378698225,
"grad_norm": 2.436042308807373,
"learning_rate": 3.964497041420119e-05,
"loss": 4.4678,
"step": 543
},
{
"epoch": 0.8047337278106509,
"grad_norm": 3.4951744079589844,
"learning_rate": 3.934911242603551e-05,
"loss": 4.4405,
"step": 544
},
{
"epoch": 0.8062130177514792,
"grad_norm": 3.391136646270752,
"learning_rate": 3.905325443786982e-05,
"loss": 4.7685,
"step": 545
},
{
"epoch": 0.8076923076923077,
"grad_norm": 3.002776861190796,
"learning_rate": 3.875739644970414e-05,
"loss": 4.754,
"step": 546
},
{
"epoch": 0.8091715976331361,
"grad_norm": 2.6767425537109375,
"learning_rate": 3.846153846153846e-05,
"loss": 4.8214,
"step": 547
},
{
"epoch": 0.8106508875739645,
"grad_norm": 4.293323516845703,
"learning_rate": 3.8165680473372784e-05,
"loss": 4.7054,
"step": 548
},
{
"epoch": 0.8121301775147929,
"grad_norm": 3.0231809616088867,
"learning_rate": 3.7869822485207104e-05,
"loss": 4.3928,
"step": 549
},
{
"epoch": 0.8136094674556213,
"grad_norm": 2.637932300567627,
"learning_rate": 3.757396449704142e-05,
"loss": 4.6446,
"step": 550
},
{
"epoch": 0.8150887573964497,
"grad_norm": 6.3277974128723145,
"learning_rate": 3.7278106508875746e-05,
"loss": 4.3953,
"step": 551
},
{
"epoch": 0.8165680473372781,
"grad_norm": 2.680094003677368,
"learning_rate": 3.698224852071006e-05,
"loss": 4.3732,
"step": 552
},
{
"epoch": 0.8180473372781065,
"grad_norm": 1.9856054782867432,
"learning_rate": 3.668639053254438e-05,
"loss": 4.6402,
"step": 553
},
{
"epoch": 0.8195266272189349,
"grad_norm": 2.212771415710449,
"learning_rate": 3.63905325443787e-05,
"loss": 4.7445,
"step": 554
},
{
"epoch": 0.8210059171597633,
"grad_norm": 2.5814425945281982,
"learning_rate": 3.609467455621302e-05,
"loss": 4.5684,
"step": 555
},
{
"epoch": 0.8224852071005917,
"grad_norm": 3.313979387283325,
"learning_rate": 3.5798816568047336e-05,
"loss": 4.6471,
"step": 556
},
{
"epoch": 0.8239644970414202,
"grad_norm": 2.2291247844696045,
"learning_rate": 3.5502958579881656e-05,
"loss": 4.77,
"step": 557
},
{
"epoch": 0.8254437869822485,
"grad_norm": 3.7490108013153076,
"learning_rate": 3.520710059171598e-05,
"loss": 4.8479,
"step": 558
},
{
"epoch": 0.8269230769230769,
"grad_norm": 2.6882386207580566,
"learning_rate": 3.49112426035503e-05,
"loss": 5.0244,
"step": 559
},
{
"epoch": 0.8284023668639053,
"grad_norm": 3.1564576625823975,
"learning_rate": 3.461538461538462e-05,
"loss": 4.6782,
"step": 560
},
{
"epoch": 0.8298816568047337,
"grad_norm": 2.728583335876465,
"learning_rate": 3.431952662721893e-05,
"loss": 4.8741,
"step": 561
},
{
"epoch": 0.8313609467455622,
"grad_norm": 3.116046190261841,
"learning_rate": 3.402366863905326e-05,
"loss": 4.6757,
"step": 562
},
{
"epoch": 0.8328402366863905,
"grad_norm": 3.0283756256103516,
"learning_rate": 3.3727810650887574e-05,
"loss": 4.7774,
"step": 563
},
{
"epoch": 0.834319526627219,
"grad_norm": 4.741537094116211,
"learning_rate": 3.3431952662721895e-05,
"loss": 5.1097,
"step": 564
},
{
"epoch": 0.8357988165680473,
"grad_norm": 3.0617265701293945,
"learning_rate": 3.3136094674556215e-05,
"loss": 4.6326,
"step": 565
},
{
"epoch": 0.8372781065088757,
"grad_norm": 3.294005870819092,
"learning_rate": 3.2840236686390536e-05,
"loss": 4.4754,
"step": 566
},
{
"epoch": 0.8387573964497042,
"grad_norm": 3.0990583896636963,
"learning_rate": 3.254437869822485e-05,
"loss": 5.0362,
"step": 567
},
{
"epoch": 0.8402366863905325,
"grad_norm": 3.523688316345215,
"learning_rate": 3.224852071005917e-05,
"loss": 5.065,
"step": 568
},
{
"epoch": 0.841715976331361,
"grad_norm": 2.907668113708496,
"learning_rate": 3.195266272189349e-05,
"loss": 4.6707,
"step": 569
},
{
"epoch": 0.8431952662721893,
"grad_norm": 2.66795015335083,
"learning_rate": 3.165680473372781e-05,
"loss": 4.7274,
"step": 570
},
{
"epoch": 0.8446745562130178,
"grad_norm": 3.600651502609253,
"learning_rate": 3.136094674556213e-05,
"loss": 4.7253,
"step": 571
},
{
"epoch": 0.8461538461538461,
"grad_norm": 2.160475730895996,
"learning_rate": 3.106508875739645e-05,
"loss": 4.7706,
"step": 572
},
{
"epoch": 0.8476331360946746,
"grad_norm": 2.354764223098755,
"learning_rate": 3.0769230769230774e-05,
"loss": 4.4115,
"step": 573
},
{
"epoch": 0.849112426035503,
"grad_norm": 3.3605852127075195,
"learning_rate": 3.047337278106509e-05,
"loss": 4.5752,
"step": 574
},
{
"epoch": 0.8505917159763313,
"grad_norm": 2.4779610633850098,
"learning_rate": 3.017751479289941e-05,
"loss": 4.4164,
"step": 575
},
{
"epoch": 0.8520710059171598,
"grad_norm": 4.133167266845703,
"learning_rate": 2.9881656804733733e-05,
"loss": 4.7699,
"step": 576
},
{
"epoch": 0.8535502958579881,
"grad_norm": 3.904437303543091,
"learning_rate": 2.958579881656805e-05,
"loss": 4.8485,
"step": 577
},
{
"epoch": 0.8550295857988166,
"grad_norm": 2.1680612564086914,
"learning_rate": 2.9289940828402368e-05,
"loss": 4.5462,
"step": 578
},
{
"epoch": 0.856508875739645,
"grad_norm": 2.6322898864746094,
"learning_rate": 2.8994082840236685e-05,
"loss": 4.5086,
"step": 579
},
{
"epoch": 0.8579881656804734,
"grad_norm": 3.004564046859741,
"learning_rate": 2.869822485207101e-05,
"loss": 4.2941,
"step": 580
},
{
"epoch": 0.8594674556213018,
"grad_norm": 3.2100727558135986,
"learning_rate": 2.8402366863905327e-05,
"loss": 4.918,
"step": 581
},
{
"epoch": 0.8609467455621301,
"grad_norm": 3.243868112564087,
"learning_rate": 2.8106508875739644e-05,
"loss": 4.8079,
"step": 582
},
{
"epoch": 0.8624260355029586,
"grad_norm": 1.9000004529953003,
"learning_rate": 2.7810650887573965e-05,
"loss": 4.6606,
"step": 583
},
{
"epoch": 0.863905325443787,
"grad_norm": 4.0184502601623535,
"learning_rate": 2.751479289940829e-05,
"loss": 4.206,
"step": 584
},
{
"epoch": 0.8653846153846154,
"grad_norm": 3.3465614318847656,
"learning_rate": 2.7218934911242606e-05,
"loss": 4.6692,
"step": 585
},
{
"epoch": 0.8668639053254438,
"grad_norm": 3.0921947956085205,
"learning_rate": 2.6923076923076923e-05,
"loss": 4.3901,
"step": 586
},
{
"epoch": 0.8683431952662722,
"grad_norm": 4.194738864898682,
"learning_rate": 2.6627218934911247e-05,
"loss": 4.6715,
"step": 587
},
{
"epoch": 0.8698224852071006,
"grad_norm": 4.712646484375,
"learning_rate": 2.6331360946745565e-05,
"loss": 4.2221,
"step": 588
},
{
"epoch": 0.871301775147929,
"grad_norm": 4.114252090454102,
"learning_rate": 2.6035502958579882e-05,
"loss": 4.3563,
"step": 589
},
{
"epoch": 0.8727810650887574,
"grad_norm": 3.0066916942596436,
"learning_rate": 2.57396449704142e-05,
"loss": 4.4031,
"step": 590
},
{
"epoch": 0.8742603550295858,
"grad_norm": 4.690349102020264,
"learning_rate": 2.5443786982248524e-05,
"loss": 4.2049,
"step": 591
},
{
"epoch": 0.8757396449704142,
"grad_norm": 2.729142904281616,
"learning_rate": 2.514792899408284e-05,
"loss": 4.3712,
"step": 592
},
{
"epoch": 0.8772189349112426,
"grad_norm": 3.8181116580963135,
"learning_rate": 2.485207100591716e-05,
"loss": 4.8115,
"step": 593
},
{
"epoch": 0.878698224852071,
"grad_norm": 4.084876537322998,
"learning_rate": 2.4556213017751482e-05,
"loss": 4.778,
"step": 594
},
{
"epoch": 0.8801775147928994,
"grad_norm": 2.857482671737671,
"learning_rate": 2.42603550295858e-05,
"loss": 4.3799,
"step": 595
},
{
"epoch": 0.8816568047337278,
"grad_norm": 4.242053985595703,
"learning_rate": 2.396449704142012e-05,
"loss": 4.3937,
"step": 596
},
{
"epoch": 0.8831360946745562,
"grad_norm": 2.85383677482605,
"learning_rate": 2.3668639053254438e-05,
"loss": 4.1794,
"step": 597
},
{
"epoch": 0.8846153846153846,
"grad_norm": 2.333695411682129,
"learning_rate": 2.337278106508876e-05,
"loss": 4.6517,
"step": 598
},
{
"epoch": 0.886094674556213,
"grad_norm": 2.6515889167785645,
"learning_rate": 2.307692307692308e-05,
"loss": 4.4109,
"step": 599
},
{
"epoch": 0.8875739644970414,
"grad_norm": 3.1488544940948486,
"learning_rate": 2.2781065088757396e-05,
"loss": 4.4439,
"step": 600
},
{
"epoch": 0.8890532544378699,
"grad_norm": 2.4748241901397705,
"learning_rate": 2.2485207100591717e-05,
"loss": 4.4223,
"step": 601
},
{
"epoch": 0.8905325443786982,
"grad_norm": 2.4698967933654785,
"learning_rate": 2.2189349112426034e-05,
"loss": 4.5374,
"step": 602
},
{
"epoch": 0.8920118343195266,
"grad_norm": 4.023700714111328,
"learning_rate": 2.189349112426036e-05,
"loss": 4.6281,
"step": 603
},
{
"epoch": 0.893491124260355,
"grad_norm": 2.4827523231506348,
"learning_rate": 2.1597633136094676e-05,
"loss": 4.6307,
"step": 604
},
{
"epoch": 0.8949704142011834,
"grad_norm": 3.7092514038085938,
"learning_rate": 2.1301775147928997e-05,
"loss": 4.9738,
"step": 605
},
{
"epoch": 0.8964497041420119,
"grad_norm": 3.7633941173553467,
"learning_rate": 2.1005917159763314e-05,
"loss": 4.3193,
"step": 606
},
{
"epoch": 0.8979289940828402,
"grad_norm": 3.2077150344848633,
"learning_rate": 2.0710059171597635e-05,
"loss": 4.2583,
"step": 607
},
{
"epoch": 0.8994082840236687,
"grad_norm": 4.071720600128174,
"learning_rate": 2.0414201183431952e-05,
"loss": 4.3272,
"step": 608
},
{
"epoch": 0.900887573964497,
"grad_norm": 3.028972864151001,
"learning_rate": 2.0118343195266273e-05,
"loss": 4.9391,
"step": 609
},
{
"epoch": 0.9023668639053254,
"grad_norm": 2.4915459156036377,
"learning_rate": 1.9822485207100593e-05,
"loss": 4.5273,
"step": 610
},
{
"epoch": 0.9038461538461539,
"grad_norm": 3.3177757263183594,
"learning_rate": 1.952662721893491e-05,
"loss": 4.6075,
"step": 611
},
{
"epoch": 0.9053254437869822,
"grad_norm": 2.9359259605407715,
"learning_rate": 1.923076923076923e-05,
"loss": 4.4901,
"step": 612
},
{
"epoch": 0.9068047337278107,
"grad_norm": 3.027765989303589,
"learning_rate": 1.8934911242603552e-05,
"loss": 4.5698,
"step": 613
},
{
"epoch": 0.908284023668639,
"grad_norm": 3.5717012882232666,
"learning_rate": 1.8639053254437873e-05,
"loss": 4.1682,
"step": 614
},
{
"epoch": 0.9097633136094675,
"grad_norm": 3.882838487625122,
"learning_rate": 1.834319526627219e-05,
"loss": 4.551,
"step": 615
},
{
"epoch": 0.9112426035502958,
"grad_norm": 4.922489166259766,
"learning_rate": 1.804733727810651e-05,
"loss": 4.3742,
"step": 616
},
{
"epoch": 0.9127218934911243,
"grad_norm": 2.828356981277466,
"learning_rate": 1.7751479289940828e-05,
"loss": 4.8154,
"step": 617
},
{
"epoch": 0.9142011834319527,
"grad_norm": 2.8683178424835205,
"learning_rate": 1.745562130177515e-05,
"loss": 4.512,
"step": 618
},
{
"epoch": 0.915680473372781,
"grad_norm": 2.6329286098480225,
"learning_rate": 1.7159763313609466e-05,
"loss": 4.414,
"step": 619
},
{
"epoch": 0.9171597633136095,
"grad_norm": 4.237077236175537,
"learning_rate": 1.6863905325443787e-05,
"loss": 4.7751,
"step": 620
},
{
"epoch": 0.9186390532544378,
"grad_norm": 2.7668392658233643,
"learning_rate": 1.6568047337278108e-05,
"loss": 4.4033,
"step": 621
},
{
"epoch": 0.9201183431952663,
"grad_norm": 3.169762134552002,
"learning_rate": 1.6272189349112425e-05,
"loss": 4.5085,
"step": 622
},
{
"epoch": 0.9215976331360947,
"grad_norm": 4.470832824707031,
"learning_rate": 1.5976331360946746e-05,
"loss": 4.3483,
"step": 623
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.081108808517456,
"learning_rate": 1.5680473372781066e-05,
"loss": 4.7229,
"step": 624
},
{
"epoch": 0.9245562130177515,
"grad_norm": 2.9792003631591797,
"learning_rate": 1.5384615384615387e-05,
"loss": 4.4489,
"step": 625
},
{
"epoch": 0.9260355029585798,
"grad_norm": 3.471381902694702,
"learning_rate": 1.5088757396449705e-05,
"loss": 4.3747,
"step": 626
},
{
"epoch": 0.9275147928994083,
"grad_norm": 2.503862142562866,
"learning_rate": 1.4792899408284025e-05,
"loss": 4.3829,
"step": 627
},
{
"epoch": 0.9289940828402367,
"grad_norm": 3.348893165588379,
"learning_rate": 1.4497041420118343e-05,
"loss": 4.5536,
"step": 628
},
{
"epoch": 0.9304733727810651,
"grad_norm": 2.7751362323760986,
"learning_rate": 1.4201183431952663e-05,
"loss": 4.8366,
"step": 629
},
{
"epoch": 0.9319526627218935,
"grad_norm": 3.739258289337158,
"learning_rate": 1.3905325443786982e-05,
"loss": 5.409,
"step": 630
},
{
"epoch": 0.9334319526627219,
"grad_norm": 2.438932180404663,
"learning_rate": 1.3609467455621303e-05,
"loss": 5.0738,
"step": 631
},
{
"epoch": 0.9349112426035503,
"grad_norm": 3.888420820236206,
"learning_rate": 1.3313609467455624e-05,
"loss": 4.4,
"step": 632
},
{
"epoch": 0.9363905325443787,
"grad_norm": 3.376814842224121,
"learning_rate": 1.3017751479289941e-05,
"loss": 4.7996,
"step": 633
},
{
"epoch": 0.9378698224852071,
"grad_norm": 2.792365550994873,
"learning_rate": 1.2721893491124262e-05,
"loss": 4.5187,
"step": 634
},
{
"epoch": 0.9393491124260355,
"grad_norm": 2.4097423553466797,
"learning_rate": 1.242603550295858e-05,
"loss": 4.6622,
"step": 635
},
{
"epoch": 0.9408284023668639,
"grad_norm": 2.6368215084075928,
"learning_rate": 1.21301775147929e-05,
"loss": 4.6149,
"step": 636
},
{
"epoch": 0.9423076923076923,
"grad_norm": 3.4069018363952637,
"learning_rate": 1.1834319526627219e-05,
"loss": 4.1836,
"step": 637
},
{
"epoch": 0.9437869822485208,
"grad_norm": 3.1973531246185303,
"learning_rate": 1.153846153846154e-05,
"loss": 4.5128,
"step": 638
},
{
"epoch": 0.9452662721893491,
"grad_norm": 3.2625622749328613,
"learning_rate": 1.1242603550295859e-05,
"loss": 4.4204,
"step": 639
},
{
"epoch": 0.9467455621301775,
"grad_norm": 4.215187072753906,
"learning_rate": 1.094674556213018e-05,
"loss": 4.8363,
"step": 640
},
{
"epoch": 0.9482248520710059,
"grad_norm": 3.0874249935150146,
"learning_rate": 1.0650887573964498e-05,
"loss": 4.5997,
"step": 641
},
{
"epoch": 0.9497041420118343,
"grad_norm": 3.4833452701568604,
"learning_rate": 1.0355029585798817e-05,
"loss": 4.6514,
"step": 642
},
{
"epoch": 0.9511834319526628,
"grad_norm": 2.5931572914123535,
"learning_rate": 1.0059171597633136e-05,
"loss": 4.5665,
"step": 643
},
{
"epoch": 0.9526627218934911,
"grad_norm": 4.655201435089111,
"learning_rate": 9.763313609467455e-06,
"loss": 4.442,
"step": 644
},
{
"epoch": 0.9541420118343196,
"grad_norm": 4.815556526184082,
"learning_rate": 9.467455621301776e-06,
"loss": 4.4401,
"step": 645
},
{
"epoch": 0.9556213017751479,
"grad_norm": 2.570767641067505,
"learning_rate": 9.171597633136095e-06,
"loss": 4.4435,
"step": 646
},
{
"epoch": 0.9571005917159763,
"grad_norm": 3.6513237953186035,
"learning_rate": 8.875739644970414e-06,
"loss": 4.1544,
"step": 647
},
{
"epoch": 0.9585798816568047,
"grad_norm": 2.992757797241211,
"learning_rate": 8.579881656804733e-06,
"loss": 4.5563,
"step": 648
},
{
"epoch": 0.9600591715976331,
"grad_norm": 3.3865644931793213,
"learning_rate": 8.284023668639054e-06,
"loss": 4.5594,
"step": 649
},
{
"epoch": 0.9615384615384616,
"grad_norm": 3.6119096279144287,
"learning_rate": 7.988165680473373e-06,
"loss": 4.6004,
"step": 650
},
{
"epoch": 0.9630177514792899,
"grad_norm": 2.800621509552002,
"learning_rate": 7.692307692307694e-06,
"loss": 4.6929,
"step": 651
},
{
"epoch": 0.9644970414201184,
"grad_norm": 3.412346601486206,
"learning_rate": 7.396449704142013e-06,
"loss": 4.5545,
"step": 652
},
{
"epoch": 0.9659763313609467,
"grad_norm": 3.1978702545166016,
"learning_rate": 7.100591715976332e-06,
"loss": 4.8381,
"step": 653
},
{
"epoch": 0.9674556213017751,
"grad_norm": 2.437225341796875,
"learning_rate": 6.8047337278106515e-06,
"loss": 4.8636,
"step": 654
},
{
"epoch": 0.9689349112426036,
"grad_norm": 2.7770652770996094,
"learning_rate": 6.5088757396449705e-06,
"loss": 4.4207,
"step": 655
},
{
"epoch": 0.9704142011834319,
"grad_norm": 4.442773342132568,
"learning_rate": 6.21301775147929e-06,
"loss": 4.2258,
"step": 656
},
{
"epoch": 0.9718934911242604,
"grad_norm": 3.711191177368164,
"learning_rate": 5.917159763313609e-06,
"loss": 4.4003,
"step": 657
},
{
"epoch": 0.9733727810650887,
"grad_norm": 3.1590025424957275,
"learning_rate": 5.621301775147929e-06,
"loss": 4.499,
"step": 658
},
{
"epoch": 0.9748520710059172,
"grad_norm": 2.811072587966919,
"learning_rate": 5.325443786982249e-06,
"loss": 4.9061,
"step": 659
},
{
"epoch": 0.9763313609467456,
"grad_norm": 3.6592109203338623,
"learning_rate": 5.029585798816568e-06,
"loss": 4.1664,
"step": 660
},
{
"epoch": 0.977810650887574,
"grad_norm": 2.923866033554077,
"learning_rate": 4.733727810650888e-06,
"loss": 4.5202,
"step": 661
},
{
"epoch": 0.9792899408284024,
"grad_norm": 2.50335431098938,
"learning_rate": 4.437869822485207e-06,
"loss": 4.5732,
"step": 662
},
{
"epoch": 0.9807692307692307,
"grad_norm": 2.5192694664001465,
"learning_rate": 4.142011834319527e-06,
"loss": 5.3358,
"step": 663
},
{
"epoch": 0.9822485207100592,
"grad_norm": 2.9927144050598145,
"learning_rate": 3.846153846153847e-06,
"loss": 4.7768,
"step": 664
},
{
"epoch": 0.9837278106508875,
"grad_norm": 2.4072325229644775,
"learning_rate": 3.550295857988166e-06,
"loss": 4.7046,
"step": 665
},
{
"epoch": 0.985207100591716,
"grad_norm": 3.98689866065979,
"learning_rate": 3.2544378698224853e-06,
"loss": 4.1346,
"step": 666
},
{
"epoch": 0.9866863905325444,
"grad_norm": 4.064346790313721,
"learning_rate": 2.9585798816568047e-06,
"loss": 4.5276,
"step": 667
},
{
"epoch": 0.9881656804733728,
"grad_norm": 3.102790117263794,
"learning_rate": 2.6627218934911246e-06,
"loss": 4.5344,
"step": 668
},
{
"epoch": 0.9896449704142012,
"grad_norm": 3.3671998977661133,
"learning_rate": 2.366863905325444e-06,
"loss": 4.3674,
"step": 669
},
{
"epoch": 0.9911242603550295,
"grad_norm": 4.128759384155273,
"learning_rate": 2.0710059171597635e-06,
"loss": 4.7371,
"step": 670
},
{
"epoch": 0.992603550295858,
"grad_norm": 2.914989709854126,
"learning_rate": 1.775147928994083e-06,
"loss": 4.5296,
"step": 671
},
{
"epoch": 0.9940828402366864,
"grad_norm": 2.991652727127075,
"learning_rate": 1.4792899408284024e-06,
"loss": 4.6279,
"step": 672
},
{
"epoch": 0.9955621301775148,
"grad_norm": 3.1215224266052246,
"learning_rate": 1.183431952662722e-06,
"loss": 4.3423,
"step": 673
},
{
"epoch": 0.9970414201183432,
"grad_norm": 4.466518878936768,
"learning_rate": 8.875739644970415e-07,
"loss": 4.348,
"step": 674
},
{
"epoch": 0.9985207100591716,
"grad_norm": 3.410566568374634,
"learning_rate": 5.91715976331361e-07,
"loss": 4.4988,
"step": 675
},
{
"epoch": 1.0,
"grad_norm": 4.770328044891357,
"learning_rate": 2.958579881656805e-07,
"loss": 4.2684,
"step": 676
}
],
"logging_steps": 1,
"max_steps": 676,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 719100446989824.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}