DiffGen-8B / trainer_state.json
QizhiPei's picture
Upload initial question generator
1ab8ba9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1498,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033377837116154874,
"grad_norm": 10.97334621037521,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.0976,
"step": 5
},
{
"epoch": 0.006675567423230975,
"grad_norm": 8.500612119645204,
"learning_rate": 3e-06,
"loss": 1.0533,
"step": 10
},
{
"epoch": 0.010013351134846462,
"grad_norm": 4.4978733433398705,
"learning_rate": 4.666666666666667e-06,
"loss": 0.9037,
"step": 15
},
{
"epoch": 0.01335113484646195,
"grad_norm": 3.8887423585372596,
"learning_rate": 6.333333333333334e-06,
"loss": 0.7853,
"step": 20
},
{
"epoch": 0.016688918558077435,
"grad_norm": 3.1870823314914474,
"learning_rate": 8.000000000000001e-06,
"loss": 0.774,
"step": 25
},
{
"epoch": 0.020026702269692925,
"grad_norm": 3.6334777762827684,
"learning_rate": 9.666666666666667e-06,
"loss": 0.7773,
"step": 30
},
{
"epoch": 0.02336448598130841,
"grad_norm": 3.683322604774294,
"learning_rate": 1.1333333333333334e-05,
"loss": 0.7457,
"step": 35
},
{
"epoch": 0.0267022696929239,
"grad_norm": 3.6697978407247605,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.7103,
"step": 40
},
{
"epoch": 0.030040053404539385,
"grad_norm": 3.651275266228758,
"learning_rate": 1.4666666666666668e-05,
"loss": 0.7227,
"step": 45
},
{
"epoch": 0.03337783711615487,
"grad_norm": 3.8445471248297265,
"learning_rate": 1.6333333333333335e-05,
"loss": 0.6611,
"step": 50
},
{
"epoch": 0.036715620827770364,
"grad_norm": 3.5025746500731545,
"learning_rate": 1.8e-05,
"loss": 0.6182,
"step": 55
},
{
"epoch": 0.04005340453938585,
"grad_norm": 3.8698798018604577,
"learning_rate": 1.9666666666666666e-05,
"loss": 0.6551,
"step": 60
},
{
"epoch": 0.043391188251001335,
"grad_norm": 2.990673727539358,
"learning_rate": 2.1333333333333335e-05,
"loss": 0.6866,
"step": 65
},
{
"epoch": 0.04672897196261682,
"grad_norm": 3.2866308072265213,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.6887,
"step": 70
},
{
"epoch": 0.050066755674232306,
"grad_norm": 3.271721003686755,
"learning_rate": 2.466666666666667e-05,
"loss": 0.7327,
"step": 75
},
{
"epoch": 0.0534045393858478,
"grad_norm": 3.0753677314727743,
"learning_rate": 2.633333333333333e-05,
"loss": 0.7003,
"step": 80
},
{
"epoch": 0.056742323097463285,
"grad_norm": 2.6258441723789607,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.7144,
"step": 85
},
{
"epoch": 0.06008010680907877,
"grad_norm": 2.434906123559183,
"learning_rate": 2.9666666666666672e-05,
"loss": 0.704,
"step": 90
},
{
"epoch": 0.06341789052069426,
"grad_norm": 2.760855540008962,
"learning_rate": 3.1333333333333334e-05,
"loss": 0.7329,
"step": 95
},
{
"epoch": 0.06675567423230974,
"grad_norm": 2.215504245536271,
"learning_rate": 3.3e-05,
"loss": 0.7367,
"step": 100
},
{
"epoch": 0.07009345794392523,
"grad_norm": 2.6569341544736713,
"learning_rate": 3.466666666666667e-05,
"loss": 0.7713,
"step": 105
},
{
"epoch": 0.07343124165554073,
"grad_norm": 2.6507368382475973,
"learning_rate": 3.633333333333333e-05,
"loss": 0.7772,
"step": 110
},
{
"epoch": 0.07676902536715621,
"grad_norm": 2.1348618645661483,
"learning_rate": 3.8e-05,
"loss": 0.7712,
"step": 115
},
{
"epoch": 0.0801068090787717,
"grad_norm": 2.264768791536783,
"learning_rate": 3.966666666666667e-05,
"loss": 0.7594,
"step": 120
},
{
"epoch": 0.08344459279038718,
"grad_norm": 1.9771816174965278,
"learning_rate": 4.133333333333333e-05,
"loss": 0.7945,
"step": 125
},
{
"epoch": 0.08678237650200267,
"grad_norm": 1.8584709556275458,
"learning_rate": 4.3e-05,
"loss": 0.7918,
"step": 130
},
{
"epoch": 0.09012016021361816,
"grad_norm": 1.6443791716346257,
"learning_rate": 4.466666666666667e-05,
"loss": 0.7961,
"step": 135
},
{
"epoch": 0.09345794392523364,
"grad_norm": 1.6647164088912758,
"learning_rate": 4.633333333333333e-05,
"loss": 0.8114,
"step": 140
},
{
"epoch": 0.09679572763684913,
"grad_norm": 1.9000184800356008,
"learning_rate": 4.8e-05,
"loss": 0.8218,
"step": 145
},
{
"epoch": 0.10013351134846461,
"grad_norm": 1.5342772082491383,
"learning_rate": 4.966666666666667e-05,
"loss": 0.8185,
"step": 150
},
{
"epoch": 0.10347129506008011,
"grad_norm": 1.4750796923574772,
"learning_rate": 4.9851632047477745e-05,
"loss": 0.8223,
"step": 155
},
{
"epoch": 0.1068090787716956,
"grad_norm": 1.6148204398582195,
"learning_rate": 4.966617210682493e-05,
"loss": 0.8303,
"step": 160
},
{
"epoch": 0.11014686248331108,
"grad_norm": 1.4390159182334255,
"learning_rate": 4.948071216617211e-05,
"loss": 0.7991,
"step": 165
},
{
"epoch": 0.11348464619492657,
"grad_norm": 1.2663492608255318,
"learning_rate": 4.929525222551929e-05,
"loss": 0.8523,
"step": 170
},
{
"epoch": 0.11682242990654206,
"grad_norm": 1.5497168860322745,
"learning_rate": 4.910979228486647e-05,
"loss": 0.8384,
"step": 175
},
{
"epoch": 0.12016021361815754,
"grad_norm": 1.3218119424151125,
"learning_rate": 4.8924332344213654e-05,
"loss": 0.8334,
"step": 180
},
{
"epoch": 0.12349799732977303,
"grad_norm": 1.3314085264665803,
"learning_rate": 4.873887240356083e-05,
"loss": 0.8775,
"step": 185
},
{
"epoch": 0.1268357810413885,
"grad_norm": 1.417575113114604,
"learning_rate": 4.855341246290801e-05,
"loss": 0.8604,
"step": 190
},
{
"epoch": 0.130173564753004,
"grad_norm": 1.366705643461715,
"learning_rate": 4.8367952522255196e-05,
"loss": 0.8578,
"step": 195
},
{
"epoch": 0.13351134846461948,
"grad_norm": 1.3085988961094133,
"learning_rate": 4.818249258160238e-05,
"loss": 0.8493,
"step": 200
},
{
"epoch": 0.13684913217623498,
"grad_norm": 1.3664654247475687,
"learning_rate": 4.7997032640949556e-05,
"loss": 0.8739,
"step": 205
},
{
"epoch": 0.14018691588785046,
"grad_norm": 1.2448228793197444,
"learning_rate": 4.781157270029674e-05,
"loss": 0.8758,
"step": 210
},
{
"epoch": 0.14352469959946595,
"grad_norm": 1.2166707854427619,
"learning_rate": 4.762611275964392e-05,
"loss": 0.8621,
"step": 215
},
{
"epoch": 0.14686248331108145,
"grad_norm": 1.2528234662317728,
"learning_rate": 4.74406528189911e-05,
"loss": 0.8358,
"step": 220
},
{
"epoch": 0.15020026702269693,
"grad_norm": 1.2078820039150766,
"learning_rate": 4.725519287833828e-05,
"loss": 0.8527,
"step": 225
},
{
"epoch": 0.15353805073431243,
"grad_norm": 1.1953586214155212,
"learning_rate": 4.7069732937685464e-05,
"loss": 0.86,
"step": 230
},
{
"epoch": 0.1568758344459279,
"grad_norm": 1.239697699729331,
"learning_rate": 4.688427299703264e-05,
"loss": 0.8512,
"step": 235
},
{
"epoch": 0.1602136181575434,
"grad_norm": 1.0943965020451794,
"learning_rate": 4.6698813056379824e-05,
"loss": 0.8485,
"step": 240
},
{
"epoch": 0.16355140186915887,
"grad_norm": 1.1753697981159985,
"learning_rate": 4.651335311572701e-05,
"loss": 0.8574,
"step": 245
},
{
"epoch": 0.16688918558077437,
"grad_norm": 1.2273163170418067,
"learning_rate": 4.632789317507419e-05,
"loss": 0.8755,
"step": 250
},
{
"epoch": 0.17022696929238984,
"grad_norm": 1.1408298234921244,
"learning_rate": 4.6142433234421366e-05,
"loss": 0.8656,
"step": 255
},
{
"epoch": 0.17356475300400534,
"grad_norm": 1.099138875783624,
"learning_rate": 4.595697329376854e-05,
"loss": 0.8641,
"step": 260
},
{
"epoch": 0.17690253671562084,
"grad_norm": 1.1726455447900384,
"learning_rate": 4.577151335311573e-05,
"loss": 0.8428,
"step": 265
},
{
"epoch": 0.1802403204272363,
"grad_norm": 1.1238710518885906,
"learning_rate": 4.558605341246291e-05,
"loss": 0.8564,
"step": 270
},
{
"epoch": 0.1835781041388518,
"grad_norm": 1.2209709343561501,
"learning_rate": 4.540059347181009e-05,
"loss": 0.8841,
"step": 275
},
{
"epoch": 0.18691588785046728,
"grad_norm": 1.0719606532900603,
"learning_rate": 4.5215133531157275e-05,
"loss": 0.8564,
"step": 280
},
{
"epoch": 0.19025367156208278,
"grad_norm": 1.1632077631864237,
"learning_rate": 4.502967359050445e-05,
"loss": 0.8336,
"step": 285
},
{
"epoch": 0.19359145527369825,
"grad_norm": 1.0912246571194697,
"learning_rate": 4.4844213649851635e-05,
"loss": 0.8386,
"step": 290
},
{
"epoch": 0.19692923898531375,
"grad_norm": 1.182192263363281,
"learning_rate": 4.465875370919881e-05,
"loss": 0.8725,
"step": 295
},
{
"epoch": 0.20026702269692923,
"grad_norm": 1.1652841802413654,
"learning_rate": 4.4473293768546e-05,
"loss": 0.8673,
"step": 300
},
{
"epoch": 0.20360480640854473,
"grad_norm": 1.006954327768831,
"learning_rate": 4.428783382789318e-05,
"loss": 0.8428,
"step": 305
},
{
"epoch": 0.20694259012016022,
"grad_norm": 1.043615772924013,
"learning_rate": 4.4102373887240354e-05,
"loss": 0.8826,
"step": 310
},
{
"epoch": 0.2102803738317757,
"grad_norm": 1.0218351388644316,
"learning_rate": 4.391691394658754e-05,
"loss": 0.8604,
"step": 315
},
{
"epoch": 0.2136181575433912,
"grad_norm": 1.093123947434233,
"learning_rate": 4.373145400593472e-05,
"loss": 0.8471,
"step": 320
},
{
"epoch": 0.21695594125500667,
"grad_norm": 1.0437185361786059,
"learning_rate": 4.35459940652819e-05,
"loss": 0.8365,
"step": 325
},
{
"epoch": 0.22029372496662217,
"grad_norm": 1.1728538235708217,
"learning_rate": 4.336053412462908e-05,
"loss": 0.8456,
"step": 330
},
{
"epoch": 0.22363150867823764,
"grad_norm": 1.072793065927316,
"learning_rate": 4.317507418397626e-05,
"loss": 0.8574,
"step": 335
},
{
"epoch": 0.22696929238985314,
"grad_norm": 0.9434393974539138,
"learning_rate": 4.2989614243323446e-05,
"loss": 0.8342,
"step": 340
},
{
"epoch": 0.23030707610146864,
"grad_norm": 1.0832629194902035,
"learning_rate": 4.280415430267062e-05,
"loss": 0.8509,
"step": 345
},
{
"epoch": 0.2336448598130841,
"grad_norm": 0.9763109475541231,
"learning_rate": 4.2618694362017805e-05,
"loss": 0.8515,
"step": 350
},
{
"epoch": 0.2369826435246996,
"grad_norm": 0.9622690418923676,
"learning_rate": 4.243323442136499e-05,
"loss": 0.8451,
"step": 355
},
{
"epoch": 0.24032042723631508,
"grad_norm": 1.0311536354503212,
"learning_rate": 4.2247774480712165e-05,
"loss": 0.8793,
"step": 360
},
{
"epoch": 0.24365821094793058,
"grad_norm": 1.0616057184675154,
"learning_rate": 4.206231454005935e-05,
"loss": 0.8496,
"step": 365
},
{
"epoch": 0.24699599465954605,
"grad_norm": 0.9694776849343971,
"learning_rate": 4.187685459940653e-05,
"loss": 0.8276,
"step": 370
},
{
"epoch": 0.25033377837116155,
"grad_norm": 1.0669771488195197,
"learning_rate": 4.1691394658753714e-05,
"loss": 0.8608,
"step": 375
},
{
"epoch": 0.253671562082777,
"grad_norm": 1.0330982860266757,
"learning_rate": 4.150593471810089e-05,
"loss": 0.8626,
"step": 380
},
{
"epoch": 0.2570093457943925,
"grad_norm": 1.0894982304893939,
"learning_rate": 4.132047477744807e-05,
"loss": 0.8456,
"step": 385
},
{
"epoch": 0.260347129506008,
"grad_norm": 1.0233378411857879,
"learning_rate": 4.1135014836795256e-05,
"loss": 0.858,
"step": 390
},
{
"epoch": 0.2636849132176235,
"grad_norm": 0.9532755270759551,
"learning_rate": 4.094955489614243e-05,
"loss": 0.8325,
"step": 395
},
{
"epoch": 0.26702269692923897,
"grad_norm": 1.012705355808147,
"learning_rate": 4.0764094955489616e-05,
"loss": 0.8603,
"step": 400
},
{
"epoch": 0.2703604806408545,
"grad_norm": 1.0304787118053764,
"learning_rate": 4.05786350148368e-05,
"loss": 0.8481,
"step": 405
},
{
"epoch": 0.27369826435246997,
"grad_norm": 1.0003080869140883,
"learning_rate": 4.039317507418398e-05,
"loss": 0.841,
"step": 410
},
{
"epoch": 0.27703604806408544,
"grad_norm": 1.0009426776633654,
"learning_rate": 4.020771513353116e-05,
"loss": 0.8504,
"step": 415
},
{
"epoch": 0.2803738317757009,
"grad_norm": 0.9644210719992499,
"learning_rate": 4.002225519287834e-05,
"loss": 0.8481,
"step": 420
},
{
"epoch": 0.28371161548731644,
"grad_norm": 1.0494436578225004,
"learning_rate": 3.9836795252225525e-05,
"loss": 0.8354,
"step": 425
},
{
"epoch": 0.2870493991989319,
"grad_norm": 0.8887585557590956,
"learning_rate": 3.96513353115727e-05,
"loss": 0.83,
"step": 430
},
{
"epoch": 0.2903871829105474,
"grad_norm": 0.9747027542446707,
"learning_rate": 3.9465875370919884e-05,
"loss": 0.8307,
"step": 435
},
{
"epoch": 0.2937249666221629,
"grad_norm": 0.9383108633240661,
"learning_rate": 3.928041543026707e-05,
"loss": 0.8135,
"step": 440
},
{
"epoch": 0.2970627503337784,
"grad_norm": 1.0410990959669617,
"learning_rate": 3.9094955489614244e-05,
"loss": 0.8485,
"step": 445
},
{
"epoch": 0.30040053404539385,
"grad_norm": 0.9780902917654535,
"learning_rate": 3.890949554896143e-05,
"loss": 0.8196,
"step": 450
},
{
"epoch": 0.3037383177570093,
"grad_norm": 0.9442982752168075,
"learning_rate": 3.87240356083086e-05,
"loss": 0.808,
"step": 455
},
{
"epoch": 0.30707610146862485,
"grad_norm": 1.0040732428090156,
"learning_rate": 3.853857566765579e-05,
"loss": 0.8478,
"step": 460
},
{
"epoch": 0.3104138851802403,
"grad_norm": 0.9957954242378592,
"learning_rate": 3.835311572700297e-05,
"loss": 0.8243,
"step": 465
},
{
"epoch": 0.3137516688918558,
"grad_norm": 0.9906243125042739,
"learning_rate": 3.8167655786350146e-05,
"loss": 0.8198,
"step": 470
},
{
"epoch": 0.3170894526034713,
"grad_norm": 0.9245770524956366,
"learning_rate": 3.7982195845697336e-05,
"loss": 0.8225,
"step": 475
},
{
"epoch": 0.3204272363150868,
"grad_norm": 0.9689880972401121,
"learning_rate": 3.779673590504451e-05,
"loss": 0.8081,
"step": 480
},
{
"epoch": 0.32376502002670227,
"grad_norm": 0.9050315548245393,
"learning_rate": 3.7611275964391695e-05,
"loss": 0.8257,
"step": 485
},
{
"epoch": 0.32710280373831774,
"grad_norm": 0.965523047639,
"learning_rate": 3.742581602373887e-05,
"loss": 0.8351,
"step": 490
},
{
"epoch": 0.33044058744993327,
"grad_norm": 1.0138619406917988,
"learning_rate": 3.7240356083086054e-05,
"loss": 0.8297,
"step": 495
},
{
"epoch": 0.33377837116154874,
"grad_norm": 0.9226993749632075,
"learning_rate": 3.705489614243324e-05,
"loss": 0.822,
"step": 500
},
{
"epoch": 0.3371161548731642,
"grad_norm": 0.932421956217793,
"learning_rate": 3.6869436201780414e-05,
"loss": 0.8131,
"step": 505
},
{
"epoch": 0.3404539385847797,
"grad_norm": 0.8929781381117078,
"learning_rate": 3.6683976261127604e-05,
"loss": 0.8144,
"step": 510
},
{
"epoch": 0.3437917222963952,
"grad_norm": 0.922998838382151,
"learning_rate": 3.649851632047478e-05,
"loss": 0.8624,
"step": 515
},
{
"epoch": 0.3471295060080107,
"grad_norm": 0.93469331823402,
"learning_rate": 3.6313056379821956e-05,
"loss": 0.8259,
"step": 520
},
{
"epoch": 0.35046728971962615,
"grad_norm": 1.0259483980065804,
"learning_rate": 3.612759643916914e-05,
"loss": 0.85,
"step": 525
},
{
"epoch": 0.3538050734312417,
"grad_norm": 0.9757464996869175,
"learning_rate": 3.594213649851632e-05,
"loss": 0.8322,
"step": 530
},
{
"epoch": 0.35714285714285715,
"grad_norm": 1.0326434851152306,
"learning_rate": 3.5756676557863506e-05,
"loss": 0.8152,
"step": 535
},
{
"epoch": 0.3604806408544726,
"grad_norm": 0.8969653625459288,
"learning_rate": 3.557121661721068e-05,
"loss": 0.8342,
"step": 540
},
{
"epoch": 0.3638184245660881,
"grad_norm": 0.9685090624506036,
"learning_rate": 3.5385756676557865e-05,
"loss": 0.8454,
"step": 545
},
{
"epoch": 0.3671562082777036,
"grad_norm": 0.9464787707831517,
"learning_rate": 3.520029673590505e-05,
"loss": 0.8144,
"step": 550
},
{
"epoch": 0.3704939919893191,
"grad_norm": 0.9715120332083621,
"learning_rate": 3.5014836795252225e-05,
"loss": 0.8075,
"step": 555
},
{
"epoch": 0.37383177570093457,
"grad_norm": 1.0181655310980833,
"learning_rate": 3.482937685459941e-05,
"loss": 0.8433,
"step": 560
},
{
"epoch": 0.3771695594125501,
"grad_norm": 1.0013772700433445,
"learning_rate": 3.464391691394659e-05,
"loss": 0.8253,
"step": 565
},
{
"epoch": 0.38050734312416556,
"grad_norm": 0.9482215787610626,
"learning_rate": 3.445845697329377e-05,
"loss": 0.8195,
"step": 570
},
{
"epoch": 0.38384512683578104,
"grad_norm": 1.0461820886125337,
"learning_rate": 3.427299703264095e-05,
"loss": 0.831,
"step": 575
},
{
"epoch": 0.3871829105473965,
"grad_norm": 0.9198239920283778,
"learning_rate": 3.4087537091988134e-05,
"loss": 0.8152,
"step": 580
},
{
"epoch": 0.39052069425901204,
"grad_norm": 0.9782331163351092,
"learning_rate": 3.390207715133532e-05,
"loss": 0.8154,
"step": 585
},
{
"epoch": 0.3938584779706275,
"grad_norm": 0.9186397229393198,
"learning_rate": 3.371661721068249e-05,
"loss": 0.8153,
"step": 590
},
{
"epoch": 0.397196261682243,
"grad_norm": 0.9337443617921134,
"learning_rate": 3.3531157270029676e-05,
"loss": 0.8233,
"step": 595
},
{
"epoch": 0.40053404539385845,
"grad_norm": 0.9434322651580768,
"learning_rate": 3.334569732937686e-05,
"loss": 0.8345,
"step": 600
},
{
"epoch": 0.403871829105474,
"grad_norm": 1.0512846063850414,
"learning_rate": 3.3160237388724036e-05,
"loss": 0.8173,
"step": 605
},
{
"epoch": 0.40720961281708945,
"grad_norm": 0.9350959223867034,
"learning_rate": 3.297477744807122e-05,
"loss": 0.8372,
"step": 610
},
{
"epoch": 0.4105473965287049,
"grad_norm": 0.9197353611822743,
"learning_rate": 3.27893175074184e-05,
"loss": 0.8215,
"step": 615
},
{
"epoch": 0.41388518024032045,
"grad_norm": 0.8518070420704498,
"learning_rate": 3.260385756676558e-05,
"loss": 0.7951,
"step": 620
},
{
"epoch": 0.4172229639519359,
"grad_norm": 0.9858909592901012,
"learning_rate": 3.241839762611276e-05,
"loss": 0.8035,
"step": 625
},
{
"epoch": 0.4205607476635514,
"grad_norm": 1.071276470614738,
"learning_rate": 3.223293768545994e-05,
"loss": 0.8121,
"step": 630
},
{
"epoch": 0.42389853137516686,
"grad_norm": 0.9226349965551451,
"learning_rate": 3.204747774480713e-05,
"loss": 0.8024,
"step": 635
},
{
"epoch": 0.4272363150867824,
"grad_norm": 0.9911335494782234,
"learning_rate": 3.1862017804154304e-05,
"loss": 0.7998,
"step": 640
},
{
"epoch": 0.43057409879839786,
"grad_norm": 0.857226373613729,
"learning_rate": 3.167655786350148e-05,
"loss": 0.7985,
"step": 645
},
{
"epoch": 0.43391188251001334,
"grad_norm": 0.9012240464805917,
"learning_rate": 3.149109792284867e-05,
"loss": 0.8109,
"step": 650
},
{
"epoch": 0.43724966622162886,
"grad_norm": 0.9124837740946565,
"learning_rate": 3.1305637982195846e-05,
"loss": 0.8015,
"step": 655
},
{
"epoch": 0.44058744993324434,
"grad_norm": 0.9907278141668688,
"learning_rate": 3.112017804154303e-05,
"loss": 0.8102,
"step": 660
},
{
"epoch": 0.4439252336448598,
"grad_norm": 0.9447867252541866,
"learning_rate": 3.0934718100890206e-05,
"loss": 0.8308,
"step": 665
},
{
"epoch": 0.4472630173564753,
"grad_norm": 0.9514834392779774,
"learning_rate": 3.074925816023739e-05,
"loss": 0.8108,
"step": 670
},
{
"epoch": 0.4506008010680908,
"grad_norm": 0.8898801356986638,
"learning_rate": 3.056379821958457e-05,
"loss": 0.7952,
"step": 675
},
{
"epoch": 0.4539385847797063,
"grad_norm": 0.9121421167479317,
"learning_rate": 3.0378338278931752e-05,
"loss": 0.7766,
"step": 680
},
{
"epoch": 0.45727636849132175,
"grad_norm": 0.9206580766916015,
"learning_rate": 3.0192878338278935e-05,
"loss": 0.7976,
"step": 685
},
{
"epoch": 0.4606141522029373,
"grad_norm": 0.8875003615985043,
"learning_rate": 3.0007418397626115e-05,
"loss": 0.7965,
"step": 690
},
{
"epoch": 0.46395193591455275,
"grad_norm": 1.0057518919464419,
"learning_rate": 2.9821958456973298e-05,
"loss": 0.7745,
"step": 695
},
{
"epoch": 0.4672897196261682,
"grad_norm": 0.9890615318492613,
"learning_rate": 2.9636498516320477e-05,
"loss": 0.8021,
"step": 700
},
{
"epoch": 0.4706275033377837,
"grad_norm": 0.9958806046763854,
"learning_rate": 2.9451038575667654e-05,
"loss": 0.7948,
"step": 705
},
{
"epoch": 0.4739652870493992,
"grad_norm": 0.9427315789994021,
"learning_rate": 2.926557863501484e-05,
"loss": 0.7995,
"step": 710
},
{
"epoch": 0.4773030707610147,
"grad_norm": 0.9054405934645217,
"learning_rate": 2.908011869436202e-05,
"loss": 0.7941,
"step": 715
},
{
"epoch": 0.48064085447263016,
"grad_norm": 0.9373946350999136,
"learning_rate": 2.8894658753709203e-05,
"loss": 0.8178,
"step": 720
},
{
"epoch": 0.48397863818424564,
"grad_norm": 0.9693149989780067,
"learning_rate": 2.8709198813056383e-05,
"loss": 0.8044,
"step": 725
},
{
"epoch": 0.48731642189586116,
"grad_norm": 0.8575646876326481,
"learning_rate": 2.852373887240356e-05,
"loss": 0.789,
"step": 730
},
{
"epoch": 0.49065420560747663,
"grad_norm": 0.986208895766066,
"learning_rate": 2.8338278931750746e-05,
"loss": 0.8155,
"step": 735
},
{
"epoch": 0.4939919893190921,
"grad_norm": 0.9731059895113887,
"learning_rate": 2.8152818991097922e-05,
"loss": 0.8092,
"step": 740
},
{
"epoch": 0.49732977303070763,
"grad_norm": 1.0244020266746006,
"learning_rate": 2.796735905044511e-05,
"loss": 0.8086,
"step": 745
},
{
"epoch": 0.5006675567423231,
"grad_norm": 0.8755711588079153,
"learning_rate": 2.7781899109792285e-05,
"loss": 0.7978,
"step": 750
},
{
"epoch": 0.5040053404539386,
"grad_norm": 0.8857975744161016,
"learning_rate": 2.7596439169139465e-05,
"loss": 0.7945,
"step": 755
},
{
"epoch": 0.507343124165554,
"grad_norm": 1.0437729771562312,
"learning_rate": 2.741097922848665e-05,
"loss": 0.8,
"step": 760
},
{
"epoch": 0.5106809078771696,
"grad_norm": 0.8938452774095116,
"learning_rate": 2.7225519287833828e-05,
"loss": 0.7919,
"step": 765
},
{
"epoch": 0.514018691588785,
"grad_norm": 0.9507910351515566,
"learning_rate": 2.7040059347181014e-05,
"loss": 0.811,
"step": 770
},
{
"epoch": 0.5173564753004005,
"grad_norm": 0.9446476391640979,
"learning_rate": 2.685459940652819e-05,
"loss": 0.812,
"step": 775
},
{
"epoch": 0.520694259012016,
"grad_norm": 0.8939815937066375,
"learning_rate": 2.666913946587537e-05,
"loss": 0.7968,
"step": 780
},
{
"epoch": 0.5240320427236315,
"grad_norm": 0.9078803372635648,
"learning_rate": 2.6483679525222553e-05,
"loss": 0.816,
"step": 785
},
{
"epoch": 0.527369826435247,
"grad_norm": 0.8194365993553059,
"learning_rate": 2.6298219584569733e-05,
"loss": 0.7861,
"step": 790
},
{
"epoch": 0.5307076101468625,
"grad_norm": 0.8408501195751673,
"learning_rate": 2.6112759643916916e-05,
"loss": 0.7743,
"step": 795
},
{
"epoch": 0.5340453938584779,
"grad_norm": 0.8531884984260966,
"learning_rate": 2.5927299703264096e-05,
"loss": 0.8047,
"step": 800
},
{
"epoch": 0.5373831775700935,
"grad_norm": 0.8791442187279712,
"learning_rate": 2.5741839762611276e-05,
"loss": 0.7854,
"step": 805
},
{
"epoch": 0.540720961281709,
"grad_norm": 0.9753162027608423,
"learning_rate": 2.555637982195846e-05,
"loss": 0.8098,
"step": 810
},
{
"epoch": 0.5440587449933244,
"grad_norm": 0.8625073065074,
"learning_rate": 2.537091988130564e-05,
"loss": 0.7967,
"step": 815
},
{
"epoch": 0.5473965287049399,
"grad_norm": 0.9742653975215142,
"learning_rate": 2.518545994065282e-05,
"loss": 0.7768,
"step": 820
},
{
"epoch": 0.5507343124165555,
"grad_norm": 0.8892913090889087,
"learning_rate": 2.5e-05,
"loss": 0.7603,
"step": 825
},
{
"epoch": 0.5540720961281709,
"grad_norm": 0.8685382218468735,
"learning_rate": 2.4814540059347184e-05,
"loss": 0.8061,
"step": 830
},
{
"epoch": 0.5574098798397864,
"grad_norm": 0.8609646194613518,
"learning_rate": 2.4629080118694364e-05,
"loss": 0.8238,
"step": 835
},
{
"epoch": 0.5607476635514018,
"grad_norm": 0.9013458211054559,
"learning_rate": 2.4443620178041544e-05,
"loss": 0.7874,
"step": 840
},
{
"epoch": 0.5640854472630173,
"grad_norm": 0.9448632241585405,
"learning_rate": 2.4258160237388723e-05,
"loss": 0.777,
"step": 845
},
{
"epoch": 0.5674232309746329,
"grad_norm": 0.8652639715383189,
"learning_rate": 2.4072700296735907e-05,
"loss": 0.7781,
"step": 850
},
{
"epoch": 0.5707610146862483,
"grad_norm": 0.885349438416903,
"learning_rate": 2.3887240356083086e-05,
"loss": 0.7852,
"step": 855
},
{
"epoch": 0.5740987983978638,
"grad_norm": 0.9226549883190552,
"learning_rate": 2.370178041543027e-05,
"loss": 0.7937,
"step": 860
},
{
"epoch": 0.5774365821094793,
"grad_norm": 0.8894377701419424,
"learning_rate": 2.351632047477745e-05,
"loss": 0.7858,
"step": 865
},
{
"epoch": 0.5807743658210948,
"grad_norm": 0.9417967618419559,
"learning_rate": 2.333086053412463e-05,
"loss": 0.7721,
"step": 870
},
{
"epoch": 0.5841121495327103,
"grad_norm": 0.9222488011231172,
"learning_rate": 2.3145400593471812e-05,
"loss": 0.7511,
"step": 875
},
{
"epoch": 0.5874499332443258,
"grad_norm": 0.8993397519730585,
"learning_rate": 2.2959940652818992e-05,
"loss": 0.7827,
"step": 880
},
{
"epoch": 0.5907877169559412,
"grad_norm": 0.9136313639945539,
"learning_rate": 2.2774480712166175e-05,
"loss": 0.7714,
"step": 885
},
{
"epoch": 0.5941255006675568,
"grad_norm": 0.9199729414745823,
"learning_rate": 2.258902077151335e-05,
"loss": 0.7761,
"step": 890
},
{
"epoch": 0.5974632843791722,
"grad_norm": 0.8409864114208272,
"learning_rate": 2.2403560830860534e-05,
"loss": 0.7758,
"step": 895
},
{
"epoch": 0.6008010680907877,
"grad_norm": 0.8927435513620092,
"learning_rate": 2.2218100890207717e-05,
"loss": 0.8088,
"step": 900
},
{
"epoch": 0.6041388518024032,
"grad_norm": 1.0111242127600466,
"learning_rate": 2.2032640949554897e-05,
"loss": 0.787,
"step": 905
},
{
"epoch": 0.6074766355140186,
"grad_norm": 0.9750007023233266,
"learning_rate": 2.184718100890208e-05,
"loss": 0.7951,
"step": 910
},
{
"epoch": 0.6108144192256342,
"grad_norm": 0.8947817876635858,
"learning_rate": 2.166172106824926e-05,
"loss": 0.7718,
"step": 915
},
{
"epoch": 0.6141522029372497,
"grad_norm": 0.8937079235831037,
"learning_rate": 2.147626112759644e-05,
"loss": 0.7653,
"step": 920
},
{
"epoch": 0.6174899866488651,
"grad_norm": 0.9633412553738314,
"learning_rate": 2.129080118694362e-05,
"loss": 0.7588,
"step": 925
},
{
"epoch": 0.6208277703604806,
"grad_norm": 0.9697623965265878,
"learning_rate": 2.1105341246290803e-05,
"loss": 0.7743,
"step": 930
},
{
"epoch": 0.6241655540720962,
"grad_norm": 0.9170590153248661,
"learning_rate": 2.0919881305637982e-05,
"loss": 0.7939,
"step": 935
},
{
"epoch": 0.6275033377837116,
"grad_norm": 0.8884986167305851,
"learning_rate": 2.0734421364985165e-05,
"loss": 0.7609,
"step": 940
},
{
"epoch": 0.6308411214953271,
"grad_norm": 0.9117598511296207,
"learning_rate": 2.0548961424332345e-05,
"loss": 0.7816,
"step": 945
},
{
"epoch": 0.6341789052069426,
"grad_norm": 0.9143844046049939,
"learning_rate": 2.0363501483679525e-05,
"loss": 0.7682,
"step": 950
},
{
"epoch": 0.6375166889185581,
"grad_norm": 1.001882223179808,
"learning_rate": 2.0178041543026708e-05,
"loss": 0.7859,
"step": 955
},
{
"epoch": 0.6408544726301736,
"grad_norm": 0.9515527243629021,
"learning_rate": 1.9992581602373888e-05,
"loss": 0.771,
"step": 960
},
{
"epoch": 0.644192256341789,
"grad_norm": 0.9987405390627165,
"learning_rate": 1.980712166172107e-05,
"loss": 0.7882,
"step": 965
},
{
"epoch": 0.6475300400534045,
"grad_norm": 0.902559098376266,
"learning_rate": 1.962166172106825e-05,
"loss": 0.7865,
"step": 970
},
{
"epoch": 0.6508678237650201,
"grad_norm": 0.9530475037552353,
"learning_rate": 1.943620178041543e-05,
"loss": 0.7651,
"step": 975
},
{
"epoch": 0.6542056074766355,
"grad_norm": 0.9044360545717226,
"learning_rate": 1.9250741839762613e-05,
"loss": 0.7611,
"step": 980
},
{
"epoch": 0.657543391188251,
"grad_norm": 0.8949422084811579,
"learning_rate": 1.9065281899109793e-05,
"loss": 0.7535,
"step": 985
},
{
"epoch": 0.6608811748998665,
"grad_norm": 0.9212385076203463,
"learning_rate": 1.8879821958456976e-05,
"loss": 0.7828,
"step": 990
},
{
"epoch": 0.664218958611482,
"grad_norm": 0.9685512993064703,
"learning_rate": 1.8694362017804153e-05,
"loss": 0.7598,
"step": 995
},
{
"epoch": 0.6675567423230975,
"grad_norm": 0.8371458023739065,
"learning_rate": 1.8508902077151336e-05,
"loss": 0.7593,
"step": 1000
},
{
"epoch": 0.670894526034713,
"grad_norm": 0.9561174634421302,
"learning_rate": 1.8323442136498515e-05,
"loss": 0.7918,
"step": 1005
},
{
"epoch": 0.6742323097463284,
"grad_norm": 0.988014946142732,
"learning_rate": 1.81379821958457e-05,
"loss": 0.7752,
"step": 1010
},
{
"epoch": 0.677570093457944,
"grad_norm": 0.9594260489502082,
"learning_rate": 1.795252225519288e-05,
"loss": 0.7555,
"step": 1015
},
{
"epoch": 0.6809078771695594,
"grad_norm": 0.9047734227550646,
"learning_rate": 1.7767062314540058e-05,
"loss": 0.7616,
"step": 1020
},
{
"epoch": 0.6842456608811749,
"grad_norm": 0.8663553522012484,
"learning_rate": 1.758160237388724e-05,
"loss": 0.7796,
"step": 1025
},
{
"epoch": 0.6875834445927904,
"grad_norm": 0.921325208555024,
"learning_rate": 1.739614243323442e-05,
"loss": 0.7641,
"step": 1030
},
{
"epoch": 0.6909212283044058,
"grad_norm": 0.8876679055429972,
"learning_rate": 1.7210682492581604e-05,
"loss": 0.7676,
"step": 1035
},
{
"epoch": 0.6942590120160214,
"grad_norm": 0.8958203531086095,
"learning_rate": 1.7025222551928784e-05,
"loss": 0.7444,
"step": 1040
},
{
"epoch": 0.6975967957276369,
"grad_norm": 0.8697101652022063,
"learning_rate": 1.6839762611275967e-05,
"loss": 0.7994,
"step": 1045
},
{
"epoch": 0.7009345794392523,
"grad_norm": 1.088915969712606,
"learning_rate": 1.6654302670623147e-05,
"loss": 0.7664,
"step": 1050
},
{
"epoch": 0.7042723631508678,
"grad_norm": 0.9166954419190961,
"learning_rate": 1.6468842729970326e-05,
"loss": 0.7576,
"step": 1055
},
{
"epoch": 0.7076101468624834,
"grad_norm": 1.0111723788145828,
"learning_rate": 1.628338278931751e-05,
"loss": 0.7437,
"step": 1060
},
{
"epoch": 0.7109479305740988,
"grad_norm": 0.8935454904692272,
"learning_rate": 1.609792284866469e-05,
"loss": 0.7651,
"step": 1065
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.9796810043507851,
"learning_rate": 1.5912462908011872e-05,
"loss": 0.7453,
"step": 1070
},
{
"epoch": 0.7176234979973297,
"grad_norm": 0.9497605242721375,
"learning_rate": 1.572700296735905e-05,
"loss": 0.7518,
"step": 1075
},
{
"epoch": 0.7209612817089452,
"grad_norm": 0.9261889971154708,
"learning_rate": 1.5541543026706232e-05,
"loss": 0.7509,
"step": 1080
},
{
"epoch": 0.7242990654205608,
"grad_norm": 0.9221518501402585,
"learning_rate": 1.5356083086053415e-05,
"loss": 0.7383,
"step": 1085
},
{
"epoch": 0.7276368491321762,
"grad_norm": 0.9905384423101319,
"learning_rate": 1.5170623145400595e-05,
"loss": 0.7651,
"step": 1090
},
{
"epoch": 0.7309746328437917,
"grad_norm": 0.9591889790233724,
"learning_rate": 1.4985163204747776e-05,
"loss": 0.7631,
"step": 1095
},
{
"epoch": 0.7343124165554072,
"grad_norm": 0.9087058012685387,
"learning_rate": 1.4799703264094956e-05,
"loss": 0.7535,
"step": 1100
},
{
"epoch": 0.7376502002670227,
"grad_norm": 0.8692391342259392,
"learning_rate": 1.4614243323442137e-05,
"loss": 0.7374,
"step": 1105
},
{
"epoch": 0.7409879839786382,
"grad_norm": 0.9298588940493733,
"learning_rate": 1.4428783382789319e-05,
"loss": 0.7423,
"step": 1110
},
{
"epoch": 0.7443257676902537,
"grad_norm": 0.9381934382110304,
"learning_rate": 1.42433234421365e-05,
"loss": 0.7556,
"step": 1115
},
{
"epoch": 0.7476635514018691,
"grad_norm": 0.9063091001621449,
"learning_rate": 1.4057863501483681e-05,
"loss": 0.742,
"step": 1120
},
{
"epoch": 0.7510013351134847,
"grad_norm": 0.9344982528376456,
"learning_rate": 1.387240356083086e-05,
"loss": 0.7554,
"step": 1125
},
{
"epoch": 0.7543391188251002,
"grad_norm": 0.9193075340785176,
"learning_rate": 1.3686943620178041e-05,
"loss": 0.7673,
"step": 1130
},
{
"epoch": 0.7576769025367156,
"grad_norm": 0.9336576279968588,
"learning_rate": 1.3501483679525222e-05,
"loss": 0.7023,
"step": 1135
},
{
"epoch": 0.7610146862483311,
"grad_norm": 0.9768530487657828,
"learning_rate": 1.3316023738872405e-05,
"loss": 0.7782,
"step": 1140
},
{
"epoch": 0.7643524699599465,
"grad_norm": 0.9679956607339216,
"learning_rate": 1.3130563798219587e-05,
"loss": 0.763,
"step": 1145
},
{
"epoch": 0.7676902536715621,
"grad_norm": 0.869544797745516,
"learning_rate": 1.2945103857566765e-05,
"loss": 0.7654,
"step": 1150
},
{
"epoch": 0.7710280373831776,
"grad_norm": 0.985241850392532,
"learning_rate": 1.2759643916913946e-05,
"loss": 0.7699,
"step": 1155
},
{
"epoch": 0.774365821094793,
"grad_norm": 0.9549632229431664,
"learning_rate": 1.2574183976261128e-05,
"loss": 0.7446,
"step": 1160
},
{
"epoch": 0.7777036048064085,
"grad_norm": 0.9575556607619793,
"learning_rate": 1.2388724035608309e-05,
"loss": 0.761,
"step": 1165
},
{
"epoch": 0.7810413885180241,
"grad_norm": 0.9994928229391189,
"learning_rate": 1.2203264094955489e-05,
"loss": 0.7493,
"step": 1170
},
{
"epoch": 0.7843791722296395,
"grad_norm": 0.9965021892630818,
"learning_rate": 1.2017804154302672e-05,
"loss": 0.7602,
"step": 1175
},
{
"epoch": 0.787716955941255,
"grad_norm": 0.9615733998987331,
"learning_rate": 1.1832344213649853e-05,
"loss": 0.7575,
"step": 1180
},
{
"epoch": 0.7910547396528705,
"grad_norm": 0.995773743198864,
"learning_rate": 1.1646884272997033e-05,
"loss": 0.7411,
"step": 1185
},
{
"epoch": 0.794392523364486,
"grad_norm": 0.8979251525490485,
"learning_rate": 1.1461424332344215e-05,
"loss": 0.7396,
"step": 1190
},
{
"epoch": 0.7977303070761015,
"grad_norm": 0.8587947464461109,
"learning_rate": 1.1275964391691394e-05,
"loss": 0.7534,
"step": 1195
},
{
"epoch": 0.8010680907877169,
"grad_norm": 0.9219949726663529,
"learning_rate": 1.1090504451038576e-05,
"loss": 0.7472,
"step": 1200
},
{
"epoch": 0.8044058744993324,
"grad_norm": 0.9232814244897973,
"learning_rate": 1.0905044510385757e-05,
"loss": 0.7361,
"step": 1205
},
{
"epoch": 0.807743658210948,
"grad_norm": 0.9036693012652329,
"learning_rate": 1.0719584569732939e-05,
"loss": 0.7714,
"step": 1210
},
{
"epoch": 0.8110814419225634,
"grad_norm": 0.8986092027119317,
"learning_rate": 1.053412462908012e-05,
"loss": 0.7291,
"step": 1215
},
{
"epoch": 0.8144192256341789,
"grad_norm": 0.9600751192343961,
"learning_rate": 1.0348664688427301e-05,
"loss": 0.7406,
"step": 1220
},
{
"epoch": 0.8177570093457944,
"grad_norm": 0.9765858612379583,
"learning_rate": 1.0163204747774481e-05,
"loss": 0.744,
"step": 1225
},
{
"epoch": 0.8210947930574098,
"grad_norm": 0.9436664489477504,
"learning_rate": 9.977744807121663e-06,
"loss": 0.7537,
"step": 1230
},
{
"epoch": 0.8244325767690254,
"grad_norm": 0.9348268117808438,
"learning_rate": 9.792284866468842e-06,
"loss": 0.7466,
"step": 1235
},
{
"epoch": 0.8277703604806409,
"grad_norm": 0.8839373635727904,
"learning_rate": 9.606824925816024e-06,
"loss": 0.7225,
"step": 1240
},
{
"epoch": 0.8311081441922563,
"grad_norm": 0.9350825333412003,
"learning_rate": 9.421364985163205e-06,
"loss": 0.7535,
"step": 1245
},
{
"epoch": 0.8344459279038718,
"grad_norm": 0.9170815856659084,
"learning_rate": 9.235905044510387e-06,
"loss": 0.7129,
"step": 1250
},
{
"epoch": 0.8377837116154874,
"grad_norm": 0.8931728840330895,
"learning_rate": 9.050445103857568e-06,
"loss": 0.7312,
"step": 1255
},
{
"epoch": 0.8411214953271028,
"grad_norm": 0.947606427769052,
"learning_rate": 8.864985163204748e-06,
"loss": 0.7198,
"step": 1260
},
{
"epoch": 0.8444592790387183,
"grad_norm": 0.9867265496764112,
"learning_rate": 8.679525222551929e-06,
"loss": 0.7627,
"step": 1265
},
{
"epoch": 0.8477970627503337,
"grad_norm": 1.0256443772674286,
"learning_rate": 8.49406528189911e-06,
"loss": 0.7466,
"step": 1270
},
{
"epoch": 0.8511348464619493,
"grad_norm": 1.0226735020250939,
"learning_rate": 8.30860534124629e-06,
"loss": 0.7524,
"step": 1275
},
{
"epoch": 0.8544726301735648,
"grad_norm": 0.918655948279863,
"learning_rate": 8.123145400593472e-06,
"loss": 0.7395,
"step": 1280
},
{
"epoch": 0.8578104138851802,
"grad_norm": 0.9337021354955276,
"learning_rate": 7.937685459940653e-06,
"loss": 0.7311,
"step": 1285
},
{
"epoch": 0.8611481975967957,
"grad_norm": 1.0290658689780736,
"learning_rate": 7.752225519287835e-06,
"loss": 0.7533,
"step": 1290
},
{
"epoch": 0.8644859813084113,
"grad_norm": 0.9244023648001366,
"learning_rate": 7.566765578635016e-06,
"loss": 0.7366,
"step": 1295
},
{
"epoch": 0.8678237650200267,
"grad_norm": 0.9537206897694197,
"learning_rate": 7.381305637982196e-06,
"loss": 0.7399,
"step": 1300
},
{
"epoch": 0.8711615487316422,
"grad_norm": 0.8743472670723075,
"learning_rate": 7.195845697329377e-06,
"loss": 0.7296,
"step": 1305
},
{
"epoch": 0.8744993324432577,
"grad_norm": 0.8934119128550586,
"learning_rate": 7.0103857566765585e-06,
"loss": 0.7416,
"step": 1310
},
{
"epoch": 0.8778371161548731,
"grad_norm": 0.9618800479933417,
"learning_rate": 6.824925816023739e-06,
"loss": 0.7322,
"step": 1315
},
{
"epoch": 0.8811748998664887,
"grad_norm": 0.830157637044439,
"learning_rate": 6.6394658753709205e-06,
"loss": 0.7292,
"step": 1320
},
{
"epoch": 0.8845126835781041,
"grad_norm": 0.9363830426057013,
"learning_rate": 6.4540059347181e-06,
"loss": 0.734,
"step": 1325
},
{
"epoch": 0.8878504672897196,
"grad_norm": 0.9442444285959836,
"learning_rate": 6.2685459940652825e-06,
"loss": 0.7297,
"step": 1330
},
{
"epoch": 0.8911882510013351,
"grad_norm": 0.9348111437754156,
"learning_rate": 6.083086053412463e-06,
"loss": 0.7382,
"step": 1335
},
{
"epoch": 0.8945260347129506,
"grad_norm": 0.894299426623852,
"learning_rate": 5.8976261127596445e-06,
"loss": 0.7125,
"step": 1340
},
{
"epoch": 0.8978638184245661,
"grad_norm": 0.9736360020611752,
"learning_rate": 5.712166172106825e-06,
"loss": 0.7277,
"step": 1345
},
{
"epoch": 0.9012016021361816,
"grad_norm": 0.9591002259783217,
"learning_rate": 5.5267062314540065e-06,
"loss": 0.7258,
"step": 1350
},
{
"epoch": 0.904539385847797,
"grad_norm": 0.9710736921612175,
"learning_rate": 5.341246290801187e-06,
"loss": 0.7381,
"step": 1355
},
{
"epoch": 0.9078771695594126,
"grad_norm": 0.8730608837945383,
"learning_rate": 5.155786350148368e-06,
"loss": 0.7251,
"step": 1360
},
{
"epoch": 0.9112149532710281,
"grad_norm": 0.903052568907302,
"learning_rate": 4.970326409495549e-06,
"loss": 0.7094,
"step": 1365
},
{
"epoch": 0.9145527369826435,
"grad_norm": 0.9621382332532343,
"learning_rate": 4.7848664688427305e-06,
"loss": 0.7227,
"step": 1370
},
{
"epoch": 0.917890520694259,
"grad_norm": 0.9252627526168606,
"learning_rate": 4.599406528189911e-06,
"loss": 0.7271,
"step": 1375
},
{
"epoch": 0.9212283044058746,
"grad_norm": 0.9417940189754115,
"learning_rate": 4.413946587537092e-06,
"loss": 0.7421,
"step": 1380
},
{
"epoch": 0.92456608811749,
"grad_norm": 0.9682630131774531,
"learning_rate": 4.228486646884274e-06,
"loss": 0.7421,
"step": 1385
},
{
"epoch": 0.9279038718291055,
"grad_norm": 1.0307114272788542,
"learning_rate": 4.0430267062314545e-06,
"loss": 0.7521,
"step": 1390
},
{
"epoch": 0.9312416555407209,
"grad_norm": 0.8832770378598581,
"learning_rate": 3.857566765578635e-06,
"loss": 0.7124,
"step": 1395
},
{
"epoch": 0.9345794392523364,
"grad_norm": 0.9746407857253272,
"learning_rate": 3.672106824925816e-06,
"loss": 0.7302,
"step": 1400
},
{
"epoch": 0.937917222963952,
"grad_norm": 0.9499407254108757,
"learning_rate": 3.4866468842729975e-06,
"loss": 0.7128,
"step": 1405
},
{
"epoch": 0.9412550066755674,
"grad_norm": 0.9145822943128387,
"learning_rate": 3.3011869436201785e-06,
"loss": 0.7322,
"step": 1410
},
{
"epoch": 0.9445927903871829,
"grad_norm": 0.9757309463266676,
"learning_rate": 3.115727002967359e-06,
"loss": 0.7336,
"step": 1415
},
{
"epoch": 0.9479305740987984,
"grad_norm": 0.9534792908684466,
"learning_rate": 2.93026706231454e-06,
"loss": 0.7312,
"step": 1420
},
{
"epoch": 0.9512683578104139,
"grad_norm": 0.9679148570410533,
"learning_rate": 2.744807121661721e-06,
"loss": 0.7128,
"step": 1425
},
{
"epoch": 0.9546061415220294,
"grad_norm": 0.9493106907067503,
"learning_rate": 2.559347181008902e-06,
"loss": 0.7306,
"step": 1430
},
{
"epoch": 0.9579439252336449,
"grad_norm": 0.9679382523098454,
"learning_rate": 2.3738872403560835e-06,
"loss": 0.7328,
"step": 1435
},
{
"epoch": 0.9612817089452603,
"grad_norm": 0.9317606747634454,
"learning_rate": 2.188427299703264e-06,
"loss": 0.7238,
"step": 1440
},
{
"epoch": 0.9646194926568759,
"grad_norm": 0.9275615394707037,
"learning_rate": 2.0029673590504455e-06,
"loss": 0.7171,
"step": 1445
},
{
"epoch": 0.9679572763684913,
"grad_norm": 0.9633454475236382,
"learning_rate": 1.8175074183976263e-06,
"loss": 0.708,
"step": 1450
},
{
"epoch": 0.9712950600801068,
"grad_norm": 0.9399727708328366,
"learning_rate": 1.6320474777448073e-06,
"loss": 0.7291,
"step": 1455
},
{
"epoch": 0.9746328437917223,
"grad_norm": 1.020554951634919,
"learning_rate": 1.4465875370919883e-06,
"loss": 0.7461,
"step": 1460
},
{
"epoch": 0.9779706275033377,
"grad_norm": 0.985081620856249,
"learning_rate": 1.2611275964391693e-06,
"loss": 0.7228,
"step": 1465
},
{
"epoch": 0.9813084112149533,
"grad_norm": 0.9385893286244219,
"learning_rate": 1.0756676557863502e-06,
"loss": 0.72,
"step": 1470
},
{
"epoch": 0.9846461949265688,
"grad_norm": 1.0049513545701543,
"learning_rate": 8.902077151335312e-07,
"loss": 0.7147,
"step": 1475
},
{
"epoch": 0.9879839786381842,
"grad_norm": 1.0099532146134622,
"learning_rate": 7.047477744807121e-07,
"loss": 0.7282,
"step": 1480
},
{
"epoch": 0.9913217623497997,
"grad_norm": 0.8888912225564214,
"learning_rate": 5.192878338278931e-07,
"loss": 0.7332,
"step": 1485
},
{
"epoch": 0.9946595460614153,
"grad_norm": 0.9997610443207388,
"learning_rate": 3.338278931750742e-07,
"loss": 0.7225,
"step": 1490
},
{
"epoch": 0.9979973297730307,
"grad_norm": 0.9412531346267364,
"learning_rate": 1.4836795252225522e-07,
"loss": 0.7318,
"step": 1495
},
{
"epoch": 1.0,
"step": 1498,
"total_flos": 92324696948736.0,
"train_loss": 0.788116905017593,
"train_runtime": 3845.5406,
"train_samples_per_second": 49.839,
"train_steps_per_second": 0.39
}
],
"logging_steps": 5,
"max_steps": 1498,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 180000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 92324696948736.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}