{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1498, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033377837116154874, "grad_norm": 10.97334621037521, "learning_rate": 1.3333333333333334e-06, "loss": 1.0976, "step": 5 }, { "epoch": 0.006675567423230975, "grad_norm": 8.500612119645204, "learning_rate": 3e-06, "loss": 1.0533, "step": 10 }, { "epoch": 0.010013351134846462, "grad_norm": 4.4978733433398705, "learning_rate": 4.666666666666667e-06, "loss": 0.9037, "step": 15 }, { "epoch": 0.01335113484646195, "grad_norm": 3.8887423585372596, "learning_rate": 6.333333333333334e-06, "loss": 0.7853, "step": 20 }, { "epoch": 0.016688918558077435, "grad_norm": 3.1870823314914474, "learning_rate": 8.000000000000001e-06, "loss": 0.774, "step": 25 }, { "epoch": 0.020026702269692925, "grad_norm": 3.6334777762827684, "learning_rate": 9.666666666666667e-06, "loss": 0.7773, "step": 30 }, { "epoch": 0.02336448598130841, "grad_norm": 3.683322604774294, "learning_rate": 1.1333333333333334e-05, "loss": 0.7457, "step": 35 }, { "epoch": 0.0267022696929239, "grad_norm": 3.6697978407247605, "learning_rate": 1.3000000000000001e-05, "loss": 0.7103, "step": 40 }, { "epoch": 0.030040053404539385, "grad_norm": 3.651275266228758, "learning_rate": 1.4666666666666668e-05, "loss": 0.7227, "step": 45 }, { "epoch": 0.03337783711615487, "grad_norm": 3.8445471248297265, "learning_rate": 1.6333333333333335e-05, "loss": 0.6611, "step": 50 }, { "epoch": 0.036715620827770364, "grad_norm": 3.5025746500731545, "learning_rate": 1.8e-05, "loss": 0.6182, "step": 55 }, { "epoch": 0.04005340453938585, "grad_norm": 3.8698798018604577, "learning_rate": 1.9666666666666666e-05, "loss": 0.6551, "step": 60 }, { "epoch": 0.043391188251001335, "grad_norm": 2.990673727539358, "learning_rate": 2.1333333333333335e-05, "loss": 0.6866, "step": 65 }, { "epoch": 0.04672897196261682, "grad_norm": 3.2866308072265213, "learning_rate": 2.3000000000000003e-05, "loss": 0.6887, "step": 70 }, { "epoch": 0.050066755674232306, "grad_norm": 3.271721003686755, "learning_rate": 2.466666666666667e-05, "loss": 0.7327, "step": 75 }, { "epoch": 0.0534045393858478, "grad_norm": 3.0753677314727743, "learning_rate": 2.633333333333333e-05, "loss": 0.7003, "step": 80 }, { "epoch": 0.056742323097463285, "grad_norm": 2.6258441723789607, "learning_rate": 2.8000000000000003e-05, "loss": 0.7144, "step": 85 }, { "epoch": 0.06008010680907877, "grad_norm": 2.434906123559183, "learning_rate": 2.9666666666666672e-05, "loss": 0.704, "step": 90 }, { "epoch": 0.06341789052069426, "grad_norm": 2.760855540008962, "learning_rate": 3.1333333333333334e-05, "loss": 0.7329, "step": 95 }, { "epoch": 0.06675567423230974, "grad_norm": 2.215504245536271, "learning_rate": 3.3e-05, "loss": 0.7367, "step": 100 }, { "epoch": 0.07009345794392523, "grad_norm": 2.6569341544736713, "learning_rate": 3.466666666666667e-05, "loss": 0.7713, "step": 105 }, { "epoch": 0.07343124165554073, "grad_norm": 2.6507368382475973, "learning_rate": 3.633333333333333e-05, "loss": 0.7772, "step": 110 }, { "epoch": 0.07676902536715621, "grad_norm": 2.1348618645661483, "learning_rate": 3.8e-05, "loss": 0.7712, "step": 115 }, { "epoch": 0.0801068090787717, "grad_norm": 2.264768791536783, "learning_rate": 3.966666666666667e-05, "loss": 0.7594, "step": 120 }, { "epoch": 0.08344459279038718, "grad_norm": 1.9771816174965278, "learning_rate": 4.133333333333333e-05, "loss": 0.7945, "step": 125 }, { "epoch": 0.08678237650200267, "grad_norm": 1.8584709556275458, "learning_rate": 4.3e-05, "loss": 0.7918, "step": 130 }, { "epoch": 0.09012016021361816, "grad_norm": 1.6443791716346257, "learning_rate": 4.466666666666667e-05, "loss": 0.7961, "step": 135 }, { "epoch": 0.09345794392523364, "grad_norm": 1.6647164088912758, "learning_rate": 4.633333333333333e-05, "loss": 0.8114, "step": 140 }, { "epoch": 0.09679572763684913, "grad_norm": 1.9000184800356008, "learning_rate": 4.8e-05, "loss": 0.8218, "step": 145 }, { "epoch": 0.10013351134846461, "grad_norm": 1.5342772082491383, "learning_rate": 4.966666666666667e-05, "loss": 0.8185, "step": 150 }, { "epoch": 0.10347129506008011, "grad_norm": 1.4750796923574772, "learning_rate": 4.9851632047477745e-05, "loss": 0.8223, "step": 155 }, { "epoch": 0.1068090787716956, "grad_norm": 1.6148204398582195, "learning_rate": 4.966617210682493e-05, "loss": 0.8303, "step": 160 }, { "epoch": 0.11014686248331108, "grad_norm": 1.4390159182334255, "learning_rate": 4.948071216617211e-05, "loss": 0.7991, "step": 165 }, { "epoch": 0.11348464619492657, "grad_norm": 1.2663492608255318, "learning_rate": 4.929525222551929e-05, "loss": 0.8523, "step": 170 }, { "epoch": 0.11682242990654206, "grad_norm": 1.5497168860322745, "learning_rate": 4.910979228486647e-05, "loss": 0.8384, "step": 175 }, { "epoch": 0.12016021361815754, "grad_norm": 1.3218119424151125, "learning_rate": 4.8924332344213654e-05, "loss": 0.8334, "step": 180 }, { "epoch": 0.12349799732977303, "grad_norm": 1.3314085264665803, "learning_rate": 4.873887240356083e-05, "loss": 0.8775, "step": 185 }, { "epoch": 0.1268357810413885, "grad_norm": 1.417575113114604, "learning_rate": 4.855341246290801e-05, "loss": 0.8604, "step": 190 }, { "epoch": 0.130173564753004, "grad_norm": 1.366705643461715, "learning_rate": 4.8367952522255196e-05, "loss": 0.8578, "step": 195 }, { "epoch": 0.13351134846461948, "grad_norm": 1.3085988961094133, "learning_rate": 4.818249258160238e-05, "loss": 0.8493, "step": 200 }, { "epoch": 0.13684913217623498, "grad_norm": 1.3664654247475687, "learning_rate": 4.7997032640949556e-05, "loss": 0.8739, "step": 205 }, { "epoch": 0.14018691588785046, "grad_norm": 1.2448228793197444, "learning_rate": 4.781157270029674e-05, "loss": 0.8758, "step": 210 }, { "epoch": 0.14352469959946595, "grad_norm": 1.2166707854427619, "learning_rate": 4.762611275964392e-05, "loss": 0.8621, "step": 215 }, { "epoch": 0.14686248331108145, "grad_norm": 1.2528234662317728, "learning_rate": 4.74406528189911e-05, "loss": 0.8358, "step": 220 }, { "epoch": 0.15020026702269693, "grad_norm": 1.2078820039150766, "learning_rate": 4.725519287833828e-05, "loss": 0.8527, "step": 225 }, { "epoch": 0.15353805073431243, "grad_norm": 1.1953586214155212, "learning_rate": 4.7069732937685464e-05, "loss": 0.86, "step": 230 }, { "epoch": 0.1568758344459279, "grad_norm": 1.239697699729331, "learning_rate": 4.688427299703264e-05, "loss": 0.8512, "step": 235 }, { "epoch": 0.1602136181575434, "grad_norm": 1.0943965020451794, "learning_rate": 4.6698813056379824e-05, "loss": 0.8485, "step": 240 }, { "epoch": 0.16355140186915887, "grad_norm": 1.1753697981159985, "learning_rate": 4.651335311572701e-05, "loss": 0.8574, "step": 245 }, { "epoch": 0.16688918558077437, "grad_norm": 1.2273163170418067, "learning_rate": 4.632789317507419e-05, "loss": 0.8755, "step": 250 }, { "epoch": 0.17022696929238984, "grad_norm": 1.1408298234921244, "learning_rate": 4.6142433234421366e-05, "loss": 0.8656, "step": 255 }, { "epoch": 0.17356475300400534, "grad_norm": 1.099138875783624, "learning_rate": 4.595697329376854e-05, "loss": 0.8641, "step": 260 }, { "epoch": 0.17690253671562084, "grad_norm": 1.1726455447900384, "learning_rate": 4.577151335311573e-05, "loss": 0.8428, "step": 265 }, { "epoch": 0.1802403204272363, "grad_norm": 1.1238710518885906, "learning_rate": 4.558605341246291e-05, "loss": 0.8564, "step": 270 }, { "epoch": 0.1835781041388518, "grad_norm": 1.2209709343561501, "learning_rate": 4.540059347181009e-05, "loss": 0.8841, "step": 275 }, { "epoch": 0.18691588785046728, "grad_norm": 1.0719606532900603, "learning_rate": 4.5215133531157275e-05, "loss": 0.8564, "step": 280 }, { "epoch": 0.19025367156208278, "grad_norm": 1.1632077631864237, "learning_rate": 4.502967359050445e-05, "loss": 0.8336, "step": 285 }, { "epoch": 0.19359145527369825, "grad_norm": 1.0912246571194697, "learning_rate": 4.4844213649851635e-05, "loss": 0.8386, "step": 290 }, { "epoch": 0.19692923898531375, "grad_norm": 1.182192263363281, "learning_rate": 4.465875370919881e-05, "loss": 0.8725, "step": 295 }, { "epoch": 0.20026702269692923, "grad_norm": 1.1652841802413654, "learning_rate": 4.4473293768546e-05, "loss": 0.8673, "step": 300 }, { "epoch": 0.20360480640854473, "grad_norm": 1.006954327768831, "learning_rate": 4.428783382789318e-05, "loss": 0.8428, "step": 305 }, { "epoch": 0.20694259012016022, "grad_norm": 1.043615772924013, "learning_rate": 4.4102373887240354e-05, "loss": 0.8826, "step": 310 }, { "epoch": 0.2102803738317757, "grad_norm": 1.0218351388644316, "learning_rate": 4.391691394658754e-05, "loss": 0.8604, "step": 315 }, { "epoch": 0.2136181575433912, "grad_norm": 1.093123947434233, "learning_rate": 4.373145400593472e-05, "loss": 0.8471, "step": 320 }, { "epoch": 0.21695594125500667, "grad_norm": 1.0437185361786059, "learning_rate": 4.35459940652819e-05, "loss": 0.8365, "step": 325 }, { "epoch": 0.22029372496662217, "grad_norm": 1.1728538235708217, "learning_rate": 4.336053412462908e-05, "loss": 0.8456, "step": 330 }, { "epoch": 0.22363150867823764, "grad_norm": 1.072793065927316, "learning_rate": 4.317507418397626e-05, "loss": 0.8574, "step": 335 }, { "epoch": 0.22696929238985314, "grad_norm": 0.9434393974539138, "learning_rate": 4.2989614243323446e-05, "loss": 0.8342, "step": 340 }, { "epoch": 0.23030707610146864, "grad_norm": 1.0832629194902035, "learning_rate": 4.280415430267062e-05, "loss": 0.8509, "step": 345 }, { "epoch": 0.2336448598130841, "grad_norm": 0.9763109475541231, "learning_rate": 4.2618694362017805e-05, "loss": 0.8515, "step": 350 }, { "epoch": 0.2369826435246996, "grad_norm": 0.9622690418923676, "learning_rate": 4.243323442136499e-05, "loss": 0.8451, "step": 355 }, { "epoch": 0.24032042723631508, "grad_norm": 1.0311536354503212, "learning_rate": 4.2247774480712165e-05, "loss": 0.8793, "step": 360 }, { "epoch": 0.24365821094793058, "grad_norm": 1.0616057184675154, "learning_rate": 4.206231454005935e-05, "loss": 0.8496, "step": 365 }, { "epoch": 0.24699599465954605, "grad_norm": 0.9694776849343971, "learning_rate": 4.187685459940653e-05, "loss": 0.8276, "step": 370 }, { "epoch": 0.25033377837116155, "grad_norm": 1.0669771488195197, "learning_rate": 4.1691394658753714e-05, "loss": 0.8608, "step": 375 }, { "epoch": 0.253671562082777, "grad_norm": 1.0330982860266757, "learning_rate": 4.150593471810089e-05, "loss": 0.8626, "step": 380 }, { "epoch": 0.2570093457943925, "grad_norm": 1.0894982304893939, "learning_rate": 4.132047477744807e-05, "loss": 0.8456, "step": 385 }, { "epoch": 0.260347129506008, "grad_norm": 1.0233378411857879, "learning_rate": 4.1135014836795256e-05, "loss": 0.858, "step": 390 }, { "epoch": 0.2636849132176235, "grad_norm": 0.9532755270759551, "learning_rate": 4.094955489614243e-05, "loss": 0.8325, "step": 395 }, { "epoch": 0.26702269692923897, "grad_norm": 1.012705355808147, "learning_rate": 4.0764094955489616e-05, "loss": 0.8603, "step": 400 }, { "epoch": 0.2703604806408545, "grad_norm": 1.0304787118053764, "learning_rate": 4.05786350148368e-05, "loss": 0.8481, "step": 405 }, { "epoch": 0.27369826435246997, "grad_norm": 1.0003080869140883, "learning_rate": 4.039317507418398e-05, "loss": 0.841, "step": 410 }, { "epoch": 0.27703604806408544, "grad_norm": 1.0009426776633654, "learning_rate": 4.020771513353116e-05, "loss": 0.8504, "step": 415 }, { "epoch": 0.2803738317757009, "grad_norm": 0.9644210719992499, "learning_rate": 4.002225519287834e-05, "loss": 0.8481, "step": 420 }, { "epoch": 0.28371161548731644, "grad_norm": 1.0494436578225004, "learning_rate": 3.9836795252225525e-05, "loss": 0.8354, "step": 425 }, { "epoch": 0.2870493991989319, "grad_norm": 0.8887585557590956, "learning_rate": 3.96513353115727e-05, "loss": 0.83, "step": 430 }, { "epoch": 0.2903871829105474, "grad_norm": 0.9747027542446707, "learning_rate": 3.9465875370919884e-05, "loss": 0.8307, "step": 435 }, { "epoch": 0.2937249666221629, "grad_norm": 0.9383108633240661, "learning_rate": 3.928041543026707e-05, "loss": 0.8135, "step": 440 }, { "epoch": 0.2970627503337784, "grad_norm": 1.0410990959669617, "learning_rate": 3.9094955489614244e-05, "loss": 0.8485, "step": 445 }, { "epoch": 0.30040053404539385, "grad_norm": 0.9780902917654535, "learning_rate": 3.890949554896143e-05, "loss": 0.8196, "step": 450 }, { "epoch": 0.3037383177570093, "grad_norm": 0.9442982752168075, "learning_rate": 3.87240356083086e-05, "loss": 0.808, "step": 455 }, { "epoch": 0.30707610146862485, "grad_norm": 1.0040732428090156, "learning_rate": 3.853857566765579e-05, "loss": 0.8478, "step": 460 }, { "epoch": 0.3104138851802403, "grad_norm": 0.9957954242378592, "learning_rate": 3.835311572700297e-05, "loss": 0.8243, "step": 465 }, { "epoch": 0.3137516688918558, "grad_norm": 0.9906243125042739, "learning_rate": 3.8167655786350146e-05, "loss": 0.8198, "step": 470 }, { "epoch": 0.3170894526034713, "grad_norm": 0.9245770524956366, "learning_rate": 3.7982195845697336e-05, "loss": 0.8225, "step": 475 }, { "epoch": 0.3204272363150868, "grad_norm": 0.9689880972401121, "learning_rate": 3.779673590504451e-05, "loss": 0.8081, "step": 480 }, { "epoch": 0.32376502002670227, "grad_norm": 0.9050315548245393, "learning_rate": 3.7611275964391695e-05, "loss": 0.8257, "step": 485 }, { "epoch": 0.32710280373831774, "grad_norm": 0.965523047639, "learning_rate": 3.742581602373887e-05, "loss": 0.8351, "step": 490 }, { "epoch": 0.33044058744993327, "grad_norm": 1.0138619406917988, "learning_rate": 3.7240356083086054e-05, "loss": 0.8297, "step": 495 }, { "epoch": 0.33377837116154874, "grad_norm": 0.9226993749632075, "learning_rate": 3.705489614243324e-05, "loss": 0.822, "step": 500 }, { "epoch": 0.3371161548731642, "grad_norm": 0.932421956217793, "learning_rate": 3.6869436201780414e-05, "loss": 0.8131, "step": 505 }, { "epoch": 0.3404539385847797, "grad_norm": 0.8929781381117078, "learning_rate": 3.6683976261127604e-05, "loss": 0.8144, "step": 510 }, { "epoch": 0.3437917222963952, "grad_norm": 0.922998838382151, "learning_rate": 3.649851632047478e-05, "loss": 0.8624, "step": 515 }, { "epoch": 0.3471295060080107, "grad_norm": 0.93469331823402, "learning_rate": 3.6313056379821956e-05, "loss": 0.8259, "step": 520 }, { "epoch": 0.35046728971962615, "grad_norm": 1.0259483980065804, "learning_rate": 3.612759643916914e-05, "loss": 0.85, "step": 525 }, { "epoch": 0.3538050734312417, "grad_norm": 0.9757464996869175, "learning_rate": 3.594213649851632e-05, "loss": 0.8322, "step": 530 }, { "epoch": 0.35714285714285715, "grad_norm": 1.0326434851152306, "learning_rate": 3.5756676557863506e-05, "loss": 0.8152, "step": 535 }, { "epoch": 0.3604806408544726, "grad_norm": 0.8969653625459288, "learning_rate": 3.557121661721068e-05, "loss": 0.8342, "step": 540 }, { "epoch": 0.3638184245660881, "grad_norm": 0.9685090624506036, "learning_rate": 3.5385756676557865e-05, "loss": 0.8454, "step": 545 }, { "epoch": 0.3671562082777036, "grad_norm": 0.9464787707831517, "learning_rate": 3.520029673590505e-05, "loss": 0.8144, "step": 550 }, { "epoch": 0.3704939919893191, "grad_norm": 0.9715120332083621, "learning_rate": 3.5014836795252225e-05, "loss": 0.8075, "step": 555 }, { "epoch": 0.37383177570093457, "grad_norm": 1.0181655310980833, "learning_rate": 3.482937685459941e-05, "loss": 0.8433, "step": 560 }, { "epoch": 0.3771695594125501, "grad_norm": 1.0013772700433445, "learning_rate": 3.464391691394659e-05, "loss": 0.8253, "step": 565 }, { "epoch": 0.38050734312416556, "grad_norm": 0.9482215787610626, "learning_rate": 3.445845697329377e-05, "loss": 0.8195, "step": 570 }, { "epoch": 0.38384512683578104, "grad_norm": 1.0461820886125337, "learning_rate": 3.427299703264095e-05, "loss": 0.831, "step": 575 }, { "epoch": 0.3871829105473965, "grad_norm": 0.9198239920283778, "learning_rate": 3.4087537091988134e-05, "loss": 0.8152, "step": 580 }, { "epoch": 0.39052069425901204, "grad_norm": 0.9782331163351092, "learning_rate": 3.390207715133532e-05, "loss": 0.8154, "step": 585 }, { "epoch": 0.3938584779706275, "grad_norm": 0.9186397229393198, "learning_rate": 3.371661721068249e-05, "loss": 0.8153, "step": 590 }, { "epoch": 0.397196261682243, "grad_norm": 0.9337443617921134, "learning_rate": 3.3531157270029676e-05, "loss": 0.8233, "step": 595 }, { "epoch": 0.40053404539385845, "grad_norm": 0.9434322651580768, "learning_rate": 3.334569732937686e-05, "loss": 0.8345, "step": 600 }, { "epoch": 0.403871829105474, "grad_norm": 1.0512846063850414, "learning_rate": 3.3160237388724036e-05, "loss": 0.8173, "step": 605 }, { "epoch": 0.40720961281708945, "grad_norm": 0.9350959223867034, "learning_rate": 3.297477744807122e-05, "loss": 0.8372, "step": 610 }, { "epoch": 0.4105473965287049, "grad_norm": 0.9197353611822743, "learning_rate": 3.27893175074184e-05, "loss": 0.8215, "step": 615 }, { "epoch": 0.41388518024032045, "grad_norm": 0.8518070420704498, "learning_rate": 3.260385756676558e-05, "loss": 0.7951, "step": 620 }, { "epoch": 0.4172229639519359, "grad_norm": 0.9858909592901012, "learning_rate": 3.241839762611276e-05, "loss": 0.8035, "step": 625 }, { "epoch": 0.4205607476635514, "grad_norm": 1.071276470614738, "learning_rate": 3.223293768545994e-05, "loss": 0.8121, "step": 630 }, { "epoch": 0.42389853137516686, "grad_norm": 0.9226349965551451, "learning_rate": 3.204747774480713e-05, "loss": 0.8024, "step": 635 }, { "epoch": 0.4272363150867824, "grad_norm": 0.9911335494782234, "learning_rate": 3.1862017804154304e-05, "loss": 0.7998, "step": 640 }, { "epoch": 0.43057409879839786, "grad_norm": 0.857226373613729, "learning_rate": 3.167655786350148e-05, "loss": 0.7985, "step": 645 }, { "epoch": 0.43391188251001334, "grad_norm": 0.9012240464805917, "learning_rate": 3.149109792284867e-05, "loss": 0.8109, "step": 650 }, { "epoch": 0.43724966622162886, "grad_norm": 0.9124837740946565, "learning_rate": 3.1305637982195846e-05, "loss": 0.8015, "step": 655 }, { "epoch": 0.44058744993324434, "grad_norm": 0.9907278141668688, "learning_rate": 3.112017804154303e-05, "loss": 0.8102, "step": 660 }, { "epoch": 0.4439252336448598, "grad_norm": 0.9447867252541866, "learning_rate": 3.0934718100890206e-05, "loss": 0.8308, "step": 665 }, { "epoch": 0.4472630173564753, "grad_norm": 0.9514834392779774, "learning_rate": 3.074925816023739e-05, "loss": 0.8108, "step": 670 }, { "epoch": 0.4506008010680908, "grad_norm": 0.8898801356986638, "learning_rate": 3.056379821958457e-05, "loss": 0.7952, "step": 675 }, { "epoch": 0.4539385847797063, "grad_norm": 0.9121421167479317, "learning_rate": 3.0378338278931752e-05, "loss": 0.7766, "step": 680 }, { "epoch": 0.45727636849132175, "grad_norm": 0.9206580766916015, "learning_rate": 3.0192878338278935e-05, "loss": 0.7976, "step": 685 }, { "epoch": 0.4606141522029373, "grad_norm": 0.8875003615985043, "learning_rate": 3.0007418397626115e-05, "loss": 0.7965, "step": 690 }, { "epoch": 0.46395193591455275, "grad_norm": 1.0057518919464419, "learning_rate": 2.9821958456973298e-05, "loss": 0.7745, "step": 695 }, { "epoch": 0.4672897196261682, "grad_norm": 0.9890615318492613, "learning_rate": 2.9636498516320477e-05, "loss": 0.8021, "step": 700 }, { "epoch": 0.4706275033377837, "grad_norm": 0.9958806046763854, "learning_rate": 2.9451038575667654e-05, "loss": 0.7948, "step": 705 }, { "epoch": 0.4739652870493992, "grad_norm": 0.9427315789994021, "learning_rate": 2.926557863501484e-05, "loss": 0.7995, "step": 710 }, { "epoch": 0.4773030707610147, "grad_norm": 0.9054405934645217, "learning_rate": 2.908011869436202e-05, "loss": 0.7941, "step": 715 }, { "epoch": 0.48064085447263016, "grad_norm": 0.9373946350999136, "learning_rate": 2.8894658753709203e-05, "loss": 0.8178, "step": 720 }, { "epoch": 0.48397863818424564, "grad_norm": 0.9693149989780067, "learning_rate": 2.8709198813056383e-05, "loss": 0.8044, "step": 725 }, { "epoch": 0.48731642189586116, "grad_norm": 0.8575646876326481, "learning_rate": 2.852373887240356e-05, "loss": 0.789, "step": 730 }, { "epoch": 0.49065420560747663, "grad_norm": 0.986208895766066, "learning_rate": 2.8338278931750746e-05, "loss": 0.8155, "step": 735 }, { "epoch": 0.4939919893190921, "grad_norm": 0.9731059895113887, "learning_rate": 2.8152818991097922e-05, "loss": 0.8092, "step": 740 }, { "epoch": 0.49732977303070763, "grad_norm": 1.0244020266746006, "learning_rate": 2.796735905044511e-05, "loss": 0.8086, "step": 745 }, { "epoch": 0.5006675567423231, "grad_norm": 0.8755711588079153, "learning_rate": 2.7781899109792285e-05, "loss": 0.7978, "step": 750 }, { "epoch": 0.5040053404539386, "grad_norm": 0.8857975744161016, "learning_rate": 2.7596439169139465e-05, "loss": 0.7945, "step": 755 }, { "epoch": 0.507343124165554, "grad_norm": 1.0437729771562312, "learning_rate": 2.741097922848665e-05, "loss": 0.8, "step": 760 }, { "epoch": 0.5106809078771696, "grad_norm": 0.8938452774095116, "learning_rate": 2.7225519287833828e-05, "loss": 0.7919, "step": 765 }, { "epoch": 0.514018691588785, "grad_norm": 0.9507910351515566, "learning_rate": 2.7040059347181014e-05, "loss": 0.811, "step": 770 }, { "epoch": 0.5173564753004005, "grad_norm": 0.9446476391640979, "learning_rate": 2.685459940652819e-05, "loss": 0.812, "step": 775 }, { "epoch": 0.520694259012016, "grad_norm": 0.8939815937066375, "learning_rate": 2.666913946587537e-05, "loss": 0.7968, "step": 780 }, { "epoch": 0.5240320427236315, "grad_norm": 0.9078803372635648, "learning_rate": 2.6483679525222553e-05, "loss": 0.816, "step": 785 }, { "epoch": 0.527369826435247, "grad_norm": 0.8194365993553059, "learning_rate": 2.6298219584569733e-05, "loss": 0.7861, "step": 790 }, { "epoch": 0.5307076101468625, "grad_norm": 0.8408501195751673, "learning_rate": 2.6112759643916916e-05, "loss": 0.7743, "step": 795 }, { "epoch": 0.5340453938584779, "grad_norm": 0.8531884984260966, "learning_rate": 2.5927299703264096e-05, "loss": 0.8047, "step": 800 }, { "epoch": 0.5373831775700935, "grad_norm": 0.8791442187279712, "learning_rate": 2.5741839762611276e-05, "loss": 0.7854, "step": 805 }, { "epoch": 0.540720961281709, "grad_norm": 0.9753162027608423, "learning_rate": 2.555637982195846e-05, "loss": 0.8098, "step": 810 }, { "epoch": 0.5440587449933244, "grad_norm": 0.8625073065074, "learning_rate": 2.537091988130564e-05, "loss": 0.7967, "step": 815 }, { "epoch": 0.5473965287049399, "grad_norm": 0.9742653975215142, "learning_rate": 2.518545994065282e-05, "loss": 0.7768, "step": 820 }, { "epoch": 0.5507343124165555, "grad_norm": 0.8892913090889087, "learning_rate": 2.5e-05, "loss": 0.7603, "step": 825 }, { "epoch": 0.5540720961281709, "grad_norm": 0.8685382218468735, "learning_rate": 2.4814540059347184e-05, "loss": 0.8061, "step": 830 }, { "epoch": 0.5574098798397864, "grad_norm": 0.8609646194613518, "learning_rate": 2.4629080118694364e-05, "loss": 0.8238, "step": 835 }, { "epoch": 0.5607476635514018, "grad_norm": 0.9013458211054559, "learning_rate": 2.4443620178041544e-05, "loss": 0.7874, "step": 840 }, { "epoch": 0.5640854472630173, "grad_norm": 0.9448632241585405, "learning_rate": 2.4258160237388723e-05, "loss": 0.777, "step": 845 }, { "epoch": 0.5674232309746329, "grad_norm": 0.8652639715383189, "learning_rate": 2.4072700296735907e-05, "loss": 0.7781, "step": 850 }, { "epoch": 0.5707610146862483, "grad_norm": 0.885349438416903, "learning_rate": 2.3887240356083086e-05, "loss": 0.7852, "step": 855 }, { "epoch": 0.5740987983978638, "grad_norm": 0.9226549883190552, "learning_rate": 2.370178041543027e-05, "loss": 0.7937, "step": 860 }, { "epoch": 0.5774365821094793, "grad_norm": 0.8894377701419424, "learning_rate": 2.351632047477745e-05, "loss": 0.7858, "step": 865 }, { "epoch": 0.5807743658210948, "grad_norm": 0.9417967618419559, "learning_rate": 2.333086053412463e-05, "loss": 0.7721, "step": 870 }, { "epoch": 0.5841121495327103, "grad_norm": 0.9222488011231172, "learning_rate": 2.3145400593471812e-05, "loss": 0.7511, "step": 875 }, { "epoch": 0.5874499332443258, "grad_norm": 0.8993397519730585, "learning_rate": 2.2959940652818992e-05, "loss": 0.7827, "step": 880 }, { "epoch": 0.5907877169559412, "grad_norm": 0.9136313639945539, "learning_rate": 2.2774480712166175e-05, "loss": 0.7714, "step": 885 }, { "epoch": 0.5941255006675568, "grad_norm": 0.9199729414745823, "learning_rate": 2.258902077151335e-05, "loss": 0.7761, "step": 890 }, { "epoch": 0.5974632843791722, "grad_norm": 0.8409864114208272, "learning_rate": 2.2403560830860534e-05, "loss": 0.7758, "step": 895 }, { "epoch": 0.6008010680907877, "grad_norm": 0.8927435513620092, "learning_rate": 2.2218100890207717e-05, "loss": 0.8088, "step": 900 }, { "epoch": 0.6041388518024032, "grad_norm": 1.0111242127600466, "learning_rate": 2.2032640949554897e-05, "loss": 0.787, "step": 905 }, { "epoch": 0.6074766355140186, "grad_norm": 0.9750007023233266, "learning_rate": 2.184718100890208e-05, "loss": 0.7951, "step": 910 }, { "epoch": 0.6108144192256342, "grad_norm": 0.8947817876635858, "learning_rate": 2.166172106824926e-05, "loss": 0.7718, "step": 915 }, { "epoch": 0.6141522029372497, "grad_norm": 0.8937079235831037, "learning_rate": 2.147626112759644e-05, "loss": 0.7653, "step": 920 }, { "epoch": 0.6174899866488651, "grad_norm": 0.9633412553738314, "learning_rate": 2.129080118694362e-05, "loss": 0.7588, "step": 925 }, { "epoch": 0.6208277703604806, "grad_norm": 0.9697623965265878, "learning_rate": 2.1105341246290803e-05, "loss": 0.7743, "step": 930 }, { "epoch": 0.6241655540720962, "grad_norm": 0.9170590153248661, "learning_rate": 2.0919881305637982e-05, "loss": 0.7939, "step": 935 }, { "epoch": 0.6275033377837116, "grad_norm": 0.8884986167305851, "learning_rate": 2.0734421364985165e-05, "loss": 0.7609, "step": 940 }, { "epoch": 0.6308411214953271, "grad_norm": 0.9117598511296207, "learning_rate": 2.0548961424332345e-05, "loss": 0.7816, "step": 945 }, { "epoch": 0.6341789052069426, "grad_norm": 0.9143844046049939, "learning_rate": 2.0363501483679525e-05, "loss": 0.7682, "step": 950 }, { "epoch": 0.6375166889185581, "grad_norm": 1.001882223179808, "learning_rate": 2.0178041543026708e-05, "loss": 0.7859, "step": 955 }, { "epoch": 0.6408544726301736, "grad_norm": 0.9515527243629021, "learning_rate": 1.9992581602373888e-05, "loss": 0.771, "step": 960 }, { "epoch": 0.644192256341789, "grad_norm": 0.9987405390627165, "learning_rate": 1.980712166172107e-05, "loss": 0.7882, "step": 965 }, { "epoch": 0.6475300400534045, "grad_norm": 0.902559098376266, "learning_rate": 1.962166172106825e-05, "loss": 0.7865, "step": 970 }, { "epoch": 0.6508678237650201, "grad_norm": 0.9530475037552353, "learning_rate": 1.943620178041543e-05, "loss": 0.7651, "step": 975 }, { "epoch": 0.6542056074766355, "grad_norm": 0.9044360545717226, "learning_rate": 1.9250741839762613e-05, "loss": 0.7611, "step": 980 }, { "epoch": 0.657543391188251, "grad_norm": 0.8949422084811579, "learning_rate": 1.9065281899109793e-05, "loss": 0.7535, "step": 985 }, { "epoch": 0.6608811748998665, "grad_norm": 0.9212385076203463, "learning_rate": 1.8879821958456976e-05, "loss": 0.7828, "step": 990 }, { "epoch": 0.664218958611482, "grad_norm": 0.9685512993064703, "learning_rate": 1.8694362017804153e-05, "loss": 0.7598, "step": 995 }, { "epoch": 0.6675567423230975, "grad_norm": 0.8371458023739065, "learning_rate": 1.8508902077151336e-05, "loss": 0.7593, "step": 1000 }, { "epoch": 0.670894526034713, "grad_norm": 0.9561174634421302, "learning_rate": 1.8323442136498515e-05, "loss": 0.7918, "step": 1005 }, { "epoch": 0.6742323097463284, "grad_norm": 0.988014946142732, "learning_rate": 1.81379821958457e-05, "loss": 0.7752, "step": 1010 }, { "epoch": 0.677570093457944, "grad_norm": 0.9594260489502082, "learning_rate": 1.795252225519288e-05, "loss": 0.7555, "step": 1015 }, { "epoch": 0.6809078771695594, "grad_norm": 0.9047734227550646, "learning_rate": 1.7767062314540058e-05, "loss": 0.7616, "step": 1020 }, { "epoch": 0.6842456608811749, "grad_norm": 0.8663553522012484, "learning_rate": 1.758160237388724e-05, "loss": 0.7796, "step": 1025 }, { "epoch": 0.6875834445927904, "grad_norm": 0.921325208555024, "learning_rate": 1.739614243323442e-05, "loss": 0.7641, "step": 1030 }, { "epoch": 0.6909212283044058, "grad_norm": 0.8876679055429972, "learning_rate": 1.7210682492581604e-05, "loss": 0.7676, "step": 1035 }, { "epoch": 0.6942590120160214, "grad_norm": 0.8958203531086095, "learning_rate": 1.7025222551928784e-05, "loss": 0.7444, "step": 1040 }, { "epoch": 0.6975967957276369, "grad_norm": 0.8697101652022063, "learning_rate": 1.6839762611275967e-05, "loss": 0.7994, "step": 1045 }, { "epoch": 0.7009345794392523, "grad_norm": 1.088915969712606, "learning_rate": 1.6654302670623147e-05, "loss": 0.7664, "step": 1050 }, { "epoch": 0.7042723631508678, "grad_norm": 0.9166954419190961, "learning_rate": 1.6468842729970326e-05, "loss": 0.7576, "step": 1055 }, { "epoch": 0.7076101468624834, "grad_norm": 1.0111723788145828, "learning_rate": 1.628338278931751e-05, "loss": 0.7437, "step": 1060 }, { "epoch": 0.7109479305740988, "grad_norm": 0.8935454904692272, "learning_rate": 1.609792284866469e-05, "loss": 0.7651, "step": 1065 }, { "epoch": 0.7142857142857143, "grad_norm": 0.9796810043507851, "learning_rate": 1.5912462908011872e-05, "loss": 0.7453, "step": 1070 }, { "epoch": 0.7176234979973297, "grad_norm": 0.9497605242721375, "learning_rate": 1.572700296735905e-05, "loss": 0.7518, "step": 1075 }, { "epoch": 0.7209612817089452, "grad_norm": 0.9261889971154708, "learning_rate": 1.5541543026706232e-05, "loss": 0.7509, "step": 1080 }, { "epoch": 0.7242990654205608, "grad_norm": 0.9221518501402585, "learning_rate": 1.5356083086053415e-05, "loss": 0.7383, "step": 1085 }, { "epoch": 0.7276368491321762, "grad_norm": 0.9905384423101319, "learning_rate": 1.5170623145400595e-05, "loss": 0.7651, "step": 1090 }, { "epoch": 0.7309746328437917, "grad_norm": 0.9591889790233724, "learning_rate": 1.4985163204747776e-05, "loss": 0.7631, "step": 1095 }, { "epoch": 0.7343124165554072, "grad_norm": 0.9087058012685387, "learning_rate": 1.4799703264094956e-05, "loss": 0.7535, "step": 1100 }, { "epoch": 0.7376502002670227, "grad_norm": 0.8692391342259392, "learning_rate": 1.4614243323442137e-05, "loss": 0.7374, "step": 1105 }, { "epoch": 0.7409879839786382, "grad_norm": 0.9298588940493733, "learning_rate": 1.4428783382789319e-05, "loss": 0.7423, "step": 1110 }, { "epoch": 0.7443257676902537, "grad_norm": 0.9381934382110304, "learning_rate": 1.42433234421365e-05, "loss": 0.7556, "step": 1115 }, { "epoch": 0.7476635514018691, "grad_norm": 0.9063091001621449, "learning_rate": 1.4057863501483681e-05, "loss": 0.742, "step": 1120 }, { "epoch": 0.7510013351134847, "grad_norm": 0.9344982528376456, "learning_rate": 1.387240356083086e-05, "loss": 0.7554, "step": 1125 }, { "epoch": 0.7543391188251002, "grad_norm": 0.9193075340785176, "learning_rate": 1.3686943620178041e-05, "loss": 0.7673, "step": 1130 }, { "epoch": 0.7576769025367156, "grad_norm": 0.9336576279968588, "learning_rate": 1.3501483679525222e-05, "loss": 0.7023, "step": 1135 }, { "epoch": 0.7610146862483311, "grad_norm": 0.9768530487657828, "learning_rate": 1.3316023738872405e-05, "loss": 0.7782, "step": 1140 }, { "epoch": 0.7643524699599465, "grad_norm": 0.9679956607339216, "learning_rate": 1.3130563798219587e-05, "loss": 0.763, "step": 1145 }, { "epoch": 0.7676902536715621, "grad_norm": 0.869544797745516, "learning_rate": 1.2945103857566765e-05, "loss": 0.7654, "step": 1150 }, { "epoch": 0.7710280373831776, "grad_norm": 0.985241850392532, "learning_rate": 1.2759643916913946e-05, "loss": 0.7699, "step": 1155 }, { "epoch": 0.774365821094793, "grad_norm": 0.9549632229431664, "learning_rate": 1.2574183976261128e-05, "loss": 0.7446, "step": 1160 }, { "epoch": 0.7777036048064085, "grad_norm": 0.9575556607619793, "learning_rate": 1.2388724035608309e-05, "loss": 0.761, "step": 1165 }, { "epoch": 0.7810413885180241, "grad_norm": 0.9994928229391189, "learning_rate": 1.2203264094955489e-05, "loss": 0.7493, "step": 1170 }, { "epoch": 0.7843791722296395, "grad_norm": 0.9965021892630818, "learning_rate": 1.2017804154302672e-05, "loss": 0.7602, "step": 1175 }, { "epoch": 0.787716955941255, "grad_norm": 0.9615733998987331, "learning_rate": 1.1832344213649853e-05, "loss": 0.7575, "step": 1180 }, { "epoch": 0.7910547396528705, "grad_norm": 0.995773743198864, "learning_rate": 1.1646884272997033e-05, "loss": 0.7411, "step": 1185 }, { "epoch": 0.794392523364486, "grad_norm": 0.8979251525490485, "learning_rate": 1.1461424332344215e-05, "loss": 0.7396, "step": 1190 }, { "epoch": 0.7977303070761015, "grad_norm": 0.8587947464461109, "learning_rate": 1.1275964391691394e-05, "loss": 0.7534, "step": 1195 }, { "epoch": 0.8010680907877169, "grad_norm": 0.9219949726663529, "learning_rate": 1.1090504451038576e-05, "loss": 0.7472, "step": 1200 }, { "epoch": 0.8044058744993324, "grad_norm": 0.9232814244897973, "learning_rate": 1.0905044510385757e-05, "loss": 0.7361, "step": 1205 }, { "epoch": 0.807743658210948, "grad_norm": 0.9036693012652329, "learning_rate": 1.0719584569732939e-05, "loss": 0.7714, "step": 1210 }, { "epoch": 0.8110814419225634, "grad_norm": 0.8986092027119317, "learning_rate": 1.053412462908012e-05, "loss": 0.7291, "step": 1215 }, { "epoch": 0.8144192256341789, "grad_norm": 0.9600751192343961, "learning_rate": 1.0348664688427301e-05, "loss": 0.7406, "step": 1220 }, { "epoch": 0.8177570093457944, "grad_norm": 0.9765858612379583, "learning_rate": 1.0163204747774481e-05, "loss": 0.744, "step": 1225 }, { "epoch": 0.8210947930574098, "grad_norm": 0.9436664489477504, "learning_rate": 9.977744807121663e-06, "loss": 0.7537, "step": 1230 }, { "epoch": 0.8244325767690254, "grad_norm": 0.9348268117808438, "learning_rate": 9.792284866468842e-06, "loss": 0.7466, "step": 1235 }, { "epoch": 0.8277703604806409, "grad_norm": 0.8839373635727904, "learning_rate": 9.606824925816024e-06, "loss": 0.7225, "step": 1240 }, { "epoch": 0.8311081441922563, "grad_norm": 0.9350825333412003, "learning_rate": 9.421364985163205e-06, "loss": 0.7535, "step": 1245 }, { "epoch": 0.8344459279038718, "grad_norm": 0.9170815856659084, "learning_rate": 9.235905044510387e-06, "loss": 0.7129, "step": 1250 }, { "epoch": 0.8377837116154874, "grad_norm": 0.8931728840330895, "learning_rate": 9.050445103857568e-06, "loss": 0.7312, "step": 1255 }, { "epoch": 0.8411214953271028, "grad_norm": 0.947606427769052, "learning_rate": 8.864985163204748e-06, "loss": 0.7198, "step": 1260 }, { "epoch": 0.8444592790387183, "grad_norm": 0.9867265496764112, "learning_rate": 8.679525222551929e-06, "loss": 0.7627, "step": 1265 }, { "epoch": 0.8477970627503337, "grad_norm": 1.0256443772674286, "learning_rate": 8.49406528189911e-06, "loss": 0.7466, "step": 1270 }, { "epoch": 0.8511348464619493, "grad_norm": 1.0226735020250939, "learning_rate": 8.30860534124629e-06, "loss": 0.7524, "step": 1275 }, { "epoch": 0.8544726301735648, "grad_norm": 0.918655948279863, "learning_rate": 8.123145400593472e-06, "loss": 0.7395, "step": 1280 }, { "epoch": 0.8578104138851802, "grad_norm": 0.9337021354955276, "learning_rate": 7.937685459940653e-06, "loss": 0.7311, "step": 1285 }, { "epoch": 0.8611481975967957, "grad_norm": 1.0290658689780736, "learning_rate": 7.752225519287835e-06, "loss": 0.7533, "step": 1290 }, { "epoch": 0.8644859813084113, "grad_norm": 0.9244023648001366, "learning_rate": 7.566765578635016e-06, "loss": 0.7366, "step": 1295 }, { "epoch": 0.8678237650200267, "grad_norm": 0.9537206897694197, "learning_rate": 7.381305637982196e-06, "loss": 0.7399, "step": 1300 }, { "epoch": 0.8711615487316422, "grad_norm": 0.8743472670723075, "learning_rate": 7.195845697329377e-06, "loss": 0.7296, "step": 1305 }, { "epoch": 0.8744993324432577, "grad_norm": 0.8934119128550586, "learning_rate": 7.0103857566765585e-06, "loss": 0.7416, "step": 1310 }, { "epoch": 0.8778371161548731, "grad_norm": 0.9618800479933417, "learning_rate": 6.824925816023739e-06, "loss": 0.7322, "step": 1315 }, { "epoch": 0.8811748998664887, "grad_norm": 0.830157637044439, "learning_rate": 6.6394658753709205e-06, "loss": 0.7292, "step": 1320 }, { "epoch": 0.8845126835781041, "grad_norm": 0.9363830426057013, "learning_rate": 6.4540059347181e-06, "loss": 0.734, "step": 1325 }, { "epoch": 0.8878504672897196, "grad_norm": 0.9442444285959836, "learning_rate": 6.2685459940652825e-06, "loss": 0.7297, "step": 1330 }, { "epoch": 0.8911882510013351, "grad_norm": 0.9348111437754156, "learning_rate": 6.083086053412463e-06, "loss": 0.7382, "step": 1335 }, { "epoch": 0.8945260347129506, "grad_norm": 0.894299426623852, "learning_rate": 5.8976261127596445e-06, "loss": 0.7125, "step": 1340 }, { "epoch": 0.8978638184245661, "grad_norm": 0.9736360020611752, "learning_rate": 5.712166172106825e-06, "loss": 0.7277, "step": 1345 }, { "epoch": 0.9012016021361816, "grad_norm": 0.9591002259783217, "learning_rate": 5.5267062314540065e-06, "loss": 0.7258, "step": 1350 }, { "epoch": 0.904539385847797, "grad_norm": 0.9710736921612175, "learning_rate": 5.341246290801187e-06, "loss": 0.7381, "step": 1355 }, { "epoch": 0.9078771695594126, "grad_norm": 0.8730608837945383, "learning_rate": 5.155786350148368e-06, "loss": 0.7251, "step": 1360 }, { "epoch": 0.9112149532710281, "grad_norm": 0.903052568907302, "learning_rate": 4.970326409495549e-06, "loss": 0.7094, "step": 1365 }, { "epoch": 0.9145527369826435, "grad_norm": 0.9621382332532343, "learning_rate": 4.7848664688427305e-06, "loss": 0.7227, "step": 1370 }, { "epoch": 0.917890520694259, "grad_norm": 0.9252627526168606, "learning_rate": 4.599406528189911e-06, "loss": 0.7271, "step": 1375 }, { "epoch": 0.9212283044058746, "grad_norm": 0.9417940189754115, "learning_rate": 4.413946587537092e-06, "loss": 0.7421, "step": 1380 }, { "epoch": 0.92456608811749, "grad_norm": 0.9682630131774531, "learning_rate": 4.228486646884274e-06, "loss": 0.7421, "step": 1385 }, { "epoch": 0.9279038718291055, "grad_norm": 1.0307114272788542, "learning_rate": 4.0430267062314545e-06, "loss": 0.7521, "step": 1390 }, { "epoch": 0.9312416555407209, "grad_norm": 0.8832770378598581, "learning_rate": 3.857566765578635e-06, "loss": 0.7124, "step": 1395 }, { "epoch": 0.9345794392523364, "grad_norm": 0.9746407857253272, "learning_rate": 3.672106824925816e-06, "loss": 0.7302, "step": 1400 }, { "epoch": 0.937917222963952, "grad_norm": 0.9499407254108757, "learning_rate": 3.4866468842729975e-06, "loss": 0.7128, "step": 1405 }, { "epoch": 0.9412550066755674, "grad_norm": 0.9145822943128387, "learning_rate": 3.3011869436201785e-06, "loss": 0.7322, "step": 1410 }, { "epoch": 0.9445927903871829, "grad_norm": 0.9757309463266676, "learning_rate": 3.115727002967359e-06, "loss": 0.7336, "step": 1415 }, { "epoch": 0.9479305740987984, "grad_norm": 0.9534792908684466, "learning_rate": 2.93026706231454e-06, "loss": 0.7312, "step": 1420 }, { "epoch": 0.9512683578104139, "grad_norm": 0.9679148570410533, "learning_rate": 2.744807121661721e-06, "loss": 0.7128, "step": 1425 }, { "epoch": 0.9546061415220294, "grad_norm": 0.9493106907067503, "learning_rate": 2.559347181008902e-06, "loss": 0.7306, "step": 1430 }, { "epoch": 0.9579439252336449, "grad_norm": 0.9679382523098454, "learning_rate": 2.3738872403560835e-06, "loss": 0.7328, "step": 1435 }, { "epoch": 0.9612817089452603, "grad_norm": 0.9317606747634454, "learning_rate": 2.188427299703264e-06, "loss": 0.7238, "step": 1440 }, { "epoch": 0.9646194926568759, "grad_norm": 0.9275615394707037, "learning_rate": 2.0029673590504455e-06, "loss": 0.7171, "step": 1445 }, { "epoch": 0.9679572763684913, "grad_norm": 0.9633454475236382, "learning_rate": 1.8175074183976263e-06, "loss": 0.708, "step": 1450 }, { "epoch": 0.9712950600801068, "grad_norm": 0.9399727708328366, "learning_rate": 1.6320474777448073e-06, "loss": 0.7291, "step": 1455 }, { "epoch": 0.9746328437917223, "grad_norm": 1.020554951634919, "learning_rate": 1.4465875370919883e-06, "loss": 0.7461, "step": 1460 }, { "epoch": 0.9779706275033377, "grad_norm": 0.985081620856249, "learning_rate": 1.2611275964391693e-06, "loss": 0.7228, "step": 1465 }, { "epoch": 0.9813084112149533, "grad_norm": 0.9385893286244219, "learning_rate": 1.0756676557863502e-06, "loss": 0.72, "step": 1470 }, { "epoch": 0.9846461949265688, "grad_norm": 1.0049513545701543, "learning_rate": 8.902077151335312e-07, "loss": 0.7147, "step": 1475 }, { "epoch": 0.9879839786381842, "grad_norm": 1.0099532146134622, "learning_rate": 7.047477744807121e-07, "loss": 0.7282, "step": 1480 }, { "epoch": 0.9913217623497997, "grad_norm": 0.8888912225564214, "learning_rate": 5.192878338278931e-07, "loss": 0.7332, "step": 1485 }, { "epoch": 0.9946595460614153, "grad_norm": 0.9997610443207388, "learning_rate": 3.338278931750742e-07, "loss": 0.7225, "step": 1490 }, { "epoch": 0.9979973297730307, "grad_norm": 0.9412531346267364, "learning_rate": 1.4836795252225522e-07, "loss": 0.7318, "step": 1495 }, { "epoch": 1.0, "step": 1498, "total_flos": 92324696948736.0, "train_loss": 0.788116905017593, "train_runtime": 3845.5406, "train_samples_per_second": 49.839, "train_steps_per_second": 0.39 } ], "logging_steps": 5, "max_steps": 1498, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 180000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 92324696948736.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }