{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 3.097726066927376, "learning_rate": 2.875399361022364e-07, "loss": 0.4182, "step": 10 }, { "epoch": 0.0064, "grad_norm": 2.6698471287095344, "learning_rate": 6.070287539936103e-07, "loss": 0.3969, "step": 20 }, { "epoch": 0.0096, "grad_norm": 2.3652285610261403, "learning_rate": 9.265175718849841e-07, "loss": 0.3746, "step": 30 }, { "epoch": 0.0128, "grad_norm": 1.4765456643161126, "learning_rate": 1.2460063897763578e-06, "loss": 0.3695, "step": 40 }, { "epoch": 0.016, "grad_norm": 2.191519526172484, "learning_rate": 1.565495207667732e-06, "loss": 0.3449, "step": 50 }, { "epoch": 0.0192, "grad_norm": 1.6285826522074962, "learning_rate": 1.8849840255591056e-06, "loss": 0.3181, "step": 60 }, { "epoch": 0.0224, "grad_norm": 1.8683549636521075, "learning_rate": 2.2044728434504793e-06, "loss": 0.322, "step": 70 }, { "epoch": 0.0256, "grad_norm": 1.5301019294828755, "learning_rate": 2.5239616613418532e-06, "loss": 0.3009, "step": 80 }, { "epoch": 0.0288, "grad_norm": 1.538896137107791, "learning_rate": 2.8434504792332267e-06, "loss": 0.293, "step": 90 }, { "epoch": 0.032, "grad_norm": 1.6984302868653567, "learning_rate": 3.162939297124601e-06, "loss": 0.2999, "step": 100 }, { "epoch": 0.0352, "grad_norm": 1.7119603328381132, "learning_rate": 3.482428115015975e-06, "loss": 0.2945, "step": 110 }, { "epoch": 0.0384, "grad_norm": 1.6069777286559, "learning_rate": 3.8019169329073485e-06, "loss": 0.2895, "step": 120 }, { "epoch": 0.0416, "grad_norm": 1.7566779999753588, "learning_rate": 4.121405750798722e-06, "loss": 0.2923, "step": 130 }, { "epoch": 0.0448, "grad_norm": 1.7352248180581427, "learning_rate": 4.440894568690096e-06, "loss": 0.3638, "step": 140 }, { "epoch": 0.048, "grad_norm": 1.8483046207545708, "learning_rate": 4.76038338658147e-06, "loss": 0.2861, "step": 150 }, { "epoch": 0.0512, "grad_norm": 1.6829534856608535, "learning_rate": 5.079872204472844e-06, "loss": 0.5296, "step": 160 }, { "epoch": 0.0544, "grad_norm": 1.7746050883459261, "learning_rate": 5.399361022364218e-06, "loss": 0.2934, "step": 170 }, { "epoch": 0.0576, "grad_norm": 1.7179977616371749, "learning_rate": 5.718849840255591e-06, "loss": 0.2713, "step": 180 }, { "epoch": 0.0608, "grad_norm": 1.5706330739618684, "learning_rate": 6.038338658146965e-06, "loss": 0.2779, "step": 190 }, { "epoch": 0.064, "grad_norm": 1.8232979139526881, "learning_rate": 6.35782747603834e-06, "loss": 0.2835, "step": 200 }, { "epoch": 0.0672, "grad_norm": 1.755606637900451, "learning_rate": 6.677316293929713e-06, "loss": 0.3767, "step": 210 }, { "epoch": 0.0704, "grad_norm": 1.7142425655315976, "learning_rate": 6.996805111821087e-06, "loss": 0.2864, "step": 220 }, { "epoch": 0.0736, "grad_norm": 1.6051059574805915, "learning_rate": 7.316293929712461e-06, "loss": 0.2767, "step": 230 }, { "epoch": 0.0768, "grad_norm": 1.49334442261346, "learning_rate": 7.635782747603835e-06, "loss": 0.2689, "step": 240 }, { "epoch": 0.08, "grad_norm": 1.6270294243342234, "learning_rate": 7.955271565495208e-06, "loss": 0.2857, "step": 250 }, { "epoch": 0.0832, "grad_norm": 1.4820640434767234, "learning_rate": 8.274760383386582e-06, "loss": 0.2838, "step": 260 }, { "epoch": 0.0864, "grad_norm": 1.4755509754091296, "learning_rate": 8.594249201277956e-06, "loss": 0.2829, "step": 270 }, { "epoch": 0.0896, "grad_norm": 1.5750758252112724, "learning_rate": 8.91373801916933e-06, "loss": 0.2716, "step": 280 }, { "epoch": 0.0928, "grad_norm": 1.8211062452729152, "learning_rate": 9.233226837060704e-06, "loss": 0.2985, "step": 290 }, { "epoch": 0.096, "grad_norm": 1.470802373475764, "learning_rate": 9.552715654952077e-06, "loss": 0.2722, "step": 300 }, { "epoch": 0.0992, "grad_norm": 1.5951743640227587, "learning_rate": 9.87220447284345e-06, "loss": 0.2834, "step": 310 }, { "epoch": 0.1024, "grad_norm": 1.5085331338575516, "learning_rate": 9.999887666317538e-06, "loss": 0.2866, "step": 320 }, { "epoch": 0.1056, "grad_norm": 1.5890905665931712, "learning_rate": 9.999201200981566e-06, "loss": 0.2774, "step": 330 }, { "epoch": 0.1088, "grad_norm": 1.4824327210922925, "learning_rate": 9.997890763487869e-06, "loss": 0.2865, "step": 340 }, { "epoch": 0.112, "grad_norm": 1.4179211935288807, "learning_rate": 9.995956517397884e-06, "loss": 0.3445, "step": 350 }, { "epoch": 0.1152, "grad_norm": 1.6311477640409462, "learning_rate": 9.993398704133318e-06, "loss": 0.2961, "step": 360 }, { "epoch": 0.1184, "grad_norm": 1.3493949424853904, "learning_rate": 9.990217642946028e-06, "loss": 0.2693, "step": 370 }, { "epoch": 0.1216, "grad_norm": 1.5074472178009761, "learning_rate": 9.986413730878168e-06, "loss": 0.2908, "step": 380 }, { "epoch": 0.1248, "grad_norm": 1.5552421320809011, "learning_rate": 9.981987442712634e-06, "loss": 0.3047, "step": 390 }, { "epoch": 0.128, "grad_norm": 1.5085252753416616, "learning_rate": 9.976939330913801e-06, "loss": 0.2761, "step": 400 }, { "epoch": 0.1312, "grad_norm": 1.3894985533924997, "learning_rate": 9.971270025558576e-06, "loss": 0.2827, "step": 410 }, { "epoch": 0.1344, "grad_norm": 1.376517568828387, "learning_rate": 9.96498023425774e-06, "loss": 0.282, "step": 420 }, { "epoch": 0.1376, "grad_norm": 1.293372373569424, "learning_rate": 9.958070742067649e-06, "loss": 0.2864, "step": 430 }, { "epoch": 0.1408, "grad_norm": 1.3933504307268871, "learning_rate": 9.95054241139223e-06, "loss": 0.2886, "step": 440 }, { "epoch": 0.144, "grad_norm": 1.5512528120422318, "learning_rate": 9.942396181875342e-06, "loss": 0.2792, "step": 450 }, { "epoch": 0.1472, "grad_norm": 1.3955106828693524, "learning_rate": 9.933633070283512e-06, "loss": 0.2823, "step": 460 }, { "epoch": 0.1504, "grad_norm": 1.6468424852669425, "learning_rate": 9.924254170379007e-06, "loss": 0.2984, "step": 470 }, { "epoch": 0.1536, "grad_norm": 1.5159031541701018, "learning_rate": 9.914260652783323e-06, "loss": 0.271, "step": 480 }, { "epoch": 0.1568, "grad_norm": 1.5805212725674178, "learning_rate": 9.903653764831088e-06, "loss": 0.2957, "step": 490 }, { "epoch": 0.16, "grad_norm": 1.1956547216731315, "learning_rate": 9.892434830414354e-06, "loss": 0.2684, "step": 500 }, { "epoch": 0.1632, "grad_norm": 1.4268809698190317, "learning_rate": 9.880605249817377e-06, "loss": 0.3657, "step": 510 }, { "epoch": 0.1664, "grad_norm": 1.4179964964215248, "learning_rate": 9.868166499541824e-06, "loss": 0.2751, "step": 520 }, { "epoch": 0.1696, "grad_norm": 1.2892232060188915, "learning_rate": 9.855120132122503e-06, "loss": 0.2729, "step": 530 }, { "epoch": 0.1728, "grad_norm": 1.261114838277135, "learning_rate": 9.841467775933566e-06, "loss": 0.268, "step": 540 }, { "epoch": 0.176, "grad_norm": 1.430271043321, "learning_rate": 9.827211134985273e-06, "loss": 0.4323, "step": 550 }, { "epoch": 0.1792, "grad_norm": 1.2954706652401755, "learning_rate": 9.812351988711312e-06, "loss": 0.2622, "step": 560 }, { "epoch": 0.1824, "grad_norm": 1.7636694848560834, "learning_rate": 9.79689219174669e-06, "loss": 0.3947, "step": 570 }, { "epoch": 0.1856, "grad_norm": 1.5362869932266252, "learning_rate": 9.780833673696255e-06, "loss": 0.261, "step": 580 }, { "epoch": 0.1888, "grad_norm": 1.2516741612873126, "learning_rate": 9.76417843889385e-06, "loss": 0.2632, "step": 590 }, { "epoch": 0.192, "grad_norm": 1.330313742879521, "learning_rate": 9.746928566152148e-06, "loss": 0.2612, "step": 600 }, { "epoch": 0.1952, "grad_norm": 1.369062918644532, "learning_rate": 9.729086208503174e-06, "loss": 0.2672, "step": 610 }, { "epoch": 0.1984, "grad_norm": 1.4183586038757179, "learning_rate": 9.710653592929595e-06, "loss": 0.2738, "step": 620 }, { "epoch": 0.2016, "grad_norm": 1.52682494954233, "learning_rate": 9.691633020086745e-06, "loss": 0.3808, "step": 630 }, { "epoch": 0.2048, "grad_norm": 1.4376998703858688, "learning_rate": 9.672026864015476e-06, "loss": 0.2643, "step": 640 }, { "epoch": 0.208, "grad_norm": 1.583565617875829, "learning_rate": 9.651837571845842e-06, "loss": 0.2772, "step": 650 }, { "epoch": 0.2112, "grad_norm": 1.4806395536482786, "learning_rate": 9.631067663491663e-06, "loss": 0.3377, "step": 660 }, { "epoch": 0.2144, "grad_norm": 1.2496121580498412, "learning_rate": 9.609719731336005e-06, "loss": 0.2752, "step": 670 }, { "epoch": 0.2176, "grad_norm": 1.3439013535339106, "learning_rate": 9.587796439907609e-06, "loss": 0.2685, "step": 680 }, { "epoch": 0.2208, "grad_norm": 1.4305791278136326, "learning_rate": 9.565300525548327e-06, "loss": 0.2754, "step": 690 }, { "epoch": 0.224, "grad_norm": 1.4621614330412347, "learning_rate": 9.542234796071577e-06, "loss": 0.2627, "step": 700 }, { "epoch": 0.2272, "grad_norm": 1.3498296779237806, "learning_rate": 9.518602130411894e-06, "loss": 0.2861, "step": 710 }, { "epoch": 0.2304, "grad_norm": 1.3530290691335103, "learning_rate": 9.4944054782656e-06, "loss": 0.2668, "step": 720 }, { "epoch": 0.2336, "grad_norm": 1.5377603964582789, "learning_rate": 9.469647859722634e-06, "loss": 0.2679, "step": 730 }, { "epoch": 0.2368, "grad_norm": 1.225090999497597, "learning_rate": 9.444332364889603e-06, "loss": 0.2491, "step": 740 }, { "epoch": 0.24, "grad_norm": 1.2724375938152441, "learning_rate": 9.41846215350409e-06, "loss": 0.2756, "step": 750 }, { "epoch": 0.2432, "grad_norm": 1.3649187680293995, "learning_rate": 9.392040454540284e-06, "loss": 0.2685, "step": 760 }, { "epoch": 0.2464, "grad_norm": 1.506689299970985, "learning_rate": 9.365070565805941e-06, "loss": 0.2727, "step": 770 }, { "epoch": 0.2496, "grad_norm": 1.5365021125318565, "learning_rate": 9.337555853530785e-06, "loss": 0.2695, "step": 780 }, { "epoch": 0.2528, "grad_norm": 1.4793118284689457, "learning_rate": 9.309499751946345e-06, "loss": 0.2712, "step": 790 }, { "epoch": 0.256, "grad_norm": 1.3520775224079904, "learning_rate": 9.280905762857315e-06, "loss": 0.2658, "step": 800 }, { "epoch": 0.2592, "grad_norm": 1.0943480269165575, "learning_rate": 9.251777455204485e-06, "loss": 0.2699, "step": 810 }, { "epoch": 0.2624, "grad_norm": 1.4272492824528293, "learning_rate": 9.222118464619278e-06, "loss": 0.2552, "step": 820 }, { "epoch": 0.2656, "grad_norm": 2.210970619104712, "learning_rate": 9.191932492969972e-06, "loss": 0.2609, "step": 830 }, { "epoch": 0.2688, "grad_norm": 1.3860115638657557, "learning_rate": 9.161223307899659e-06, "loss": 0.2614, "step": 840 }, { "epoch": 0.272, "grad_norm": 1.3097054200264115, "learning_rate": 9.129994742355985e-06, "loss": 0.2558, "step": 850 }, { "epoch": 0.2752, "grad_norm": 1.2863046042758517, "learning_rate": 9.09825069411274e-06, "loss": 0.3506, "step": 860 }, { "epoch": 0.2784, "grad_norm": 1.4154177255509635, "learning_rate": 9.065995125283367e-06, "loss": 0.3483, "step": 870 }, { "epoch": 0.2816, "grad_norm": 1.2614915111427545, "learning_rate": 9.033232061826428e-06, "loss": 0.3094, "step": 880 }, { "epoch": 0.2848, "grad_norm": 1.392215585602565, "learning_rate": 8.999965593043113e-06, "loss": 0.2552, "step": 890 }, { "epoch": 0.288, "grad_norm": 1.2787168993445066, "learning_rate": 8.96619987106682e-06, "loss": 0.2635, "step": 900 }, { "epoch": 0.2912, "grad_norm": 1.1874290348203185, "learning_rate": 8.931939110344935e-06, "loss": 0.3925, "step": 910 }, { "epoch": 0.2944, "grad_norm": 1.2892860612183799, "learning_rate": 8.897187587112783e-06, "loss": 0.472, "step": 920 }, { "epoch": 0.2976, "grad_norm": 1.1180916115878632, "learning_rate": 8.861949638859908e-06, "loss": 0.4072, "step": 930 }, { "epoch": 0.3008, "grad_norm": 1.311746880897253, "learning_rate": 8.826229663788688e-06, "loss": 0.2632, "step": 940 }, { "epoch": 0.304, "grad_norm": 1.2164327029930522, "learning_rate": 8.790032120265373e-06, "loss": 0.2532, "step": 950 }, { "epoch": 0.3072, "grad_norm": 1.436604570629404, "learning_rate": 8.753361526263622e-06, "loss": 0.2591, "step": 960 }, { "epoch": 0.3104, "grad_norm": 1.2626583454725855, "learning_rate": 8.716222458800591e-06, "loss": 0.2617, "step": 970 }, { "epoch": 0.3136, "grad_norm": 1.5056105724631803, "learning_rate": 8.67861955336566e-06, "loss": 0.2622, "step": 980 }, { "epoch": 0.3168, "grad_norm": 1.4940608590847797, "learning_rate": 8.640557503341843e-06, "loss": 0.2689, "step": 990 }, { "epoch": 0.32, "grad_norm": 1.4135983918557036, "learning_rate": 8.602041059420017e-06, "loss": 0.2599, "step": 1000 }, { "epoch": 0.3232, "grad_norm": 1.2654674588600259, "learning_rate": 8.563075029005924e-06, "loss": 0.2598, "step": 1010 }, { "epoch": 0.3264, "grad_norm": 1.5394425180175606, "learning_rate": 8.523664275620185e-06, "loss": 0.284, "step": 1020 }, { "epoch": 0.3296, "grad_norm": 1.2453557042164298, "learning_rate": 8.483813718291223e-06, "loss": 0.2475, "step": 1030 }, { "epoch": 0.3328, "grad_norm": 1.3932335036293408, "learning_rate": 8.443528330941322e-06, "loss": 0.2733, "step": 1040 }, { "epoch": 0.336, "grad_norm": 1.3326668118526488, "learning_rate": 8.402813141765796e-06, "loss": 0.2706, "step": 1050 }, { "epoch": 0.3392, "grad_norm": 1.4498463244679263, "learning_rate": 8.361673232605408e-06, "loss": 0.2599, "step": 1060 }, { "epoch": 0.3424, "grad_norm": 1.5505084458466836, "learning_rate": 8.320113738312081e-06, "loss": 0.2582, "step": 1070 }, { "epoch": 0.3456, "grad_norm": 1.3933287663129275, "learning_rate": 8.27813984610799e-06, "loss": 0.251, "step": 1080 }, { "epoch": 0.3488, "grad_norm": 1.3411841017130814, "learning_rate": 8.235756794938123e-06, "loss": 0.2545, "step": 1090 }, { "epoch": 0.352, "grad_norm": 1.2762415468326629, "learning_rate": 8.19296987481639e-06, "loss": 0.2511, "step": 1100 }, { "epoch": 0.3552, "grad_norm": 1.3756624147468053, "learning_rate": 8.149784426165351e-06, "loss": 0.2471, "step": 1110 }, { "epoch": 0.3584, "grad_norm": 1.2367975395806792, "learning_rate": 8.106205839149653e-06, "loss": 0.2398, "step": 1120 }, { "epoch": 0.3616, "grad_norm": 1.2795711630605267, "learning_rate": 8.06223955300326e-06, "loss": 0.25, "step": 1130 }, { "epoch": 0.3648, "grad_norm": 1.181366173079793, "learning_rate": 8.017891055350563e-06, "loss": 0.3787, "step": 1140 }, { "epoch": 0.368, "grad_norm": 1.235985789921985, "learning_rate": 7.973165881521435e-06, "loss": 0.2489, "step": 1150 }, { "epoch": 0.3712, "grad_norm": 1.3716107886978977, "learning_rate": 7.928069613860357e-06, "loss": 0.2638, "step": 1160 }, { "epoch": 0.3744, "grad_norm": 1.3314537160679019, "learning_rate": 7.882607881029652e-06, "loss": 0.2666, "step": 1170 }, { "epoch": 0.3776, "grad_norm": 1.47532226068613, "learning_rate": 7.836786357306943e-06, "loss": 0.2469, "step": 1180 }, { "epoch": 0.3808, "grad_norm": 1.2910123474743094, "learning_rate": 7.790610761876936e-06, "loss": 0.245, "step": 1190 }, { "epoch": 0.384, "grad_norm": 1.219675861544086, "learning_rate": 7.744086858117565e-06, "loss": 0.2532, "step": 1200 }, { "epoch": 0.3872, "grad_norm": 1.2663481766467186, "learning_rate": 7.69722045288066e-06, "loss": 0.2648, "step": 1210 }, { "epoch": 0.3904, "grad_norm": 1.2601430683107997, "learning_rate": 7.650017395767149e-06, "loss": 0.2428, "step": 1220 }, { "epoch": 0.3936, "grad_norm": 1.306765832939207, "learning_rate": 7.602483578396955e-06, "loss": 0.244, "step": 1230 }, { "epoch": 0.3968, "grad_norm": 1.3723024837409241, "learning_rate": 7.554624933673638e-06, "loss": 0.3634, "step": 1240 }, { "epoch": 0.4, "grad_norm": 1.5507546450379777, "learning_rate": 7.5064474350438755e-06, "loss": 0.2618, "step": 1250 }, { "epoch": 0.4032, "grad_norm": 1.366012809594849, "learning_rate": 7.457957095751896e-06, "loss": 0.2487, "step": 1260 }, { "epoch": 0.4064, "grad_norm": 1.1873848911132183, "learning_rate": 7.4091599680889425e-06, "loss": 0.2553, "step": 1270 }, { "epoch": 0.4096, "grad_norm": 1.4014484490952397, "learning_rate": 7.3600621426378515e-06, "loss": 0.2402, "step": 1280 }, { "epoch": 0.4128, "grad_norm": 1.1617991472531028, "learning_rate": 7.3106697475128655e-06, "loss": 0.2569, "step": 1290 }, { "epoch": 0.416, "grad_norm": 1.3751891734766228, "learning_rate": 7.260988947594759e-06, "loss": 0.2488, "step": 1300 }, { "epoch": 0.4192, "grad_norm": 1.237845283871077, "learning_rate": 7.211025943761367e-06, "loss": 0.2454, "step": 1310 }, { "epoch": 0.4224, "grad_norm": 1.1628031806651409, "learning_rate": 7.160786972113627e-06, "loss": 0.2554, "step": 1320 }, { "epoch": 0.4256, "grad_norm": 1.3404586293984995, "learning_rate": 7.1102783031972326e-06, "loss": 0.2551, "step": 1330 }, { "epoch": 0.4288, "grad_norm": 1.4670462801897188, "learning_rate": 7.059506241219964e-06, "loss": 0.2502, "step": 1340 }, { "epoch": 0.432, "grad_norm": 1.0302147200635003, "learning_rate": 7.008477123264849e-06, "loss": 0.2295, "step": 1350 }, { "epoch": 0.4352, "grad_norm": 1.2824619758646878, "learning_rate": 6.957197318499187e-06, "loss": 0.2539, "step": 1360 }, { "epoch": 0.4384, "grad_norm": 1.3608888568790518, "learning_rate": 6.905673227379606e-06, "loss": 0.342, "step": 1370 }, { "epoch": 0.4416, "grad_norm": 1.2538554464958942, "learning_rate": 6.853911280853168e-06, "loss": 0.2405, "step": 1380 }, { "epoch": 0.4448, "grad_norm": 1.377080653341555, "learning_rate": 6.801917939554721e-06, "loss": 0.246, "step": 1390 }, { "epoch": 0.448, "grad_norm": 1.295752065257539, "learning_rate": 6.749699693000495e-06, "loss": 0.238, "step": 1400 }, { "epoch": 0.4512, "grad_norm": 1.1623556510041717, "learning_rate": 6.6972630587781385e-06, "loss": 0.3522, "step": 1410 }, { "epoch": 0.4544, "grad_norm": 1.0514412139989395, "learning_rate": 6.6446145817332105e-06, "loss": 0.2448, "step": 1420 }, { "epoch": 0.4576, "grad_norm": 1.3006684707343785, "learning_rate": 6.591760833152306e-06, "loss": 0.2388, "step": 1430 }, { "epoch": 0.4608, "grad_norm": 1.3688639995228327, "learning_rate": 6.538708409942854e-06, "loss": 0.2484, "step": 1440 }, { "epoch": 0.464, "grad_norm": 1.182581571604436, "learning_rate": 6.48546393380973e-06, "loss": 0.2411, "step": 1450 }, { "epoch": 0.4672, "grad_norm": 1.2229123644507025, "learning_rate": 6.4320340504287825e-06, "loss": 0.2913, "step": 1460 }, { "epoch": 0.4704, "grad_norm": 1.114177546269806, "learning_rate": 6.378425428617343e-06, "loss": 0.5038, "step": 1470 }, { "epoch": 0.4736, "grad_norm": 1.400649143081864, "learning_rate": 6.324644759501869e-06, "loss": 0.2514, "step": 1480 }, { "epoch": 0.4768, "grad_norm": 1.2414663469351401, "learning_rate": 6.270698755682792e-06, "loss": 0.2434, "step": 1490 }, { "epoch": 0.48, "grad_norm": 1.3094558123042377, "learning_rate": 6.2165941503966995e-06, "loss": 0.2486, "step": 1500 }, { "epoch": 0.4832, "grad_norm": 1.1308739268431762, "learning_rate": 6.162337696675909e-06, "loss": 0.224, "step": 1510 }, { "epoch": 0.4864, "grad_norm": 1.2921131535571986, "learning_rate": 6.107936166505615e-06, "loss": 0.25, "step": 1520 }, { "epoch": 0.4896, "grad_norm": 1.277856644085611, "learning_rate": 6.053396349978632e-06, "loss": 0.2418, "step": 1530 }, { "epoch": 0.4928, "grad_norm": 1.207819320049489, "learning_rate": 5.998725054447904e-06, "loss": 0.2429, "step": 1540 }, { "epoch": 0.496, "grad_norm": 1.3964622577997952, "learning_rate": 5.943929103676839e-06, "loss": 0.2339, "step": 1550 }, { "epoch": 0.4992, "grad_norm": 1.0444429321557864, "learning_rate": 5.889015336987614e-06, "loss": 0.2296, "step": 1560 }, { "epoch": 0.5024, "grad_norm": 1.000405925226846, "learning_rate": 5.833990608407525e-06, "loss": 0.2227, "step": 1570 }, { "epoch": 0.5056, "grad_norm": 1.243996070580087, "learning_rate": 5.778861785813508e-06, "loss": 0.228, "step": 1580 }, { "epoch": 0.5088, "grad_norm": 1.4163588613795817, "learning_rate": 5.723635750074924e-06, "loss": 0.262, "step": 1590 }, { "epoch": 0.512, "grad_norm": 1.2497990764938063, "learning_rate": 5.6683193941947365e-06, "loss": 0.2354, "step": 1600 }, { "epoch": 0.5152, "grad_norm": 1.0918427100928676, "learning_rate": 5.61291962244916e-06, "loss": 0.2343, "step": 1610 }, { "epoch": 0.5184, "grad_norm": 1.2313539946902519, "learning_rate": 5.5574433495259015e-06, "loss": 0.2224, "step": 1620 }, { "epoch": 0.5216, "grad_norm": 1.3110693967381668, "learning_rate": 5.501897499661123e-06, "loss": 0.2353, "step": 1630 }, { "epoch": 0.5248, "grad_norm": 1.3211565707814268, "learning_rate": 5.446289005775185e-06, "loss": 0.2448, "step": 1640 }, { "epoch": 0.528, "grad_norm": 1.336496780231579, "learning_rate": 5.390624808607321e-06, "loss": 0.2353, "step": 1650 }, { "epoch": 0.5312, "grad_norm": 1.4349961842702135, "learning_rate": 5.334911855849334e-06, "loss": 0.2344, "step": 1660 }, { "epoch": 0.5344, "grad_norm": 1.300276553470609, "learning_rate": 5.279157101278433e-06, "loss": 0.2488, "step": 1670 }, { "epoch": 0.5376, "grad_norm": 1.3938747275358123, "learning_rate": 5.2233675038892815e-06, "loss": 0.235, "step": 1680 }, { "epoch": 0.5408, "grad_norm": 1.0695710070831115, "learning_rate": 5.1675500270254385e-06, "loss": 0.2358, "step": 1690 }, { "epoch": 0.544, "grad_norm": 1.3464266574587438, "learning_rate": 5.111711637510216e-06, "loss": 0.262, "step": 1700 }, { "epoch": 0.5472, "grad_norm": 1.2054969480183788, "learning_rate": 5.055859304777127e-06, "loss": 0.2346, "step": 1710 }, { "epoch": 0.5504, "grad_norm": 1.592945506341502, "learning_rate": 5e-06, "loss": 0.2491, "step": 1720 }, { "epoch": 0.5536, "grad_norm": 1.0841849724930561, "learning_rate": 4.944140695222874e-06, "loss": 0.2373, "step": 1730 }, { "epoch": 0.5568, "grad_norm": 1.1462434966725683, "learning_rate": 4.888288362489786e-06, "loss": 0.2245, "step": 1740 }, { "epoch": 0.56, "grad_norm": 1.2006687090364805, "learning_rate": 4.832449972974564e-06, "loss": 0.2281, "step": 1750 }, { "epoch": 0.5632, "grad_norm": 1.31046822802538, "learning_rate": 4.776632496110721e-06, "loss": 0.2343, "step": 1760 }, { "epoch": 0.5664, "grad_norm": 1.3130856213140962, "learning_rate": 4.720842898721569e-06, "loss": 0.3716, "step": 1770 }, { "epoch": 0.5696, "grad_norm": 1.3909345006491758, "learning_rate": 4.665088144150666e-06, "loss": 0.2344, "step": 1780 }, { "epoch": 0.5728, "grad_norm": 1.103425988971943, "learning_rate": 4.60937519139268e-06, "loss": 0.2207, "step": 1790 }, { "epoch": 0.576, "grad_norm": 1.3110124719599046, "learning_rate": 4.553710994224816e-06, "loss": 0.2584, "step": 1800 }, { "epoch": 0.5792, "grad_norm": 1.1358436006113255, "learning_rate": 4.498102500338879e-06, "loss": 0.2408, "step": 1810 }, { "epoch": 0.5824, "grad_norm": 1.1712875527581932, "learning_rate": 4.442556650474099e-06, "loss": 0.2338, "step": 1820 }, { "epoch": 0.5856, "grad_norm": 1.4200454612891542, "learning_rate": 4.387080377550843e-06, "loss": 0.3351, "step": 1830 }, { "epoch": 0.5888, "grad_norm": 1.1792392310210245, "learning_rate": 4.331680605805264e-06, "loss": 0.2284, "step": 1840 }, { "epoch": 0.592, "grad_norm": 1.4269025040790715, "learning_rate": 4.2763642499250765e-06, "loss": 0.2831, "step": 1850 }, { "epoch": 0.5952, "grad_norm": 1.2178218832805392, "learning_rate": 4.221138214186493e-06, "loss": 0.4656, "step": 1860 }, { "epoch": 0.5984, "grad_norm": 1.381880289226324, "learning_rate": 4.166009391592476e-06, "loss": 0.2226, "step": 1870 }, { "epoch": 0.6016, "grad_norm": 1.2861868856506182, "learning_rate": 4.110984663012388e-06, "loss": 0.2436, "step": 1880 }, { "epoch": 0.6048, "grad_norm": 1.460603893087498, "learning_rate": 4.056070896323163e-06, "loss": 0.2348, "step": 1890 }, { "epoch": 0.608, "grad_norm": 1.3411148998390934, "learning_rate": 4.001274945552098e-06, "loss": 0.2294, "step": 1900 }, { "epoch": 0.6112, "grad_norm": 1.2503467379311224, "learning_rate": 3.94660365002137e-06, "loss": 0.2179, "step": 1910 }, { "epoch": 0.6144, "grad_norm": 0.8844502114106245, "learning_rate": 3.892063833494387e-06, "loss": 0.2293, "step": 1920 }, { "epoch": 0.6176, "grad_norm": 1.323943188387191, "learning_rate": 3.837662303324093e-06, "loss": 0.2395, "step": 1930 }, { "epoch": 0.6208, "grad_norm": 1.3150182927928105, "learning_rate": 3.783405849603302e-06, "loss": 0.2265, "step": 1940 }, { "epoch": 0.624, "grad_norm": 1.5251447223837293, "learning_rate": 3.729301244317208e-06, "loss": 0.2445, "step": 1950 }, { "epoch": 0.6272, "grad_norm": 1.2518632060745116, "learning_rate": 3.675355240498133e-06, "loss": 0.2229, "step": 1960 }, { "epoch": 0.6304, "grad_norm": 1.4410331274554835, "learning_rate": 3.6215745713826585e-06, "loss": 0.2269, "step": 1970 }, { "epoch": 0.6336, "grad_norm": 1.2681938269526776, "learning_rate": 3.567965949571219e-06, "loss": 0.2216, "step": 1980 }, { "epoch": 0.6368, "grad_norm": 1.2119772204720247, "learning_rate": 3.5145360661902717e-06, "loss": 0.2029, "step": 1990 }, { "epoch": 0.64, "grad_norm": 1.2769167413279376, "learning_rate": 3.4612915900571493e-06, "loss": 0.2234, "step": 2000 }, { "epoch": 0.6432, "grad_norm": 1.1935986503511073, "learning_rate": 3.408239166847696e-06, "loss": 0.2313, "step": 2010 }, { "epoch": 0.6464, "grad_norm": 1.0781337815097756, "learning_rate": 3.355385418266792e-06, "loss": 0.2292, "step": 2020 }, { "epoch": 0.6496, "grad_norm": 1.2668633413118764, "learning_rate": 3.3027369412218623e-06, "loss": 0.2231, "step": 2030 }, { "epoch": 0.6528, "grad_norm": 1.2243691251251998, "learning_rate": 3.2503003069995057e-06, "loss": 0.2127, "step": 2040 }, { "epoch": 0.656, "grad_norm": 1.1678450454685523, "learning_rate": 3.198082060445281e-06, "loss": 0.2286, "step": 2050 }, { "epoch": 0.6592, "grad_norm": 1.3654151111307413, "learning_rate": 3.1460887191468324e-06, "loss": 0.2262, "step": 2060 }, { "epoch": 0.6624, "grad_norm": 1.1453122239206175, "learning_rate": 3.0943267726203965e-06, "loss": 0.2251, "step": 2070 }, { "epoch": 0.6656, "grad_norm": 1.4758563467786459, "learning_rate": 3.042802681500814e-06, "loss": 0.2293, "step": 2080 }, { "epoch": 0.6688, "grad_norm": 1.1793224562603977, "learning_rate": 2.991522876735154e-06, "loss": 0.2205, "step": 2090 }, { "epoch": 0.672, "grad_norm": 1.2381651466593462, "learning_rate": 2.9404937587800374e-06, "loss": 0.2189, "step": 2100 }, { "epoch": 0.6752, "grad_norm": 19.66766001236314, "learning_rate": 2.889721696802768e-06, "loss": 0.2491, "step": 2110 }, { "epoch": 0.6784, "grad_norm": 1.0721784074518568, "learning_rate": 2.839213027886373e-06, "loss": 0.2266, "step": 2120 }, { "epoch": 0.6816, "grad_norm": 1.0952730571552138, "learning_rate": 2.7889740562386357e-06, "loss": 0.2156, "step": 2130 }, { "epoch": 0.6848, "grad_norm": 1.0572399244670028, "learning_rate": 2.7390110524052415e-06, "loss": 0.2355, "step": 2140 }, { "epoch": 0.688, "grad_norm": 1.0010243056654815, "learning_rate": 2.6893302524871357e-06, "loss": 0.2339, "step": 2150 }, { "epoch": 0.6912, "grad_norm": 1.1936155230445462, "learning_rate": 2.6399378573621493e-06, "loss": 0.2057, "step": 2160 }, { "epoch": 0.6944, "grad_norm": 1.4177935849928514, "learning_rate": 2.5908400319110588e-06, "loss": 0.2845, "step": 2170 }, { "epoch": 0.6976, "grad_norm": 1.3791416946344361, "learning_rate": 2.5420429042481054e-06, "loss": 0.287, "step": 2180 }, { "epoch": 0.7008, "grad_norm": 1.4748810133302899, "learning_rate": 2.493552564956126e-06, "loss": 0.2317, "step": 2190 }, { "epoch": 0.704, "grad_norm": 1.1836491485069627, "learning_rate": 2.445375066326362e-06, "loss": 0.2172, "step": 2200 }, { "epoch": 0.7072, "grad_norm": 1.2813045848755424, "learning_rate": 2.3975164216030456e-06, "loss": 0.2232, "step": 2210 }, { "epoch": 0.7104, "grad_norm": 1.3145112714821583, "learning_rate": 2.349982604232851e-06, "loss": 0.2343, "step": 2220 }, { "epoch": 0.7136, "grad_norm": 1.1808875801974772, "learning_rate": 2.3027795471193404e-06, "loss": 0.2465, "step": 2230 }, { "epoch": 0.7168, "grad_norm": 1.0981462474429815, "learning_rate": 2.255913141882436e-06, "loss": 0.3201, "step": 2240 }, { "epoch": 0.72, "grad_norm": 1.230734760390504, "learning_rate": 2.209389238123066e-06, "loss": 0.2201, "step": 2250 }, { "epoch": 0.7232, "grad_norm": 1.2070256506474353, "learning_rate": 2.163213642693059e-06, "loss": 0.2218, "step": 2260 }, { "epoch": 0.7264, "grad_norm": 1.2704909154384314, "learning_rate": 2.1173921189703523e-06, "loss": 0.3296, "step": 2270 }, { "epoch": 0.7296, "grad_norm": 1.2731910215448914, "learning_rate": 2.0719303861396435e-06, "loss": 0.2509, "step": 2280 }, { "epoch": 0.7328, "grad_norm": 1.2952080019044347, "learning_rate": 2.0268341184785674e-06, "loss": 0.2296, "step": 2290 }, { "epoch": 0.736, "grad_norm": 1.1752545729106638, "learning_rate": 1.982108944649441e-06, "loss": 0.2502, "step": 2300 }, { "epoch": 0.7392, "grad_norm": 1.3810746184648486, "learning_rate": 1.937760446996741e-06, "loss": 0.2234, "step": 2310 }, { "epoch": 0.7424, "grad_norm": 1.24815248782901, "learning_rate": 1.8937941608503484e-06, "loss": 0.3464, "step": 2320 }, { "epoch": 0.7456, "grad_norm": 1.1328500128827523, "learning_rate": 1.8502155738346488e-06, "loss": 0.2319, "step": 2330 }, { "epoch": 0.7488, "grad_norm": 1.1922344333046615, "learning_rate": 1.8070301251836108e-06, "loss": 0.2231, "step": 2340 }, { "epoch": 0.752, "grad_norm": 1.2519090435571332, "learning_rate": 1.764243205061879e-06, "loss": 0.2213, "step": 2350 }, { "epoch": 0.7552, "grad_norm": 1.2296446772094094, "learning_rate": 1.721860153892011e-06, "loss": 0.2334, "step": 2360 }, { "epoch": 0.7584, "grad_norm": 1.1405936822992957, "learning_rate": 1.6798862616879185e-06, "loss": 0.2139, "step": 2370 }, { "epoch": 0.7616, "grad_norm": 1.3260125376610297, "learning_rate": 1.6383267673945925e-06, "loss": 0.222, "step": 2380 }, { "epoch": 0.7648, "grad_norm": 1.1316011229474554, "learning_rate": 1.5971868582342047e-06, "loss": 0.2026, "step": 2390 }, { "epoch": 0.768, "grad_norm": 1.1138427441279388, "learning_rate": 1.55647166905868e-06, "loss": 0.2136, "step": 2400 }, { "epoch": 0.7712, "grad_norm": 1.3571505881357013, "learning_rate": 1.516186281708778e-06, "loss": 0.2216, "step": 2410 }, { "epoch": 0.7744, "grad_norm": 1.2375006700475994, "learning_rate": 1.4763357243798154e-06, "loss": 0.2139, "step": 2420 }, { "epoch": 0.7776, "grad_norm": 1.1212490843977576, "learning_rate": 1.4369249709940759e-06, "loss": 0.2067, "step": 2430 }, { "epoch": 0.7808, "grad_norm": 1.3356196332358288, "learning_rate": 1.3979589405799865e-06, "loss": 0.207, "step": 2440 }, { "epoch": 0.784, "grad_norm": 1.28648624602483, "learning_rate": 1.3594424966581555e-06, "loss": 0.2186, "step": 2450 }, { "epoch": 0.7872, "grad_norm": 1.1736305743675295, "learning_rate": 1.321380446634342e-06, "loss": 0.2116, "step": 2460 }, { "epoch": 0.7904, "grad_norm": 1.2598458372212618, "learning_rate": 1.2837775411994092e-06, "loss": 0.2228, "step": 2470 }, { "epoch": 0.7936, "grad_norm": 1.0979929469461585, "learning_rate": 1.246638473736378e-06, "loss": 0.2067, "step": 2480 }, { "epoch": 0.7968, "grad_norm": 1.6135875508628044, "learning_rate": 1.2099678797346282e-06, "loss": 0.3298, "step": 2490 }, { "epoch": 0.8, "grad_norm": 1.230081395796401, "learning_rate": 1.1737703362113134e-06, "loss": 0.2149, "step": 2500 }, { "epoch": 0.8032, "grad_norm": 1.191155602222997, "learning_rate": 1.1380503611400933e-06, "loss": 0.2114, "step": 2510 }, { "epoch": 0.8064, "grad_norm": 1.0941583722626504, "learning_rate": 1.1028124128872191e-06, "loss": 0.2197, "step": 2520 }, { "epoch": 0.8096, "grad_norm": 1.3743010322287874, "learning_rate": 1.068060889655066e-06, "loss": 0.2201, "step": 2530 }, { "epoch": 0.8128, "grad_norm": 1.367609495826361, "learning_rate": 1.033800128933179e-06, "loss": 0.2263, "step": 2540 }, { "epoch": 0.816, "grad_norm": 1.2841195283902214, "learning_rate": 1.0000344069568885e-06, "loss": 0.2108, "step": 2550 }, { "epoch": 0.8192, "grad_norm": 1.262506622679509, "learning_rate": 9.667679381735706e-07, "loss": 0.2099, "step": 2560 }, { "epoch": 0.8224, "grad_norm": 1.3708525231737212, "learning_rate": 9.340048747166341e-07, "loss": 0.2123, "step": 2570 }, { "epoch": 0.8256, "grad_norm": 1.0907835378818949, "learning_rate": 9.017493058872623e-07, "loss": 0.2124, "step": 2580 }, { "epoch": 0.8288, "grad_norm": 1.1324488490089442, "learning_rate": 8.700052576440166e-07, "loss": 0.2152, "step": 2590 }, { "epoch": 0.832, "grad_norm": 1.1888142582084373, "learning_rate": 8.387766921003427e-07, "loss": 0.2218, "step": 2600 }, { "epoch": 0.8352, "grad_norm": 1.230386423249033, "learning_rate": 8.080675070300303e-07, "loss": 0.4094, "step": 2610 }, { "epoch": 0.8384, "grad_norm": 1.2059240836852692, "learning_rate": 7.77881535380724e-07, "loss": 0.2182, "step": 2620 }, { "epoch": 0.8416, "grad_norm": 1.2730243315669996, "learning_rate": 7.482225447955155e-07, "loss": 0.2242, "step": 2630 }, { "epoch": 0.8448, "grad_norm": 1.3097925731697693, "learning_rate": 7.190942371426862e-07, "loss": 0.2334, "step": 2640 }, { "epoch": 0.848, "grad_norm": 1.3089263483412454, "learning_rate": 6.905002480536565e-07, "loss": 0.2129, "step": 2650 }, { "epoch": 0.8512, "grad_norm": 1.2487331361219, "learning_rate": 6.624441464692161e-07, "loss": 0.2059, "step": 2660 }, { "epoch": 0.8544, "grad_norm": 1.1900169075542317, "learning_rate": 6.349294341940593e-07, "loss": 0.2119, "step": 2670 }, { "epoch": 0.8576, "grad_norm": 1.4499451387635829, "learning_rate": 6.07959545459717e-07, "loss": 0.2881, "step": 2680 }, { "epoch": 0.8608, "grad_norm": 1.2365542029204284, "learning_rate": 5.815378464959109e-07, "loss": 0.21, "step": 2690 }, { "epoch": 0.864, "grad_norm": 1.169403617807277, "learning_rate": 5.55667635110399e-07, "loss": 0.2131, "step": 2700 }, { "epoch": 0.8672, "grad_norm": 1.1917798187061128, "learning_rate": 5.303521402773665e-07, "loss": 0.2171, "step": 2710 }, { "epoch": 0.8704, "grad_norm": 1.3227237208234792, "learning_rate": 5.055945217344004e-07, "loss": 0.2232, "step": 2720 }, { "epoch": 0.8736, "grad_norm": 1.244493433459287, "learning_rate": 4.81397869588106e-07, "loss": 0.3424, "step": 2730 }, { "epoch": 0.8768, "grad_norm": 1.2918840219931813, "learning_rate": 4.5776520392842473e-07, "loss": 0.2041, "step": 2740 }, { "epoch": 0.88, "grad_norm": 1.161291630108919, "learning_rate": 4.346994744516747e-07, "loss": 0.2062, "step": 2750 }, { "epoch": 0.8832, "grad_norm": 1.188169889946088, "learning_rate": 4.122035600923913e-07, "loss": 0.2138, "step": 2760 }, { "epoch": 0.8864, "grad_norm": 1.2759791115877892, "learning_rate": 3.902802686639967e-07, "loss": 0.2076, "step": 2770 }, { "epoch": 0.8896, "grad_norm": 1.2428248160576683, "learning_rate": 3.6893233650833916e-07, "loss": 0.2231, "step": 2780 }, { "epoch": 0.8928, "grad_norm": 1.2872717018149586, "learning_rate": 3.4816242815416014e-07, "loss": 0.2167, "step": 2790 }, { "epoch": 0.896, "grad_norm": 1.120924586598326, "learning_rate": 3.2797313598452506e-07, "loss": 0.2125, "step": 2800 }, { "epoch": 0.8992, "grad_norm": 1.235367729092687, "learning_rate": 3.0836697991325547e-07, "loss": 0.2211, "step": 2810 }, { "epoch": 0.9024, "grad_norm": 1.1425245188644768, "learning_rate": 2.893464070704055e-07, "loss": 0.2059, "step": 2820 }, { "epoch": 0.9056, "grad_norm": 1.240438287494313, "learning_rate": 2.7091379149682683e-07, "loss": 0.2038, "step": 2830 }, { "epoch": 0.9088, "grad_norm": 1.2119903261308935, "learning_rate": 2.53071433847854e-07, "loss": 0.208, "step": 2840 }, { "epoch": 0.912, "grad_norm": 1.3047012967784055, "learning_rate": 2.3582156110614985e-07, "loss": 0.2065, "step": 2850 }, { "epoch": 0.9152, "grad_norm": 1.2159745001331477, "learning_rate": 2.1916632630374579e-07, "loss": 0.2092, "step": 2860 }, { "epoch": 0.9184, "grad_norm": 1.4115746282687291, "learning_rate": 2.0310780825331056e-07, "loss": 0.3065, "step": 2870 }, { "epoch": 0.9216, "grad_norm": 1.1260909387649622, "learning_rate": 1.876480112886886e-07, "loss": 0.2149, "step": 2880 }, { "epoch": 0.9248, "grad_norm": 1.321400120727542, "learning_rate": 1.7278886501472804e-07, "loss": 0.2151, "step": 2890 }, { "epoch": 0.928, "grad_norm": 1.2811002757998282, "learning_rate": 1.5853222406643555e-07, "loss": 0.2124, "step": 2900 }, { "epoch": 0.9312, "grad_norm": 1.3188098070211447, "learning_rate": 1.4487986787749763e-07, "loss": 0.2103, "step": 2910 }, { "epoch": 0.9344, "grad_norm": 1.1879475265636688, "learning_rate": 1.318335004581761e-07, "loss": 0.2114, "step": 2920 }, { "epoch": 0.9376, "grad_norm": 1.119348294167629, "learning_rate": 1.1939475018262481e-07, "loss": 0.2144, "step": 2930 }, { "epoch": 0.9408, "grad_norm": 1.160073841048408, "learning_rate": 1.0756516958564667e-07, "loss": 0.2119, "step": 2940 }, { "epoch": 0.944, "grad_norm": 1.3050171867050437, "learning_rate": 9.634623516891372e-08, "loss": 0.2077, "step": 2950 }, { "epoch": 0.9472, "grad_norm": 1.102635547442788, "learning_rate": 8.573934721667731e-08, "loss": 0.2107, "step": 2960 }, { "epoch": 0.9504, "grad_norm": 1.1976820144956357, "learning_rate": 7.574582962099508e-08, "loss": 0.3234, "step": 2970 }, { "epoch": 0.9536, "grad_norm": 1.0289812852531994, "learning_rate": 6.636692971648873e-08, "loss": 0.2946, "step": 2980 }, { "epoch": 0.9568, "grad_norm": 1.1192107741554636, "learning_rate": 5.7603818124657984e-08, "loss": 0.2146, "step": 2990 }, { "epoch": 0.96, "grad_norm": 1.319177806968747, "learning_rate": 4.9457588607772497e-08, "loss": 0.2113, "step": 3000 }, { "epoch": 0.9632, "grad_norm": 1.0857028410476908, "learning_rate": 4.192925793235159e-08, "loss": 0.2097, "step": 3010 }, { "epoch": 0.9664, "grad_norm": 1.0588162046549727, "learning_rate": 3.501976574226018e-08, "loss": 0.2138, "step": 3020 }, { "epoch": 0.9696, "grad_norm": 1.3776741815700904, "learning_rate": 2.8729974441426557e-08, "loss": 0.1994, "step": 3030 }, { "epoch": 0.9728, "grad_norm": 1.403580554324849, "learning_rate": 2.3060669086199526e-08, "loss": 0.2166, "step": 3040 }, { "epoch": 0.976, "grad_norm": 1.2568358655842473, "learning_rate": 1.8012557287367394e-08, "loss": 0.2843, "step": 3050 }, { "epoch": 0.9792, "grad_norm": 1.3656578926442577, "learning_rate": 1.3586269121833028e-08, "loss": 0.2138, "step": 3060 }, { "epoch": 0.9824, "grad_norm": 1.178254860310462, "learning_rate": 9.782357053972902e-09, "loss": 0.2169, "step": 3070 }, { "epoch": 0.9856, "grad_norm": 1.2773015970417392, "learning_rate": 6.6012958666827886e-09, "loss": 0.2163, "step": 3080 }, { "epoch": 0.9888, "grad_norm": 1.2949804935819031, "learning_rate": 4.043482602116844e-09, "loss": 0.2152, "step": 3090 }, { "epoch": 0.992, "grad_norm": 1.129218160141717, "learning_rate": 2.1092365121305745e-09, "loss": 0.2566, "step": 3100 }, { "epoch": 0.9952, "grad_norm": 1.3984530229483965, "learning_rate": 7.987990184354921e-10, "loss": 0.3123, "step": 3110 }, { "epoch": 0.9984, "grad_norm": 1.3456687728542012, "learning_rate": 1.1233368246321708e-10, "loss": 0.2285, "step": 3120 } ], "logging_steps": 10, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 68659404668928.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }