| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 3.097726066927376, | |
| "learning_rate": 2.875399361022364e-07, | |
| "loss": 0.4182, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 2.6698471287095344, | |
| "learning_rate": 6.070287539936103e-07, | |
| "loss": 0.3969, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 2.3652285610261403, | |
| "learning_rate": 9.265175718849841e-07, | |
| "loss": 0.3746, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 1.4765456643161126, | |
| "learning_rate": 1.2460063897763578e-06, | |
| "loss": 0.3695, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 2.191519526172484, | |
| "learning_rate": 1.565495207667732e-06, | |
| "loss": 0.3449, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 1.6285826522074962, | |
| "learning_rate": 1.8849840255591056e-06, | |
| "loss": 0.3181, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 1.8683549636521075, | |
| "learning_rate": 2.2044728434504793e-06, | |
| "loss": 0.322, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 1.5301019294828755, | |
| "learning_rate": 2.5239616613418532e-06, | |
| "loss": 0.3009, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 1.538896137107791, | |
| "learning_rate": 2.8434504792332267e-06, | |
| "loss": 0.293, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.6984302868653567, | |
| "learning_rate": 3.162939297124601e-06, | |
| "loss": 0.2999, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 1.7119603328381132, | |
| "learning_rate": 3.482428115015975e-06, | |
| "loss": 0.2945, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 1.6069777286559, | |
| "learning_rate": 3.8019169329073485e-06, | |
| "loss": 0.2895, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 1.7566779999753588, | |
| "learning_rate": 4.121405750798722e-06, | |
| "loss": 0.2923, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 1.7352248180581427, | |
| "learning_rate": 4.440894568690096e-06, | |
| "loss": 0.3638, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 1.8483046207545708, | |
| "learning_rate": 4.76038338658147e-06, | |
| "loss": 0.2861, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 1.6829534856608535, | |
| "learning_rate": 5.079872204472844e-06, | |
| "loss": 0.5296, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 1.7746050883459261, | |
| "learning_rate": 5.399361022364218e-06, | |
| "loss": 0.2934, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 1.7179977616371749, | |
| "learning_rate": 5.718849840255591e-06, | |
| "loss": 0.2713, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 1.5706330739618684, | |
| "learning_rate": 6.038338658146965e-06, | |
| "loss": 0.2779, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 1.8232979139526881, | |
| "learning_rate": 6.35782747603834e-06, | |
| "loss": 0.2835, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 1.755606637900451, | |
| "learning_rate": 6.677316293929713e-06, | |
| "loss": 0.3767, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 1.7142425655315976, | |
| "learning_rate": 6.996805111821087e-06, | |
| "loss": 0.2864, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 1.6051059574805915, | |
| "learning_rate": 7.316293929712461e-06, | |
| "loss": 0.2767, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 1.49334442261346, | |
| "learning_rate": 7.635782747603835e-06, | |
| "loss": 0.2689, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.6270294243342234, | |
| "learning_rate": 7.955271565495208e-06, | |
| "loss": 0.2857, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 1.4820640434767234, | |
| "learning_rate": 8.274760383386582e-06, | |
| "loss": 0.2838, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 1.4755509754091296, | |
| "learning_rate": 8.594249201277956e-06, | |
| "loss": 0.2829, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 1.5750758252112724, | |
| "learning_rate": 8.91373801916933e-06, | |
| "loss": 0.2716, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 1.8211062452729152, | |
| "learning_rate": 9.233226837060704e-06, | |
| "loss": 0.2985, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.470802373475764, | |
| "learning_rate": 9.552715654952077e-06, | |
| "loss": 0.2722, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 1.5951743640227587, | |
| "learning_rate": 9.87220447284345e-06, | |
| "loss": 0.2834, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 1.5085331338575516, | |
| "learning_rate": 9.999887666317538e-06, | |
| "loss": 0.2866, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 1.5890905665931712, | |
| "learning_rate": 9.999201200981566e-06, | |
| "loss": 0.2774, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 1.4824327210922925, | |
| "learning_rate": 9.997890763487869e-06, | |
| "loss": 0.2865, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 1.4179211935288807, | |
| "learning_rate": 9.995956517397884e-06, | |
| "loss": 0.3445, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 1.6311477640409462, | |
| "learning_rate": 9.993398704133318e-06, | |
| "loss": 0.2961, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 1.3493949424853904, | |
| "learning_rate": 9.990217642946028e-06, | |
| "loss": 0.2693, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 1.5074472178009761, | |
| "learning_rate": 9.986413730878168e-06, | |
| "loss": 0.2908, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 1.5552421320809011, | |
| "learning_rate": 9.981987442712634e-06, | |
| "loss": 0.3047, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 1.5085252753416616, | |
| "learning_rate": 9.976939330913801e-06, | |
| "loss": 0.2761, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 1.3894985533924997, | |
| "learning_rate": 9.971270025558576e-06, | |
| "loss": 0.2827, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 1.376517568828387, | |
| "learning_rate": 9.96498023425774e-06, | |
| "loss": 0.282, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 1.293372373569424, | |
| "learning_rate": 9.958070742067649e-06, | |
| "loss": 0.2864, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 1.3933504307268871, | |
| "learning_rate": 9.95054241139223e-06, | |
| "loss": 0.2886, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 1.5512528120422318, | |
| "learning_rate": 9.942396181875342e-06, | |
| "loss": 0.2792, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 1.3955106828693524, | |
| "learning_rate": 9.933633070283512e-06, | |
| "loss": 0.2823, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 1.6468424852669425, | |
| "learning_rate": 9.924254170379007e-06, | |
| "loss": 0.2984, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 1.5159031541701018, | |
| "learning_rate": 9.914260652783323e-06, | |
| "loss": 0.271, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 1.5805212725674178, | |
| "learning_rate": 9.903653764831088e-06, | |
| "loss": 0.2957, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.1956547216731315, | |
| "learning_rate": 9.892434830414354e-06, | |
| "loss": 0.2684, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 1.4268809698190317, | |
| "learning_rate": 9.880605249817377e-06, | |
| "loss": 0.3657, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 1.4179964964215248, | |
| "learning_rate": 9.868166499541824e-06, | |
| "loss": 0.2751, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 1.2892232060188915, | |
| "learning_rate": 9.855120132122503e-06, | |
| "loss": 0.2729, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 1.261114838277135, | |
| "learning_rate": 9.841467775933566e-06, | |
| "loss": 0.268, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 1.430271043321, | |
| "learning_rate": 9.827211134985273e-06, | |
| "loss": 0.4323, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 1.2954706652401755, | |
| "learning_rate": 9.812351988711312e-06, | |
| "loss": 0.2622, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 1.7636694848560834, | |
| "learning_rate": 9.79689219174669e-06, | |
| "loss": 0.3947, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 1.5362869932266252, | |
| "learning_rate": 9.780833673696255e-06, | |
| "loss": 0.261, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 1.2516741612873126, | |
| "learning_rate": 9.76417843889385e-06, | |
| "loss": 0.2632, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 1.330313742879521, | |
| "learning_rate": 9.746928566152148e-06, | |
| "loss": 0.2612, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 1.369062918644532, | |
| "learning_rate": 9.729086208503174e-06, | |
| "loss": 0.2672, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 1.4183586038757179, | |
| "learning_rate": 9.710653592929595e-06, | |
| "loss": 0.2738, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 1.52682494954233, | |
| "learning_rate": 9.691633020086745e-06, | |
| "loss": 0.3808, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 1.4376998703858688, | |
| "learning_rate": 9.672026864015476e-06, | |
| "loss": 0.2643, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 1.583565617875829, | |
| "learning_rate": 9.651837571845842e-06, | |
| "loss": 0.2772, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 1.4806395536482786, | |
| "learning_rate": 9.631067663491663e-06, | |
| "loss": 0.3377, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 1.2496121580498412, | |
| "learning_rate": 9.609719731336005e-06, | |
| "loss": 0.2752, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 1.3439013535339106, | |
| "learning_rate": 9.587796439907609e-06, | |
| "loss": 0.2685, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 1.4305791278136326, | |
| "learning_rate": 9.565300525548327e-06, | |
| "loss": 0.2754, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 1.4621614330412347, | |
| "learning_rate": 9.542234796071577e-06, | |
| "loss": 0.2627, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 1.3498296779237806, | |
| "learning_rate": 9.518602130411894e-06, | |
| "loss": 0.2861, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 1.3530290691335103, | |
| "learning_rate": 9.4944054782656e-06, | |
| "loss": 0.2668, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 1.5377603964582789, | |
| "learning_rate": 9.469647859722634e-06, | |
| "loss": 0.2679, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 1.225090999497597, | |
| "learning_rate": 9.444332364889603e-06, | |
| "loss": 0.2491, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.2724375938152441, | |
| "learning_rate": 9.41846215350409e-06, | |
| "loss": 0.2756, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 1.3649187680293995, | |
| "learning_rate": 9.392040454540284e-06, | |
| "loss": 0.2685, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 1.506689299970985, | |
| "learning_rate": 9.365070565805941e-06, | |
| "loss": 0.2727, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 1.5365021125318565, | |
| "learning_rate": 9.337555853530785e-06, | |
| "loss": 0.2695, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 1.4793118284689457, | |
| "learning_rate": 9.309499751946345e-06, | |
| "loss": 0.2712, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 1.3520775224079904, | |
| "learning_rate": 9.280905762857315e-06, | |
| "loss": 0.2658, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 1.0943480269165575, | |
| "learning_rate": 9.251777455204485e-06, | |
| "loss": 0.2699, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 1.4272492824528293, | |
| "learning_rate": 9.222118464619278e-06, | |
| "loss": 0.2552, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 2.210970619104712, | |
| "learning_rate": 9.191932492969972e-06, | |
| "loss": 0.2609, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 1.3860115638657557, | |
| "learning_rate": 9.161223307899659e-06, | |
| "loss": 0.2614, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 1.3097054200264115, | |
| "learning_rate": 9.129994742355985e-06, | |
| "loss": 0.2558, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 1.2863046042758517, | |
| "learning_rate": 9.09825069411274e-06, | |
| "loss": 0.3506, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 1.4154177255509635, | |
| "learning_rate": 9.065995125283367e-06, | |
| "loss": 0.3483, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 1.2614915111427545, | |
| "learning_rate": 9.033232061826428e-06, | |
| "loss": 0.3094, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 1.392215585602565, | |
| "learning_rate": 8.999965593043113e-06, | |
| "loss": 0.2552, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 1.2787168993445066, | |
| "learning_rate": 8.96619987106682e-06, | |
| "loss": 0.2635, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 1.1874290348203185, | |
| "learning_rate": 8.931939110344935e-06, | |
| "loss": 0.3925, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 1.2892860612183799, | |
| "learning_rate": 8.897187587112783e-06, | |
| "loss": 0.472, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 1.1180916115878632, | |
| "learning_rate": 8.861949638859908e-06, | |
| "loss": 0.4072, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 1.311746880897253, | |
| "learning_rate": 8.826229663788688e-06, | |
| "loss": 0.2632, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 1.2164327029930522, | |
| "learning_rate": 8.790032120265373e-06, | |
| "loss": 0.2532, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 1.436604570629404, | |
| "learning_rate": 8.753361526263622e-06, | |
| "loss": 0.2591, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 1.2626583454725855, | |
| "learning_rate": 8.716222458800591e-06, | |
| "loss": 0.2617, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 1.5056105724631803, | |
| "learning_rate": 8.67861955336566e-06, | |
| "loss": 0.2622, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 1.4940608590847797, | |
| "learning_rate": 8.640557503341843e-06, | |
| "loss": 0.2689, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.4135983918557036, | |
| "learning_rate": 8.602041059420017e-06, | |
| "loss": 0.2599, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 1.2654674588600259, | |
| "learning_rate": 8.563075029005924e-06, | |
| "loss": 0.2598, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 1.5394425180175606, | |
| "learning_rate": 8.523664275620185e-06, | |
| "loss": 0.284, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 1.2453557042164298, | |
| "learning_rate": 8.483813718291223e-06, | |
| "loss": 0.2475, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 1.3932335036293408, | |
| "learning_rate": 8.443528330941322e-06, | |
| "loss": 0.2733, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 1.3326668118526488, | |
| "learning_rate": 8.402813141765796e-06, | |
| "loss": 0.2706, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 1.4498463244679263, | |
| "learning_rate": 8.361673232605408e-06, | |
| "loss": 0.2599, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 1.5505084458466836, | |
| "learning_rate": 8.320113738312081e-06, | |
| "loss": 0.2582, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 1.3933287663129275, | |
| "learning_rate": 8.27813984610799e-06, | |
| "loss": 0.251, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 1.3411841017130814, | |
| "learning_rate": 8.235756794938123e-06, | |
| "loss": 0.2545, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 1.2762415468326629, | |
| "learning_rate": 8.19296987481639e-06, | |
| "loss": 0.2511, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 1.3756624147468053, | |
| "learning_rate": 8.149784426165351e-06, | |
| "loss": 0.2471, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 1.2367975395806792, | |
| "learning_rate": 8.106205839149653e-06, | |
| "loss": 0.2398, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 1.2795711630605267, | |
| "learning_rate": 8.06223955300326e-06, | |
| "loss": 0.25, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 1.181366173079793, | |
| "learning_rate": 8.017891055350563e-06, | |
| "loss": 0.3787, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 1.235985789921985, | |
| "learning_rate": 7.973165881521435e-06, | |
| "loss": 0.2489, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 1.3716107886978977, | |
| "learning_rate": 7.928069613860357e-06, | |
| "loss": 0.2638, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 1.3314537160679019, | |
| "learning_rate": 7.882607881029652e-06, | |
| "loss": 0.2666, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 1.47532226068613, | |
| "learning_rate": 7.836786357306943e-06, | |
| "loss": 0.2469, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 1.2910123474743094, | |
| "learning_rate": 7.790610761876936e-06, | |
| "loss": 0.245, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 1.219675861544086, | |
| "learning_rate": 7.744086858117565e-06, | |
| "loss": 0.2532, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 1.2663481766467186, | |
| "learning_rate": 7.69722045288066e-06, | |
| "loss": 0.2648, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 1.2601430683107997, | |
| "learning_rate": 7.650017395767149e-06, | |
| "loss": 0.2428, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 1.306765832939207, | |
| "learning_rate": 7.602483578396955e-06, | |
| "loss": 0.244, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 1.3723024837409241, | |
| "learning_rate": 7.554624933673638e-06, | |
| "loss": 0.3634, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.5507546450379777, | |
| "learning_rate": 7.5064474350438755e-06, | |
| "loss": 0.2618, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 1.366012809594849, | |
| "learning_rate": 7.457957095751896e-06, | |
| "loss": 0.2487, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 1.1873848911132183, | |
| "learning_rate": 7.4091599680889425e-06, | |
| "loss": 0.2553, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 1.4014484490952397, | |
| "learning_rate": 7.3600621426378515e-06, | |
| "loss": 0.2402, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 1.1617991472531028, | |
| "learning_rate": 7.3106697475128655e-06, | |
| "loss": 0.2569, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 1.3751891734766228, | |
| "learning_rate": 7.260988947594759e-06, | |
| "loss": 0.2488, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 1.237845283871077, | |
| "learning_rate": 7.211025943761367e-06, | |
| "loss": 0.2454, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 1.1628031806651409, | |
| "learning_rate": 7.160786972113627e-06, | |
| "loss": 0.2554, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 1.3404586293984995, | |
| "learning_rate": 7.1102783031972326e-06, | |
| "loss": 0.2551, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 1.4670462801897188, | |
| "learning_rate": 7.059506241219964e-06, | |
| "loss": 0.2502, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 1.0302147200635003, | |
| "learning_rate": 7.008477123264849e-06, | |
| "loss": 0.2295, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 1.2824619758646878, | |
| "learning_rate": 6.957197318499187e-06, | |
| "loss": 0.2539, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 1.3608888568790518, | |
| "learning_rate": 6.905673227379606e-06, | |
| "loss": 0.342, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 1.2538554464958942, | |
| "learning_rate": 6.853911280853168e-06, | |
| "loss": 0.2405, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 1.377080653341555, | |
| "learning_rate": 6.801917939554721e-06, | |
| "loss": 0.246, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 1.295752065257539, | |
| "learning_rate": 6.749699693000495e-06, | |
| "loss": 0.238, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 1.1623556510041717, | |
| "learning_rate": 6.6972630587781385e-06, | |
| "loss": 0.3522, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 1.0514412139989395, | |
| "learning_rate": 6.6446145817332105e-06, | |
| "loss": 0.2448, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 1.3006684707343785, | |
| "learning_rate": 6.591760833152306e-06, | |
| "loss": 0.2388, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 1.3688639995228327, | |
| "learning_rate": 6.538708409942854e-06, | |
| "loss": 0.2484, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 1.182581571604436, | |
| "learning_rate": 6.48546393380973e-06, | |
| "loss": 0.2411, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 1.2229123644507025, | |
| "learning_rate": 6.4320340504287825e-06, | |
| "loss": 0.2913, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 1.114177546269806, | |
| "learning_rate": 6.378425428617343e-06, | |
| "loss": 0.5038, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 1.400649143081864, | |
| "learning_rate": 6.324644759501869e-06, | |
| "loss": 0.2514, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 1.2414663469351401, | |
| "learning_rate": 6.270698755682792e-06, | |
| "loss": 0.2434, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.3094558123042377, | |
| "learning_rate": 6.2165941503966995e-06, | |
| "loss": 0.2486, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 1.1308739268431762, | |
| "learning_rate": 6.162337696675909e-06, | |
| "loss": 0.224, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 1.2921131535571986, | |
| "learning_rate": 6.107936166505615e-06, | |
| "loss": 0.25, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 1.277856644085611, | |
| "learning_rate": 6.053396349978632e-06, | |
| "loss": 0.2418, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 1.207819320049489, | |
| "learning_rate": 5.998725054447904e-06, | |
| "loss": 0.2429, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 1.3964622577997952, | |
| "learning_rate": 5.943929103676839e-06, | |
| "loss": 0.2339, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 1.0444429321557864, | |
| "learning_rate": 5.889015336987614e-06, | |
| "loss": 0.2296, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 1.000405925226846, | |
| "learning_rate": 5.833990608407525e-06, | |
| "loss": 0.2227, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 1.243996070580087, | |
| "learning_rate": 5.778861785813508e-06, | |
| "loss": 0.228, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 1.4163588613795817, | |
| "learning_rate": 5.723635750074924e-06, | |
| "loss": 0.262, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 1.2497990764938063, | |
| "learning_rate": 5.6683193941947365e-06, | |
| "loss": 0.2354, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 1.0918427100928676, | |
| "learning_rate": 5.61291962244916e-06, | |
| "loss": 0.2343, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 1.2313539946902519, | |
| "learning_rate": 5.5574433495259015e-06, | |
| "loss": 0.2224, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 1.3110693967381668, | |
| "learning_rate": 5.501897499661123e-06, | |
| "loss": 0.2353, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 1.3211565707814268, | |
| "learning_rate": 5.446289005775185e-06, | |
| "loss": 0.2448, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 1.336496780231579, | |
| "learning_rate": 5.390624808607321e-06, | |
| "loss": 0.2353, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 1.4349961842702135, | |
| "learning_rate": 5.334911855849334e-06, | |
| "loss": 0.2344, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 1.300276553470609, | |
| "learning_rate": 5.279157101278433e-06, | |
| "loss": 0.2488, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 1.3938747275358123, | |
| "learning_rate": 5.2233675038892815e-06, | |
| "loss": 0.235, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 1.0695710070831115, | |
| "learning_rate": 5.1675500270254385e-06, | |
| "loss": 0.2358, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 1.3464266574587438, | |
| "learning_rate": 5.111711637510216e-06, | |
| "loss": 0.262, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 1.2054969480183788, | |
| "learning_rate": 5.055859304777127e-06, | |
| "loss": 0.2346, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 1.592945506341502, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2491, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 1.0841849724930561, | |
| "learning_rate": 4.944140695222874e-06, | |
| "loss": 0.2373, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 1.1462434966725683, | |
| "learning_rate": 4.888288362489786e-06, | |
| "loss": 0.2245, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.2006687090364805, | |
| "learning_rate": 4.832449972974564e-06, | |
| "loss": 0.2281, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 1.31046822802538, | |
| "learning_rate": 4.776632496110721e-06, | |
| "loss": 0.2343, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 1.3130856213140962, | |
| "learning_rate": 4.720842898721569e-06, | |
| "loss": 0.3716, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 1.3909345006491758, | |
| "learning_rate": 4.665088144150666e-06, | |
| "loss": 0.2344, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 1.103425988971943, | |
| "learning_rate": 4.60937519139268e-06, | |
| "loss": 0.2207, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 1.3110124719599046, | |
| "learning_rate": 4.553710994224816e-06, | |
| "loss": 0.2584, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 1.1358436006113255, | |
| "learning_rate": 4.498102500338879e-06, | |
| "loss": 0.2408, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 1.1712875527581932, | |
| "learning_rate": 4.442556650474099e-06, | |
| "loss": 0.2338, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 1.4200454612891542, | |
| "learning_rate": 4.387080377550843e-06, | |
| "loss": 0.3351, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 1.1792392310210245, | |
| "learning_rate": 4.331680605805264e-06, | |
| "loss": 0.2284, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 1.4269025040790715, | |
| "learning_rate": 4.2763642499250765e-06, | |
| "loss": 0.2831, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 1.2178218832805392, | |
| "learning_rate": 4.221138214186493e-06, | |
| "loss": 0.4656, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 1.381880289226324, | |
| "learning_rate": 4.166009391592476e-06, | |
| "loss": 0.2226, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 1.2861868856506182, | |
| "learning_rate": 4.110984663012388e-06, | |
| "loss": 0.2436, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 1.460603893087498, | |
| "learning_rate": 4.056070896323163e-06, | |
| "loss": 0.2348, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 1.3411148998390934, | |
| "learning_rate": 4.001274945552098e-06, | |
| "loss": 0.2294, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 1.2503467379311224, | |
| "learning_rate": 3.94660365002137e-06, | |
| "loss": 0.2179, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.8844502114106245, | |
| "learning_rate": 3.892063833494387e-06, | |
| "loss": 0.2293, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 1.323943188387191, | |
| "learning_rate": 3.837662303324093e-06, | |
| "loss": 0.2395, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 1.3150182927928105, | |
| "learning_rate": 3.783405849603302e-06, | |
| "loss": 0.2265, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 1.5251447223837293, | |
| "learning_rate": 3.729301244317208e-06, | |
| "loss": 0.2445, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 1.2518632060745116, | |
| "learning_rate": 3.675355240498133e-06, | |
| "loss": 0.2229, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 1.4410331274554835, | |
| "learning_rate": 3.6215745713826585e-06, | |
| "loss": 0.2269, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 1.2681938269526776, | |
| "learning_rate": 3.567965949571219e-06, | |
| "loss": 0.2216, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 1.2119772204720247, | |
| "learning_rate": 3.5145360661902717e-06, | |
| "loss": 0.2029, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.2769167413279376, | |
| "learning_rate": 3.4612915900571493e-06, | |
| "loss": 0.2234, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 1.1935986503511073, | |
| "learning_rate": 3.408239166847696e-06, | |
| "loss": 0.2313, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 1.0781337815097756, | |
| "learning_rate": 3.355385418266792e-06, | |
| "loss": 0.2292, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 1.2668633413118764, | |
| "learning_rate": 3.3027369412218623e-06, | |
| "loss": 0.2231, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 1.2243691251251998, | |
| "learning_rate": 3.2503003069995057e-06, | |
| "loss": 0.2127, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 1.1678450454685523, | |
| "learning_rate": 3.198082060445281e-06, | |
| "loss": 0.2286, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 1.3654151111307413, | |
| "learning_rate": 3.1460887191468324e-06, | |
| "loss": 0.2262, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 1.1453122239206175, | |
| "learning_rate": 3.0943267726203965e-06, | |
| "loss": 0.2251, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 1.4758563467786459, | |
| "learning_rate": 3.042802681500814e-06, | |
| "loss": 0.2293, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 1.1793224562603977, | |
| "learning_rate": 2.991522876735154e-06, | |
| "loss": 0.2205, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 1.2381651466593462, | |
| "learning_rate": 2.9404937587800374e-06, | |
| "loss": 0.2189, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 19.66766001236314, | |
| "learning_rate": 2.889721696802768e-06, | |
| "loss": 0.2491, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 1.0721784074518568, | |
| "learning_rate": 2.839213027886373e-06, | |
| "loss": 0.2266, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 1.0952730571552138, | |
| "learning_rate": 2.7889740562386357e-06, | |
| "loss": 0.2156, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 1.0572399244670028, | |
| "learning_rate": 2.7390110524052415e-06, | |
| "loss": 0.2355, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 1.0010243056654815, | |
| "learning_rate": 2.6893302524871357e-06, | |
| "loss": 0.2339, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 1.1936155230445462, | |
| "learning_rate": 2.6399378573621493e-06, | |
| "loss": 0.2057, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 1.4177935849928514, | |
| "learning_rate": 2.5908400319110588e-06, | |
| "loss": 0.2845, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 1.3791416946344361, | |
| "learning_rate": 2.5420429042481054e-06, | |
| "loss": 0.287, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 1.4748810133302899, | |
| "learning_rate": 2.493552564956126e-06, | |
| "loss": 0.2317, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 1.1836491485069627, | |
| "learning_rate": 2.445375066326362e-06, | |
| "loss": 0.2172, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 1.2813045848755424, | |
| "learning_rate": 2.3975164216030456e-06, | |
| "loss": 0.2232, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 1.3145112714821583, | |
| "learning_rate": 2.349982604232851e-06, | |
| "loss": 0.2343, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 1.1808875801974772, | |
| "learning_rate": 2.3027795471193404e-06, | |
| "loss": 0.2465, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 1.0981462474429815, | |
| "learning_rate": 2.255913141882436e-06, | |
| "loss": 0.3201, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.230734760390504, | |
| "learning_rate": 2.209389238123066e-06, | |
| "loss": 0.2201, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 1.2070256506474353, | |
| "learning_rate": 2.163213642693059e-06, | |
| "loss": 0.2218, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 1.2704909154384314, | |
| "learning_rate": 2.1173921189703523e-06, | |
| "loss": 0.3296, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 1.2731910215448914, | |
| "learning_rate": 2.0719303861396435e-06, | |
| "loss": 0.2509, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 1.2952080019044347, | |
| "learning_rate": 2.0268341184785674e-06, | |
| "loss": 0.2296, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 1.1752545729106638, | |
| "learning_rate": 1.982108944649441e-06, | |
| "loss": 0.2502, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 1.3810746184648486, | |
| "learning_rate": 1.937760446996741e-06, | |
| "loss": 0.2234, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 1.24815248782901, | |
| "learning_rate": 1.8937941608503484e-06, | |
| "loss": 0.3464, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 1.1328500128827523, | |
| "learning_rate": 1.8502155738346488e-06, | |
| "loss": 0.2319, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 1.1922344333046615, | |
| "learning_rate": 1.8070301251836108e-06, | |
| "loss": 0.2231, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 1.2519090435571332, | |
| "learning_rate": 1.764243205061879e-06, | |
| "loss": 0.2213, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 1.2296446772094094, | |
| "learning_rate": 1.721860153892011e-06, | |
| "loss": 0.2334, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 1.1405936822992957, | |
| "learning_rate": 1.6798862616879185e-06, | |
| "loss": 0.2139, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 1.3260125376610297, | |
| "learning_rate": 1.6383267673945925e-06, | |
| "loss": 0.222, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 1.1316011229474554, | |
| "learning_rate": 1.5971868582342047e-06, | |
| "loss": 0.2026, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.1138427441279388, | |
| "learning_rate": 1.55647166905868e-06, | |
| "loss": 0.2136, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 1.3571505881357013, | |
| "learning_rate": 1.516186281708778e-06, | |
| "loss": 0.2216, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 1.2375006700475994, | |
| "learning_rate": 1.4763357243798154e-06, | |
| "loss": 0.2139, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 1.1212490843977576, | |
| "learning_rate": 1.4369249709940759e-06, | |
| "loss": 0.2067, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 1.3356196332358288, | |
| "learning_rate": 1.3979589405799865e-06, | |
| "loss": 0.207, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 1.28648624602483, | |
| "learning_rate": 1.3594424966581555e-06, | |
| "loss": 0.2186, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 1.1736305743675295, | |
| "learning_rate": 1.321380446634342e-06, | |
| "loss": 0.2116, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 1.2598458372212618, | |
| "learning_rate": 1.2837775411994092e-06, | |
| "loss": 0.2228, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 1.0979929469461585, | |
| "learning_rate": 1.246638473736378e-06, | |
| "loss": 0.2067, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 1.6135875508628044, | |
| "learning_rate": 1.2099678797346282e-06, | |
| "loss": 0.3298, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.230081395796401, | |
| "learning_rate": 1.1737703362113134e-06, | |
| "loss": 0.2149, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 1.191155602222997, | |
| "learning_rate": 1.1380503611400933e-06, | |
| "loss": 0.2114, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 1.0941583722626504, | |
| "learning_rate": 1.1028124128872191e-06, | |
| "loss": 0.2197, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 1.3743010322287874, | |
| "learning_rate": 1.068060889655066e-06, | |
| "loss": 0.2201, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 1.367609495826361, | |
| "learning_rate": 1.033800128933179e-06, | |
| "loss": 0.2263, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 1.2841195283902214, | |
| "learning_rate": 1.0000344069568885e-06, | |
| "loss": 0.2108, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 1.262506622679509, | |
| "learning_rate": 9.667679381735706e-07, | |
| "loss": 0.2099, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 1.3708525231737212, | |
| "learning_rate": 9.340048747166341e-07, | |
| "loss": 0.2123, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 1.0907835378818949, | |
| "learning_rate": 9.017493058872623e-07, | |
| "loss": 0.2124, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 1.1324488490089442, | |
| "learning_rate": 8.700052576440166e-07, | |
| "loss": 0.2152, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 1.1888142582084373, | |
| "learning_rate": 8.387766921003427e-07, | |
| "loss": 0.2218, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 1.230386423249033, | |
| "learning_rate": 8.080675070300303e-07, | |
| "loss": 0.4094, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 1.2059240836852692, | |
| "learning_rate": 7.77881535380724e-07, | |
| "loss": 0.2182, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 1.2730243315669996, | |
| "learning_rate": 7.482225447955155e-07, | |
| "loss": 0.2242, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 1.3097925731697693, | |
| "learning_rate": 7.190942371426862e-07, | |
| "loss": 0.2334, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 1.3089263483412454, | |
| "learning_rate": 6.905002480536565e-07, | |
| "loss": 0.2129, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 1.2487331361219, | |
| "learning_rate": 6.624441464692161e-07, | |
| "loss": 0.2059, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 1.1900169075542317, | |
| "learning_rate": 6.349294341940593e-07, | |
| "loss": 0.2119, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 1.4499451387635829, | |
| "learning_rate": 6.07959545459717e-07, | |
| "loss": 0.2881, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 1.2365542029204284, | |
| "learning_rate": 5.815378464959109e-07, | |
| "loss": 0.21, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.169403617807277, | |
| "learning_rate": 5.55667635110399e-07, | |
| "loss": 0.2131, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 1.1917798187061128, | |
| "learning_rate": 5.303521402773665e-07, | |
| "loss": 0.2171, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 1.3227237208234792, | |
| "learning_rate": 5.055945217344004e-07, | |
| "loss": 0.2232, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 1.244493433459287, | |
| "learning_rate": 4.81397869588106e-07, | |
| "loss": 0.3424, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 1.2918840219931813, | |
| "learning_rate": 4.5776520392842473e-07, | |
| "loss": 0.2041, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.161291630108919, | |
| "learning_rate": 4.346994744516747e-07, | |
| "loss": 0.2062, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 1.188169889946088, | |
| "learning_rate": 4.122035600923913e-07, | |
| "loss": 0.2138, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 1.2759791115877892, | |
| "learning_rate": 3.902802686639967e-07, | |
| "loss": 0.2076, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 1.2428248160576683, | |
| "learning_rate": 3.6893233650833916e-07, | |
| "loss": 0.2231, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 1.2872717018149586, | |
| "learning_rate": 3.4816242815416014e-07, | |
| "loss": 0.2167, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.120924586598326, | |
| "learning_rate": 3.2797313598452506e-07, | |
| "loss": 0.2125, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 1.235367729092687, | |
| "learning_rate": 3.0836697991325547e-07, | |
| "loss": 0.2211, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 1.1425245188644768, | |
| "learning_rate": 2.893464070704055e-07, | |
| "loss": 0.2059, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 1.240438287494313, | |
| "learning_rate": 2.7091379149682683e-07, | |
| "loss": 0.2038, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 1.2119903261308935, | |
| "learning_rate": 2.53071433847854e-07, | |
| "loss": 0.208, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 1.3047012967784055, | |
| "learning_rate": 2.3582156110614985e-07, | |
| "loss": 0.2065, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 1.2159745001331477, | |
| "learning_rate": 2.1916632630374579e-07, | |
| "loss": 0.2092, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 1.4115746282687291, | |
| "learning_rate": 2.0310780825331056e-07, | |
| "loss": 0.3065, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 1.1260909387649622, | |
| "learning_rate": 1.876480112886886e-07, | |
| "loss": 0.2149, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 1.321400120727542, | |
| "learning_rate": 1.7278886501472804e-07, | |
| "loss": 0.2151, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 1.2811002757998282, | |
| "learning_rate": 1.5853222406643555e-07, | |
| "loss": 0.2124, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 1.3188098070211447, | |
| "learning_rate": 1.4487986787749763e-07, | |
| "loss": 0.2103, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 1.1879475265636688, | |
| "learning_rate": 1.318335004581761e-07, | |
| "loss": 0.2114, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 1.119348294167629, | |
| "learning_rate": 1.1939475018262481e-07, | |
| "loss": 0.2144, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 1.160073841048408, | |
| "learning_rate": 1.0756516958564667e-07, | |
| "loss": 0.2119, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 1.3050171867050437, | |
| "learning_rate": 9.634623516891372e-08, | |
| "loss": 0.2077, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 1.102635547442788, | |
| "learning_rate": 8.573934721667731e-08, | |
| "loss": 0.2107, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 1.1976820144956357, | |
| "learning_rate": 7.574582962099508e-08, | |
| "loss": 0.3234, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 1.0289812852531994, | |
| "learning_rate": 6.636692971648873e-08, | |
| "loss": 0.2946, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 1.1192107741554636, | |
| "learning_rate": 5.7603818124657984e-08, | |
| "loss": 0.2146, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.319177806968747, | |
| "learning_rate": 4.9457588607772497e-08, | |
| "loss": 0.2113, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 1.0857028410476908, | |
| "learning_rate": 4.192925793235159e-08, | |
| "loss": 0.2097, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 1.0588162046549727, | |
| "learning_rate": 3.501976574226018e-08, | |
| "loss": 0.2138, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 1.3776741815700904, | |
| "learning_rate": 2.8729974441426557e-08, | |
| "loss": 0.1994, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 1.403580554324849, | |
| "learning_rate": 2.3060669086199526e-08, | |
| "loss": 0.2166, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 1.2568358655842473, | |
| "learning_rate": 1.8012557287367394e-08, | |
| "loss": 0.2843, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 1.3656578926442577, | |
| "learning_rate": 1.3586269121833028e-08, | |
| "loss": 0.2138, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 1.178254860310462, | |
| "learning_rate": 9.782357053972902e-09, | |
| "loss": 0.2169, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 1.2773015970417392, | |
| "learning_rate": 6.6012958666827886e-09, | |
| "loss": 0.2163, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 1.2949804935819031, | |
| "learning_rate": 4.043482602116844e-09, | |
| "loss": 0.2152, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 1.129218160141717, | |
| "learning_rate": 2.1092365121305745e-09, | |
| "loss": 0.2566, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 1.3984530229483965, | |
| "learning_rate": 7.987990184354921e-10, | |
| "loss": 0.3123, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 1.3456687728542012, | |
| "learning_rate": 1.1233368246321708e-10, | |
| "loss": 0.2285, | |
| "step": 3120 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 68659404668928.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |