{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03161089100564781, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00021073927337098542, "grad_norm": 4.289033889770508, "learning_rate": 4.998946303633145e-05, "loss": 3.8267, "step": 10 }, { "epoch": 0.00042147854674197085, "grad_norm": 4.060084342956543, "learning_rate": 4.997997976902976e-05, "loss": 3.6144, "step": 20 }, { "epoch": 0.0006322178201129563, "grad_norm": 4.173689842224121, "learning_rate": 4.996944280536121e-05, "loss": 3.4868, "step": 30 }, { "epoch": 0.0008429570934839417, "grad_norm": 5.22714376449585, "learning_rate": 4.995890584169266e-05, "loss": 3.4488, "step": 40 }, { "epoch": 0.001053696366854927, "grad_norm": 5.710427284240723, "learning_rate": 4.994836887802411e-05, "loss": 3.3072, "step": 50 }, { "epoch": 0.0012644356402259126, "grad_norm": 5.226071357727051, "learning_rate": 4.993783191435556e-05, "loss": 3.3811, "step": 60 }, { "epoch": 0.0014751749135968979, "grad_norm": 4.471126556396484, "learning_rate": 4.9927294950687014e-05, "loss": 3.3294, "step": 70 }, { "epoch": 0.0016859141869678834, "grad_norm": 3.511232376098633, "learning_rate": 4.991675798701846e-05, "loss": 3.274, "step": 80 }, { "epoch": 0.0018966534603388687, "grad_norm": 3.6000168323516846, "learning_rate": 4.990622102334991e-05, "loss": 3.2927, "step": 90 }, { "epoch": 0.002107392733709854, "grad_norm": 3.6504461765289307, "learning_rate": 4.9895684059681366e-05, "loss": 3.1517, "step": 100 }, { "epoch": 0.0023181320070808395, "grad_norm": 5.8762288093566895, "learning_rate": 4.9885147096012814e-05, "loss": 3.2165, "step": 110 }, { "epoch": 0.002528871280451825, "grad_norm": 3.862133264541626, "learning_rate": 4.987461013234427e-05, "loss": 3.2496, "step": 120 }, { "epoch": 0.0027396105538228105, "grad_norm": 3.1009256839752197, "learning_rate": 4.986407316867572e-05, "loss": 3.1269, "step": 130 }, { "epoch": 0.0029503498271937958, "grad_norm": 4.951446056365967, "learning_rate": 4.9853536205007165e-05, "loss": 3.142, "step": 140 }, { "epoch": 0.003161089100564781, "grad_norm": 4.305763244628906, "learning_rate": 4.9842999241338614e-05, "loss": 3.1518, "step": 150 }, { "epoch": 0.003371828373935767, "grad_norm": 4.36134672164917, "learning_rate": 4.983246227767007e-05, "loss": 3.1285, "step": 160 }, { "epoch": 0.003582567647306752, "grad_norm": 3.9053471088409424, "learning_rate": 4.9821925314001524e-05, "loss": 3.0738, "step": 170 }, { "epoch": 0.0037933069206777374, "grad_norm": 13.242171287536621, "learning_rate": 4.981138835033297e-05, "loss": 3.0425, "step": 180 }, { "epoch": 0.004004046194048723, "grad_norm": 3.8637614250183105, "learning_rate": 4.980085138666442e-05, "loss": 3.007, "step": 190 }, { "epoch": 0.004214785467419708, "grad_norm": 4.488512992858887, "learning_rate": 4.979031442299587e-05, "loss": 3.0578, "step": 200 }, { "epoch": 0.004425524740790694, "grad_norm": 5.628246307373047, "learning_rate": 4.9779777459327324e-05, "loss": 3.0215, "step": 210 }, { "epoch": 0.004636264014161679, "grad_norm": 3.368596315383911, "learning_rate": 4.976924049565878e-05, "loss": 3.096, "step": 220 }, { "epoch": 0.004847003287532664, "grad_norm": 7.204585075378418, "learning_rate": 4.975870353199023e-05, "loss": 2.9822, "step": 230 }, { "epoch": 0.00505774256090365, "grad_norm": 4.260004997253418, "learning_rate": 4.9748166568321675e-05, "loss": 3.0716, "step": 240 }, { "epoch": 0.005268481834274636, "grad_norm": 4.109992504119873, "learning_rate": 4.973762960465312e-05, "loss": 2.9789, "step": 250 }, { "epoch": 0.005479221107645621, "grad_norm": 10.826703071594238, "learning_rate": 4.972709264098457e-05, "loss": 2.9928, "step": 260 }, { "epoch": 0.005689960381016606, "grad_norm": 4.802614212036133, "learning_rate": 4.9716555677316027e-05, "loss": 3.0421, "step": 270 }, { "epoch": 0.0059006996543875916, "grad_norm": 4.688140392303467, "learning_rate": 4.970601871364748e-05, "loss": 2.8983, "step": 280 }, { "epoch": 0.006111438927758577, "grad_norm": 4.237213134765625, "learning_rate": 4.969548174997893e-05, "loss": 3.0775, "step": 290 }, { "epoch": 0.006322178201129562, "grad_norm": 6.8719000816345215, "learning_rate": 4.968494478631038e-05, "loss": 2.9356, "step": 300 }, { "epoch": 0.006532917474500548, "grad_norm": 6.765329360961914, "learning_rate": 4.9674407822641826e-05, "loss": 3.0783, "step": 310 }, { "epoch": 0.006743656747871534, "grad_norm": 4.296706199645996, "learning_rate": 4.9664924555340135e-05, "loss": 2.831, "step": 320 }, { "epoch": 0.006954396021242519, "grad_norm": 5.505979537963867, "learning_rate": 4.965438759167158e-05, "loss": 3.0016, "step": 330 }, { "epoch": 0.007165135294613504, "grad_norm": 15.19791316986084, "learning_rate": 4.964385062800304e-05, "loss": 2.9707, "step": 340 }, { "epoch": 0.0073758745679844894, "grad_norm": 3.5731112957000732, "learning_rate": 4.9633313664334487e-05, "loss": 3.0052, "step": 350 }, { "epoch": 0.007586613841355475, "grad_norm": 4.196691989898682, "learning_rate": 4.962277670066594e-05, "loss": 3.0018, "step": 360 }, { "epoch": 0.00779735311472646, "grad_norm": 3.7209060192108154, "learning_rate": 4.961223973699739e-05, "loss": 2.9956, "step": 370 }, { "epoch": 0.008008092388097446, "grad_norm": 4.67873477935791, "learning_rate": 4.960170277332884e-05, "loss": 3.0256, "step": 380 }, { "epoch": 0.008218831661468431, "grad_norm": 4.471567630767822, "learning_rate": 4.9591165809660286e-05, "loss": 2.9656, "step": 390 }, { "epoch": 0.008429570934839417, "grad_norm": 4.6043195724487305, "learning_rate": 4.958062884599174e-05, "loss": 2.9317, "step": 400 }, { "epoch": 0.008640310208210402, "grad_norm": 5.873717308044434, "learning_rate": 4.9570091882323196e-05, "loss": 3.0048, "step": 410 }, { "epoch": 0.008851049481581387, "grad_norm": 4.8152923583984375, "learning_rate": 4.9559554918654645e-05, "loss": 2.933, "step": 420 }, { "epoch": 0.009061788754952373, "grad_norm": 3.463914632797241, "learning_rate": 4.954901795498609e-05, "loss": 2.9188, "step": 430 }, { "epoch": 0.009272528028323358, "grad_norm": 5.305572986602783, "learning_rate": 4.953848099131754e-05, "loss": 3.1017, "step": 440 }, { "epoch": 0.009483267301694343, "grad_norm": 3.027397394180298, "learning_rate": 4.952794402764899e-05, "loss": 2.9651, "step": 450 }, { "epoch": 0.009694006575065328, "grad_norm": 3.619413375854492, "learning_rate": 4.9517407063980444e-05, "loss": 3.0447, "step": 460 }, { "epoch": 0.009904745848436314, "grad_norm": 3.6718661785125732, "learning_rate": 4.95068701003119e-05, "loss": 2.9183, "step": 470 }, { "epoch": 0.0101154851218073, "grad_norm": 3.527235984802246, "learning_rate": 4.949633313664335e-05, "loss": 2.9889, "step": 480 }, { "epoch": 0.010326224395178286, "grad_norm": 3.9344675540924072, "learning_rate": 4.9485796172974796e-05, "loss": 2.9665, "step": 490 }, { "epoch": 0.010536963668549271, "grad_norm": 6.459798336029053, "learning_rate": 4.9475259209306244e-05, "loss": 2.9079, "step": 500 }, { "epoch": 0.010747702941920257, "grad_norm": 3.535411834716797, "learning_rate": 4.94647222456377e-05, "loss": 3.0064, "step": 510 }, { "epoch": 0.010958442215291242, "grad_norm": 3.441504716873169, "learning_rate": 4.9454185281969154e-05, "loss": 2.8912, "step": 520 }, { "epoch": 0.011169181488662227, "grad_norm": 6.144808292388916, "learning_rate": 4.94436483183006e-05, "loss": 2.9239, "step": 530 }, { "epoch": 0.011379920762033213, "grad_norm": 3.7004387378692627, "learning_rate": 4.943311135463205e-05, "loss": 2.9526, "step": 540 }, { "epoch": 0.011590660035404198, "grad_norm": 6.244487762451172, "learning_rate": 4.94225743909635e-05, "loss": 2.8755, "step": 550 }, { "epoch": 0.011801399308775183, "grad_norm": 3.984456777572632, "learning_rate": 4.9412037427294954e-05, "loss": 2.835, "step": 560 }, { "epoch": 0.012012138582146168, "grad_norm": 4.6722941398620605, "learning_rate": 4.94015004636264e-05, "loss": 2.9199, "step": 570 }, { "epoch": 0.012222877855517154, "grad_norm": 4.149166584014893, "learning_rate": 4.939096349995786e-05, "loss": 3.0706, "step": 580 }, { "epoch": 0.012433617128888139, "grad_norm": 3.566357374191284, "learning_rate": 4.9380426536289305e-05, "loss": 2.9011, "step": 590 }, { "epoch": 0.012644356402259124, "grad_norm": 3.4722342491149902, "learning_rate": 4.9369889572620754e-05, "loss": 2.9345, "step": 600 }, { "epoch": 0.01285509567563011, "grad_norm": 3.4608707427978516, "learning_rate": 4.935935260895221e-05, "loss": 2.8419, "step": 610 }, { "epoch": 0.013065834949001097, "grad_norm": 3.4312050342559814, "learning_rate": 4.934881564528366e-05, "loss": 2.9087, "step": 620 }, { "epoch": 0.013276574222372082, "grad_norm": 6.524892330169678, "learning_rate": 4.9338278681615105e-05, "loss": 2.8818, "step": 630 }, { "epoch": 0.013487313495743067, "grad_norm": 5.508037567138672, "learning_rate": 4.932774171794656e-05, "loss": 2.7335, "step": 640 }, { "epoch": 0.013698052769114052, "grad_norm": 4.094667911529541, "learning_rate": 4.931720475427801e-05, "loss": 2.9197, "step": 650 }, { "epoch": 0.013908792042485038, "grad_norm": 3.6645474433898926, "learning_rate": 4.9306667790609463e-05, "loss": 2.9647, "step": 660 }, { "epoch": 0.014119531315856023, "grad_norm": 3.711179256439209, "learning_rate": 4.929613082694091e-05, "loss": 2.9863, "step": 670 }, { "epoch": 0.014330270589227008, "grad_norm": 3.904421329498291, "learning_rate": 4.928559386327236e-05, "loss": 2.8947, "step": 680 }, { "epoch": 0.014541009862597994, "grad_norm": 3.3760502338409424, "learning_rate": 4.927505689960381e-05, "loss": 2.7375, "step": 690 }, { "epoch": 0.014751749135968979, "grad_norm": 4.068516254425049, "learning_rate": 4.926451993593526e-05, "loss": 2.8502, "step": 700 }, { "epoch": 0.014962488409339964, "grad_norm": 4.1656999588012695, "learning_rate": 4.925398297226672e-05, "loss": 2.8688, "step": 710 }, { "epoch": 0.01517322768271095, "grad_norm": 3.5596961975097656, "learning_rate": 4.9243446008598166e-05, "loss": 2.9094, "step": 720 }, { "epoch": 0.015383966956081935, "grad_norm": 4.3002753257751465, "learning_rate": 4.9232909044929615e-05, "loss": 2.8964, "step": 730 }, { "epoch": 0.01559470622945292, "grad_norm": 5.381309509277344, "learning_rate": 4.922237208126106e-05, "loss": 2.8543, "step": 740 }, { "epoch": 0.015805445502823905, "grad_norm": 5.13466739654541, "learning_rate": 4.921183511759252e-05, "loss": 2.8754, "step": 750 }, { "epoch": 0.016016184776194892, "grad_norm": 4.221550464630127, "learning_rate": 4.9201298153923966e-05, "loss": 2.9192, "step": 760 }, { "epoch": 0.016226924049565876, "grad_norm": 4.988082408905029, "learning_rate": 4.919076119025542e-05, "loss": 2.8325, "step": 770 }, { "epoch": 0.016437663322936863, "grad_norm": 6.187538146972656, "learning_rate": 4.918022422658687e-05, "loss": 2.8819, "step": 780 }, { "epoch": 0.016648402596307846, "grad_norm": 3.920051336288452, "learning_rate": 4.916968726291832e-05, "loss": 2.8123, "step": 790 }, { "epoch": 0.016859141869678834, "grad_norm": 3.4514334201812744, "learning_rate": 4.915915029924977e-05, "loss": 2.8523, "step": 800 }, { "epoch": 0.017069881143049817, "grad_norm": 5.007158279418945, "learning_rate": 4.914861333558122e-05, "loss": 2.8607, "step": 810 }, { "epoch": 0.017280620416420804, "grad_norm": 4.900107383728027, "learning_rate": 4.913807637191267e-05, "loss": 2.7612, "step": 820 }, { "epoch": 0.01749135968979179, "grad_norm": 7.45019006729126, "learning_rate": 4.9127539408244124e-05, "loss": 2.913, "step": 830 }, { "epoch": 0.017702098963162775, "grad_norm": 5.6856770515441895, "learning_rate": 4.911700244457557e-05, "loss": 2.8877, "step": 840 }, { "epoch": 0.01791283823653376, "grad_norm": 4.228877067565918, "learning_rate": 4.910646548090703e-05, "loss": 2.7294, "step": 850 }, { "epoch": 0.018123577509904745, "grad_norm": 3.372474193572998, "learning_rate": 4.9095928517238476e-05, "loss": 2.7951, "step": 860 }, { "epoch": 0.018334316783275732, "grad_norm": 3.3396689891815186, "learning_rate": 4.9085391553569924e-05, "loss": 2.8488, "step": 870 }, { "epoch": 0.018545056056646716, "grad_norm": 3.912947177886963, "learning_rate": 4.907485458990138e-05, "loss": 2.7816, "step": 880 }, { "epoch": 0.018755795330017703, "grad_norm": 3.7946770191192627, "learning_rate": 4.906431762623283e-05, "loss": 2.8841, "step": 890 }, { "epoch": 0.018966534603388686, "grad_norm": 4.324241638183594, "learning_rate": 4.905378066256428e-05, "loss": 2.8666, "step": 900 }, { "epoch": 0.019177273876759673, "grad_norm": 3.172816038131714, "learning_rate": 4.904324369889573e-05, "loss": 2.7794, "step": 910 }, { "epoch": 0.019388013150130657, "grad_norm": 3.8265621662139893, "learning_rate": 4.903270673522718e-05, "loss": 2.8162, "step": 920 }, { "epoch": 0.019598752423501644, "grad_norm": 3.4845385551452637, "learning_rate": 4.902216977155863e-05, "loss": 2.7385, "step": 930 }, { "epoch": 0.019809491696872628, "grad_norm": 4.434839248657227, "learning_rate": 4.901163280789008e-05, "loss": 2.6926, "step": 940 }, { "epoch": 0.020020230970243615, "grad_norm": 6.55767822265625, "learning_rate": 4.900109584422154e-05, "loss": 2.6728, "step": 950 }, { "epoch": 0.0202309702436146, "grad_norm": 3.1376519203186035, "learning_rate": 4.8990558880552985e-05, "loss": 2.8854, "step": 960 }, { "epoch": 0.020441709516985585, "grad_norm": 3.569209337234497, "learning_rate": 4.8980021916884433e-05, "loss": 2.7862, "step": 970 }, { "epoch": 0.020652448790356572, "grad_norm": 16.01164436340332, "learning_rate": 4.896948495321588e-05, "loss": 2.8844, "step": 980 }, { "epoch": 0.020863188063727556, "grad_norm": 3.2311861515045166, "learning_rate": 4.895894798954733e-05, "loss": 2.8773, "step": 990 }, { "epoch": 0.021073927337098543, "grad_norm": 4.047968864440918, "learning_rate": 4.8948411025878785e-05, "loss": 2.6955, "step": 1000 }, { "epoch": 0.021284666610469526, "grad_norm": 3.3213765621185303, "learning_rate": 4.893787406221024e-05, "loss": 2.8308, "step": 1010 }, { "epoch": 0.021495405883840513, "grad_norm": 3.658327102661133, "learning_rate": 4.892733709854169e-05, "loss": 2.8409, "step": 1020 }, { "epoch": 0.021706145157211497, "grad_norm": 3.6058104038238525, "learning_rate": 4.8916800134873136e-05, "loss": 2.8073, "step": 1030 }, { "epoch": 0.021916884430582484, "grad_norm": 3.622807741165161, "learning_rate": 4.8906263171204585e-05, "loss": 2.7389, "step": 1040 }, { "epoch": 0.022127623703953467, "grad_norm": 3.6087045669555664, "learning_rate": 4.889572620753604e-05, "loss": 2.7506, "step": 1050 }, { "epoch": 0.022338362977324455, "grad_norm": 3.1047306060791016, "learning_rate": 4.888518924386749e-05, "loss": 2.77, "step": 1060 }, { "epoch": 0.022549102250695438, "grad_norm": 3.4957761764526367, "learning_rate": 4.887465228019894e-05, "loss": 2.8526, "step": 1070 }, { "epoch": 0.022759841524066425, "grad_norm": 5.486833095550537, "learning_rate": 4.886411531653039e-05, "loss": 2.8144, "step": 1080 }, { "epoch": 0.022970580797437412, "grad_norm": 6.019243240356445, "learning_rate": 4.885357835286184e-05, "loss": 2.8095, "step": 1090 }, { "epoch": 0.023181320070808396, "grad_norm": 5.631433486938477, "learning_rate": 4.8843041389193294e-05, "loss": 2.6904, "step": 1100 }, { "epoch": 0.023392059344179383, "grad_norm": 3.162370204925537, "learning_rate": 4.883250442552474e-05, "loss": 2.6805, "step": 1110 }, { "epoch": 0.023602798617550366, "grad_norm": 3.7855355739593506, "learning_rate": 4.882196746185619e-05, "loss": 2.7785, "step": 1120 }, { "epoch": 0.023813537890921353, "grad_norm": 3.0686984062194824, "learning_rate": 4.8811430498187646e-05, "loss": 2.7163, "step": 1130 }, { "epoch": 0.024024277164292337, "grad_norm": 4.290556907653809, "learning_rate": 4.8800893534519094e-05, "loss": 2.8549, "step": 1140 }, { "epoch": 0.024235016437663324, "grad_norm": 4.224363327026367, "learning_rate": 4.879035657085055e-05, "loss": 2.7858, "step": 1150 }, { "epoch": 0.024445755711034307, "grad_norm": 4.114129066467285, "learning_rate": 4.8779819607182e-05, "loss": 2.7969, "step": 1160 }, { "epoch": 0.024656494984405294, "grad_norm": 4.066317081451416, "learning_rate": 4.8769282643513446e-05, "loss": 2.701, "step": 1170 }, { "epoch": 0.024867234257776278, "grad_norm": 3.0110621452331543, "learning_rate": 4.8758745679844894e-05, "loss": 2.9004, "step": 1180 }, { "epoch": 0.025077973531147265, "grad_norm": 4.144210338592529, "learning_rate": 4.874820871617635e-05, "loss": 2.8651, "step": 1190 }, { "epoch": 0.02528871280451825, "grad_norm": 3.226668119430542, "learning_rate": 4.8737671752507804e-05, "loss": 2.8332, "step": 1200 }, { "epoch": 0.025499452077889236, "grad_norm": 3.7419745922088623, "learning_rate": 4.872713478883925e-05, "loss": 2.8502, "step": 1210 }, { "epoch": 0.02571019135126022, "grad_norm": 6.740761756896973, "learning_rate": 4.87165978251707e-05, "loss": 2.7892, "step": 1220 }, { "epoch": 0.025920930624631206, "grad_norm": 4.075252056121826, "learning_rate": 4.870606086150215e-05, "loss": 2.7037, "step": 1230 }, { "epoch": 0.026131669898002193, "grad_norm": 3.7171854972839355, "learning_rate": 4.8695523897833604e-05, "loss": 2.7885, "step": 1240 }, { "epoch": 0.026342409171373177, "grad_norm": 3.2372097969055176, "learning_rate": 4.868498693416506e-05, "loss": 2.7827, "step": 1250 }, { "epoch": 0.026553148444744164, "grad_norm": 5.390050888061523, "learning_rate": 4.867444997049651e-05, "loss": 2.7929, "step": 1260 }, { "epoch": 0.026763887718115147, "grad_norm": 6.445965766906738, "learning_rate": 4.8663913006827955e-05, "loss": 2.7683, "step": 1270 }, { "epoch": 0.026974626991486134, "grad_norm": 4.038309574127197, "learning_rate": 4.8653376043159403e-05, "loss": 2.7644, "step": 1280 }, { "epoch": 0.027185366264857118, "grad_norm": 3.422018527984619, "learning_rate": 4.864283907949085e-05, "loss": 2.759, "step": 1290 }, { "epoch": 0.027396105538228105, "grad_norm": 3.4551455974578857, "learning_rate": 4.863230211582231e-05, "loss": 2.6919, "step": 1300 }, { "epoch": 0.02760684481159909, "grad_norm": 4.99464225769043, "learning_rate": 4.862176515215376e-05, "loss": 2.8362, "step": 1310 }, { "epoch": 0.027817584084970075, "grad_norm": 3.759676933288574, "learning_rate": 4.861122818848521e-05, "loss": 2.7712, "step": 1320 }, { "epoch": 0.02802832335834106, "grad_norm": 4.082064151763916, "learning_rate": 4.860069122481666e-05, "loss": 2.8224, "step": 1330 }, { "epoch": 0.028239062631712046, "grad_norm": 3.157449960708618, "learning_rate": 4.8590154261148106e-05, "loss": 2.7863, "step": 1340 }, { "epoch": 0.02844980190508303, "grad_norm": 5.148242950439453, "learning_rate": 4.857961729747956e-05, "loss": 2.7383, "step": 1350 }, { "epoch": 0.028660541178454017, "grad_norm": 3.794858694076538, "learning_rate": 4.856908033381101e-05, "loss": 2.7118, "step": 1360 }, { "epoch": 0.028871280451825004, "grad_norm": 3.0035576820373535, "learning_rate": 4.8558543370142465e-05, "loss": 2.7787, "step": 1370 }, { "epoch": 0.029082019725195987, "grad_norm": 4.196252822875977, "learning_rate": 4.854800640647391e-05, "loss": 2.7501, "step": 1380 }, { "epoch": 0.029292758998566974, "grad_norm": 7.164214611053467, "learning_rate": 4.853746944280536e-05, "loss": 2.7885, "step": 1390 }, { "epoch": 0.029503498271937958, "grad_norm": 3.252230167388916, "learning_rate": 4.8526932479136816e-05, "loss": 2.7875, "step": 1400 }, { "epoch": 0.029714237545308945, "grad_norm": 7.037599563598633, "learning_rate": 4.8516395515468265e-05, "loss": 2.7787, "step": 1410 }, { "epoch": 0.02992497681867993, "grad_norm": 3.5410661697387695, "learning_rate": 4.850585855179971e-05, "loss": 2.8644, "step": 1420 }, { "epoch": 0.030135716092050915, "grad_norm": 3.7410356998443604, "learning_rate": 4.849532158813117e-05, "loss": 2.8146, "step": 1430 }, { "epoch": 0.0303464553654219, "grad_norm": 7.049551963806152, "learning_rate": 4.8484784624462616e-05, "loss": 2.8427, "step": 1440 }, { "epoch": 0.030557194638792886, "grad_norm": 3.5179500579833984, "learning_rate": 4.847424766079407e-05, "loss": 2.7396, "step": 1450 }, { "epoch": 0.03076793391216387, "grad_norm": 3.117222785949707, "learning_rate": 4.846371069712552e-05, "loss": 2.6701, "step": 1460 }, { "epoch": 0.030978673185534857, "grad_norm": 2.9434943199157715, "learning_rate": 4.845317373345697e-05, "loss": 2.7827, "step": 1470 }, { "epoch": 0.03118941245890584, "grad_norm": 7.40554666519165, "learning_rate": 4.8442636769788416e-05, "loss": 2.6663, "step": 1480 }, { "epoch": 0.031400151732276824, "grad_norm": 3.6940155029296875, "learning_rate": 4.843209980611987e-05, "loss": 2.7505, "step": 1490 }, { "epoch": 0.03161089100564781, "grad_norm": 3.142557382583618, "learning_rate": 4.8421562842451326e-05, "loss": 2.7381, "step": 1500 } ], "logging_steps": 10, "max_steps": 47452, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3376102996231680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }