{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9952556668423828, "eval_steps": 500, "global_step": 118, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008434370057986295, "grad_norm": 0.08799133449792862, "learning_rate": 4.9999999999999996e-06, "loss": 1.6351, "step": 1 }, { "epoch": 0.01686874011597259, "grad_norm": 0.08821269869804382, "learning_rate": 9.999999999999999e-06, "loss": 1.6405, "step": 2 }, { "epoch": 0.025303110173958882, "grad_norm": 0.028541648760437965, "learning_rate": 1.5e-05, "loss": 1.6264, "step": 3 }, { "epoch": 0.03373748023194518, "grad_norm": 0.016522206366062164, "learning_rate": 1.9999999999999998e-05, "loss": 1.6233, "step": 4 }, { "epoch": 0.04217185028993147, "grad_norm": 0.054906539618968964, "learning_rate": 2.5e-05, "loss": 1.62, "step": 5 }, { "epoch": 0.050606220347917764, "grad_norm": 0.0514790378510952, "learning_rate": 3e-05, "loss": 1.6134, "step": 6 }, { "epoch": 0.05904059040590406, "grad_norm": 0.04156072437763214, "learning_rate": 3.5000000000000004e-05, "loss": 1.6169, "step": 7 }, { "epoch": 0.06747496046389036, "grad_norm": 0.05689298361539841, "learning_rate": 3.9999999999999996e-05, "loss": 1.6143, "step": 8 }, { "epoch": 0.07590933052187665, "grad_norm": 0.041525840759277344, "learning_rate": 4.5e-05, "loss": 1.6104, "step": 9 }, { "epoch": 0.08434370057986294, "grad_norm": 0.031016899272799492, "learning_rate": 5e-05, "loss": 1.6028, "step": 10 }, { "epoch": 0.09277807063784924, "grad_norm": 0.03775344789028168, "learning_rate": 5.5e-05, "loss": 1.5949, "step": 11 }, { "epoch": 0.10121244069583553, "grad_norm": 0.027061201632022858, "learning_rate": 6e-05, "loss": 1.5966, "step": 12 }, { "epoch": 0.10964681075382182, "grad_norm": 0.03555454686284065, "learning_rate": 5.998682509526384e-05, "loss": 1.601, "step": 13 }, { "epoch": 0.11808118081180811, "grad_norm": 0.038648299872875214, "learning_rate": 5.994731195292965e-05, "loss": 1.6015, "step": 14 }, { "epoch": 0.1265155508697944, "grad_norm": 0.03883035108447075, "learning_rate": 5.988149527845651e-05, "loss": 1.5992, "step": 15 }, { "epoch": 0.13494992092778071, "grad_norm": 0.03391977399587631, "learning_rate": 5.978943288040551e-05, "loss": 1.5932, "step": 16 }, { "epoch": 0.143384290985767, "grad_norm": 0.0362255796790123, "learning_rate": 5.967120561966492e-05, "loss": 1.5873, "step": 17 }, { "epoch": 0.1518186610437533, "grad_norm": 0.027403229847550392, "learning_rate": 5.952691733842791e-05, "loss": 1.5845, "step": 18 }, { "epoch": 0.16025303110173958, "grad_norm": 0.02821512520313263, "learning_rate": 5.935669476898512e-05, "loss": 1.5942, "step": 19 }, { "epoch": 0.16868740115972589, "grad_norm": 0.022913869470357895, "learning_rate": 5.9160687422412324e-05, "loss": 1.5976, "step": 20 }, { "epoch": 0.17712177121771217, "grad_norm": 0.02420000359416008, "learning_rate": 5.893906745725076e-05, "loss": 1.5862, "step": 21 }, { "epoch": 0.18555614127569847, "grad_norm": 0.021311871707439423, "learning_rate": 5.8692029528295675e-05, "loss": 1.5877, "step": 22 }, { "epoch": 0.19399051133368478, "grad_norm": 0.024183662608265877, "learning_rate": 5.841979061562574e-05, "loss": 1.584, "step": 23 }, { "epoch": 0.20242488139167106, "grad_norm": 0.02072131633758545, "learning_rate": 5.8122589834023634e-05, "loss": 1.5841, "step": 24 }, { "epoch": 0.21085925144965736, "grad_norm": 0.023273587226867676, "learning_rate": 5.7800688222955e-05, "loss": 1.5845, "step": 25 }, { "epoch": 0.21929362150764364, "grad_norm": 0.0180776659399271, "learning_rate": 5.745436851729055e-05, "loss": 1.594, "step": 26 }, { "epoch": 0.22772799156562995, "grad_norm": 0.018995055928826332, "learning_rate": 5.708393489897231e-05, "loss": 1.5903, "step": 27 }, { "epoch": 0.23616236162361623, "grad_norm": 0.017286648973822594, "learning_rate": 5.668971272984242e-05, "loss": 1.5804, "step": 28 }, { "epoch": 0.24459673168160254, "grad_norm": 0.018625088036060333, "learning_rate": 5.6272048265869104e-05, "loss": 1.5798, "step": 29 }, { "epoch": 0.2530311017395888, "grad_norm": 0.017109202221035957, "learning_rate": 5.583130835302066e-05, "loss": 1.5848, "step": 30 }, { "epoch": 0.2614654717975751, "grad_norm": 0.017000902444124222, "learning_rate": 5.536788010505478e-05, "loss": 1.5751, "step": 31 }, { "epoch": 0.26989984185556143, "grad_norm": 0.018897738307714462, "learning_rate": 5.4882170563506055e-05, "loss": 1.5799, "step": 32 }, { "epoch": 0.2783342119135477, "grad_norm": 0.017153726890683174, "learning_rate": 5.437460634017044e-05, "loss": 1.5758, "step": 33 }, { "epoch": 0.286768581971534, "grad_norm": 0.020006069913506508, "learning_rate": 5.3845633242400604e-05, "loss": 1.5774, "step": 34 }, { "epoch": 0.2952029520295203, "grad_norm": 0.016250574961304665, "learning_rate": 5.329571588154127e-05, "loss": 1.5748, "step": 35 }, { "epoch": 0.3036373220875066, "grad_norm": 0.019675249233841896, "learning_rate": 5.2725337264848605e-05, "loss": 1.5772, "step": 36 }, { "epoch": 0.3120716921454929, "grad_norm": 0.017005721107125282, "learning_rate": 5.213499837125182e-05, "loss": 1.5697, "step": 37 }, { "epoch": 0.32050606220347916, "grad_norm": 0.01664470136165619, "learning_rate": 5.152521771132993e-05, "loss": 1.5761, "step": 38 }, { "epoch": 0.32894043226146547, "grad_norm": 0.01764543540775776, "learning_rate": 5.0896530871889914e-05, "loss": 1.5793, "step": 39 }, { "epoch": 0.33737480231945177, "grad_norm": 0.016753442585468292, "learning_rate": 5.024949004554632e-05, "loss": 1.5658, "step": 40 }, { "epoch": 0.3458091723774381, "grad_norm": 0.019939422607421875, "learning_rate": 4.958466354571565e-05, "loss": 1.5762, "step": 41 }, { "epoch": 0.35424354243542433, "grad_norm": 0.01566561497747898, "learning_rate": 4.890263530745134e-05, "loss": 1.5703, "step": 42 }, { "epoch": 0.36267791249341064, "grad_norm": 0.015579808503389359, "learning_rate": 4.8204004374557806e-05, "loss": 1.577, "step": 43 }, { "epoch": 0.37111228255139694, "grad_norm": 0.016742996871471405, "learning_rate": 4.748938437343416e-05, "loss": 1.5726, "step": 44 }, { "epoch": 0.37954665260938325, "grad_norm": 0.017128925770521164, "learning_rate": 4.675940297410958e-05, "loss": 1.579, "step": 45 }, { "epoch": 0.38798102266736956, "grad_norm": 0.015266829170286655, "learning_rate": 4.601470133894373e-05, "loss": 1.5611, "step": 46 }, { "epoch": 0.3964153927253558, "grad_norm": 0.014922689646482468, "learning_rate": 4.525593355947662e-05, "loss": 1.5725, "step": 47 }, { "epoch": 0.4048497627833421, "grad_norm": 0.01651890017092228, "learning_rate": 4.448376608192235e-05, "loss": 1.5679, "step": 48 }, { "epoch": 0.4132841328413284, "grad_norm": 0.013002808205783367, "learning_rate": 4.3698877121811395e-05, "loss": 1.5712, "step": 49 }, { "epoch": 0.42171850289931473, "grad_norm": 0.013684232719242573, "learning_rate": 4.290195606829562e-05, "loss": 1.5683, "step": 50 }, { "epoch": 0.430152872957301, "grad_norm": 0.01470887940376997, "learning_rate": 4.2093702878639174e-05, "loss": 1.5784, "step": 51 }, { "epoch": 0.4385872430152873, "grad_norm": 0.013774153776466846, "learning_rate": 4.127482746342714e-05, "loss": 1.5648, "step": 52 }, { "epoch": 0.4470216130732736, "grad_norm": 0.01601037010550499, "learning_rate": 4.044604906303197e-05, "loss": 1.5671, "step": 53 }, { "epoch": 0.4554559831312599, "grad_norm": 0.013479109853506088, "learning_rate": 3.960809561588513e-05, "loss": 1.5759, "step": 54 }, { "epoch": 0.46389035318924615, "grad_norm": 0.01525378692895174, "learning_rate": 3.876170311910928e-05, "loss": 1.5672, "step": 55 }, { "epoch": 0.47232472324723246, "grad_norm": 0.013126607052981853, "learning_rate": 3.790761498207203e-05, "loss": 1.5744, "step": 56 }, { "epoch": 0.48075909330521877, "grad_norm": 0.013218970037996769, "learning_rate": 3.704658137342952e-05, "loss": 1.5688, "step": 57 }, { "epoch": 0.48919346336320507, "grad_norm": 0.014142030850052834, "learning_rate": 3.617935856223295e-05, "loss": 1.5742, "step": 58 }, { "epoch": 0.4976278334211914, "grad_norm": 0.013189482502639294, "learning_rate": 3.5306708253677186e-05, "loss": 1.5615, "step": 59 }, { "epoch": 0.5060622034791776, "grad_norm": 0.014055909588932991, "learning_rate": 3.442939692007444e-05, "loss": 1.5456, "step": 60 }, { "epoch": 0.5144965735371639, "grad_norm": 0.011999402195215225, "learning_rate": 3.354819512764097e-05, "loss": 1.5579, "step": 61 }, { "epoch": 0.5229309435951502, "grad_norm": 0.015170286409556866, "learning_rate": 3.2663876859688045e-05, "loss": 1.5606, "step": 62 }, { "epoch": 0.5313653136531366, "grad_norm": 0.013461374677717686, "learning_rate": 3.177721883681143e-05, "loss": 1.5631, "step": 63 }, { "epoch": 0.5397996837111229, "grad_norm": 0.014450161717832088, "learning_rate": 3.0888999834676796e-05, "loss": 1.5606, "step": 64 }, { "epoch": 0.5482340537691092, "grad_norm": 0.014033439569175243, "learning_rate": 3e-05, "loss": 1.5638, "step": 65 }, { "epoch": 0.5566684238270954, "grad_norm": 0.014029957354068756, "learning_rate": 2.9111000165323206e-05, "loss": 1.5656, "step": 66 }, { "epoch": 0.5651027938850817, "grad_norm": 0.016938265413045883, "learning_rate": 2.8222781163188573e-05, "loss": 1.5595, "step": 67 }, { "epoch": 0.573537163943068, "grad_norm": 0.014442404732108116, "learning_rate": 2.7336123140311957e-05, "loss": 1.5627, "step": 68 }, { "epoch": 0.5819715340010543, "grad_norm": 0.015609300695359707, "learning_rate": 2.645180487235903e-05, "loss": 1.5707, "step": 69 }, { "epoch": 0.5904059040590406, "grad_norm": 0.014037694782018661, "learning_rate": 2.557060307992557e-05, "loss": 1.5635, "step": 70 }, { "epoch": 0.5988402741170269, "grad_norm": 0.013035484589636326, "learning_rate": 2.469329174632282e-05, "loss": 1.5635, "step": 71 }, { "epoch": 0.6072746441750132, "grad_norm": 0.013149570673704147, "learning_rate": 2.3820641437767053e-05, "loss": 1.5607, "step": 72 }, { "epoch": 0.6157090142329995, "grad_norm": 0.01272524707019329, "learning_rate": 2.2953418626570494e-05, "loss": 1.5524, "step": 73 }, { "epoch": 0.6241433842909858, "grad_norm": 0.01219966635107994, "learning_rate": 2.209238501792798e-05, "loss": 1.555, "step": 74 }, { "epoch": 0.632577754348972, "grad_norm": 0.01229917537420988, "learning_rate": 2.123829688089073e-05, "loss": 1.5514, "step": 75 }, { "epoch": 0.6410121244069583, "grad_norm": 0.013784164562821388, "learning_rate": 2.0391904384114877e-05, "loss": 1.5614, "step": 76 }, { "epoch": 0.6494464944649446, "grad_norm": 0.010503321886062622, "learning_rate": 1.9553950936968042e-05, "loss": 1.541, "step": 77 }, { "epoch": 0.6578808645229309, "grad_norm": 0.012291346676647663, "learning_rate": 1.8725172536572863e-05, "loss": 1.556, "step": 78 }, { "epoch": 0.6663152345809172, "grad_norm": 0.011516911908984184, "learning_rate": 1.7906297121360838e-05, "loss": 1.5638, "step": 79 }, { "epoch": 0.6747496046389035, "grad_norm": 0.01181780081242323, "learning_rate": 1.7098043931704396e-05, "loss": 1.5508, "step": 80 }, { "epoch": 0.6831839746968899, "grad_norm": 0.010808738879859447, "learning_rate": 1.6301122878188607e-05, "loss": 1.5567, "step": 81 }, { "epoch": 0.6916183447548762, "grad_norm": 0.010649660602211952, "learning_rate": 1.551623391807766e-05, "loss": 1.5484, "step": 82 }, { "epoch": 0.7000527148128625, "grad_norm": 0.010580360889434814, "learning_rate": 1.4744066440523391e-05, "loss": 1.5591, "step": 83 }, { "epoch": 0.7084870848708487, "grad_norm": 0.010917909443378448, "learning_rate": 1.3985298661056292e-05, "loss": 1.569, "step": 84 }, { "epoch": 0.716921454928835, "grad_norm": 0.01177785824984312, "learning_rate": 1.324059702589043e-05, "loss": 1.5631, "step": 85 }, { "epoch": 0.7253558249868213, "grad_norm": 0.009857219643890858, "learning_rate": 1.2510615626565844e-05, "loss": 1.5561, "step": 86 }, { "epoch": 0.7337901950448076, "grad_norm": 0.011106839403510094, "learning_rate": 1.1795995625442208e-05, "loss": 1.5471, "step": 87 }, { "epoch": 0.7422245651027939, "grad_norm": 0.011377968825399876, "learning_rate": 1.109736469254867e-05, "loss": 1.5583, "step": 88 }, { "epoch": 0.7506589351607802, "grad_norm": 0.010118059813976288, "learning_rate": 1.0415336454284356e-05, "loss": 1.5531, "step": 89 }, { "epoch": 0.7590933052187665, "grad_norm": 0.01021275483071804, "learning_rate": 9.75050995445369e-06, "loss": 1.5559, "step": 90 }, { "epoch": 0.7675276752767528, "grad_norm": 0.00994526594877243, "learning_rate": 9.103469128110098e-06, "loss": 1.5527, "step": 91 }, { "epoch": 0.7759620453347391, "grad_norm": 0.01060432381927967, "learning_rate": 8.474782288670058e-06, "loss": 1.5514, "step": 92 }, { "epoch": 0.7843964153927253, "grad_norm": 0.011965557001531124, "learning_rate": 7.86500162874818e-06, "loss": 1.5536, "step": 93 }, { "epoch": 0.7928307854507116, "grad_norm": 0.010221057571470737, "learning_rate": 7.274662735151396e-06, "loss": 1.5541, "step": 94 }, { "epoch": 0.8012651555086979, "grad_norm": 0.01093184296041727, "learning_rate": 6.704284118458731e-06, "loss": 1.5512, "step": 95 }, { "epoch": 0.8096995255666842, "grad_norm": 0.010998157784342766, "learning_rate": 6.154366757599399e-06, "loss": 1.5492, "step": 96 }, { "epoch": 0.8181338956246705, "grad_norm": 0.01003272831439972, "learning_rate": 5.625393659829561e-06, "loss": 1.5472, "step": 97 }, { "epoch": 0.8265682656826568, "grad_norm": 0.010513346642255783, "learning_rate": 5.117829436493947e-06, "loss": 1.551, "step": 98 }, { "epoch": 0.8350026357406432, "grad_norm": 0.01016693189740181, "learning_rate": 4.632119894945215e-06, "loss": 1.5599, "step": 99 }, { "epoch": 0.8434370057986295, "grad_norm": 0.009756877087056637, "learning_rate": 4.1686916469793335e-06, "loss": 1.5552, "step": 100 }, { "epoch": 0.8518713758566157, "grad_norm": 0.010328919626772404, "learning_rate": 3.7279517341308977e-06, "loss": 1.5645, "step": 101 }, { "epoch": 0.860305745914602, "grad_norm": 0.009724525734782219, "learning_rate": 3.3102872701575838e-06, "loss": 1.5466, "step": 102 }, { "epoch": 0.8687401159725883, "grad_norm": 0.009452255442738533, "learning_rate": 2.916065101027694e-06, "loss": 1.555, "step": 103 }, { "epoch": 0.8771744860305746, "grad_norm": 0.009558911435306072, "learning_rate": 2.5456314827094463e-06, "loss": 1.5479, "step": 104 }, { "epoch": 0.8856088560885609, "grad_norm": 0.009129817597568035, "learning_rate": 2.1993117770449987e-06, "loss": 1.545, "step": 105 }, { "epoch": 0.8940432261465472, "grad_norm": 0.00930058490484953, "learning_rate": 1.8774101659763731e-06, "loss": 1.554, "step": 106 }, { "epoch": 0.9024775962045335, "grad_norm": 0.009718949906527996, "learning_rate": 1.5802093843742582e-06, "loss": 1.5467, "step": 107 }, { "epoch": 0.9109119662625198, "grad_norm": 0.009196877479553223, "learning_rate": 1.3079704717043273e-06, "loss": 1.55, "step": 108 }, { "epoch": 0.9193463363205061, "grad_norm": 0.00919976457953453, "learning_rate": 1.060932542749241e-06, "loss": 1.5558, "step": 109 }, { "epoch": 0.9277807063784923, "grad_norm": 0.0089542455971241, "learning_rate": 8.393125775876775e-07, "loss": 1.5563, "step": 110 }, { "epoch": 0.9362150764364786, "grad_norm": 0.009196256287395954, "learning_rate": 6.433052310148791e-07, "loss": 1.5537, "step": 111 }, { "epoch": 0.9446494464944649, "grad_norm": 0.009201628156006336, "learning_rate": 4.730826615720951e-07, "loss": 1.5567, "step": 112 }, { "epoch": 0.9530838165524512, "grad_norm": 0.008883966132998466, "learning_rate": 3.28794380335079e-07, "loss": 1.5549, "step": 113 }, { "epoch": 0.9615181866104375, "grad_norm": 0.009221088141202927, "learning_rate": 2.1056711959449247e-07, "loss": 1.5585, "step": 114 }, { "epoch": 0.9699525566684238, "grad_norm": 0.009092201478779316, "learning_rate": 1.1850472154349313e-07, "loss": 1.5536, "step": 115 }, { "epoch": 0.9783869267264101, "grad_norm": 0.009470025077462196, "learning_rate": 5.268804707035946e-08, "loss": 1.5705, "step": 116 }, { "epoch": 0.9868212967843965, "grad_norm": 0.008715336211025715, "learning_rate": 1.3174904736169557e-08, "loss": 1.5566, "step": 117 }, { "epoch": 0.9952556668423828, "grad_norm": 0.008857190608978271, "learning_rate": 0.0, "loss": 1.5464, "step": 118 }, { "epoch": 0.9952556668423828, "step": 118, "total_flos": 1660937136242688.0, "train_loss": 1.571211524939133, "train_runtime": 47361.2024, "train_samples_per_second": 0.641, "train_steps_per_second": 0.002 } ], "logging_steps": 1, "max_steps": 118, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1660937136242688.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }