{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 108, "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.7017320156097413, "epoch": 0.009267840593141797, "grad_norm": 0.4929618835449219, "learning_rate": 8.000000000000001e-06, "loss": 1.8663, "mean_token_accuracy": 0.6088488936424256, "num_tokens": 10169.0, "step": 5 }, { "entropy": 1.6203628540039063, "epoch": 0.018535681186283594, "grad_norm": 0.5699822306632996, "learning_rate": 1.8e-05, "loss": 1.8139, "mean_token_accuracy": 0.613596773147583, "num_tokens": 17492.0, "step": 10 }, { "entropy": 1.6117817282676696, "epoch": 0.027803521779425393, "grad_norm": 0.5831968784332275, "learning_rate": 1.9849056603773588e-05, "loss": 1.9287, "mean_token_accuracy": 0.6028814196586609, "num_tokens": 23104.0, "step": 15 }, { "entropy": 1.5564811944961547, "epoch": 0.03707136237256719, "grad_norm": 0.6044633388519287, "learning_rate": 1.9660377358490567e-05, "loss": 1.8979, "mean_token_accuracy": 0.6022705733776093, "num_tokens": 28237.0, "step": 20 }, { "entropy": 1.5479681015014648, "epoch": 0.04633920296570899, "grad_norm": 0.5943928956985474, "learning_rate": 1.947169811320755e-05, "loss": 1.8733, "mean_token_accuracy": 0.6153187394142151, "num_tokens": 33029.0, "step": 25 }, { "entropy": 1.646817970275879, "epoch": 0.05560704355885079, "grad_norm": 0.6993905901908875, "learning_rate": 1.928301886792453e-05, "loss": 1.8784, "mean_token_accuracy": 0.6003017485141754, "num_tokens": 37369.0, "step": 30 }, { "entropy": 1.650176739692688, "epoch": 0.06487488415199258, "grad_norm": 0.786679744720459, "learning_rate": 1.909433962264151e-05, "loss": 1.8796, "mean_token_accuracy": 0.6147689402103425, "num_tokens": 41484.0, "step": 35 }, { "entropy": 1.590287184715271, "epoch": 0.07414272474513438, "grad_norm": 0.770859956741333, "learning_rate": 1.8905660377358492e-05, "loss": 1.817, "mean_token_accuracy": 0.6139472305774689, "num_tokens": 45350.0, "step": 40 }, { "entropy": 1.8836397886276246, "epoch": 0.08341056533827618, "grad_norm": 1.3446482419967651, "learning_rate": 1.8716981132075474e-05, "loss": 2.2667, "mean_token_accuracy": 0.5831025898456573, "num_tokens": 47900.0, "step": 45 }, { "entropy": 1.914563238620758, "epoch": 0.09267840593141798, "grad_norm": 2.0144922733306885, "learning_rate": 1.8528301886792453e-05, "loss": 2.3988, "mean_token_accuracy": 0.5866762459278106, "num_tokens": 49421.0, "step": 50 }, { "entropy": 1.753426432609558, "epoch": 0.10194624652455977, "grad_norm": 0.36396369338035583, "learning_rate": 1.8339622641509435e-05, "loss": 1.6739, "mean_token_accuracy": 0.634347426891327, "num_tokens": 59290.0, "step": 55 }, { "entropy": 1.7417057394981383, "epoch": 0.11121408711770157, "grad_norm": 0.557318389415741, "learning_rate": 1.8150943396226417e-05, "loss": 1.643, "mean_token_accuracy": 0.6404906511306763, "num_tokens": 66072.0, "step": 60 }, { "entropy": 1.6922197580337524, "epoch": 0.12048192771084337, "grad_norm": 0.5784375667572021, "learning_rate": 1.79622641509434e-05, "loss": 1.546, "mean_token_accuracy": 0.6431969463825226, "num_tokens": 71590.0, "step": 65 }, { "entropy": 1.7081309676170349, "epoch": 0.12974976830398516, "grad_norm": 0.6285673379898071, "learning_rate": 1.777358490566038e-05, "loss": 1.5738, "mean_token_accuracy": 0.6408641755580902, "num_tokens": 76620.0, "step": 70 }, { "entropy": 1.706821823120117, "epoch": 0.13901760889712697, "grad_norm": 0.6793721914291382, "learning_rate": 1.758490566037736e-05, "loss": 1.5597, "mean_token_accuracy": 0.6496909260749817, "num_tokens": 81367.0, "step": 75 }, { "entropy": 1.729298961162567, "epoch": 0.14828544949026876, "grad_norm": 0.7073204517364502, "learning_rate": 1.7396226415094343e-05, "loss": 1.5936, "mean_token_accuracy": 0.646106606721878, "num_tokens": 85829.0, "step": 80 }, { "entropy": 1.764689528942108, "epoch": 0.15755329008341057, "grad_norm": 0.8073896765708923, "learning_rate": 1.720754716981132e-05, "loss": 1.5737, "mean_token_accuracy": 0.6494780361652375, "num_tokens": 89983.0, "step": 85 }, { "entropy": 1.6753356814384461, "epoch": 0.16682113067655235, "grad_norm": 0.7738911509513855, "learning_rate": 1.7018867924528304e-05, "loss": 1.4815, "mean_token_accuracy": 0.6729483246803284, "num_tokens": 93855.0, "step": 90 }, { "entropy": 1.9390615344047546, "epoch": 0.17608897126969417, "grad_norm": 1.312859058380127, "learning_rate": 1.6830188679245286e-05, "loss": 1.8817, "mean_token_accuracy": 0.6210294425487518, "num_tokens": 96341.0, "step": 95 }, { "entropy": 1.9929674506187438, "epoch": 0.18535681186283595, "grad_norm": 1.6688119173049927, "learning_rate": 1.6641509433962265e-05, "loss": 1.9392, "mean_token_accuracy": 0.6368482947349549, "num_tokens": 97810.0, "step": 100 }, { "entropy": 1.712046504020691, "epoch": 0.19462465245597776, "grad_norm": 0.4342072010040283, "learning_rate": 1.6452830188679247e-05, "loss": 1.5909, "mean_token_accuracy": 0.6414119660854339, "num_tokens": 107589.0, "step": 105 }, { "entropy": 1.7470085620880127, "epoch": 0.20389249304911955, "grad_norm": 0.5741161704063416, "learning_rate": 1.626415094339623e-05, "loss": 1.5711, "mean_token_accuracy": 0.6456838071346283, "num_tokens": 114565.0, "step": 110 }, { "entropy": 1.5347474694252015, "epoch": 0.21316033364226136, "grad_norm": 0.6536217927932739, "learning_rate": 1.607547169811321e-05, "loss": 1.3867, "mean_token_accuracy": 0.6677907109260559, "num_tokens": 120099.0, "step": 115 }, { "entropy": 1.5956430912017823, "epoch": 0.22242817423540315, "grad_norm": 0.6675344705581665, "learning_rate": 1.588679245283019e-05, "loss": 1.4253, "mean_token_accuracy": 0.6632845401763916, "num_tokens": 125112.0, "step": 120 }, { "entropy": 1.5792277336120606, "epoch": 0.23169601482854496, "grad_norm": 0.7069701552391052, "learning_rate": 1.5698113207547172e-05, "loss": 1.3823, "mean_token_accuracy": 0.6681795418262482, "num_tokens": 129723.0, "step": 125 }, { "entropy": 1.5756880044937134, "epoch": 0.24096385542168675, "grad_norm": 0.7337709069252014, "learning_rate": 1.5509433962264154e-05, "loss": 1.3491, "mean_token_accuracy": 0.6784022688865662, "num_tokens": 134105.0, "step": 130 }, { "entropy": 1.6404427766799927, "epoch": 0.25023169601482853, "grad_norm": 0.7882033586502075, "learning_rate": 1.5320754716981133e-05, "loss": 1.4448, "mean_token_accuracy": 0.6615463674068451, "num_tokens": 138249.0, "step": 135 }, { "entropy": 1.5251396298408508, "epoch": 0.2594995366079703, "grad_norm": 0.9358009099960327, "learning_rate": 1.5132075471698115e-05, "loss": 1.3161, "mean_token_accuracy": 0.6832186877727509, "num_tokens": 141935.0, "step": 140 }, { "entropy": 1.6210530638694762, "epoch": 0.26876737720111216, "grad_norm": 1.2580927610397339, "learning_rate": 1.4943396226415094e-05, "loss": 1.5166, "mean_token_accuracy": 0.6789897084236145, "num_tokens": 144364.0, "step": 145 }, { "entropy": 1.5357141017913818, "epoch": 0.27803521779425394, "grad_norm": 1.4969000816345215, "learning_rate": 1.4754716981132076e-05, "loss": 1.5769, "mean_token_accuracy": 0.7000365734100342, "num_tokens": 145928.0, "step": 150 }, { "entropy": 1.4059158325195313, "epoch": 0.2873030583873957, "grad_norm": 0.5834245681762695, "learning_rate": 1.4566037735849057e-05, "loss": 1.3117, "mean_token_accuracy": 0.6917888641357421, "num_tokens": 156168.0, "step": 155 }, { "entropy": 1.4503844499588012, "epoch": 0.2965708989805375, "grad_norm": 0.5249871611595154, "learning_rate": 1.4377358490566037e-05, "loss": 1.3086, "mean_token_accuracy": 0.692787104845047, "num_tokens": 165610.0, "step": 160 }, { "entropy": 1.5506718158721924, "epoch": 0.30583873957367935, "grad_norm": 0.6383576393127441, "learning_rate": 1.418867924528302e-05, "loss": 1.3519, "mean_token_accuracy": 0.6676154375076294, "num_tokens": 171787.0, "step": 165 }, { "entropy": 1.5482367396354675, "epoch": 0.31510658016682114, "grad_norm": 0.8127756714820862, "learning_rate": 1.4e-05, "loss": 1.3004, "mean_token_accuracy": 0.6841661810874939, "num_tokens": 177223.0, "step": 170 }, { "entropy": 1.5198933720588683, "epoch": 0.3243744207599629, "grad_norm": 0.7491681575775146, "learning_rate": 1.3811320754716982e-05, "loss": 1.3046, "mean_token_accuracy": 0.684827846288681, "num_tokens": 182283.0, "step": 175 }, { "entropy": 1.4806129813194275, "epoch": 0.3336422613531047, "grad_norm": 0.7199849486351013, "learning_rate": 1.3622641509433962e-05, "loss": 1.3289, "mean_token_accuracy": 0.6813067197799683, "num_tokens": 187001.0, "step": 180 }, { "entropy": 1.4494765639305114, "epoch": 0.34291010194624655, "grad_norm": 0.842717707157135, "learning_rate": 1.3433962264150943e-05, "loss": 1.2323, "mean_token_accuracy": 0.7035281479358673, "num_tokens": 191394.0, "step": 185 }, { "entropy": 1.4810110807418824, "epoch": 0.35217794253938833, "grad_norm": 0.9301387667655945, "learning_rate": 1.3245283018867925e-05, "loss": 1.2773, "mean_token_accuracy": 0.6910167336463928, "num_tokens": 195399.0, "step": 190 }, { "entropy": 1.5017635345458984, "epoch": 0.3614457831325301, "grad_norm": 0.923757791519165, "learning_rate": 1.3056603773584906e-05, "loss": 1.3787, "mean_token_accuracy": 0.6846955835819244, "num_tokens": 198934.0, "step": 195 }, { "entropy": 1.6470162987709045, "epoch": 0.3707136237256719, "grad_norm": 1.4323452711105347, "learning_rate": 1.2867924528301888e-05, "loss": 1.6467, "mean_token_accuracy": 0.6908825635910034, "num_tokens": 200588.0, "step": 200 }, { "entropy": 1.6742107629776002, "epoch": 0.3799814643188137, "grad_norm": 0.6032175421714783, "learning_rate": 1.2679245283018868e-05, "loss": 1.5514, "mean_token_accuracy": 0.647225683927536, "num_tokens": 210039.0, "step": 205 }, { "entropy": 1.381903338432312, "epoch": 0.38924930491195553, "grad_norm": 0.6865923404693604, "learning_rate": 1.2490566037735849e-05, "loss": 1.1837, "mean_token_accuracy": 0.7007428467273712, "num_tokens": 216062.0, "step": 210 }, { "entropy": 1.5497890710830688, "epoch": 0.3985171455050973, "grad_norm": 0.7516324520111084, "learning_rate": 1.2301886792452831e-05, "loss": 1.3396, "mean_token_accuracy": 0.679390799999237, "num_tokens": 221238.0, "step": 215 }, { "entropy": 1.629994511604309, "epoch": 0.4077849860982391, "grad_norm": 0.8432377576828003, "learning_rate": 1.2113207547169811e-05, "loss": 1.4085, "mean_token_accuracy": 0.6734645128250122, "num_tokens": 226114.0, "step": 220 }, { "entropy": 1.4988724350929261, "epoch": 0.4170528266913809, "grad_norm": 0.7744415998458862, "learning_rate": 1.1924528301886794e-05, "loss": 1.2606, "mean_token_accuracy": 0.690128743648529, "num_tokens": 230722.0, "step": 225 }, { "entropy": 1.6341279029846192, "epoch": 0.4263206672845227, "grad_norm": 0.9293099641799927, "learning_rate": 1.1735849056603774e-05, "loss": 1.3895, "mean_token_accuracy": 0.6756053507328034, "num_tokens": 235030.0, "step": 230 }, { "entropy": 1.4942476391792296, "epoch": 0.4355885078776645, "grad_norm": 1.097900629043579, "learning_rate": 1.1547169811320756e-05, "loss": 1.2779, "mean_token_accuracy": 0.6963155150413514, "num_tokens": 239053.0, "step": 235 }, { "entropy": 1.4344226121902466, "epoch": 0.4448563484708063, "grad_norm": 0.9284445643424988, "learning_rate": 1.1358490566037737e-05, "loss": 1.2624, "mean_token_accuracy": 0.6946760237216949, "num_tokens": 242797.0, "step": 240 }, { "entropy": 1.6279133677482605, "epoch": 0.4541241890639481, "grad_norm": 1.8525234460830688, "learning_rate": 1.1169811320754717e-05, "loss": 1.5141, "mean_token_accuracy": 0.6641450226306915, "num_tokens": 245262.0, "step": 245 }, { "entropy": 1.5635493040084838, "epoch": 0.4633920296570899, "grad_norm": 1.4725935459136963, "learning_rate": 1.09811320754717e-05, "loss": 1.5592, "mean_token_accuracy": 0.6816279590129852, "num_tokens": 246825.0, "step": 250 }, { "entropy": 1.5720198631286622, "epoch": 0.4726598702502317, "grad_norm": 0.9245171546936035, "learning_rate": 1.079245283018868e-05, "loss": 1.4606, "mean_token_accuracy": 0.6601259410381317, "num_tokens": 256931.0, "step": 255 }, { "entropy": 1.5677057027816772, "epoch": 0.4819277108433735, "grad_norm": 0.8322890400886536, "learning_rate": 1.0603773584905662e-05, "loss": 1.4144, "mean_token_accuracy": 0.6694712340831757, "num_tokens": 263360.0, "step": 260 }, { "entropy": 1.59023619890213, "epoch": 0.4911955514365153, "grad_norm": 0.7772918939590454, "learning_rate": 1.0415094339622642e-05, "loss": 1.3723, "mean_token_accuracy": 0.6685677945613862, "num_tokens": 269174.0, "step": 265 }, { "entropy": 1.5507931351661681, "epoch": 0.5004633920296571, "grad_norm": 0.8000075221061707, "learning_rate": 1.0226415094339623e-05, "loss": 1.3126, "mean_token_accuracy": 0.6843527674674987, "num_tokens": 274595.0, "step": 270 }, { "entropy": 1.5491173028945924, "epoch": 0.5097312326227988, "grad_norm": 0.8983348608016968, "learning_rate": 1.0037735849056605e-05, "loss": 1.369, "mean_token_accuracy": 0.6823632538318634, "num_tokens": 279472.0, "step": 275 }, { "entropy": 1.411498475074768, "epoch": 0.5189990732159406, "grad_norm": 0.891360878944397, "learning_rate": 9.849056603773586e-06, "loss": 1.1676, "mean_token_accuracy": 0.7068820059299469, "num_tokens": 283955.0, "step": 280 }, { "entropy": 1.5002652764320374, "epoch": 0.5282669138090825, "grad_norm": 0.9551361799240112, "learning_rate": 9.660377358490568e-06, "loss": 1.2826, "mean_token_accuracy": 0.6885687828063964, "num_tokens": 288166.0, "step": 285 }, { "entropy": 1.4297375559806824, "epoch": 0.5375347544022243, "grad_norm": 1.0993260145187378, "learning_rate": 9.471698113207548e-06, "loss": 1.2081, "mean_token_accuracy": 0.7039083421230317, "num_tokens": 292143.0, "step": 290 }, { "entropy": 1.5504101514816284, "epoch": 0.5468025949953661, "grad_norm": 1.0684620141983032, "learning_rate": 9.283018867924529e-06, "loss": 1.3632, "mean_token_accuracy": 0.6732664227485656, "num_tokens": 295795.0, "step": 295 }, { "entropy": 1.4705226182937623, "epoch": 0.5560704355885079, "grad_norm": 2.1173923015594482, "learning_rate": 9.09433962264151e-06, "loss": 1.4014, "mean_token_accuracy": 0.6959770500659943, "num_tokens": 297656.0, "step": 300 }, { "entropy": 1.4929959535598756, "epoch": 0.5653382761816497, "grad_norm": 1.2770863771438599, "learning_rate": 8.905660377358491e-06, "loss": 1.414, "mean_token_accuracy": 0.6737047851085662, "num_tokens": 307896.0, "step": 305 }, { "entropy": 1.4795246124267578, "epoch": 0.5746061167747915, "grad_norm": 0.8402530550956726, "learning_rate": 8.716981132075473e-06, "loss": 1.3673, "mean_token_accuracy": 0.6673755586147309, "num_tokens": 314996.0, "step": 310 }, { "entropy": 1.5327817797660828, "epoch": 0.5838739573679332, "grad_norm": 0.9122950434684753, "learning_rate": 8.528301886792454e-06, "loss": 1.3569, "mean_token_accuracy": 0.6737642705440521, "num_tokens": 320535.0, "step": 315 }, { "entropy": 1.4931538462638856, "epoch": 0.593141797961075, "grad_norm": 0.8541343808174133, "learning_rate": 8.339622641509434e-06, "loss": 1.2924, "mean_token_accuracy": 0.6892772674560547, "num_tokens": 325543.0, "step": 320 }, { "entropy": 1.4055845737457275, "epoch": 0.6024096385542169, "grad_norm": 1.0335566997528076, "learning_rate": 8.150943396226417e-06, "loss": 1.2071, "mean_token_accuracy": 0.7009121060371399, "num_tokens": 330160.0, "step": 325 }, { "entropy": 1.3517128109931946, "epoch": 0.6116774791473587, "grad_norm": 0.9749907851219177, "learning_rate": 7.962264150943397e-06, "loss": 1.1001, "mean_token_accuracy": 0.7244792997837066, "num_tokens": 334517.0, "step": 330 }, { "entropy": 1.511979877948761, "epoch": 0.6209453197405005, "grad_norm": 1.2967917919158936, "learning_rate": 7.77358490566038e-06, "loss": 1.3045, "mean_token_accuracy": 0.6831347227096558, "num_tokens": 338603.0, "step": 335 }, { "entropy": 1.4577907562255858, "epoch": 0.6302131603336423, "grad_norm": 0.9886659383773804, "learning_rate": 7.58490566037736e-06, "loss": 1.2057, "mean_token_accuracy": 0.7024867594242096, "num_tokens": 342434.0, "step": 340 }, { "entropy": 1.368123424053192, "epoch": 0.6394810009267841, "grad_norm": 1.8563096523284912, "learning_rate": 7.396226415094339e-06, "loss": 1.2631, "mean_token_accuracy": 0.7030794739723205, "num_tokens": 345119.0, "step": 345 }, { "entropy": 1.550855565071106, "epoch": 0.6487488415199258, "grad_norm": 1.7384581565856934, "learning_rate": 7.207547169811321e-06, "loss": 1.5237, "mean_token_accuracy": 0.6707696557044983, "num_tokens": 346702.0, "step": 350 }, { "entropy": 1.4958812713623046, "epoch": 0.6580166821130676, "grad_norm": 0.7086682915687561, "learning_rate": 7.018867924528302e-06, "loss": 1.4053, "mean_token_accuracy": 0.6752688169479371, "num_tokens": 356942.0, "step": 355 }, { "entropy": 1.558197546005249, "epoch": 0.6672845227062094, "grad_norm": 0.7285569310188293, "learning_rate": 6.830188679245283e-06, "loss": 1.4605, "mean_token_accuracy": 0.6629868388175965, "num_tokens": 366408.0, "step": 360 }, { "entropy": 1.6734450340270997, "epoch": 0.6765523632993512, "grad_norm": 0.8609201908111572, "learning_rate": 6.641509433962265e-06, "loss": 1.5822, "mean_token_accuracy": 0.6437154173851013, "num_tokens": 373516.0, "step": 365 }, { "entropy": 1.5121603965759278, "epoch": 0.6858202038924931, "grad_norm": 0.8602127432823181, "learning_rate": 6.452830188679245e-06, "loss": 1.3354, "mean_token_accuracy": 0.6697323322296143, "num_tokens": 379078.0, "step": 370 }, { "entropy": 1.5671154141426087, "epoch": 0.6950880444856349, "grad_norm": 1.0804234743118286, "learning_rate": 6.2641509433962265e-06, "loss": 1.3706, "mean_token_accuracy": 0.6781690716743469, "num_tokens": 384082.0, "step": 375 }, { "entropy": 1.4360106825828551, "epoch": 0.7043558850787767, "grad_norm": 1.0533781051635742, "learning_rate": 6.075471698113208e-06, "loss": 1.2339, "mean_token_accuracy": 0.6931209981441497, "num_tokens": 388701.0, "step": 380 }, { "entropy": 1.3446730136871339, "epoch": 0.7136237256719185, "grad_norm": 1.0057952404022217, "learning_rate": 5.886792452830189e-06, "loss": 1.1451, "mean_token_accuracy": 0.7143323004245759, "num_tokens": 393032.0, "step": 385 }, { "entropy": 1.3606623888015748, "epoch": 0.7228915662650602, "grad_norm": 0.9760032296180725, "learning_rate": 5.6981132075471704e-06, "loss": 1.1359, "mean_token_accuracy": 0.7167974233627319, "num_tokens": 397054.0, "step": 390 }, { "entropy": 1.3662820339202881, "epoch": 0.732159406858202, "grad_norm": 1.250963807106018, "learning_rate": 5.509433962264151e-06, "loss": 1.1525, "mean_token_accuracy": 0.7196339964866638, "num_tokens": 400475.0, "step": 395 }, { "entropy": 1.5162119686603546, "epoch": 0.7414272474513438, "grad_norm": 2.1900084018707275, "learning_rate": 5.320754716981132e-06, "loss": 1.4817, "mean_token_accuracy": 0.6947880864143372, "num_tokens": 402190.0, "step": 400 }, { "entropy": 1.4912107944488526, "epoch": 0.7506950880444856, "grad_norm": 0.6602088809013367, "learning_rate": 5.1320754716981136e-06, "loss": 1.3781, "mean_token_accuracy": 0.6775171160697937, "num_tokens": 412430.0, "step": 405 }, { "entropy": 1.677505886554718, "epoch": 0.7599629286376274, "grad_norm": 0.8567835688591003, "learning_rate": 4.943396226415095e-06, "loss": 1.5629, "mean_token_accuracy": 0.638292646408081, "num_tokens": 419893.0, "step": 410 }, { "entropy": 1.4946231245994568, "epoch": 0.7692307692307693, "grad_norm": 0.9445057511329651, "learning_rate": 4.754716981132076e-06, "loss": 1.3326, "mean_token_accuracy": 0.6779886364936829, "num_tokens": 425619.0, "step": 415 }, { "entropy": 1.3864770650863647, "epoch": 0.7784986098239111, "grad_norm": 0.8252947926521301, "learning_rate": 4.566037735849057e-06, "loss": 1.2169, "mean_token_accuracy": 0.7035700976848602, "num_tokens": 430797.0, "step": 420 }, { "entropy": 1.5413469910621642, "epoch": 0.7877664504170528, "grad_norm": 1.1849970817565918, "learning_rate": 4.377358490566038e-06, "loss": 1.3547, "mean_token_accuracy": 0.6810157537460327, "num_tokens": 435596.0, "step": 425 }, { "entropy": 1.3699937224388123, "epoch": 0.7970342910101946, "grad_norm": 0.877142608165741, "learning_rate": 4.188679245283019e-06, "loss": 1.1797, "mean_token_accuracy": 0.7002040147781372, "num_tokens": 440241.0, "step": 430 }, { "entropy": 1.391974401473999, "epoch": 0.8063021316033364, "grad_norm": 1.0411101579666138, "learning_rate": 4.000000000000001e-06, "loss": 1.1623, "mean_token_accuracy": 0.7032719731330872, "num_tokens": 444603.0, "step": 435 }, { "entropy": 1.5593681573867797, "epoch": 0.8155699721964782, "grad_norm": 1.07487154006958, "learning_rate": 3.8113207547169816e-06, "loss": 1.3711, "mean_token_accuracy": 0.6782773613929749, "num_tokens": 448697.0, "step": 440 }, { "entropy": 1.3416823267936706, "epoch": 0.82483781278962, "grad_norm": 1.0888190269470215, "learning_rate": 3.6226415094339625e-06, "loss": 1.1324, "mean_token_accuracy": 0.7193882942199707, "num_tokens": 452317.0, "step": 445 }, { "entropy": 1.5021097183227539, "epoch": 0.8341056533827618, "grad_norm": 2.1909356117248535, "learning_rate": 3.4339622641509434e-06, "loss": 1.4446, "mean_token_accuracy": 0.6848468244075775, "num_tokens": 453964.0, "step": 450 }, { "entropy": 1.5324198842048644, "epoch": 0.8433734939759037, "grad_norm": 0.7571365833282471, "learning_rate": 3.2452830188679247e-06, "loss": 1.4131, "mean_token_accuracy": 0.6636363625526428, "num_tokens": 464204.0, "step": 455 }, { "entropy": 1.3785831093788148, "epoch": 0.8526413345690455, "grad_norm": 0.7701159119606018, "learning_rate": 3.0566037735849056e-06, "loss": 1.2441, "mean_token_accuracy": 0.6936827838420868, "num_tokens": 473089.0, "step": 460 }, { "entropy": 1.4654639124870301, "epoch": 0.8619091751621872, "grad_norm": 0.9103213548660278, "learning_rate": 2.867924528301887e-06, "loss": 1.3014, "mean_token_accuracy": 0.685904186964035, "num_tokens": 479410.0, "step": 465 }, { "entropy": 1.533838427066803, "epoch": 0.871177015755329, "grad_norm": 0.8901606202125549, "learning_rate": 2.6792452830188682e-06, "loss": 1.3928, "mean_token_accuracy": 0.6758940577507019, "num_tokens": 485110.0, "step": 470 }, { "entropy": 1.3798070430755616, "epoch": 0.8804448563484708, "grad_norm": 1.0601820945739746, "learning_rate": 2.490566037735849e-06, "loss": 1.1907, "mean_token_accuracy": 0.7054125189781189, "num_tokens": 490180.0, "step": 475 }, { "entropy": 1.4131479620933534, "epoch": 0.8897126969416126, "grad_norm": 0.8970419764518738, "learning_rate": 2.3018867924528305e-06, "loss": 1.1863, "mean_token_accuracy": 0.7067974150180817, "num_tokens": 494890.0, "step": 480 }, { "entropy": 1.4630970120429994, "epoch": 0.8989805375347544, "grad_norm": 1.003049373626709, "learning_rate": 2.1132075471698114e-06, "loss": 1.2327, "mean_token_accuracy": 0.6967244625091553, "num_tokens": 499349.0, "step": 485 }, { "entropy": 1.3104987263679504, "epoch": 0.9082483781278962, "grad_norm": 1.0828076601028442, "learning_rate": 1.9245283018867927e-06, "loss": 1.1085, "mean_token_accuracy": 0.7211176335811615, "num_tokens": 503600.0, "step": 490 }, { "entropy": 1.4279333114624024, "epoch": 0.917516218721038, "grad_norm": 1.2222362756729126, "learning_rate": 1.7358490566037736e-06, "loss": 1.2275, "mean_token_accuracy": 0.703647392988205, "num_tokens": 507413.0, "step": 495 }, { "entropy": 1.4541478991508483, "epoch": 0.9267840593141798, "grad_norm": 2.478383779525757, "learning_rate": 1.5471698113207547e-06, "loss": 1.433, "mean_token_accuracy": 0.7055663108825684, "num_tokens": 508911.0, "step": 500 }, { "entropy": 1.504941475391388, "epoch": 0.9360518999073216, "grad_norm": 0.7645187973976135, "learning_rate": 1.358490566037736e-06, "loss": 1.3723, "mean_token_accuracy": 0.6745567083358764, "num_tokens": 518390.0, "step": 505 }, { "entropy": 1.574069583415985, "epoch": 0.9453197405004634, "grad_norm": 0.9938948750495911, "learning_rate": 1.1698113207547171e-06, "loss": 1.4143, "mean_token_accuracy": 0.6633340060710907, "num_tokens": 524489.0, "step": 510 }, { "entropy": 1.3694233775138855, "epoch": 0.9545875810936052, "grad_norm": 0.9658361673355103, "learning_rate": 9.811320754716983e-07, "loss": 1.1829, "mean_token_accuracy": 0.7040388941764831, "num_tokens": 529543.0, "step": 515 }, { "entropy": 1.41559841632843, "epoch": 0.963855421686747, "grad_norm": 0.9936702251434326, "learning_rate": 7.924528301886793e-07, "loss": 1.2234, "mean_token_accuracy": 0.7022507786750793, "num_tokens": 534072.0, "step": 520 }, { "entropy": 1.391177773475647, "epoch": 0.9731232622798888, "grad_norm": 0.959622323513031, "learning_rate": 6.037735849056605e-07, "loss": 1.2196, "mean_token_accuracy": 0.6922858953475952, "num_tokens": 538321.0, "step": 525 }, { "entropy": 1.3745897650718688, "epoch": 0.9823911028730306, "grad_norm": 1.3114018440246582, "learning_rate": 4.1509433962264154e-07, "loss": 1.1473, "mean_token_accuracy": 0.7134360671043396, "num_tokens": 542222.0, "step": 530 }, { "entropy": 1.492677342891693, "epoch": 0.9916589434661723, "grad_norm": 1.9476492404937744, "learning_rate": 2.2641509433962265e-07, "loss": 1.3013, "mean_token_accuracy": 0.6977547407150269, "num_tokens": 545236.0, "step": 535 }, { "entropy": 1.4785194396972656, "epoch": 1.0, "grad_norm": 3.7451815605163574, "learning_rate": 3.773584905660378e-08, "loss": 1.4539, "mean_token_accuracy": 0.69340937005149, "num_tokens": 546641.0, "step": 540 } ], "logging_steps": 5, "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4637351756455936e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }