| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 108, | |
| "global_step": 540, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.7017320156097413, | |
| "epoch": 0.009267840593141797, | |
| "grad_norm": 0.4929618835449219, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.8663, | |
| "mean_token_accuracy": 0.6088488936424256, | |
| "num_tokens": 10169.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.6203628540039063, | |
| "epoch": 0.018535681186283594, | |
| "grad_norm": 0.5699822306632996, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.8139, | |
| "mean_token_accuracy": 0.613596773147583, | |
| "num_tokens": 17492.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.6117817282676696, | |
| "epoch": 0.027803521779425393, | |
| "grad_norm": 0.5831968784332275, | |
| "learning_rate": 1.9849056603773588e-05, | |
| "loss": 1.9287, | |
| "mean_token_accuracy": 0.6028814196586609, | |
| "num_tokens": 23104.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.5564811944961547, | |
| "epoch": 0.03707136237256719, | |
| "grad_norm": 0.6044633388519287, | |
| "learning_rate": 1.9660377358490567e-05, | |
| "loss": 1.8979, | |
| "mean_token_accuracy": 0.6022705733776093, | |
| "num_tokens": 28237.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.5479681015014648, | |
| "epoch": 0.04633920296570899, | |
| "grad_norm": 0.5943928956985474, | |
| "learning_rate": 1.947169811320755e-05, | |
| "loss": 1.8733, | |
| "mean_token_accuracy": 0.6153187394142151, | |
| "num_tokens": 33029.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.646817970275879, | |
| "epoch": 0.05560704355885079, | |
| "grad_norm": 0.6993905901908875, | |
| "learning_rate": 1.928301886792453e-05, | |
| "loss": 1.8784, | |
| "mean_token_accuracy": 0.6003017485141754, | |
| "num_tokens": 37369.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.650176739692688, | |
| "epoch": 0.06487488415199258, | |
| "grad_norm": 0.786679744720459, | |
| "learning_rate": 1.909433962264151e-05, | |
| "loss": 1.8796, | |
| "mean_token_accuracy": 0.6147689402103425, | |
| "num_tokens": 41484.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.590287184715271, | |
| "epoch": 0.07414272474513438, | |
| "grad_norm": 0.770859956741333, | |
| "learning_rate": 1.8905660377358492e-05, | |
| "loss": 1.817, | |
| "mean_token_accuracy": 0.6139472305774689, | |
| "num_tokens": 45350.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.8836397886276246, | |
| "epoch": 0.08341056533827618, | |
| "grad_norm": 1.3446482419967651, | |
| "learning_rate": 1.8716981132075474e-05, | |
| "loss": 2.2667, | |
| "mean_token_accuracy": 0.5831025898456573, | |
| "num_tokens": 47900.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.914563238620758, | |
| "epoch": 0.09267840593141798, | |
| "grad_norm": 2.0144922733306885, | |
| "learning_rate": 1.8528301886792453e-05, | |
| "loss": 2.3988, | |
| "mean_token_accuracy": 0.5866762459278106, | |
| "num_tokens": 49421.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.753426432609558, | |
| "epoch": 0.10194624652455977, | |
| "grad_norm": 0.36396369338035583, | |
| "learning_rate": 1.8339622641509435e-05, | |
| "loss": 1.6739, | |
| "mean_token_accuracy": 0.634347426891327, | |
| "num_tokens": 59290.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.7417057394981383, | |
| "epoch": 0.11121408711770157, | |
| "grad_norm": 0.557318389415741, | |
| "learning_rate": 1.8150943396226417e-05, | |
| "loss": 1.643, | |
| "mean_token_accuracy": 0.6404906511306763, | |
| "num_tokens": 66072.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.6922197580337524, | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 0.5784375667572021, | |
| "learning_rate": 1.79622641509434e-05, | |
| "loss": 1.546, | |
| "mean_token_accuracy": 0.6431969463825226, | |
| "num_tokens": 71590.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.7081309676170349, | |
| "epoch": 0.12974976830398516, | |
| "grad_norm": 0.6285673379898071, | |
| "learning_rate": 1.777358490566038e-05, | |
| "loss": 1.5738, | |
| "mean_token_accuracy": 0.6408641755580902, | |
| "num_tokens": 76620.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.706821823120117, | |
| "epoch": 0.13901760889712697, | |
| "grad_norm": 0.6793721914291382, | |
| "learning_rate": 1.758490566037736e-05, | |
| "loss": 1.5597, | |
| "mean_token_accuracy": 0.6496909260749817, | |
| "num_tokens": 81367.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.729298961162567, | |
| "epoch": 0.14828544949026876, | |
| "grad_norm": 0.7073204517364502, | |
| "learning_rate": 1.7396226415094343e-05, | |
| "loss": 1.5936, | |
| "mean_token_accuracy": 0.646106606721878, | |
| "num_tokens": 85829.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.764689528942108, | |
| "epoch": 0.15755329008341057, | |
| "grad_norm": 0.8073896765708923, | |
| "learning_rate": 1.720754716981132e-05, | |
| "loss": 1.5737, | |
| "mean_token_accuracy": 0.6494780361652375, | |
| "num_tokens": 89983.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.6753356814384461, | |
| "epoch": 0.16682113067655235, | |
| "grad_norm": 0.7738911509513855, | |
| "learning_rate": 1.7018867924528304e-05, | |
| "loss": 1.4815, | |
| "mean_token_accuracy": 0.6729483246803284, | |
| "num_tokens": 93855.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.9390615344047546, | |
| "epoch": 0.17608897126969417, | |
| "grad_norm": 1.312859058380127, | |
| "learning_rate": 1.6830188679245286e-05, | |
| "loss": 1.8817, | |
| "mean_token_accuracy": 0.6210294425487518, | |
| "num_tokens": 96341.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.9929674506187438, | |
| "epoch": 0.18535681186283595, | |
| "grad_norm": 1.6688119173049927, | |
| "learning_rate": 1.6641509433962265e-05, | |
| "loss": 1.9392, | |
| "mean_token_accuracy": 0.6368482947349549, | |
| "num_tokens": 97810.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.712046504020691, | |
| "epoch": 0.19462465245597776, | |
| "grad_norm": 0.4342072010040283, | |
| "learning_rate": 1.6452830188679247e-05, | |
| "loss": 1.5909, | |
| "mean_token_accuracy": 0.6414119660854339, | |
| "num_tokens": 107589.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.7470085620880127, | |
| "epoch": 0.20389249304911955, | |
| "grad_norm": 0.5741161704063416, | |
| "learning_rate": 1.626415094339623e-05, | |
| "loss": 1.5711, | |
| "mean_token_accuracy": 0.6456838071346283, | |
| "num_tokens": 114565.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.5347474694252015, | |
| "epoch": 0.21316033364226136, | |
| "grad_norm": 0.6536217927932739, | |
| "learning_rate": 1.607547169811321e-05, | |
| "loss": 1.3867, | |
| "mean_token_accuracy": 0.6677907109260559, | |
| "num_tokens": 120099.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.5956430912017823, | |
| "epoch": 0.22242817423540315, | |
| "grad_norm": 0.6675344705581665, | |
| "learning_rate": 1.588679245283019e-05, | |
| "loss": 1.4253, | |
| "mean_token_accuracy": 0.6632845401763916, | |
| "num_tokens": 125112.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.5792277336120606, | |
| "epoch": 0.23169601482854496, | |
| "grad_norm": 0.7069701552391052, | |
| "learning_rate": 1.5698113207547172e-05, | |
| "loss": 1.3823, | |
| "mean_token_accuracy": 0.6681795418262482, | |
| "num_tokens": 129723.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.5756880044937134, | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 0.7337709069252014, | |
| "learning_rate": 1.5509433962264154e-05, | |
| "loss": 1.3491, | |
| "mean_token_accuracy": 0.6784022688865662, | |
| "num_tokens": 134105.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.6404427766799927, | |
| "epoch": 0.25023169601482853, | |
| "grad_norm": 0.7882033586502075, | |
| "learning_rate": 1.5320754716981133e-05, | |
| "loss": 1.4448, | |
| "mean_token_accuracy": 0.6615463674068451, | |
| "num_tokens": 138249.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.5251396298408508, | |
| "epoch": 0.2594995366079703, | |
| "grad_norm": 0.9358009099960327, | |
| "learning_rate": 1.5132075471698115e-05, | |
| "loss": 1.3161, | |
| "mean_token_accuracy": 0.6832186877727509, | |
| "num_tokens": 141935.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.6210530638694762, | |
| "epoch": 0.26876737720111216, | |
| "grad_norm": 1.2580927610397339, | |
| "learning_rate": 1.4943396226415094e-05, | |
| "loss": 1.5166, | |
| "mean_token_accuracy": 0.6789897084236145, | |
| "num_tokens": 144364.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.5357141017913818, | |
| "epoch": 0.27803521779425394, | |
| "grad_norm": 1.4969000816345215, | |
| "learning_rate": 1.4754716981132076e-05, | |
| "loss": 1.5769, | |
| "mean_token_accuracy": 0.7000365734100342, | |
| "num_tokens": 145928.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.4059158325195313, | |
| "epoch": 0.2873030583873957, | |
| "grad_norm": 0.5834245681762695, | |
| "learning_rate": 1.4566037735849057e-05, | |
| "loss": 1.3117, | |
| "mean_token_accuracy": 0.6917888641357421, | |
| "num_tokens": 156168.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.4503844499588012, | |
| "epoch": 0.2965708989805375, | |
| "grad_norm": 0.5249871611595154, | |
| "learning_rate": 1.4377358490566037e-05, | |
| "loss": 1.3086, | |
| "mean_token_accuracy": 0.692787104845047, | |
| "num_tokens": 165610.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.5506718158721924, | |
| "epoch": 0.30583873957367935, | |
| "grad_norm": 0.6383576393127441, | |
| "learning_rate": 1.418867924528302e-05, | |
| "loss": 1.3519, | |
| "mean_token_accuracy": 0.6676154375076294, | |
| "num_tokens": 171787.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.5482367396354675, | |
| "epoch": 0.31510658016682114, | |
| "grad_norm": 0.8127756714820862, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.3004, | |
| "mean_token_accuracy": 0.6841661810874939, | |
| "num_tokens": 177223.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.5198933720588683, | |
| "epoch": 0.3243744207599629, | |
| "grad_norm": 0.7491681575775146, | |
| "learning_rate": 1.3811320754716982e-05, | |
| "loss": 1.3046, | |
| "mean_token_accuracy": 0.684827846288681, | |
| "num_tokens": 182283.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.4806129813194275, | |
| "epoch": 0.3336422613531047, | |
| "grad_norm": 0.7199849486351013, | |
| "learning_rate": 1.3622641509433962e-05, | |
| "loss": 1.3289, | |
| "mean_token_accuracy": 0.6813067197799683, | |
| "num_tokens": 187001.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.4494765639305114, | |
| "epoch": 0.34291010194624655, | |
| "grad_norm": 0.842717707157135, | |
| "learning_rate": 1.3433962264150943e-05, | |
| "loss": 1.2323, | |
| "mean_token_accuracy": 0.7035281479358673, | |
| "num_tokens": 191394.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.4810110807418824, | |
| "epoch": 0.35217794253938833, | |
| "grad_norm": 0.9301387667655945, | |
| "learning_rate": 1.3245283018867925e-05, | |
| "loss": 1.2773, | |
| "mean_token_accuracy": 0.6910167336463928, | |
| "num_tokens": 195399.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.5017635345458984, | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 0.923757791519165, | |
| "learning_rate": 1.3056603773584906e-05, | |
| "loss": 1.3787, | |
| "mean_token_accuracy": 0.6846955835819244, | |
| "num_tokens": 198934.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.6470162987709045, | |
| "epoch": 0.3707136237256719, | |
| "grad_norm": 1.4323452711105347, | |
| "learning_rate": 1.2867924528301888e-05, | |
| "loss": 1.6467, | |
| "mean_token_accuracy": 0.6908825635910034, | |
| "num_tokens": 200588.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.6742107629776002, | |
| "epoch": 0.3799814643188137, | |
| "grad_norm": 0.6032175421714783, | |
| "learning_rate": 1.2679245283018868e-05, | |
| "loss": 1.5514, | |
| "mean_token_accuracy": 0.647225683927536, | |
| "num_tokens": 210039.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.381903338432312, | |
| "epoch": 0.38924930491195553, | |
| "grad_norm": 0.6865923404693604, | |
| "learning_rate": 1.2490566037735849e-05, | |
| "loss": 1.1837, | |
| "mean_token_accuracy": 0.7007428467273712, | |
| "num_tokens": 216062.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.5497890710830688, | |
| "epoch": 0.3985171455050973, | |
| "grad_norm": 0.7516324520111084, | |
| "learning_rate": 1.2301886792452831e-05, | |
| "loss": 1.3396, | |
| "mean_token_accuracy": 0.679390799999237, | |
| "num_tokens": 221238.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.629994511604309, | |
| "epoch": 0.4077849860982391, | |
| "grad_norm": 0.8432377576828003, | |
| "learning_rate": 1.2113207547169811e-05, | |
| "loss": 1.4085, | |
| "mean_token_accuracy": 0.6734645128250122, | |
| "num_tokens": 226114.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.4988724350929261, | |
| "epoch": 0.4170528266913809, | |
| "grad_norm": 0.7744415998458862, | |
| "learning_rate": 1.1924528301886794e-05, | |
| "loss": 1.2606, | |
| "mean_token_accuracy": 0.690128743648529, | |
| "num_tokens": 230722.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.6341279029846192, | |
| "epoch": 0.4263206672845227, | |
| "grad_norm": 0.9293099641799927, | |
| "learning_rate": 1.1735849056603774e-05, | |
| "loss": 1.3895, | |
| "mean_token_accuracy": 0.6756053507328034, | |
| "num_tokens": 235030.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.4942476391792296, | |
| "epoch": 0.4355885078776645, | |
| "grad_norm": 1.097900629043579, | |
| "learning_rate": 1.1547169811320756e-05, | |
| "loss": 1.2779, | |
| "mean_token_accuracy": 0.6963155150413514, | |
| "num_tokens": 239053.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.4344226121902466, | |
| "epoch": 0.4448563484708063, | |
| "grad_norm": 0.9284445643424988, | |
| "learning_rate": 1.1358490566037737e-05, | |
| "loss": 1.2624, | |
| "mean_token_accuracy": 0.6946760237216949, | |
| "num_tokens": 242797.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.6279133677482605, | |
| "epoch": 0.4541241890639481, | |
| "grad_norm": 1.8525234460830688, | |
| "learning_rate": 1.1169811320754717e-05, | |
| "loss": 1.5141, | |
| "mean_token_accuracy": 0.6641450226306915, | |
| "num_tokens": 245262.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.5635493040084838, | |
| "epoch": 0.4633920296570899, | |
| "grad_norm": 1.4725935459136963, | |
| "learning_rate": 1.09811320754717e-05, | |
| "loss": 1.5592, | |
| "mean_token_accuracy": 0.6816279590129852, | |
| "num_tokens": 246825.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.5720198631286622, | |
| "epoch": 0.4726598702502317, | |
| "grad_norm": 0.9245171546936035, | |
| "learning_rate": 1.079245283018868e-05, | |
| "loss": 1.4606, | |
| "mean_token_accuracy": 0.6601259410381317, | |
| "num_tokens": 256931.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.5677057027816772, | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.8322890400886536, | |
| "learning_rate": 1.0603773584905662e-05, | |
| "loss": 1.4144, | |
| "mean_token_accuracy": 0.6694712340831757, | |
| "num_tokens": 263360.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.59023619890213, | |
| "epoch": 0.4911955514365153, | |
| "grad_norm": 0.7772918939590454, | |
| "learning_rate": 1.0415094339622642e-05, | |
| "loss": 1.3723, | |
| "mean_token_accuracy": 0.6685677945613862, | |
| "num_tokens": 269174.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.5507931351661681, | |
| "epoch": 0.5004633920296571, | |
| "grad_norm": 0.8000075221061707, | |
| "learning_rate": 1.0226415094339623e-05, | |
| "loss": 1.3126, | |
| "mean_token_accuracy": 0.6843527674674987, | |
| "num_tokens": 274595.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.5491173028945924, | |
| "epoch": 0.5097312326227988, | |
| "grad_norm": 0.8983348608016968, | |
| "learning_rate": 1.0037735849056605e-05, | |
| "loss": 1.369, | |
| "mean_token_accuracy": 0.6823632538318634, | |
| "num_tokens": 279472.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.411498475074768, | |
| "epoch": 0.5189990732159406, | |
| "grad_norm": 0.891360878944397, | |
| "learning_rate": 9.849056603773586e-06, | |
| "loss": 1.1676, | |
| "mean_token_accuracy": 0.7068820059299469, | |
| "num_tokens": 283955.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.5002652764320374, | |
| "epoch": 0.5282669138090825, | |
| "grad_norm": 0.9551361799240112, | |
| "learning_rate": 9.660377358490568e-06, | |
| "loss": 1.2826, | |
| "mean_token_accuracy": 0.6885687828063964, | |
| "num_tokens": 288166.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 1.4297375559806824, | |
| "epoch": 0.5375347544022243, | |
| "grad_norm": 1.0993260145187378, | |
| "learning_rate": 9.471698113207548e-06, | |
| "loss": 1.2081, | |
| "mean_token_accuracy": 0.7039083421230317, | |
| "num_tokens": 292143.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.5504101514816284, | |
| "epoch": 0.5468025949953661, | |
| "grad_norm": 1.0684620141983032, | |
| "learning_rate": 9.283018867924529e-06, | |
| "loss": 1.3632, | |
| "mean_token_accuracy": 0.6732664227485656, | |
| "num_tokens": 295795.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 1.4705226182937623, | |
| "epoch": 0.5560704355885079, | |
| "grad_norm": 2.1173923015594482, | |
| "learning_rate": 9.09433962264151e-06, | |
| "loss": 1.4014, | |
| "mean_token_accuracy": 0.6959770500659943, | |
| "num_tokens": 297656.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.4929959535598756, | |
| "epoch": 0.5653382761816497, | |
| "grad_norm": 1.2770863771438599, | |
| "learning_rate": 8.905660377358491e-06, | |
| "loss": 1.414, | |
| "mean_token_accuracy": 0.6737047851085662, | |
| "num_tokens": 307896.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 1.4795246124267578, | |
| "epoch": 0.5746061167747915, | |
| "grad_norm": 0.8402530550956726, | |
| "learning_rate": 8.716981132075473e-06, | |
| "loss": 1.3673, | |
| "mean_token_accuracy": 0.6673755586147309, | |
| "num_tokens": 314996.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.5327817797660828, | |
| "epoch": 0.5838739573679332, | |
| "grad_norm": 0.9122950434684753, | |
| "learning_rate": 8.528301886792454e-06, | |
| "loss": 1.3569, | |
| "mean_token_accuracy": 0.6737642705440521, | |
| "num_tokens": 320535.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 1.4931538462638856, | |
| "epoch": 0.593141797961075, | |
| "grad_norm": 0.8541343808174133, | |
| "learning_rate": 8.339622641509434e-06, | |
| "loss": 1.2924, | |
| "mean_token_accuracy": 0.6892772674560547, | |
| "num_tokens": 325543.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.4055845737457275, | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 1.0335566997528076, | |
| "learning_rate": 8.150943396226417e-06, | |
| "loss": 1.2071, | |
| "mean_token_accuracy": 0.7009121060371399, | |
| "num_tokens": 330160.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.3517128109931946, | |
| "epoch": 0.6116774791473587, | |
| "grad_norm": 0.9749907851219177, | |
| "learning_rate": 7.962264150943397e-06, | |
| "loss": 1.1001, | |
| "mean_token_accuracy": 0.7244792997837066, | |
| "num_tokens": 334517.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.511979877948761, | |
| "epoch": 0.6209453197405005, | |
| "grad_norm": 1.2967917919158936, | |
| "learning_rate": 7.77358490566038e-06, | |
| "loss": 1.3045, | |
| "mean_token_accuracy": 0.6831347227096558, | |
| "num_tokens": 338603.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.4577907562255858, | |
| "epoch": 0.6302131603336423, | |
| "grad_norm": 0.9886659383773804, | |
| "learning_rate": 7.58490566037736e-06, | |
| "loss": 1.2057, | |
| "mean_token_accuracy": 0.7024867594242096, | |
| "num_tokens": 342434.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.368123424053192, | |
| "epoch": 0.6394810009267841, | |
| "grad_norm": 1.8563096523284912, | |
| "learning_rate": 7.396226415094339e-06, | |
| "loss": 1.2631, | |
| "mean_token_accuracy": 0.7030794739723205, | |
| "num_tokens": 345119.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.550855565071106, | |
| "epoch": 0.6487488415199258, | |
| "grad_norm": 1.7384581565856934, | |
| "learning_rate": 7.207547169811321e-06, | |
| "loss": 1.5237, | |
| "mean_token_accuracy": 0.6707696557044983, | |
| "num_tokens": 346702.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.4958812713623046, | |
| "epoch": 0.6580166821130676, | |
| "grad_norm": 0.7086682915687561, | |
| "learning_rate": 7.018867924528302e-06, | |
| "loss": 1.4053, | |
| "mean_token_accuracy": 0.6752688169479371, | |
| "num_tokens": 356942.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.558197546005249, | |
| "epoch": 0.6672845227062094, | |
| "grad_norm": 0.7285569310188293, | |
| "learning_rate": 6.830188679245283e-06, | |
| "loss": 1.4605, | |
| "mean_token_accuracy": 0.6629868388175965, | |
| "num_tokens": 366408.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.6734450340270997, | |
| "epoch": 0.6765523632993512, | |
| "grad_norm": 0.8609201908111572, | |
| "learning_rate": 6.641509433962265e-06, | |
| "loss": 1.5822, | |
| "mean_token_accuracy": 0.6437154173851013, | |
| "num_tokens": 373516.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.5121603965759278, | |
| "epoch": 0.6858202038924931, | |
| "grad_norm": 0.8602127432823181, | |
| "learning_rate": 6.452830188679245e-06, | |
| "loss": 1.3354, | |
| "mean_token_accuracy": 0.6697323322296143, | |
| "num_tokens": 379078.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.5671154141426087, | |
| "epoch": 0.6950880444856349, | |
| "grad_norm": 1.0804234743118286, | |
| "learning_rate": 6.2641509433962265e-06, | |
| "loss": 1.3706, | |
| "mean_token_accuracy": 0.6781690716743469, | |
| "num_tokens": 384082.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.4360106825828551, | |
| "epoch": 0.7043558850787767, | |
| "grad_norm": 1.0533781051635742, | |
| "learning_rate": 6.075471698113208e-06, | |
| "loss": 1.2339, | |
| "mean_token_accuracy": 0.6931209981441497, | |
| "num_tokens": 388701.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.3446730136871339, | |
| "epoch": 0.7136237256719185, | |
| "grad_norm": 1.0057952404022217, | |
| "learning_rate": 5.886792452830189e-06, | |
| "loss": 1.1451, | |
| "mean_token_accuracy": 0.7143323004245759, | |
| "num_tokens": 393032.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.3606623888015748, | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 0.9760032296180725, | |
| "learning_rate": 5.6981132075471704e-06, | |
| "loss": 1.1359, | |
| "mean_token_accuracy": 0.7167974233627319, | |
| "num_tokens": 397054.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.3662820339202881, | |
| "epoch": 0.732159406858202, | |
| "grad_norm": 1.250963807106018, | |
| "learning_rate": 5.509433962264151e-06, | |
| "loss": 1.1525, | |
| "mean_token_accuracy": 0.7196339964866638, | |
| "num_tokens": 400475.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 1.5162119686603546, | |
| "epoch": 0.7414272474513438, | |
| "grad_norm": 2.1900084018707275, | |
| "learning_rate": 5.320754716981132e-06, | |
| "loss": 1.4817, | |
| "mean_token_accuracy": 0.6947880864143372, | |
| "num_tokens": 402190.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.4912107944488526, | |
| "epoch": 0.7506950880444856, | |
| "grad_norm": 0.6602088809013367, | |
| "learning_rate": 5.1320754716981136e-06, | |
| "loss": 1.3781, | |
| "mean_token_accuracy": 0.6775171160697937, | |
| "num_tokens": 412430.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 1.677505886554718, | |
| "epoch": 0.7599629286376274, | |
| "grad_norm": 0.8567835688591003, | |
| "learning_rate": 4.943396226415095e-06, | |
| "loss": 1.5629, | |
| "mean_token_accuracy": 0.638292646408081, | |
| "num_tokens": 419893.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.4946231245994568, | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.9445057511329651, | |
| "learning_rate": 4.754716981132076e-06, | |
| "loss": 1.3326, | |
| "mean_token_accuracy": 0.6779886364936829, | |
| "num_tokens": 425619.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 1.3864770650863647, | |
| "epoch": 0.7784986098239111, | |
| "grad_norm": 0.8252947926521301, | |
| "learning_rate": 4.566037735849057e-06, | |
| "loss": 1.2169, | |
| "mean_token_accuracy": 0.7035700976848602, | |
| "num_tokens": 430797.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.5413469910621642, | |
| "epoch": 0.7877664504170528, | |
| "grad_norm": 1.1849970817565918, | |
| "learning_rate": 4.377358490566038e-06, | |
| "loss": 1.3547, | |
| "mean_token_accuracy": 0.6810157537460327, | |
| "num_tokens": 435596.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 1.3699937224388123, | |
| "epoch": 0.7970342910101946, | |
| "grad_norm": 0.877142608165741, | |
| "learning_rate": 4.188679245283019e-06, | |
| "loss": 1.1797, | |
| "mean_token_accuracy": 0.7002040147781372, | |
| "num_tokens": 440241.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.391974401473999, | |
| "epoch": 0.8063021316033364, | |
| "grad_norm": 1.0411101579666138, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.1623, | |
| "mean_token_accuracy": 0.7032719731330872, | |
| "num_tokens": 444603.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 1.5593681573867797, | |
| "epoch": 0.8155699721964782, | |
| "grad_norm": 1.07487154006958, | |
| "learning_rate": 3.8113207547169816e-06, | |
| "loss": 1.3711, | |
| "mean_token_accuracy": 0.6782773613929749, | |
| "num_tokens": 448697.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.3416823267936706, | |
| "epoch": 0.82483781278962, | |
| "grad_norm": 1.0888190269470215, | |
| "learning_rate": 3.6226415094339625e-06, | |
| "loss": 1.1324, | |
| "mean_token_accuracy": 0.7193882942199707, | |
| "num_tokens": 452317.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 1.5021097183227539, | |
| "epoch": 0.8341056533827618, | |
| "grad_norm": 2.1909356117248535, | |
| "learning_rate": 3.4339622641509434e-06, | |
| "loss": 1.4446, | |
| "mean_token_accuracy": 0.6848468244075775, | |
| "num_tokens": 453964.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.5324198842048644, | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 0.7571365833282471, | |
| "learning_rate": 3.2452830188679247e-06, | |
| "loss": 1.4131, | |
| "mean_token_accuracy": 0.6636363625526428, | |
| "num_tokens": 464204.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 1.3785831093788148, | |
| "epoch": 0.8526413345690455, | |
| "grad_norm": 0.7701159119606018, | |
| "learning_rate": 3.0566037735849056e-06, | |
| "loss": 1.2441, | |
| "mean_token_accuracy": 0.6936827838420868, | |
| "num_tokens": 473089.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.4654639124870301, | |
| "epoch": 0.8619091751621872, | |
| "grad_norm": 0.9103213548660278, | |
| "learning_rate": 2.867924528301887e-06, | |
| "loss": 1.3014, | |
| "mean_token_accuracy": 0.685904186964035, | |
| "num_tokens": 479410.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 1.533838427066803, | |
| "epoch": 0.871177015755329, | |
| "grad_norm": 0.8901606202125549, | |
| "learning_rate": 2.6792452830188682e-06, | |
| "loss": 1.3928, | |
| "mean_token_accuracy": 0.6758940577507019, | |
| "num_tokens": 485110.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.3798070430755616, | |
| "epoch": 0.8804448563484708, | |
| "grad_norm": 1.0601820945739746, | |
| "learning_rate": 2.490566037735849e-06, | |
| "loss": 1.1907, | |
| "mean_token_accuracy": 0.7054125189781189, | |
| "num_tokens": 490180.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 1.4131479620933534, | |
| "epoch": 0.8897126969416126, | |
| "grad_norm": 0.8970419764518738, | |
| "learning_rate": 2.3018867924528305e-06, | |
| "loss": 1.1863, | |
| "mean_token_accuracy": 0.7067974150180817, | |
| "num_tokens": 494890.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.4630970120429994, | |
| "epoch": 0.8989805375347544, | |
| "grad_norm": 1.003049373626709, | |
| "learning_rate": 2.1132075471698114e-06, | |
| "loss": 1.2327, | |
| "mean_token_accuracy": 0.6967244625091553, | |
| "num_tokens": 499349.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 1.3104987263679504, | |
| "epoch": 0.9082483781278962, | |
| "grad_norm": 1.0828076601028442, | |
| "learning_rate": 1.9245283018867927e-06, | |
| "loss": 1.1085, | |
| "mean_token_accuracy": 0.7211176335811615, | |
| "num_tokens": 503600.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.4279333114624024, | |
| "epoch": 0.917516218721038, | |
| "grad_norm": 1.2222362756729126, | |
| "learning_rate": 1.7358490566037736e-06, | |
| "loss": 1.2275, | |
| "mean_token_accuracy": 0.703647392988205, | |
| "num_tokens": 507413.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 1.4541478991508483, | |
| "epoch": 0.9267840593141798, | |
| "grad_norm": 2.478383779525757, | |
| "learning_rate": 1.5471698113207547e-06, | |
| "loss": 1.433, | |
| "mean_token_accuracy": 0.7055663108825684, | |
| "num_tokens": 508911.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.504941475391388, | |
| "epoch": 0.9360518999073216, | |
| "grad_norm": 0.7645187973976135, | |
| "learning_rate": 1.358490566037736e-06, | |
| "loss": 1.3723, | |
| "mean_token_accuracy": 0.6745567083358764, | |
| "num_tokens": 518390.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 1.574069583415985, | |
| "epoch": 0.9453197405004634, | |
| "grad_norm": 0.9938948750495911, | |
| "learning_rate": 1.1698113207547171e-06, | |
| "loss": 1.4143, | |
| "mean_token_accuracy": 0.6633340060710907, | |
| "num_tokens": 524489.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.3694233775138855, | |
| "epoch": 0.9545875810936052, | |
| "grad_norm": 0.9658361673355103, | |
| "learning_rate": 9.811320754716983e-07, | |
| "loss": 1.1829, | |
| "mean_token_accuracy": 0.7040388941764831, | |
| "num_tokens": 529543.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 1.41559841632843, | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.9936702251434326, | |
| "learning_rate": 7.924528301886793e-07, | |
| "loss": 1.2234, | |
| "mean_token_accuracy": 0.7022507786750793, | |
| "num_tokens": 534072.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.391177773475647, | |
| "epoch": 0.9731232622798888, | |
| "grad_norm": 0.959622323513031, | |
| "learning_rate": 6.037735849056605e-07, | |
| "loss": 1.2196, | |
| "mean_token_accuracy": 0.6922858953475952, | |
| "num_tokens": 538321.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 1.3745897650718688, | |
| "epoch": 0.9823911028730306, | |
| "grad_norm": 1.3114018440246582, | |
| "learning_rate": 4.1509433962264154e-07, | |
| "loss": 1.1473, | |
| "mean_token_accuracy": 0.7134360671043396, | |
| "num_tokens": 542222.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.492677342891693, | |
| "epoch": 0.9916589434661723, | |
| "grad_norm": 1.9476492404937744, | |
| "learning_rate": 2.2641509433962265e-07, | |
| "loss": 1.3013, | |
| "mean_token_accuracy": 0.6977547407150269, | |
| "num_tokens": 545236.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 1.4785194396972656, | |
| "epoch": 1.0, | |
| "grad_norm": 3.7451815605163574, | |
| "learning_rate": 3.773584905660378e-08, | |
| "loss": 1.4539, | |
| "mean_token_accuracy": 0.69340937005149, | |
| "num_tokens": 546641.0, | |
| "step": 540 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 540, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.4637351756455936e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |