product_des / trainer_state.json
nt-van-khanh's picture
Upload folder using huggingface_hub
2fcd150 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 108,
"global_step": 540,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.7017320156097413,
"epoch": 0.009267840593141797,
"grad_norm": 0.4929618835449219,
"learning_rate": 8.000000000000001e-06,
"loss": 1.8663,
"mean_token_accuracy": 0.6088488936424256,
"num_tokens": 10169.0,
"step": 5
},
{
"entropy": 1.6203628540039063,
"epoch": 0.018535681186283594,
"grad_norm": 0.5699822306632996,
"learning_rate": 1.8e-05,
"loss": 1.8139,
"mean_token_accuracy": 0.613596773147583,
"num_tokens": 17492.0,
"step": 10
},
{
"entropy": 1.6117817282676696,
"epoch": 0.027803521779425393,
"grad_norm": 0.5831968784332275,
"learning_rate": 1.9849056603773588e-05,
"loss": 1.9287,
"mean_token_accuracy": 0.6028814196586609,
"num_tokens": 23104.0,
"step": 15
},
{
"entropy": 1.5564811944961547,
"epoch": 0.03707136237256719,
"grad_norm": 0.6044633388519287,
"learning_rate": 1.9660377358490567e-05,
"loss": 1.8979,
"mean_token_accuracy": 0.6022705733776093,
"num_tokens": 28237.0,
"step": 20
},
{
"entropy": 1.5479681015014648,
"epoch": 0.04633920296570899,
"grad_norm": 0.5943928956985474,
"learning_rate": 1.947169811320755e-05,
"loss": 1.8733,
"mean_token_accuracy": 0.6153187394142151,
"num_tokens": 33029.0,
"step": 25
},
{
"entropy": 1.646817970275879,
"epoch": 0.05560704355885079,
"grad_norm": 0.6993905901908875,
"learning_rate": 1.928301886792453e-05,
"loss": 1.8784,
"mean_token_accuracy": 0.6003017485141754,
"num_tokens": 37369.0,
"step": 30
},
{
"entropy": 1.650176739692688,
"epoch": 0.06487488415199258,
"grad_norm": 0.786679744720459,
"learning_rate": 1.909433962264151e-05,
"loss": 1.8796,
"mean_token_accuracy": 0.6147689402103425,
"num_tokens": 41484.0,
"step": 35
},
{
"entropy": 1.590287184715271,
"epoch": 0.07414272474513438,
"grad_norm": 0.770859956741333,
"learning_rate": 1.8905660377358492e-05,
"loss": 1.817,
"mean_token_accuracy": 0.6139472305774689,
"num_tokens": 45350.0,
"step": 40
},
{
"entropy": 1.8836397886276246,
"epoch": 0.08341056533827618,
"grad_norm": 1.3446482419967651,
"learning_rate": 1.8716981132075474e-05,
"loss": 2.2667,
"mean_token_accuracy": 0.5831025898456573,
"num_tokens": 47900.0,
"step": 45
},
{
"entropy": 1.914563238620758,
"epoch": 0.09267840593141798,
"grad_norm": 2.0144922733306885,
"learning_rate": 1.8528301886792453e-05,
"loss": 2.3988,
"mean_token_accuracy": 0.5866762459278106,
"num_tokens": 49421.0,
"step": 50
},
{
"entropy": 1.753426432609558,
"epoch": 0.10194624652455977,
"grad_norm": 0.36396369338035583,
"learning_rate": 1.8339622641509435e-05,
"loss": 1.6739,
"mean_token_accuracy": 0.634347426891327,
"num_tokens": 59290.0,
"step": 55
},
{
"entropy": 1.7417057394981383,
"epoch": 0.11121408711770157,
"grad_norm": 0.557318389415741,
"learning_rate": 1.8150943396226417e-05,
"loss": 1.643,
"mean_token_accuracy": 0.6404906511306763,
"num_tokens": 66072.0,
"step": 60
},
{
"entropy": 1.6922197580337524,
"epoch": 0.12048192771084337,
"grad_norm": 0.5784375667572021,
"learning_rate": 1.79622641509434e-05,
"loss": 1.546,
"mean_token_accuracy": 0.6431969463825226,
"num_tokens": 71590.0,
"step": 65
},
{
"entropy": 1.7081309676170349,
"epoch": 0.12974976830398516,
"grad_norm": 0.6285673379898071,
"learning_rate": 1.777358490566038e-05,
"loss": 1.5738,
"mean_token_accuracy": 0.6408641755580902,
"num_tokens": 76620.0,
"step": 70
},
{
"entropy": 1.706821823120117,
"epoch": 0.13901760889712697,
"grad_norm": 0.6793721914291382,
"learning_rate": 1.758490566037736e-05,
"loss": 1.5597,
"mean_token_accuracy": 0.6496909260749817,
"num_tokens": 81367.0,
"step": 75
},
{
"entropy": 1.729298961162567,
"epoch": 0.14828544949026876,
"grad_norm": 0.7073204517364502,
"learning_rate": 1.7396226415094343e-05,
"loss": 1.5936,
"mean_token_accuracy": 0.646106606721878,
"num_tokens": 85829.0,
"step": 80
},
{
"entropy": 1.764689528942108,
"epoch": 0.15755329008341057,
"grad_norm": 0.8073896765708923,
"learning_rate": 1.720754716981132e-05,
"loss": 1.5737,
"mean_token_accuracy": 0.6494780361652375,
"num_tokens": 89983.0,
"step": 85
},
{
"entropy": 1.6753356814384461,
"epoch": 0.16682113067655235,
"grad_norm": 0.7738911509513855,
"learning_rate": 1.7018867924528304e-05,
"loss": 1.4815,
"mean_token_accuracy": 0.6729483246803284,
"num_tokens": 93855.0,
"step": 90
},
{
"entropy": 1.9390615344047546,
"epoch": 0.17608897126969417,
"grad_norm": 1.312859058380127,
"learning_rate": 1.6830188679245286e-05,
"loss": 1.8817,
"mean_token_accuracy": 0.6210294425487518,
"num_tokens": 96341.0,
"step": 95
},
{
"entropy": 1.9929674506187438,
"epoch": 0.18535681186283595,
"grad_norm": 1.6688119173049927,
"learning_rate": 1.6641509433962265e-05,
"loss": 1.9392,
"mean_token_accuracy": 0.6368482947349549,
"num_tokens": 97810.0,
"step": 100
},
{
"entropy": 1.712046504020691,
"epoch": 0.19462465245597776,
"grad_norm": 0.4342072010040283,
"learning_rate": 1.6452830188679247e-05,
"loss": 1.5909,
"mean_token_accuracy": 0.6414119660854339,
"num_tokens": 107589.0,
"step": 105
},
{
"entropy": 1.7470085620880127,
"epoch": 0.20389249304911955,
"grad_norm": 0.5741161704063416,
"learning_rate": 1.626415094339623e-05,
"loss": 1.5711,
"mean_token_accuracy": 0.6456838071346283,
"num_tokens": 114565.0,
"step": 110
},
{
"entropy": 1.5347474694252015,
"epoch": 0.21316033364226136,
"grad_norm": 0.6536217927932739,
"learning_rate": 1.607547169811321e-05,
"loss": 1.3867,
"mean_token_accuracy": 0.6677907109260559,
"num_tokens": 120099.0,
"step": 115
},
{
"entropy": 1.5956430912017823,
"epoch": 0.22242817423540315,
"grad_norm": 0.6675344705581665,
"learning_rate": 1.588679245283019e-05,
"loss": 1.4253,
"mean_token_accuracy": 0.6632845401763916,
"num_tokens": 125112.0,
"step": 120
},
{
"entropy": 1.5792277336120606,
"epoch": 0.23169601482854496,
"grad_norm": 0.7069701552391052,
"learning_rate": 1.5698113207547172e-05,
"loss": 1.3823,
"mean_token_accuracy": 0.6681795418262482,
"num_tokens": 129723.0,
"step": 125
},
{
"entropy": 1.5756880044937134,
"epoch": 0.24096385542168675,
"grad_norm": 0.7337709069252014,
"learning_rate": 1.5509433962264154e-05,
"loss": 1.3491,
"mean_token_accuracy": 0.6784022688865662,
"num_tokens": 134105.0,
"step": 130
},
{
"entropy": 1.6404427766799927,
"epoch": 0.25023169601482853,
"grad_norm": 0.7882033586502075,
"learning_rate": 1.5320754716981133e-05,
"loss": 1.4448,
"mean_token_accuracy": 0.6615463674068451,
"num_tokens": 138249.0,
"step": 135
},
{
"entropy": 1.5251396298408508,
"epoch": 0.2594995366079703,
"grad_norm": 0.9358009099960327,
"learning_rate": 1.5132075471698115e-05,
"loss": 1.3161,
"mean_token_accuracy": 0.6832186877727509,
"num_tokens": 141935.0,
"step": 140
},
{
"entropy": 1.6210530638694762,
"epoch": 0.26876737720111216,
"grad_norm": 1.2580927610397339,
"learning_rate": 1.4943396226415094e-05,
"loss": 1.5166,
"mean_token_accuracy": 0.6789897084236145,
"num_tokens": 144364.0,
"step": 145
},
{
"entropy": 1.5357141017913818,
"epoch": 0.27803521779425394,
"grad_norm": 1.4969000816345215,
"learning_rate": 1.4754716981132076e-05,
"loss": 1.5769,
"mean_token_accuracy": 0.7000365734100342,
"num_tokens": 145928.0,
"step": 150
},
{
"entropy": 1.4059158325195313,
"epoch": 0.2873030583873957,
"grad_norm": 0.5834245681762695,
"learning_rate": 1.4566037735849057e-05,
"loss": 1.3117,
"mean_token_accuracy": 0.6917888641357421,
"num_tokens": 156168.0,
"step": 155
},
{
"entropy": 1.4503844499588012,
"epoch": 0.2965708989805375,
"grad_norm": 0.5249871611595154,
"learning_rate": 1.4377358490566037e-05,
"loss": 1.3086,
"mean_token_accuracy": 0.692787104845047,
"num_tokens": 165610.0,
"step": 160
},
{
"entropy": 1.5506718158721924,
"epoch": 0.30583873957367935,
"grad_norm": 0.6383576393127441,
"learning_rate": 1.418867924528302e-05,
"loss": 1.3519,
"mean_token_accuracy": 0.6676154375076294,
"num_tokens": 171787.0,
"step": 165
},
{
"entropy": 1.5482367396354675,
"epoch": 0.31510658016682114,
"grad_norm": 0.8127756714820862,
"learning_rate": 1.4e-05,
"loss": 1.3004,
"mean_token_accuracy": 0.6841661810874939,
"num_tokens": 177223.0,
"step": 170
},
{
"entropy": 1.5198933720588683,
"epoch": 0.3243744207599629,
"grad_norm": 0.7491681575775146,
"learning_rate": 1.3811320754716982e-05,
"loss": 1.3046,
"mean_token_accuracy": 0.684827846288681,
"num_tokens": 182283.0,
"step": 175
},
{
"entropy": 1.4806129813194275,
"epoch": 0.3336422613531047,
"grad_norm": 0.7199849486351013,
"learning_rate": 1.3622641509433962e-05,
"loss": 1.3289,
"mean_token_accuracy": 0.6813067197799683,
"num_tokens": 187001.0,
"step": 180
},
{
"entropy": 1.4494765639305114,
"epoch": 0.34291010194624655,
"grad_norm": 0.842717707157135,
"learning_rate": 1.3433962264150943e-05,
"loss": 1.2323,
"mean_token_accuracy": 0.7035281479358673,
"num_tokens": 191394.0,
"step": 185
},
{
"entropy": 1.4810110807418824,
"epoch": 0.35217794253938833,
"grad_norm": 0.9301387667655945,
"learning_rate": 1.3245283018867925e-05,
"loss": 1.2773,
"mean_token_accuracy": 0.6910167336463928,
"num_tokens": 195399.0,
"step": 190
},
{
"entropy": 1.5017635345458984,
"epoch": 0.3614457831325301,
"grad_norm": 0.923757791519165,
"learning_rate": 1.3056603773584906e-05,
"loss": 1.3787,
"mean_token_accuracy": 0.6846955835819244,
"num_tokens": 198934.0,
"step": 195
},
{
"entropy": 1.6470162987709045,
"epoch": 0.3707136237256719,
"grad_norm": 1.4323452711105347,
"learning_rate": 1.2867924528301888e-05,
"loss": 1.6467,
"mean_token_accuracy": 0.6908825635910034,
"num_tokens": 200588.0,
"step": 200
},
{
"entropy": 1.6742107629776002,
"epoch": 0.3799814643188137,
"grad_norm": 0.6032175421714783,
"learning_rate": 1.2679245283018868e-05,
"loss": 1.5514,
"mean_token_accuracy": 0.647225683927536,
"num_tokens": 210039.0,
"step": 205
},
{
"entropy": 1.381903338432312,
"epoch": 0.38924930491195553,
"grad_norm": 0.6865923404693604,
"learning_rate": 1.2490566037735849e-05,
"loss": 1.1837,
"mean_token_accuracy": 0.7007428467273712,
"num_tokens": 216062.0,
"step": 210
},
{
"entropy": 1.5497890710830688,
"epoch": 0.3985171455050973,
"grad_norm": 0.7516324520111084,
"learning_rate": 1.2301886792452831e-05,
"loss": 1.3396,
"mean_token_accuracy": 0.679390799999237,
"num_tokens": 221238.0,
"step": 215
},
{
"entropy": 1.629994511604309,
"epoch": 0.4077849860982391,
"grad_norm": 0.8432377576828003,
"learning_rate": 1.2113207547169811e-05,
"loss": 1.4085,
"mean_token_accuracy": 0.6734645128250122,
"num_tokens": 226114.0,
"step": 220
},
{
"entropy": 1.4988724350929261,
"epoch": 0.4170528266913809,
"grad_norm": 0.7744415998458862,
"learning_rate": 1.1924528301886794e-05,
"loss": 1.2606,
"mean_token_accuracy": 0.690128743648529,
"num_tokens": 230722.0,
"step": 225
},
{
"entropy": 1.6341279029846192,
"epoch": 0.4263206672845227,
"grad_norm": 0.9293099641799927,
"learning_rate": 1.1735849056603774e-05,
"loss": 1.3895,
"mean_token_accuracy": 0.6756053507328034,
"num_tokens": 235030.0,
"step": 230
},
{
"entropy": 1.4942476391792296,
"epoch": 0.4355885078776645,
"grad_norm": 1.097900629043579,
"learning_rate": 1.1547169811320756e-05,
"loss": 1.2779,
"mean_token_accuracy": 0.6963155150413514,
"num_tokens": 239053.0,
"step": 235
},
{
"entropy": 1.4344226121902466,
"epoch": 0.4448563484708063,
"grad_norm": 0.9284445643424988,
"learning_rate": 1.1358490566037737e-05,
"loss": 1.2624,
"mean_token_accuracy": 0.6946760237216949,
"num_tokens": 242797.0,
"step": 240
},
{
"entropy": 1.6279133677482605,
"epoch": 0.4541241890639481,
"grad_norm": 1.8525234460830688,
"learning_rate": 1.1169811320754717e-05,
"loss": 1.5141,
"mean_token_accuracy": 0.6641450226306915,
"num_tokens": 245262.0,
"step": 245
},
{
"entropy": 1.5635493040084838,
"epoch": 0.4633920296570899,
"grad_norm": 1.4725935459136963,
"learning_rate": 1.09811320754717e-05,
"loss": 1.5592,
"mean_token_accuracy": 0.6816279590129852,
"num_tokens": 246825.0,
"step": 250
},
{
"entropy": 1.5720198631286622,
"epoch": 0.4726598702502317,
"grad_norm": 0.9245171546936035,
"learning_rate": 1.079245283018868e-05,
"loss": 1.4606,
"mean_token_accuracy": 0.6601259410381317,
"num_tokens": 256931.0,
"step": 255
},
{
"entropy": 1.5677057027816772,
"epoch": 0.4819277108433735,
"grad_norm": 0.8322890400886536,
"learning_rate": 1.0603773584905662e-05,
"loss": 1.4144,
"mean_token_accuracy": 0.6694712340831757,
"num_tokens": 263360.0,
"step": 260
},
{
"entropy": 1.59023619890213,
"epoch": 0.4911955514365153,
"grad_norm": 0.7772918939590454,
"learning_rate": 1.0415094339622642e-05,
"loss": 1.3723,
"mean_token_accuracy": 0.6685677945613862,
"num_tokens": 269174.0,
"step": 265
},
{
"entropy": 1.5507931351661681,
"epoch": 0.5004633920296571,
"grad_norm": 0.8000075221061707,
"learning_rate": 1.0226415094339623e-05,
"loss": 1.3126,
"mean_token_accuracy": 0.6843527674674987,
"num_tokens": 274595.0,
"step": 270
},
{
"entropy": 1.5491173028945924,
"epoch": 0.5097312326227988,
"grad_norm": 0.8983348608016968,
"learning_rate": 1.0037735849056605e-05,
"loss": 1.369,
"mean_token_accuracy": 0.6823632538318634,
"num_tokens": 279472.0,
"step": 275
},
{
"entropy": 1.411498475074768,
"epoch": 0.5189990732159406,
"grad_norm": 0.891360878944397,
"learning_rate": 9.849056603773586e-06,
"loss": 1.1676,
"mean_token_accuracy": 0.7068820059299469,
"num_tokens": 283955.0,
"step": 280
},
{
"entropy": 1.5002652764320374,
"epoch": 0.5282669138090825,
"grad_norm": 0.9551361799240112,
"learning_rate": 9.660377358490568e-06,
"loss": 1.2826,
"mean_token_accuracy": 0.6885687828063964,
"num_tokens": 288166.0,
"step": 285
},
{
"entropy": 1.4297375559806824,
"epoch": 0.5375347544022243,
"grad_norm": 1.0993260145187378,
"learning_rate": 9.471698113207548e-06,
"loss": 1.2081,
"mean_token_accuracy": 0.7039083421230317,
"num_tokens": 292143.0,
"step": 290
},
{
"entropy": 1.5504101514816284,
"epoch": 0.5468025949953661,
"grad_norm": 1.0684620141983032,
"learning_rate": 9.283018867924529e-06,
"loss": 1.3632,
"mean_token_accuracy": 0.6732664227485656,
"num_tokens": 295795.0,
"step": 295
},
{
"entropy": 1.4705226182937623,
"epoch": 0.5560704355885079,
"grad_norm": 2.1173923015594482,
"learning_rate": 9.09433962264151e-06,
"loss": 1.4014,
"mean_token_accuracy": 0.6959770500659943,
"num_tokens": 297656.0,
"step": 300
},
{
"entropy": 1.4929959535598756,
"epoch": 0.5653382761816497,
"grad_norm": 1.2770863771438599,
"learning_rate": 8.905660377358491e-06,
"loss": 1.414,
"mean_token_accuracy": 0.6737047851085662,
"num_tokens": 307896.0,
"step": 305
},
{
"entropy": 1.4795246124267578,
"epoch": 0.5746061167747915,
"grad_norm": 0.8402530550956726,
"learning_rate": 8.716981132075473e-06,
"loss": 1.3673,
"mean_token_accuracy": 0.6673755586147309,
"num_tokens": 314996.0,
"step": 310
},
{
"entropy": 1.5327817797660828,
"epoch": 0.5838739573679332,
"grad_norm": 0.9122950434684753,
"learning_rate": 8.528301886792454e-06,
"loss": 1.3569,
"mean_token_accuracy": 0.6737642705440521,
"num_tokens": 320535.0,
"step": 315
},
{
"entropy": 1.4931538462638856,
"epoch": 0.593141797961075,
"grad_norm": 0.8541343808174133,
"learning_rate": 8.339622641509434e-06,
"loss": 1.2924,
"mean_token_accuracy": 0.6892772674560547,
"num_tokens": 325543.0,
"step": 320
},
{
"entropy": 1.4055845737457275,
"epoch": 0.6024096385542169,
"grad_norm": 1.0335566997528076,
"learning_rate": 8.150943396226417e-06,
"loss": 1.2071,
"mean_token_accuracy": 0.7009121060371399,
"num_tokens": 330160.0,
"step": 325
},
{
"entropy": 1.3517128109931946,
"epoch": 0.6116774791473587,
"grad_norm": 0.9749907851219177,
"learning_rate": 7.962264150943397e-06,
"loss": 1.1001,
"mean_token_accuracy": 0.7244792997837066,
"num_tokens": 334517.0,
"step": 330
},
{
"entropy": 1.511979877948761,
"epoch": 0.6209453197405005,
"grad_norm": 1.2967917919158936,
"learning_rate": 7.77358490566038e-06,
"loss": 1.3045,
"mean_token_accuracy": 0.6831347227096558,
"num_tokens": 338603.0,
"step": 335
},
{
"entropy": 1.4577907562255858,
"epoch": 0.6302131603336423,
"grad_norm": 0.9886659383773804,
"learning_rate": 7.58490566037736e-06,
"loss": 1.2057,
"mean_token_accuracy": 0.7024867594242096,
"num_tokens": 342434.0,
"step": 340
},
{
"entropy": 1.368123424053192,
"epoch": 0.6394810009267841,
"grad_norm": 1.8563096523284912,
"learning_rate": 7.396226415094339e-06,
"loss": 1.2631,
"mean_token_accuracy": 0.7030794739723205,
"num_tokens": 345119.0,
"step": 345
},
{
"entropy": 1.550855565071106,
"epoch": 0.6487488415199258,
"grad_norm": 1.7384581565856934,
"learning_rate": 7.207547169811321e-06,
"loss": 1.5237,
"mean_token_accuracy": 0.6707696557044983,
"num_tokens": 346702.0,
"step": 350
},
{
"entropy": 1.4958812713623046,
"epoch": 0.6580166821130676,
"grad_norm": 0.7086682915687561,
"learning_rate": 7.018867924528302e-06,
"loss": 1.4053,
"mean_token_accuracy": 0.6752688169479371,
"num_tokens": 356942.0,
"step": 355
},
{
"entropy": 1.558197546005249,
"epoch": 0.6672845227062094,
"grad_norm": 0.7285569310188293,
"learning_rate": 6.830188679245283e-06,
"loss": 1.4605,
"mean_token_accuracy": 0.6629868388175965,
"num_tokens": 366408.0,
"step": 360
},
{
"entropy": 1.6734450340270997,
"epoch": 0.6765523632993512,
"grad_norm": 0.8609201908111572,
"learning_rate": 6.641509433962265e-06,
"loss": 1.5822,
"mean_token_accuracy": 0.6437154173851013,
"num_tokens": 373516.0,
"step": 365
},
{
"entropy": 1.5121603965759278,
"epoch": 0.6858202038924931,
"grad_norm": 0.8602127432823181,
"learning_rate": 6.452830188679245e-06,
"loss": 1.3354,
"mean_token_accuracy": 0.6697323322296143,
"num_tokens": 379078.0,
"step": 370
},
{
"entropy": 1.5671154141426087,
"epoch": 0.6950880444856349,
"grad_norm": 1.0804234743118286,
"learning_rate": 6.2641509433962265e-06,
"loss": 1.3706,
"mean_token_accuracy": 0.6781690716743469,
"num_tokens": 384082.0,
"step": 375
},
{
"entropy": 1.4360106825828551,
"epoch": 0.7043558850787767,
"grad_norm": 1.0533781051635742,
"learning_rate": 6.075471698113208e-06,
"loss": 1.2339,
"mean_token_accuracy": 0.6931209981441497,
"num_tokens": 388701.0,
"step": 380
},
{
"entropy": 1.3446730136871339,
"epoch": 0.7136237256719185,
"grad_norm": 1.0057952404022217,
"learning_rate": 5.886792452830189e-06,
"loss": 1.1451,
"mean_token_accuracy": 0.7143323004245759,
"num_tokens": 393032.0,
"step": 385
},
{
"entropy": 1.3606623888015748,
"epoch": 0.7228915662650602,
"grad_norm": 0.9760032296180725,
"learning_rate": 5.6981132075471704e-06,
"loss": 1.1359,
"mean_token_accuracy": 0.7167974233627319,
"num_tokens": 397054.0,
"step": 390
},
{
"entropy": 1.3662820339202881,
"epoch": 0.732159406858202,
"grad_norm": 1.250963807106018,
"learning_rate": 5.509433962264151e-06,
"loss": 1.1525,
"mean_token_accuracy": 0.7196339964866638,
"num_tokens": 400475.0,
"step": 395
},
{
"entropy": 1.5162119686603546,
"epoch": 0.7414272474513438,
"grad_norm": 2.1900084018707275,
"learning_rate": 5.320754716981132e-06,
"loss": 1.4817,
"mean_token_accuracy": 0.6947880864143372,
"num_tokens": 402190.0,
"step": 400
},
{
"entropy": 1.4912107944488526,
"epoch": 0.7506950880444856,
"grad_norm": 0.6602088809013367,
"learning_rate": 5.1320754716981136e-06,
"loss": 1.3781,
"mean_token_accuracy": 0.6775171160697937,
"num_tokens": 412430.0,
"step": 405
},
{
"entropy": 1.677505886554718,
"epoch": 0.7599629286376274,
"grad_norm": 0.8567835688591003,
"learning_rate": 4.943396226415095e-06,
"loss": 1.5629,
"mean_token_accuracy": 0.638292646408081,
"num_tokens": 419893.0,
"step": 410
},
{
"entropy": 1.4946231245994568,
"epoch": 0.7692307692307693,
"grad_norm": 0.9445057511329651,
"learning_rate": 4.754716981132076e-06,
"loss": 1.3326,
"mean_token_accuracy": 0.6779886364936829,
"num_tokens": 425619.0,
"step": 415
},
{
"entropy": 1.3864770650863647,
"epoch": 0.7784986098239111,
"grad_norm": 0.8252947926521301,
"learning_rate": 4.566037735849057e-06,
"loss": 1.2169,
"mean_token_accuracy": 0.7035700976848602,
"num_tokens": 430797.0,
"step": 420
},
{
"entropy": 1.5413469910621642,
"epoch": 0.7877664504170528,
"grad_norm": 1.1849970817565918,
"learning_rate": 4.377358490566038e-06,
"loss": 1.3547,
"mean_token_accuracy": 0.6810157537460327,
"num_tokens": 435596.0,
"step": 425
},
{
"entropy": 1.3699937224388123,
"epoch": 0.7970342910101946,
"grad_norm": 0.877142608165741,
"learning_rate": 4.188679245283019e-06,
"loss": 1.1797,
"mean_token_accuracy": 0.7002040147781372,
"num_tokens": 440241.0,
"step": 430
},
{
"entropy": 1.391974401473999,
"epoch": 0.8063021316033364,
"grad_norm": 1.0411101579666138,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1623,
"mean_token_accuracy": 0.7032719731330872,
"num_tokens": 444603.0,
"step": 435
},
{
"entropy": 1.5593681573867797,
"epoch": 0.8155699721964782,
"grad_norm": 1.07487154006958,
"learning_rate": 3.8113207547169816e-06,
"loss": 1.3711,
"mean_token_accuracy": 0.6782773613929749,
"num_tokens": 448697.0,
"step": 440
},
{
"entropy": 1.3416823267936706,
"epoch": 0.82483781278962,
"grad_norm": 1.0888190269470215,
"learning_rate": 3.6226415094339625e-06,
"loss": 1.1324,
"mean_token_accuracy": 0.7193882942199707,
"num_tokens": 452317.0,
"step": 445
},
{
"entropy": 1.5021097183227539,
"epoch": 0.8341056533827618,
"grad_norm": 2.1909356117248535,
"learning_rate": 3.4339622641509434e-06,
"loss": 1.4446,
"mean_token_accuracy": 0.6848468244075775,
"num_tokens": 453964.0,
"step": 450
},
{
"entropy": 1.5324198842048644,
"epoch": 0.8433734939759037,
"grad_norm": 0.7571365833282471,
"learning_rate": 3.2452830188679247e-06,
"loss": 1.4131,
"mean_token_accuracy": 0.6636363625526428,
"num_tokens": 464204.0,
"step": 455
},
{
"entropy": 1.3785831093788148,
"epoch": 0.8526413345690455,
"grad_norm": 0.7701159119606018,
"learning_rate": 3.0566037735849056e-06,
"loss": 1.2441,
"mean_token_accuracy": 0.6936827838420868,
"num_tokens": 473089.0,
"step": 460
},
{
"entropy": 1.4654639124870301,
"epoch": 0.8619091751621872,
"grad_norm": 0.9103213548660278,
"learning_rate": 2.867924528301887e-06,
"loss": 1.3014,
"mean_token_accuracy": 0.685904186964035,
"num_tokens": 479410.0,
"step": 465
},
{
"entropy": 1.533838427066803,
"epoch": 0.871177015755329,
"grad_norm": 0.8901606202125549,
"learning_rate": 2.6792452830188682e-06,
"loss": 1.3928,
"mean_token_accuracy": 0.6758940577507019,
"num_tokens": 485110.0,
"step": 470
},
{
"entropy": 1.3798070430755616,
"epoch": 0.8804448563484708,
"grad_norm": 1.0601820945739746,
"learning_rate": 2.490566037735849e-06,
"loss": 1.1907,
"mean_token_accuracy": 0.7054125189781189,
"num_tokens": 490180.0,
"step": 475
},
{
"entropy": 1.4131479620933534,
"epoch": 0.8897126969416126,
"grad_norm": 0.8970419764518738,
"learning_rate": 2.3018867924528305e-06,
"loss": 1.1863,
"mean_token_accuracy": 0.7067974150180817,
"num_tokens": 494890.0,
"step": 480
},
{
"entropy": 1.4630970120429994,
"epoch": 0.8989805375347544,
"grad_norm": 1.003049373626709,
"learning_rate": 2.1132075471698114e-06,
"loss": 1.2327,
"mean_token_accuracy": 0.6967244625091553,
"num_tokens": 499349.0,
"step": 485
},
{
"entropy": 1.3104987263679504,
"epoch": 0.9082483781278962,
"grad_norm": 1.0828076601028442,
"learning_rate": 1.9245283018867927e-06,
"loss": 1.1085,
"mean_token_accuracy": 0.7211176335811615,
"num_tokens": 503600.0,
"step": 490
},
{
"entropy": 1.4279333114624024,
"epoch": 0.917516218721038,
"grad_norm": 1.2222362756729126,
"learning_rate": 1.7358490566037736e-06,
"loss": 1.2275,
"mean_token_accuracy": 0.703647392988205,
"num_tokens": 507413.0,
"step": 495
},
{
"entropy": 1.4541478991508483,
"epoch": 0.9267840593141798,
"grad_norm": 2.478383779525757,
"learning_rate": 1.5471698113207547e-06,
"loss": 1.433,
"mean_token_accuracy": 0.7055663108825684,
"num_tokens": 508911.0,
"step": 500
},
{
"entropy": 1.504941475391388,
"epoch": 0.9360518999073216,
"grad_norm": 0.7645187973976135,
"learning_rate": 1.358490566037736e-06,
"loss": 1.3723,
"mean_token_accuracy": 0.6745567083358764,
"num_tokens": 518390.0,
"step": 505
},
{
"entropy": 1.574069583415985,
"epoch": 0.9453197405004634,
"grad_norm": 0.9938948750495911,
"learning_rate": 1.1698113207547171e-06,
"loss": 1.4143,
"mean_token_accuracy": 0.6633340060710907,
"num_tokens": 524489.0,
"step": 510
},
{
"entropy": 1.3694233775138855,
"epoch": 0.9545875810936052,
"grad_norm": 0.9658361673355103,
"learning_rate": 9.811320754716983e-07,
"loss": 1.1829,
"mean_token_accuracy": 0.7040388941764831,
"num_tokens": 529543.0,
"step": 515
},
{
"entropy": 1.41559841632843,
"epoch": 0.963855421686747,
"grad_norm": 0.9936702251434326,
"learning_rate": 7.924528301886793e-07,
"loss": 1.2234,
"mean_token_accuracy": 0.7022507786750793,
"num_tokens": 534072.0,
"step": 520
},
{
"entropy": 1.391177773475647,
"epoch": 0.9731232622798888,
"grad_norm": 0.959622323513031,
"learning_rate": 6.037735849056605e-07,
"loss": 1.2196,
"mean_token_accuracy": 0.6922858953475952,
"num_tokens": 538321.0,
"step": 525
},
{
"entropy": 1.3745897650718688,
"epoch": 0.9823911028730306,
"grad_norm": 1.3114018440246582,
"learning_rate": 4.1509433962264154e-07,
"loss": 1.1473,
"mean_token_accuracy": 0.7134360671043396,
"num_tokens": 542222.0,
"step": 530
},
{
"entropy": 1.492677342891693,
"epoch": 0.9916589434661723,
"grad_norm": 1.9476492404937744,
"learning_rate": 2.2641509433962265e-07,
"loss": 1.3013,
"mean_token_accuracy": 0.6977547407150269,
"num_tokens": 545236.0,
"step": 535
},
{
"entropy": 1.4785194396972656,
"epoch": 1.0,
"grad_norm": 3.7451815605163574,
"learning_rate": 3.773584905660378e-08,
"loss": 1.4539,
"mean_token_accuracy": 0.69340937005149,
"num_tokens": 546641.0,
"step": 540
}
],
"logging_steps": 5,
"max_steps": 540,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4637351756455936e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}