besimray's picture
Upload task output 25459ca6-d7ba-4cd2-a38a-4110a59c3b7c
8704af6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9911894273127753,
"eval_steps": 500,
"global_step": 452,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022026431718061675,
"grad_norm": 1.0663584486206101,
"learning_rate": 9.237540571428572e-06,
"loss": 3.7016,
"step": 5
},
{
"epoch": 0.04405286343612335,
"grad_norm": 0.8088844636114286,
"learning_rate": 2.0784466285714287e-05,
"loss": 3.5322,
"step": 10
},
{
"epoch": 0.06607929515418502,
"grad_norm": 0.7375732667713775,
"learning_rate": 3.2331392000000005e-05,
"loss": 2.8192,
"step": 15
},
{
"epoch": 0.0881057268722467,
"grad_norm": 0.8949585308681106,
"learning_rate": 4.3878317714285716e-05,
"loss": 2.4495,
"step": 20
},
{
"epoch": 0.11013215859030837,
"grad_norm": 0.40288961916928345,
"learning_rate": 5.542524342857144e-05,
"loss": 1.9809,
"step": 25
},
{
"epoch": 0.13215859030837004,
"grad_norm": 1.5689759086423734,
"learning_rate": 6.697216914285716e-05,
"loss": 1.8521,
"step": 30
},
{
"epoch": 0.15418502202643172,
"grad_norm": 0.2643401651561827,
"learning_rate": 7.851909485714286e-05,
"loss": 1.8447,
"step": 35
},
{
"epoch": 0.1762114537444934,
"grad_norm": 0.3828395123315439,
"learning_rate": 8.0822745353115e-05,
"loss": 1.6381,
"step": 40
},
{
"epoch": 0.19823788546255505,
"grad_norm": 1.0422898029576655,
"learning_rate": 8.079945206908332e-05,
"loss": 1.6084,
"step": 45
},
{
"epoch": 0.22026431718061673,
"grad_norm": 6.593814557940585,
"learning_rate": 8.075825549346742e-05,
"loss": 1.6388,
"step": 50
},
{
"epoch": 0.2422907488986784,
"grad_norm": 0.23819531959892334,
"learning_rate": 8.06991799827424e-05,
"loss": 1.6555,
"step": 55
},
{
"epoch": 0.2643171806167401,
"grad_norm": 0.4770300018453914,
"learning_rate": 8.062226046386969e-05,
"loss": 1.5852,
"step": 60
},
{
"epoch": 0.28634361233480177,
"grad_norm": 0.252743363486302,
"learning_rate": 8.052754241364732e-05,
"loss": 1.5304,
"step": 65
},
{
"epoch": 0.30837004405286345,
"grad_norm": 0.32218530888051744,
"learning_rate": 8.041508183182285e-05,
"loss": 1.4884,
"step": 70
},
{
"epoch": 0.3303964757709251,
"grad_norm": 5.427482328857429,
"learning_rate": 8.028494520798486e-05,
"loss": 1.6083,
"step": 75
},
{
"epoch": 0.3524229074889868,
"grad_norm": 2.1000249470749024,
"learning_rate": 8.013720948225267e-05,
"loss": 1.481,
"step": 80
},
{
"epoch": 0.3744493392070485,
"grad_norm": 0.5069089744093597,
"learning_rate": 7.997196199978724e-05,
"loss": 1.5425,
"step": 85
},
{
"epoch": 0.3964757709251101,
"grad_norm": 0.28182829475876264,
"learning_rate": 7.978930045915059e-05,
"loss": 1.4742,
"step": 90
},
{
"epoch": 0.4185022026431718,
"grad_norm": 0.25226708061115644,
"learning_rate": 7.958933285454381e-05,
"loss": 1.5259,
"step": 95
},
{
"epoch": 0.44052863436123346,
"grad_norm": 0.2940763633451413,
"learning_rate": 7.937217741195818e-05,
"loss": 1.4476,
"step": 100
},
{
"epoch": 0.46255506607929514,
"grad_norm": 0.2998708778393728,
"learning_rate": 7.913796251927683e-05,
"loss": 1.4677,
"step": 105
},
{
"epoch": 0.4845814977973568,
"grad_norm": 0.30999571048347585,
"learning_rate": 7.888682665036862e-05,
"loss": 1.4895,
"step": 110
},
{
"epoch": 0.5066079295154186,
"grad_norm": 0.3385710708643517,
"learning_rate": 7.861891828321876e-05,
"loss": 1.4467,
"step": 115
},
{
"epoch": 0.5286343612334802,
"grad_norm": 0.2405218018246251,
"learning_rate": 7.833439581214485e-05,
"loss": 1.463,
"step": 120
},
{
"epoch": 0.5506607929515418,
"grad_norm": 0.3120582655284802,
"learning_rate": 7.803342745415004e-05,
"loss": 1.4433,
"step": 125
},
{
"epoch": 0.5726872246696035,
"grad_norm": 0.24571731344972014,
"learning_rate": 7.771619114946885e-05,
"loss": 1.537,
"step": 130
},
{
"epoch": 0.5947136563876652,
"grad_norm": 0.24744483588882188,
"learning_rate": 7.738287445636435e-05,
"loss": 1.4349,
"step": 135
},
{
"epoch": 0.6167400881057269,
"grad_norm": 0.2340432324202039,
"learning_rate": 7.703367444023876e-05,
"loss": 1.3148,
"step": 140
},
{
"epoch": 0.6387665198237885,
"grad_norm": 0.33180577529696736,
"learning_rate": 7.666879755712349e-05,
"loss": 1.3527,
"step": 145
},
{
"epoch": 0.6607929515418502,
"grad_norm": 0.29748503637539375,
"learning_rate": 7.628845953161691e-05,
"loss": 1.3921,
"step": 150
},
{
"epoch": 0.6828193832599119,
"grad_norm": 0.2568531998827171,
"learning_rate": 7.589288522934248e-05,
"loss": 1.3191,
"step": 155
},
{
"epoch": 0.7048458149779736,
"grad_norm": 0.2420347997963869,
"learning_rate": 7.548230852400232e-05,
"loss": 1.3777,
"step": 160
},
{
"epoch": 0.7268722466960352,
"grad_norm": 0.22010679080838433,
"learning_rate": 7.505697215910518e-05,
"loss": 1.3653,
"step": 165
},
{
"epoch": 0.748898678414097,
"grad_norm": 0.3729582538192735,
"learning_rate": 7.461712760445017e-05,
"loss": 1.3821,
"step": 170
},
{
"epoch": 0.7709251101321586,
"grad_norm": 0.24471717017646333,
"learning_rate": 7.416303490745132e-05,
"loss": 1.3272,
"step": 175
},
{
"epoch": 0.7929515418502202,
"grad_norm": 0.23718932476563045,
"learning_rate": 7.369496253939093e-05,
"loss": 1.4063,
"step": 180
},
{
"epoch": 0.8149779735682819,
"grad_norm": 0.273316183010925,
"learning_rate": 7.321318723669236e-05,
"loss": 1.3922,
"step": 185
},
{
"epoch": 0.8370044052863436,
"grad_norm": 0.2723324432950777,
"learning_rate": 7.271799383730644e-05,
"loss": 1.5102,
"step": 190
},
{
"epoch": 0.8590308370044053,
"grad_norm": 0.2486191758110478,
"learning_rate": 7.220967511230787e-05,
"loss": 1.4539,
"step": 195
},
{
"epoch": 0.8810572687224669,
"grad_norm": 0.20359743027793406,
"learning_rate": 7.168853159280142e-05,
"loss": 1.3669,
"step": 200
},
{
"epoch": 0.9030837004405287,
"grad_norm": 0.3452908425906942,
"learning_rate": 7.115487139224027e-05,
"loss": 1.4474,
"step": 205
},
{
"epoch": 0.9251101321585903,
"grad_norm": 0.24206927604187342,
"learning_rate": 7.060901002426141e-05,
"loss": 1.405,
"step": 210
},
{
"epoch": 0.947136563876652,
"grad_norm": 0.2736785474465845,
"learning_rate": 7.00512702161458e-05,
"loss": 1.3685,
"step": 215
},
{
"epoch": 0.9691629955947136,
"grad_norm": 0.23978655563774517,
"learning_rate": 6.948198171801373e-05,
"loss": 1.4067,
"step": 220
},
{
"epoch": 0.9911894273127754,
"grad_norm": 0.22393386935110163,
"learning_rate": 6.890148110786813e-05,
"loss": 1.3082,
"step": 225
},
{
"epoch": 0.9955947136563876,
"eval_loss": 1.50490403175354,
"eval_runtime": 42.9483,
"eval_samples_per_second": 1.537,
"eval_steps_per_second": 0.21,
"step": 226
},
{
"epoch": 1.013215859030837,
"grad_norm": 0.29391885004093155,
"learning_rate": 6.831011159260084e-05,
"loss": 1.2652,
"step": 230
},
{
"epoch": 1.0352422907488987,
"grad_norm": 0.2262762255488471,
"learning_rate": 6.770822280508006e-05,
"loss": 1.1647,
"step": 235
},
{
"epoch": 1.0572687224669604,
"grad_norm": 0.25236435689633363,
"learning_rate": 6.70961705974383e-05,
"loss": 1.256,
"step": 240
},
{
"epoch": 1.079295154185022,
"grad_norm": 0.30150815984057305,
"learning_rate": 6.647431683068342e-05,
"loss": 1.2419,
"step": 245
},
{
"epoch": 1.1013215859030836,
"grad_norm": 0.2461529615622769,
"learning_rate": 6.58430291607572e-05,
"loss": 1.2819,
"step": 250
},
{
"epoch": 1.1233480176211454,
"grad_norm": 0.2617560590756918,
"learning_rate": 6.520268082116754e-05,
"loss": 1.1769,
"step": 255
},
{
"epoch": 1.145374449339207,
"grad_norm": 0.2675888813814032,
"learning_rate": 6.455365040232338e-05,
"loss": 1.3418,
"step": 260
},
{
"epoch": 1.1674008810572687,
"grad_norm": 0.4478862449004952,
"learning_rate": 6.389632162770219e-05,
"loss": 1.2083,
"step": 265
},
{
"epoch": 1.1894273127753303,
"grad_norm": 0.2820003778060524,
"learning_rate": 6.32310831269828e-05,
"loss": 1.2821,
"step": 270
},
{
"epoch": 1.2114537444933922,
"grad_norm": 0.25852039054845183,
"learning_rate": 6.255832820627763e-05,
"loss": 1.1638,
"step": 275
},
{
"epoch": 1.2334801762114538,
"grad_norm": 0.25522724381792494,
"learning_rate": 6.187845461559982e-05,
"loss": 1.264,
"step": 280
},
{
"epoch": 1.2555066079295154,
"grad_norm": 0.2597245715157646,
"learning_rate": 6.119186431370339e-05,
"loss": 1.2434,
"step": 285
},
{
"epoch": 1.277533039647577,
"grad_norm": 0.2582862000714317,
"learning_rate": 6.049896323043476e-05,
"loss": 1.2385,
"step": 290
},
{
"epoch": 1.2995594713656389,
"grad_norm": 0.2828799977378617,
"learning_rate": 5.9800161026736606e-05,
"loss": 1.1716,
"step": 295
},
{
"epoch": 1.3215859030837005,
"grad_norm": 0.27495857073641394,
"learning_rate": 5.9095870852445795e-05,
"loss": 1.2569,
"step": 300
},
{
"epoch": 1.3436123348017621,
"grad_norm": 0.26885812885440236,
"learning_rate": 5.83865091020286e-05,
"loss": 1.1797,
"step": 305
},
{
"epoch": 1.3656387665198237,
"grad_norm": 0.26990836029271487,
"learning_rate": 5.767249516839748e-05,
"loss": 1.2667,
"step": 310
},
{
"epoch": 1.3876651982378854,
"grad_norm": 0.24942586412100395,
"learning_rate": 5.6954251194955336e-05,
"loss": 1.2285,
"step": 315
},
{
"epoch": 1.4096916299559472,
"grad_norm": 0.2837832327793825,
"learning_rate": 5.6232201826013346e-05,
"loss": 1.2649,
"step": 320
},
{
"epoch": 1.4317180616740088,
"grad_norm": 0.27338241561639043,
"learning_rate": 5.550677395573045e-05,
"loss": 1.247,
"step": 325
},
{
"epoch": 1.4537444933920705,
"grad_norm": 0.26757127591761626,
"learning_rate": 5.477839647572243e-05,
"loss": 1.2455,
"step": 330
},
{
"epoch": 1.475770925110132,
"grad_norm": 0.25393138651995495,
"learning_rate": 5.404750002149023e-05,
"loss": 1.2512,
"step": 335
},
{
"epoch": 1.497797356828194,
"grad_norm": 0.2657447805705433,
"learning_rate": 5.3314516717817126e-05,
"loss": 1.2397,
"step": 340
},
{
"epoch": 1.5198237885462555,
"grad_norm": 0.25474789006371235,
"learning_rate": 5.257987992328549e-05,
"loss": 1.1496,
"step": 345
},
{
"epoch": 1.5418502202643172,
"grad_norm": 0.2751533270870879,
"learning_rate": 5.1844023974064006e-05,
"loss": 1.2603,
"step": 350
},
{
"epoch": 1.5638766519823788,
"grad_norm": 0.26365407252111467,
"learning_rate": 5.110738392711697e-05,
"loss": 1.1364,
"step": 355
},
{
"epoch": 1.5859030837004404,
"grad_norm": 0.2525541732623165,
"learning_rate": 5.037039530298738e-05,
"loss": 1.1592,
"step": 360
},
{
"epoch": 1.607929515418502,
"grad_norm": 0.29719446796341475,
"learning_rate": 4.9633493828306105e-05,
"loss": 1.2675,
"step": 365
},
{
"epoch": 1.6299559471365639,
"grad_norm": 0.2744533935942541,
"learning_rate": 4.889711517817897e-05,
"loss": 1.2058,
"step": 370
},
{
"epoch": 1.6519823788546255,
"grad_norm": 0.2553055879619236,
"learning_rate": 4.8161694718604484e-05,
"loss": 1.2671,
"step": 375
},
{
"epoch": 1.6740088105726874,
"grad_norm": 0.2641167238472348,
"learning_rate": 4.742766724907424e-05,
"loss": 1.2367,
"step": 380
},
{
"epoch": 1.696035242290749,
"grad_norm": 0.29534926894855623,
"learning_rate": 4.6695466745508345e-05,
"loss": 1.2394,
"step": 385
},
{
"epoch": 1.7180616740088106,
"grad_norm": 0.27781303700098053,
"learning_rate": 4.5965526103677575e-05,
"loss": 1.2047,
"step": 390
},
{
"epoch": 1.7400881057268722,
"grad_norm": 0.25175443663281,
"learning_rate": 4.523827688326434e-05,
"loss": 1.2718,
"step": 395
},
{
"epoch": 1.7621145374449338,
"grad_norm": 0.2660850278265443,
"learning_rate": 4.451414905271349e-05,
"loss": 1.2061,
"step": 400
},
{
"epoch": 1.7841409691629955,
"grad_norm": 0.2680968372741624,
"learning_rate": 4.379357073502389e-05,
"loss": 1.2053,
"step": 405
},
{
"epoch": 1.8061674008810573,
"grad_norm": 0.28513938423430346,
"learning_rate": 4.307696795463113e-05,
"loss": 1.1879,
"step": 410
},
{
"epoch": 1.828193832599119,
"grad_norm": 0.31656390494301856,
"learning_rate": 4.236476438553086e-05,
"loss": 1.2365,
"step": 415
},
{
"epoch": 1.8502202643171806,
"grad_norm": 0.2728232724630057,
"learning_rate": 4.165738110079188e-05,
"loss": 1.1303,
"step": 420
},
{
"epoch": 1.8722466960352424,
"grad_norm": 0.2652052861267643,
"learning_rate": 4.095523632360687e-05,
"loss": 1.2028,
"step": 425
},
{
"epoch": 1.894273127753304,
"grad_norm": 0.25228905366706306,
"learning_rate": 4.025874518002815e-05,
"loss": 1.2777,
"step": 430
},
{
"epoch": 1.9162995594713657,
"grad_norm": 0.2923622331840007,
"learning_rate": 3.95683194535344e-05,
"loss": 1.2848,
"step": 435
},
{
"epoch": 1.9383259911894273,
"grad_norm": 0.27847518483753436,
"learning_rate": 3.88843673415737e-05,
"loss": 1.2911,
"step": 440
},
{
"epoch": 1.960352422907489,
"grad_norm": 0.2559544293947722,
"learning_rate": 3.8207293214226725e-05,
"loss": 1.1217,
"step": 445
},
{
"epoch": 1.9823788546255505,
"grad_norm": 0.2741260212367433,
"learning_rate": 3.7537497375132755e-05,
"loss": 1.188,
"step": 450
},
{
"epoch": 1.9911894273127753,
"eval_loss": 1.421474814414978,
"eval_runtime": 42.6046,
"eval_samples_per_second": 1.549,
"eval_steps_per_second": 0.211,
"step": 452
}
],
"logging_steps": 5,
"max_steps": 681,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 468830040096768.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}