| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9911894273127753, | |
| "eval_steps": 500, | |
| "global_step": 452, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022026431718061675, | |
| "grad_norm": 1.0663584486206101, | |
| "learning_rate": 9.237540571428572e-06, | |
| "loss": 3.7016, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04405286343612335, | |
| "grad_norm": 0.8088844636114286, | |
| "learning_rate": 2.0784466285714287e-05, | |
| "loss": 3.5322, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06607929515418502, | |
| "grad_norm": 0.7375732667713775, | |
| "learning_rate": 3.2331392000000005e-05, | |
| "loss": 2.8192, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0881057268722467, | |
| "grad_norm": 0.8949585308681106, | |
| "learning_rate": 4.3878317714285716e-05, | |
| "loss": 2.4495, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11013215859030837, | |
| "grad_norm": 0.40288961916928345, | |
| "learning_rate": 5.542524342857144e-05, | |
| "loss": 1.9809, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.13215859030837004, | |
| "grad_norm": 1.5689759086423734, | |
| "learning_rate": 6.697216914285716e-05, | |
| "loss": 1.8521, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15418502202643172, | |
| "grad_norm": 0.2643401651561827, | |
| "learning_rate": 7.851909485714286e-05, | |
| "loss": 1.8447, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1762114537444934, | |
| "grad_norm": 0.3828395123315439, | |
| "learning_rate": 8.0822745353115e-05, | |
| "loss": 1.6381, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19823788546255505, | |
| "grad_norm": 1.0422898029576655, | |
| "learning_rate": 8.079945206908332e-05, | |
| "loss": 1.6084, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.22026431718061673, | |
| "grad_norm": 6.593814557940585, | |
| "learning_rate": 8.075825549346742e-05, | |
| "loss": 1.6388, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2422907488986784, | |
| "grad_norm": 0.23819531959892334, | |
| "learning_rate": 8.06991799827424e-05, | |
| "loss": 1.6555, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2643171806167401, | |
| "grad_norm": 0.4770300018453914, | |
| "learning_rate": 8.062226046386969e-05, | |
| "loss": 1.5852, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.28634361233480177, | |
| "grad_norm": 0.252743363486302, | |
| "learning_rate": 8.052754241364732e-05, | |
| "loss": 1.5304, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.30837004405286345, | |
| "grad_norm": 0.32218530888051744, | |
| "learning_rate": 8.041508183182285e-05, | |
| "loss": 1.4884, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3303964757709251, | |
| "grad_norm": 5.427482328857429, | |
| "learning_rate": 8.028494520798486e-05, | |
| "loss": 1.6083, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3524229074889868, | |
| "grad_norm": 2.1000249470749024, | |
| "learning_rate": 8.013720948225267e-05, | |
| "loss": 1.481, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3744493392070485, | |
| "grad_norm": 0.5069089744093597, | |
| "learning_rate": 7.997196199978724e-05, | |
| "loss": 1.5425, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3964757709251101, | |
| "grad_norm": 0.28182829475876264, | |
| "learning_rate": 7.978930045915059e-05, | |
| "loss": 1.4742, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4185022026431718, | |
| "grad_norm": 0.25226708061115644, | |
| "learning_rate": 7.958933285454381e-05, | |
| "loss": 1.5259, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.44052863436123346, | |
| "grad_norm": 0.2940763633451413, | |
| "learning_rate": 7.937217741195818e-05, | |
| "loss": 1.4476, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.46255506607929514, | |
| "grad_norm": 0.2998708778393728, | |
| "learning_rate": 7.913796251927683e-05, | |
| "loss": 1.4677, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4845814977973568, | |
| "grad_norm": 0.30999571048347585, | |
| "learning_rate": 7.888682665036862e-05, | |
| "loss": 1.4895, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5066079295154186, | |
| "grad_norm": 0.3385710708643517, | |
| "learning_rate": 7.861891828321876e-05, | |
| "loss": 1.4467, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5286343612334802, | |
| "grad_norm": 0.2405218018246251, | |
| "learning_rate": 7.833439581214485e-05, | |
| "loss": 1.463, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5506607929515418, | |
| "grad_norm": 0.3120582655284802, | |
| "learning_rate": 7.803342745415004e-05, | |
| "loss": 1.4433, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5726872246696035, | |
| "grad_norm": 0.24571731344972014, | |
| "learning_rate": 7.771619114946885e-05, | |
| "loss": 1.537, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5947136563876652, | |
| "grad_norm": 0.24744483588882188, | |
| "learning_rate": 7.738287445636435e-05, | |
| "loss": 1.4349, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.6167400881057269, | |
| "grad_norm": 0.2340432324202039, | |
| "learning_rate": 7.703367444023876e-05, | |
| "loss": 1.3148, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6387665198237885, | |
| "grad_norm": 0.33180577529696736, | |
| "learning_rate": 7.666879755712349e-05, | |
| "loss": 1.3527, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6607929515418502, | |
| "grad_norm": 0.29748503637539375, | |
| "learning_rate": 7.628845953161691e-05, | |
| "loss": 1.3921, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6828193832599119, | |
| "grad_norm": 0.2568531998827171, | |
| "learning_rate": 7.589288522934248e-05, | |
| "loss": 1.3191, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7048458149779736, | |
| "grad_norm": 0.2420347997963869, | |
| "learning_rate": 7.548230852400232e-05, | |
| "loss": 1.3777, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7268722466960352, | |
| "grad_norm": 0.22010679080838433, | |
| "learning_rate": 7.505697215910518e-05, | |
| "loss": 1.3653, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.748898678414097, | |
| "grad_norm": 0.3729582538192735, | |
| "learning_rate": 7.461712760445017e-05, | |
| "loss": 1.3821, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7709251101321586, | |
| "grad_norm": 0.24471717017646333, | |
| "learning_rate": 7.416303490745132e-05, | |
| "loss": 1.3272, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7929515418502202, | |
| "grad_norm": 0.23718932476563045, | |
| "learning_rate": 7.369496253939093e-05, | |
| "loss": 1.4063, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8149779735682819, | |
| "grad_norm": 0.273316183010925, | |
| "learning_rate": 7.321318723669236e-05, | |
| "loss": 1.3922, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.8370044052863436, | |
| "grad_norm": 0.2723324432950777, | |
| "learning_rate": 7.271799383730644e-05, | |
| "loss": 1.5102, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8590308370044053, | |
| "grad_norm": 0.2486191758110478, | |
| "learning_rate": 7.220967511230787e-05, | |
| "loss": 1.4539, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8810572687224669, | |
| "grad_norm": 0.20359743027793406, | |
| "learning_rate": 7.168853159280142e-05, | |
| "loss": 1.3669, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9030837004405287, | |
| "grad_norm": 0.3452908425906942, | |
| "learning_rate": 7.115487139224027e-05, | |
| "loss": 1.4474, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.9251101321585903, | |
| "grad_norm": 0.24206927604187342, | |
| "learning_rate": 7.060901002426141e-05, | |
| "loss": 1.405, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.947136563876652, | |
| "grad_norm": 0.2736785474465845, | |
| "learning_rate": 7.00512702161458e-05, | |
| "loss": 1.3685, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.9691629955947136, | |
| "grad_norm": 0.23978655563774517, | |
| "learning_rate": 6.948198171801373e-05, | |
| "loss": 1.4067, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9911894273127754, | |
| "grad_norm": 0.22393386935110163, | |
| "learning_rate": 6.890148110786813e-05, | |
| "loss": 1.3082, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9955947136563876, | |
| "eval_loss": 1.50490403175354, | |
| "eval_runtime": 42.9483, | |
| "eval_samples_per_second": 1.537, | |
| "eval_steps_per_second": 0.21, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.013215859030837, | |
| "grad_norm": 0.29391885004093155, | |
| "learning_rate": 6.831011159260084e-05, | |
| "loss": 1.2652, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0352422907488987, | |
| "grad_norm": 0.2262762255488471, | |
| "learning_rate": 6.770822280508006e-05, | |
| "loss": 1.1647, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.0572687224669604, | |
| "grad_norm": 0.25236435689633363, | |
| "learning_rate": 6.70961705974383e-05, | |
| "loss": 1.256, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.079295154185022, | |
| "grad_norm": 0.30150815984057305, | |
| "learning_rate": 6.647431683068342e-05, | |
| "loss": 1.2419, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.1013215859030836, | |
| "grad_norm": 0.2461529615622769, | |
| "learning_rate": 6.58430291607572e-05, | |
| "loss": 1.2819, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1233480176211454, | |
| "grad_norm": 0.2617560590756918, | |
| "learning_rate": 6.520268082116754e-05, | |
| "loss": 1.1769, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.145374449339207, | |
| "grad_norm": 0.2675888813814032, | |
| "learning_rate": 6.455365040232338e-05, | |
| "loss": 1.3418, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.1674008810572687, | |
| "grad_norm": 0.4478862449004952, | |
| "learning_rate": 6.389632162770219e-05, | |
| "loss": 1.2083, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.1894273127753303, | |
| "grad_norm": 0.2820003778060524, | |
| "learning_rate": 6.32310831269828e-05, | |
| "loss": 1.2821, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2114537444933922, | |
| "grad_norm": 0.25852039054845183, | |
| "learning_rate": 6.255832820627763e-05, | |
| "loss": 1.1638, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2334801762114538, | |
| "grad_norm": 0.25522724381792494, | |
| "learning_rate": 6.187845461559982e-05, | |
| "loss": 1.264, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2555066079295154, | |
| "grad_norm": 0.2597245715157646, | |
| "learning_rate": 6.119186431370339e-05, | |
| "loss": 1.2434, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.277533039647577, | |
| "grad_norm": 0.2582862000714317, | |
| "learning_rate": 6.049896323043476e-05, | |
| "loss": 1.2385, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.2995594713656389, | |
| "grad_norm": 0.2828799977378617, | |
| "learning_rate": 5.9800161026736606e-05, | |
| "loss": 1.1716, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3215859030837005, | |
| "grad_norm": 0.27495857073641394, | |
| "learning_rate": 5.9095870852445795e-05, | |
| "loss": 1.2569, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3436123348017621, | |
| "grad_norm": 0.26885812885440236, | |
| "learning_rate": 5.83865091020286e-05, | |
| "loss": 1.1797, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.3656387665198237, | |
| "grad_norm": 0.26990836029271487, | |
| "learning_rate": 5.767249516839748e-05, | |
| "loss": 1.2667, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.3876651982378854, | |
| "grad_norm": 0.24942586412100395, | |
| "learning_rate": 5.6954251194955336e-05, | |
| "loss": 1.2285, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.4096916299559472, | |
| "grad_norm": 0.2837832327793825, | |
| "learning_rate": 5.6232201826013346e-05, | |
| "loss": 1.2649, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4317180616740088, | |
| "grad_norm": 0.27338241561639043, | |
| "learning_rate": 5.550677395573045e-05, | |
| "loss": 1.247, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.4537444933920705, | |
| "grad_norm": 0.26757127591761626, | |
| "learning_rate": 5.477839647572243e-05, | |
| "loss": 1.2455, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.475770925110132, | |
| "grad_norm": 0.25393138651995495, | |
| "learning_rate": 5.404750002149023e-05, | |
| "loss": 1.2512, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.497797356828194, | |
| "grad_norm": 0.2657447805705433, | |
| "learning_rate": 5.3314516717817126e-05, | |
| "loss": 1.2397, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.5198237885462555, | |
| "grad_norm": 0.25474789006371235, | |
| "learning_rate": 5.257987992328549e-05, | |
| "loss": 1.1496, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.5418502202643172, | |
| "grad_norm": 0.2751533270870879, | |
| "learning_rate": 5.1844023974064006e-05, | |
| "loss": 1.2603, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.5638766519823788, | |
| "grad_norm": 0.26365407252111467, | |
| "learning_rate": 5.110738392711697e-05, | |
| "loss": 1.1364, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.5859030837004404, | |
| "grad_norm": 0.2525541732623165, | |
| "learning_rate": 5.037039530298738e-05, | |
| "loss": 1.1592, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.607929515418502, | |
| "grad_norm": 0.29719446796341475, | |
| "learning_rate": 4.9633493828306105e-05, | |
| "loss": 1.2675, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.6299559471365639, | |
| "grad_norm": 0.2744533935942541, | |
| "learning_rate": 4.889711517817897e-05, | |
| "loss": 1.2058, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6519823788546255, | |
| "grad_norm": 0.2553055879619236, | |
| "learning_rate": 4.8161694718604484e-05, | |
| "loss": 1.2671, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.6740088105726874, | |
| "grad_norm": 0.2641167238472348, | |
| "learning_rate": 4.742766724907424e-05, | |
| "loss": 1.2367, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.696035242290749, | |
| "grad_norm": 0.29534926894855623, | |
| "learning_rate": 4.6695466745508345e-05, | |
| "loss": 1.2394, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.7180616740088106, | |
| "grad_norm": 0.27781303700098053, | |
| "learning_rate": 4.5965526103677575e-05, | |
| "loss": 1.2047, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7400881057268722, | |
| "grad_norm": 0.25175443663281, | |
| "learning_rate": 4.523827688326434e-05, | |
| "loss": 1.2718, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.7621145374449338, | |
| "grad_norm": 0.2660850278265443, | |
| "learning_rate": 4.451414905271349e-05, | |
| "loss": 1.2061, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7841409691629955, | |
| "grad_norm": 0.2680968372741624, | |
| "learning_rate": 4.379357073502389e-05, | |
| "loss": 1.2053, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.8061674008810573, | |
| "grad_norm": 0.28513938423430346, | |
| "learning_rate": 4.307696795463113e-05, | |
| "loss": 1.1879, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.828193832599119, | |
| "grad_norm": 0.31656390494301856, | |
| "learning_rate": 4.236476438553086e-05, | |
| "loss": 1.2365, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.8502202643171806, | |
| "grad_norm": 0.2728232724630057, | |
| "learning_rate": 4.165738110079188e-05, | |
| "loss": 1.1303, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.8722466960352424, | |
| "grad_norm": 0.2652052861267643, | |
| "learning_rate": 4.095523632360687e-05, | |
| "loss": 1.2028, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.894273127753304, | |
| "grad_norm": 0.25228905366706306, | |
| "learning_rate": 4.025874518002815e-05, | |
| "loss": 1.2777, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9162995594713657, | |
| "grad_norm": 0.2923622331840007, | |
| "learning_rate": 3.95683194535344e-05, | |
| "loss": 1.2848, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.9383259911894273, | |
| "grad_norm": 0.27847518483753436, | |
| "learning_rate": 3.88843673415737e-05, | |
| "loss": 1.2911, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.960352422907489, | |
| "grad_norm": 0.2559544293947722, | |
| "learning_rate": 3.8207293214226725e-05, | |
| "loss": 1.1217, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.9823788546255505, | |
| "grad_norm": 0.2741260212367433, | |
| "learning_rate": 3.7537497375132755e-05, | |
| "loss": 1.188, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.9911894273127753, | |
| "eval_loss": 1.421474814414978, | |
| "eval_runtime": 42.6046, | |
| "eval_samples_per_second": 1.549, | |
| "eval_steps_per_second": 0.211, | |
| "step": 452 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 681, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 468830040096768.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |