{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9997052321296978, "eval_steps": 100, "global_step": 7632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013100794235650537, "grad_norm": 65.25566997593643, "learning_rate": 2.6178010471204188e-08, "loss": 0.8756, "step": 5 }, { "epoch": 0.0026201588471301074, "grad_norm": 62.55868359719172, "learning_rate": 5.2356020942408376e-08, "loss": 0.8355, "step": 10 }, { "epoch": 0.003930238270695161, "grad_norm": 72.36291944323244, "learning_rate": 7.853403141361257e-08, "loss": 0.9024, "step": 15 }, { "epoch": 0.005240317694260215, "grad_norm": 78.2927923163889, "learning_rate": 1.0471204188481675e-07, "loss": 0.9112, "step": 20 }, { "epoch": 0.006550397117825268, "grad_norm": 73.20997266140758, "learning_rate": 1.3089005235602092e-07, "loss": 0.9038, "step": 25 }, { "epoch": 0.007860476541390321, "grad_norm": 79.5698279926454, "learning_rate": 1.5706806282722514e-07, "loss": 0.7385, "step": 30 }, { "epoch": 0.009170555964955375, "grad_norm": 62.25008705240314, "learning_rate": 1.8324607329842932e-07, "loss": 0.8842, "step": 35 }, { "epoch": 0.01048063538852043, "grad_norm": 71.93212371078113, "learning_rate": 2.094240837696335e-07, "loss": 0.8456, "step": 40 }, { "epoch": 0.011790714812085483, "grad_norm": 76.81854898396818, "learning_rate": 2.356020942408377e-07, "loss": 0.8637, "step": 45 }, { "epoch": 0.013100794235650536, "grad_norm": 72.77223708616008, "learning_rate": 2.6178010471204185e-07, "loss": 0.8457, "step": 50 }, { "epoch": 0.01441087365921559, "grad_norm": 82.53172515882576, "learning_rate": 2.879581151832461e-07, "loss": 0.809, "step": 55 }, { "epoch": 0.015720953082780643, "grad_norm": 63.36114555923586, "learning_rate": 3.1413612565445027e-07, "loss": 0.7974, "step": 60 }, { "epoch": 0.017031032506345696, "grad_norm": 67.13005941209079, "learning_rate": 3.4031413612565446e-07, "loss": 0.7941, "step": 65 }, { "epoch": 0.01834111192991075, "grad_norm": 62.35455050500593, "learning_rate": 3.6649214659685864e-07, "loss": 0.8121, "step": 70 }, { "epoch": 0.019651191353475806, "grad_norm": 61.13921963104784, "learning_rate": 3.926701570680628e-07, "loss": 0.6837, "step": 75 }, { "epoch": 0.02096127077704086, "grad_norm": 60.734863824940575, "learning_rate": 4.18848167539267e-07, "loss": 0.6569, "step": 80 }, { "epoch": 0.022271350200605913, "grad_norm": 68.43319499092718, "learning_rate": 4.450261780104712e-07, "loss": 0.7585, "step": 85 }, { "epoch": 0.023581429624170966, "grad_norm": 52.09969303256668, "learning_rate": 4.712041884816754e-07, "loss": 0.6591, "step": 90 }, { "epoch": 0.02489150904773602, "grad_norm": 60.01362166751946, "learning_rate": 4.973821989528796e-07, "loss": 0.625, "step": 95 }, { "epoch": 0.026201588471301072, "grad_norm": 58.796731028708756, "learning_rate": 5.235602094240837e-07, "loss": 0.687, "step": 100 }, { "epoch": 0.026201588471301072, "eval_accuracy": 0.5504, "eval_loss": 0.7788666486740112, "eval_runtime": 138.8257, "eval_samples_per_second": 9.004, "eval_steps_per_second": 2.255, "step": 100 }, { "epoch": 0.027511667894866126, "grad_norm": 45.6814258154, "learning_rate": 5.497382198952879e-07, "loss": 0.6697, "step": 105 }, { "epoch": 0.02882174731843118, "grad_norm": 55.768935605768114, "learning_rate": 5.759162303664922e-07, "loss": 0.6187, "step": 110 }, { "epoch": 0.030131826741996232, "grad_norm": 44.123709497680935, "learning_rate": 6.020942408376963e-07, "loss": 0.59, "step": 115 }, { "epoch": 0.031441906165561286, "grad_norm": 40.077067899984414, "learning_rate": 6.282722513089005e-07, "loss": 0.6216, "step": 120 }, { "epoch": 0.03275198558912634, "grad_norm": 39.216463798971496, "learning_rate": 6.544502617801047e-07, "loss": 0.6603, "step": 125 }, { "epoch": 0.03406206501269139, "grad_norm": 34.95260150433843, "learning_rate": 6.806282722513089e-07, "loss": 0.5664, "step": 130 }, { "epoch": 0.03537214443625645, "grad_norm": 37.834491450835614, "learning_rate": 7.06806282722513e-07, "loss": 0.5947, "step": 135 }, { "epoch": 0.0366822238598215, "grad_norm": 32.63280938760141, "learning_rate": 7.329842931937173e-07, "loss": 0.6015, "step": 140 }, { "epoch": 0.037992303283386555, "grad_norm": 37.4913705926391, "learning_rate": 7.591623036649214e-07, "loss": 0.5922, "step": 145 }, { "epoch": 0.03930238270695161, "grad_norm": 26.355681906302205, "learning_rate": 7.853403141361256e-07, "loss": 0.5681, "step": 150 }, { "epoch": 0.04061246213051666, "grad_norm": 31.880949398269596, "learning_rate": 8.115183246073298e-07, "loss": 0.5664, "step": 155 }, { "epoch": 0.04192254155408172, "grad_norm": 20.65782299342999, "learning_rate": 8.37696335078534e-07, "loss": 0.5211, "step": 160 }, { "epoch": 0.04323262097764677, "grad_norm": 22.258499509814012, "learning_rate": 8.638743455497382e-07, "loss": 0.5219, "step": 165 }, { "epoch": 0.044542700401211825, "grad_norm": 21.846834289704923, "learning_rate": 8.900523560209424e-07, "loss": 0.5059, "step": 170 }, { "epoch": 0.045852779824776875, "grad_norm": 23.18302368260191, "learning_rate": 9.162303664921466e-07, "loss": 0.518, "step": 175 }, { "epoch": 0.04716285924834193, "grad_norm": 19.652604647477776, "learning_rate": 9.424083769633508e-07, "loss": 0.5271, "step": 180 }, { "epoch": 0.04847293867190698, "grad_norm": 17.965969880689265, "learning_rate": 9.68586387434555e-07, "loss": 0.5313, "step": 185 }, { "epoch": 0.04978301809547204, "grad_norm": 17.265465595838617, "learning_rate": 9.947643979057591e-07, "loss": 0.4635, "step": 190 }, { "epoch": 0.051093097519037095, "grad_norm": 12.731564683260206, "learning_rate": 1.0209424083769633e-06, "loss": 0.4914, "step": 195 }, { "epoch": 0.052403176942602145, "grad_norm": 14.240010539299435, "learning_rate": 1.0471204188481674e-06, "loss": 0.5163, "step": 200 }, { "epoch": 0.052403176942602145, "eval_accuracy": 0.4488, "eval_loss": 0.773366391658783, "eval_runtime": 139.5762, "eval_samples_per_second": 8.956, "eval_steps_per_second": 2.243, "step": 200 }, { "epoch": 0.0537132563661672, "grad_norm": 19.63370884990648, "learning_rate": 1.0732984293193717e-06, "loss": 0.5581, "step": 205 }, { "epoch": 0.05502333578973225, "grad_norm": 10.618120041300497, "learning_rate": 1.0994764397905759e-06, "loss": 0.444, "step": 210 }, { "epoch": 0.05633341521329731, "grad_norm": 17.44393406285321, "learning_rate": 1.12565445026178e-06, "loss": 0.5224, "step": 215 }, { "epoch": 0.05764349463686236, "grad_norm": 12.189359941143175, "learning_rate": 1.1518324607329843e-06, "loss": 0.4787, "step": 220 }, { "epoch": 0.058953574060427415, "grad_norm": 12.078094446605022, "learning_rate": 1.1780104712041885e-06, "loss": 0.5046, "step": 225 }, { "epoch": 0.060263653483992465, "grad_norm": 12.745981182030365, "learning_rate": 1.2041884816753926e-06, "loss": 0.5234, "step": 230 }, { "epoch": 0.06157373290755752, "grad_norm": 9.685521970587265, "learning_rate": 1.2303664921465967e-06, "loss": 0.4313, "step": 235 }, { "epoch": 0.06288381233112257, "grad_norm": 14.736633585803254, "learning_rate": 1.256544502617801e-06, "loss": 0.4547, "step": 240 }, { "epoch": 0.06419389175468763, "grad_norm": 8.794949376696293, "learning_rate": 1.2827225130890052e-06, "loss": 0.4405, "step": 245 }, { "epoch": 0.06550397117825268, "grad_norm": 11.249829085031893, "learning_rate": 1.3089005235602093e-06, "loss": 0.4307, "step": 250 }, { "epoch": 0.06681405060181773, "grad_norm": 13.390941109258982, "learning_rate": 1.3350785340314135e-06, "loss": 0.4998, "step": 255 }, { "epoch": 0.06812413002538278, "grad_norm": 9.534912328167007, "learning_rate": 1.3612565445026178e-06, "loss": 0.472, "step": 260 }, { "epoch": 0.06943420944894785, "grad_norm": 8.397391075769068, "learning_rate": 1.387434554973822e-06, "loss": 0.4433, "step": 265 }, { "epoch": 0.0707442888725129, "grad_norm": 10.746633363582351, "learning_rate": 1.413612565445026e-06, "loss": 0.4567, "step": 270 }, { "epoch": 0.07205436829607795, "grad_norm": 12.05720859118597, "learning_rate": 1.4397905759162302e-06, "loss": 0.4549, "step": 275 }, { "epoch": 0.073364447719643, "grad_norm": 10.006378930278041, "learning_rate": 1.4659685863874346e-06, "loss": 0.4329, "step": 280 }, { "epoch": 0.07467452714320806, "grad_norm": 9.597335107124772, "learning_rate": 1.4921465968586387e-06, "loss": 0.4084, "step": 285 }, { "epoch": 0.07598460656677311, "grad_norm": 11.481222700352367, "learning_rate": 1.5183246073298428e-06, "loss": 0.4149, "step": 290 }, { "epoch": 0.07729468599033816, "grad_norm": 19.43866947694484, "learning_rate": 1.544502617801047e-06, "loss": 0.4507, "step": 295 }, { "epoch": 0.07860476541390322, "grad_norm": 12.044720730463665, "learning_rate": 1.5706806282722513e-06, "loss": 0.4566, "step": 300 }, { "epoch": 0.07860476541390322, "eval_accuracy": 0.5296, "eval_loss": 0.8311891555786133, "eval_runtime": 139.1725, "eval_samples_per_second": 8.982, "eval_steps_per_second": 2.249, "step": 300 }, { "epoch": 0.07991484483746827, "grad_norm": 11.917516090251366, "learning_rate": 1.5968586387434554e-06, "loss": 0.4058, "step": 305 }, { "epoch": 0.08122492426103332, "grad_norm": 12.158737920925903, "learning_rate": 1.6230366492146596e-06, "loss": 0.3985, "step": 310 }, { "epoch": 0.08253500368459837, "grad_norm": 11.207695168487689, "learning_rate": 1.649214659685864e-06, "loss": 0.3668, "step": 315 }, { "epoch": 0.08384508310816344, "grad_norm": 14.299364939902476, "learning_rate": 1.675392670157068e-06, "loss": 0.4118, "step": 320 }, { "epoch": 0.08515516253172849, "grad_norm": 8.796874162358842, "learning_rate": 1.7015706806282722e-06, "loss": 0.417, "step": 325 }, { "epoch": 0.08646524195529354, "grad_norm": 6.423994778367414, "learning_rate": 1.7277486910994763e-06, "loss": 0.3693, "step": 330 }, { "epoch": 0.08777532137885859, "grad_norm": 14.257916893772826, "learning_rate": 1.7539267015706804e-06, "loss": 0.4209, "step": 335 }, { "epoch": 0.08908540080242365, "grad_norm": 10.247378725750938, "learning_rate": 1.7801047120418848e-06, "loss": 0.4086, "step": 340 }, { "epoch": 0.0903954802259887, "grad_norm": 13.028352515928068, "learning_rate": 1.806282722513089e-06, "loss": 0.4366, "step": 345 }, { "epoch": 0.09170555964955375, "grad_norm": 8.037401497631812, "learning_rate": 1.8324607329842933e-06, "loss": 0.3272, "step": 350 }, { "epoch": 0.09301563907311881, "grad_norm": 6.9081571017698655, "learning_rate": 1.8586387434554974e-06, "loss": 0.3677, "step": 355 }, { "epoch": 0.09432571849668386, "grad_norm": 8.726274241413915, "learning_rate": 1.8848167539267015e-06, "loss": 0.3582, "step": 360 }, { "epoch": 0.09563579792024891, "grad_norm": 8.438818851906133, "learning_rate": 1.9109947643979056e-06, "loss": 0.3923, "step": 365 }, { "epoch": 0.09694587734381396, "grad_norm": 9.975870802252798, "learning_rate": 1.93717277486911e-06, "loss": 0.3755, "step": 370 }, { "epoch": 0.09825595676737903, "grad_norm": 11.07336790257551, "learning_rate": 1.963350785340314e-06, "loss": 0.3931, "step": 375 }, { "epoch": 0.09956603619094408, "grad_norm": 6.87508978730872, "learning_rate": 1.9895287958115183e-06, "loss": 0.3723, "step": 380 }, { "epoch": 0.10087611561450913, "grad_norm": 7.299612626576022, "learning_rate": 1.999999155039932e-06, "loss": 0.3936, "step": 385 }, { "epoch": 0.10218619503807419, "grad_norm": 8.19359622835764, "learning_rate": 1.999993991400246e-06, "loss": 0.3347, "step": 390 }, { "epoch": 0.10349627446163924, "grad_norm": 7.35341462576354, "learning_rate": 1.9999841335673434e-06, "loss": 0.3843, "step": 395 }, { "epoch": 0.10480635388520429, "grad_norm": 6.711354211930067, "learning_rate": 1.999969581587499e-06, "loss": 0.3568, "step": 400 }, { "epoch": 0.10480635388520429, "eval_accuracy": 0.6608, "eval_loss": 0.739296019077301, "eval_runtime": 142.7751, "eval_samples_per_second": 8.755, "eval_steps_per_second": 2.192, "step": 400 }, { "epoch": 0.10611643330876934, "grad_norm": 10.024329644547777, "learning_rate": 1.999950335529023e-06, "loss": 0.377, "step": 405 }, { "epoch": 0.1074265127323344, "grad_norm": 9.78542639003509, "learning_rate": 1.999926395482261e-06, "loss": 0.3109, "step": 410 }, { "epoch": 0.10873659215589945, "grad_norm": 5.4448738851908445, "learning_rate": 1.999897761559593e-06, "loss": 0.3238, "step": 415 }, { "epoch": 0.1100466715794645, "grad_norm": 5.403174975957244, "learning_rate": 1.999864433895432e-06, "loss": 0.317, "step": 420 }, { "epoch": 0.11135675100302955, "grad_norm": 14.489233866558225, "learning_rate": 1.9998264126462264e-06, "loss": 0.3485, "step": 425 }, { "epoch": 0.11266683042659462, "grad_norm": 7.983108029464349, "learning_rate": 1.999783697990456e-06, "loss": 0.3745, "step": 430 }, { "epoch": 0.11397690985015967, "grad_norm": 6.210263065691716, "learning_rate": 1.9997362901286328e-06, "loss": 0.3134, "step": 435 }, { "epoch": 0.11528698927372472, "grad_norm": 5.7095639740154445, "learning_rate": 1.9996841892832997e-06, "loss": 0.2956, "step": 440 }, { "epoch": 0.11659706869728978, "grad_norm": 5.926962115905033, "learning_rate": 1.9996273956990303e-06, "loss": 0.3558, "step": 445 }, { "epoch": 0.11790714812085483, "grad_norm": 7.303119154977969, "learning_rate": 1.999565909642425e-06, "loss": 0.3417, "step": 450 }, { "epoch": 0.11921722754441988, "grad_norm": 10.835920993927127, "learning_rate": 1.9994997314021146e-06, "loss": 0.4127, "step": 455 }, { "epoch": 0.12052730696798493, "grad_norm": 7.096230008529465, "learning_rate": 1.999428861288753e-06, "loss": 0.3652, "step": 460 }, { "epoch": 0.12183738639154999, "grad_norm": 7.856938952483924, "learning_rate": 1.999353299635021e-06, "loss": 0.3319, "step": 465 }, { "epoch": 0.12314746581511504, "grad_norm": 5.163875987837063, "learning_rate": 1.9992730467956218e-06, "loss": 0.3274, "step": 470 }, { "epoch": 0.12445754523868009, "grad_norm": 5.096460744829759, "learning_rate": 1.9991881031472787e-06, "loss": 0.3369, "step": 475 }, { "epoch": 0.12576762466224514, "grad_norm": 4.898833760747336, "learning_rate": 1.9990984690887376e-06, "loss": 0.3342, "step": 480 }, { "epoch": 0.1270777040858102, "grad_norm": 8.546538011577958, "learning_rate": 1.99900414504076e-06, "loss": 0.3675, "step": 485 }, { "epoch": 0.12838778350937527, "grad_norm": 9.545619233480027, "learning_rate": 1.998905131446124e-06, "loss": 0.2993, "step": 490 }, { "epoch": 0.1296978629329403, "grad_norm": 6.0468014618988, "learning_rate": 1.998801428769621e-06, "loss": 0.3337, "step": 495 }, { "epoch": 0.13100794235650537, "grad_norm": 6.525761511275149, "learning_rate": 1.998693037498054e-06, "loss": 0.3504, "step": 500 }, { "epoch": 0.13100794235650537, "eval_accuracy": 0.6496, "eval_loss": 0.7146463394165039, "eval_runtime": 141.9904, "eval_samples_per_second": 8.803, "eval_steps_per_second": 2.204, "step": 500 }, { "epoch": 0.1323180217800704, "grad_norm": 6.303773120304537, "learning_rate": 1.9985799581402366e-06, "loss": 0.3254, "step": 505 }, { "epoch": 0.13362810120363547, "grad_norm": 7.995893571860963, "learning_rate": 1.998462191226988e-06, "loss": 0.3392, "step": 510 }, { "epoch": 0.13493818062720053, "grad_norm": 9.171359972165128, "learning_rate": 1.9983397373111318e-06, "loss": 0.3223, "step": 515 }, { "epoch": 0.13624826005076557, "grad_norm": 5.152252573879665, "learning_rate": 1.9982125969674943e-06, "loss": 0.3214, "step": 520 }, { "epoch": 0.13755833947433063, "grad_norm": 6.810478020843169, "learning_rate": 1.9980807707929e-06, "loss": 0.3643, "step": 525 }, { "epoch": 0.1388684188978957, "grad_norm": 6.555447149076346, "learning_rate": 1.99794425940617e-06, "loss": 0.3173, "step": 530 }, { "epoch": 0.14017849832146073, "grad_norm": 7.460824741390232, "learning_rate": 1.99780306344812e-06, "loss": 0.362, "step": 535 }, { "epoch": 0.1414885777450258, "grad_norm": 4.465161138093627, "learning_rate": 1.997657183581554e-06, "loss": 0.2876, "step": 540 }, { "epoch": 0.14279865716859086, "grad_norm": 6.492929729490839, "learning_rate": 1.997506620491265e-06, "loss": 0.3412, "step": 545 }, { "epoch": 0.1441087365921559, "grad_norm": 6.323199580280994, "learning_rate": 1.9973513748840294e-06, "loss": 0.2913, "step": 550 }, { "epoch": 0.14541881601572096, "grad_norm": 6.860707779728365, "learning_rate": 1.997191447488604e-06, "loss": 0.2841, "step": 555 }, { "epoch": 0.146728895439286, "grad_norm": 7.428003430414849, "learning_rate": 1.9970268390557235e-06, "loss": 0.3296, "step": 560 }, { "epoch": 0.14803897486285106, "grad_norm": 9.809910396455075, "learning_rate": 1.996857550358097e-06, "loss": 0.3316, "step": 565 }, { "epoch": 0.14934905428641612, "grad_norm": 6.120283708913698, "learning_rate": 1.9966835821904022e-06, "loss": 0.3227, "step": 570 }, { "epoch": 0.15065913370998116, "grad_norm": 5.518190183534834, "learning_rate": 1.9965049353692853e-06, "loss": 0.3271, "step": 575 }, { "epoch": 0.15196921313354622, "grad_norm": 4.956536091880624, "learning_rate": 1.996321610733353e-06, "loss": 0.3677, "step": 580 }, { "epoch": 0.15327929255711129, "grad_norm": 6.782512343216961, "learning_rate": 1.9961336091431724e-06, "loss": 0.3538, "step": 585 }, { "epoch": 0.15458937198067632, "grad_norm": 4.957135478968806, "learning_rate": 1.995940931481264e-06, "loss": 0.3716, "step": 590 }, { "epoch": 0.15589945140424138, "grad_norm": 5.89621538406006, "learning_rate": 1.9957435786521003e-06, "loss": 0.3211, "step": 595 }, { "epoch": 0.15720953082780645, "grad_norm": 3.4716470492850133, "learning_rate": 1.9955415515820982e-06, "loss": 0.3335, "step": 600 }, { "epoch": 0.15720953082780645, "eval_accuracy": 0.74, "eval_loss": 0.6648128628730774, "eval_runtime": 143.2818, "eval_samples_per_second": 8.724, "eval_steps_per_second": 2.185, "step": 600 }, { "epoch": 0.15851961025137148, "grad_norm": 4.420746512696032, "learning_rate": 1.9953348512196184e-06, "loss": 0.3074, "step": 605 }, { "epoch": 0.15982968967493655, "grad_norm": 5.278777879896076, "learning_rate": 1.9951234785349572e-06, "loss": 0.3338, "step": 610 }, { "epoch": 0.16113976909850158, "grad_norm": 7.753730093542852, "learning_rate": 1.9949074345203457e-06, "loss": 0.3409, "step": 615 }, { "epoch": 0.16244984852206665, "grad_norm": 4.666850591356625, "learning_rate": 1.9946867201899415e-06, "loss": 0.3368, "step": 620 }, { "epoch": 0.1637599279456317, "grad_norm": 3.574785990989966, "learning_rate": 1.994461336579827e-06, "loss": 0.2872, "step": 625 }, { "epoch": 0.16507000736919675, "grad_norm": 6.370783353900673, "learning_rate": 1.9942312847480032e-06, "loss": 0.3223, "step": 630 }, { "epoch": 0.1663800867927618, "grad_norm": 6.091101766679421, "learning_rate": 1.993996565774384e-06, "loss": 0.3247, "step": 635 }, { "epoch": 0.16769016621632687, "grad_norm": 4.963319059049603, "learning_rate": 1.9937571807607914e-06, "loss": 0.3035, "step": 640 }, { "epoch": 0.1690002456398919, "grad_norm": 5.763955927215438, "learning_rate": 1.993513130830953e-06, "loss": 0.3207, "step": 645 }, { "epoch": 0.17031032506345697, "grad_norm": 5.873434630553368, "learning_rate": 1.9932644171304922e-06, "loss": 0.2886, "step": 650 }, { "epoch": 0.17162040448702204, "grad_norm": 6.2646805543143165, "learning_rate": 1.9930110408269265e-06, "loss": 0.2844, "step": 655 }, { "epoch": 0.17293048391058707, "grad_norm": 6.593113180342127, "learning_rate": 1.992753003109661e-06, "loss": 0.3156, "step": 660 }, { "epoch": 0.17424056333415214, "grad_norm": 7.157645021880429, "learning_rate": 1.9924903051899805e-06, "loss": 0.2825, "step": 665 }, { "epoch": 0.17555064275771717, "grad_norm": 8.060379496602742, "learning_rate": 1.9922229483010486e-06, "loss": 0.2938, "step": 670 }, { "epoch": 0.17686072218128224, "grad_norm": 3.9961852237294413, "learning_rate": 1.9919509336978966e-06, "loss": 0.3503, "step": 675 }, { "epoch": 0.1781708016048473, "grad_norm": 4.475800758356923, "learning_rate": 1.9916742626574224e-06, "loss": 0.3459, "step": 680 }, { "epoch": 0.17948088102841234, "grad_norm": 4.593693404230203, "learning_rate": 1.9913929364783804e-06, "loss": 0.33, "step": 685 }, { "epoch": 0.1807909604519774, "grad_norm": 5.413425332374862, "learning_rate": 1.9911069564813783e-06, "loss": 0.3051, "step": 690 }, { "epoch": 0.18210103987554246, "grad_norm": 8.008275090324018, "learning_rate": 1.9908163240088693e-06, "loss": 0.3699, "step": 695 }, { "epoch": 0.1834111192991075, "grad_norm": 5.528157261942457, "learning_rate": 1.9905210404251465e-06, "loss": 0.2891, "step": 700 }, { "epoch": 0.1834111192991075, "eval_accuracy": 0.7104, "eval_loss": 0.6656551957130432, "eval_runtime": 142.4624, "eval_samples_per_second": 8.774, "eval_steps_per_second": 2.197, "step": 700 }, { "epoch": 0.18472119872267256, "grad_norm": 3.6403780225909403, "learning_rate": 1.9902211071163366e-06, "loss": 0.287, "step": 705 }, { "epoch": 0.18603127814623763, "grad_norm": 6.998780707143166, "learning_rate": 1.989916525490393e-06, "loss": 0.2794, "step": 710 }, { "epoch": 0.18734135756980266, "grad_norm": 4.8809190411029055, "learning_rate": 1.989607296977089e-06, "loss": 0.3102, "step": 715 }, { "epoch": 0.18865143699336773, "grad_norm": 4.433896161785547, "learning_rate": 1.989293423028012e-06, "loss": 0.3142, "step": 720 }, { "epoch": 0.18996151641693276, "grad_norm": 4.968740406829342, "learning_rate": 1.988974905116556e-06, "loss": 0.2907, "step": 725 }, { "epoch": 0.19127159584049783, "grad_norm": 5.0219581203222505, "learning_rate": 1.988651744737914e-06, "loss": 0.3004, "step": 730 }, { "epoch": 0.1925816752640629, "grad_norm": 5.9858192432913215, "learning_rate": 1.9883239434090727e-06, "loss": 0.3099, "step": 735 }, { "epoch": 0.19389175468762793, "grad_norm": 6.069766314309123, "learning_rate": 1.9879915026688042e-06, "loss": 0.3456, "step": 740 }, { "epoch": 0.195201834111193, "grad_norm": 3.8932647987051365, "learning_rate": 1.9876544240776593e-06, "loss": 0.2827, "step": 745 }, { "epoch": 0.19651191353475805, "grad_norm": 4.970870187009217, "learning_rate": 1.987312709217959e-06, "loss": 0.2862, "step": 750 }, { "epoch": 0.1978219929583231, "grad_norm": 7.016260113036865, "learning_rate": 1.9869663596937884e-06, "loss": 0.2776, "step": 755 }, { "epoch": 0.19913207238188815, "grad_norm": 9.66509560172156, "learning_rate": 1.986615377130989e-06, "loss": 0.2772, "step": 760 }, { "epoch": 0.20044215180545322, "grad_norm": 5.77891746002653, "learning_rate": 1.9862597631771508e-06, "loss": 0.353, "step": 765 }, { "epoch": 0.20175223122901825, "grad_norm": 6.520911408063365, "learning_rate": 1.9858995195016044e-06, "loss": 0.3101, "step": 770 }, { "epoch": 0.20306231065258332, "grad_norm": 3.484594315471376, "learning_rate": 1.9855346477954142e-06, "loss": 0.2896, "step": 775 }, { "epoch": 0.20437239007614838, "grad_norm": 4.684870944655888, "learning_rate": 1.9851651497713672e-06, "loss": 0.2596, "step": 780 }, { "epoch": 0.20568246949971342, "grad_norm": 4.808149471025438, "learning_rate": 1.9847910271639697e-06, "loss": 0.3015, "step": 785 }, { "epoch": 0.20699254892327848, "grad_norm": 4.156391998918141, "learning_rate": 1.984412281729436e-06, "loss": 0.2871, "step": 790 }, { "epoch": 0.20830262834684352, "grad_norm": 8.382532262606357, "learning_rate": 1.9840289152456814e-06, "loss": 0.375, "step": 795 }, { "epoch": 0.20961270777040858, "grad_norm": 4.571411739513223, "learning_rate": 1.9836409295123127e-06, "loss": 0.3006, "step": 800 }, { "epoch": 0.20961270777040858, "eval_accuracy": 0.6704, "eval_loss": 0.7644935250282288, "eval_runtime": 138.0522, "eval_samples_per_second": 9.055, "eval_steps_per_second": 2.267, "step": 800 }, { "epoch": 0.21092278719397364, "grad_norm": 2.825880146573424, "learning_rate": 1.983248326350621e-06, "loss": 0.2792, "step": 805 }, { "epoch": 0.21223286661753868, "grad_norm": 4.072431366106783, "learning_rate": 1.982851107603572e-06, "loss": 0.256, "step": 810 }, { "epoch": 0.21354294604110374, "grad_norm": 8.315895632884684, "learning_rate": 1.982449275135799e-06, "loss": 0.288, "step": 815 }, { "epoch": 0.2148530254646688, "grad_norm": 6.617341511261604, "learning_rate": 1.982042830833592e-06, "loss": 0.2574, "step": 820 }, { "epoch": 0.21616310488823384, "grad_norm": 7.4121422793295055, "learning_rate": 1.981631776604892e-06, "loss": 0.3751, "step": 825 }, { "epoch": 0.2174731843117989, "grad_norm": 5.462630509848653, "learning_rate": 1.9812161143792764e-06, "loss": 0.3347, "step": 830 }, { "epoch": 0.21878326373536397, "grad_norm": 4.397947143261118, "learning_rate": 1.9807958461079574e-06, "loss": 0.318, "step": 835 }, { "epoch": 0.220093343158929, "grad_norm": 3.733736827246703, "learning_rate": 1.980370973763767e-06, "loss": 0.2653, "step": 840 }, { "epoch": 0.22140342258249407, "grad_norm": 4.754151304483215, "learning_rate": 1.9799414993411495e-06, "loss": 0.2822, "step": 845 }, { "epoch": 0.2227135020060591, "grad_norm": 5.574460765166766, "learning_rate": 1.979507424856153e-06, "loss": 0.336, "step": 850 }, { "epoch": 0.22402358142962417, "grad_norm": 4.505363713646319, "learning_rate": 1.97906875234642e-06, "loss": 0.2928, "step": 855 }, { "epoch": 0.22533366085318923, "grad_norm": 6.222786448829383, "learning_rate": 1.9786254838711757e-06, "loss": 0.2989, "step": 860 }, { "epoch": 0.22664374027675427, "grad_norm": 6.1340912560899525, "learning_rate": 1.9781776215112204e-06, "loss": 0.2904, "step": 865 }, { "epoch": 0.22795381970031933, "grad_norm": 4.747377998160214, "learning_rate": 1.9777251673689198e-06, "loss": 0.2786, "step": 870 }, { "epoch": 0.2292638991238844, "grad_norm": 7.702780817923823, "learning_rate": 1.9772681235681933e-06, "loss": 0.3207, "step": 875 }, { "epoch": 0.23057397854744943, "grad_norm": 3.7484591701091077, "learning_rate": 1.976806492254506e-06, "loss": 0.272, "step": 880 }, { "epoch": 0.2318840579710145, "grad_norm": 6.027425431373605, "learning_rate": 1.9763402755948574e-06, "loss": 0.2878, "step": 885 }, { "epoch": 0.23319413739457956, "grad_norm": 4.420243905346968, "learning_rate": 1.975869475777772e-06, "loss": 0.3112, "step": 890 }, { "epoch": 0.2345042168181446, "grad_norm": 3.2086505540675474, "learning_rate": 1.9753940950132874e-06, "loss": 0.3328, "step": 895 }, { "epoch": 0.23581429624170966, "grad_norm": 4.92351892053512, "learning_rate": 1.9749141355329473e-06, "loss": 0.3039, "step": 900 }, { "epoch": 0.23581429624170966, "eval_accuracy": 0.7176, "eval_loss": 0.6235886812210083, "eval_runtime": 134.5821, "eval_samples_per_second": 9.288, "eval_steps_per_second": 2.326, "step": 900 }, { "epoch": 0.2371243756652747, "grad_norm": 4.595098138514267, "learning_rate": 1.9744295995897874e-06, "loss": 0.3384, "step": 905 }, { "epoch": 0.23843445508883976, "grad_norm": 5.875594511033142, "learning_rate": 1.9739404894583262e-06, "loss": 0.2493, "step": 910 }, { "epoch": 0.23974453451240482, "grad_norm": 4.943964557160407, "learning_rate": 1.9734468074345555e-06, "loss": 0.3264, "step": 915 }, { "epoch": 0.24105461393596986, "grad_norm": 3.4784751132842167, "learning_rate": 1.9729485558359286e-06, "loss": 0.2736, "step": 920 }, { "epoch": 0.24236469335953492, "grad_norm": 5.689006473597506, "learning_rate": 1.9724457370013474e-06, "loss": 0.2991, "step": 925 }, { "epoch": 0.24367477278309999, "grad_norm": 4.95697152099517, "learning_rate": 1.971938353291156e-06, "loss": 0.3023, "step": 930 }, { "epoch": 0.24498485220666502, "grad_norm": 3.698150841899642, "learning_rate": 1.9714264070871254e-06, "loss": 0.3104, "step": 935 }, { "epoch": 0.24629493163023009, "grad_norm": 3.485873023817148, "learning_rate": 1.970909900792444e-06, "loss": 0.296, "step": 940 }, { "epoch": 0.24760501105379515, "grad_norm": 4.428297690192133, "learning_rate": 1.9703888368317084e-06, "loss": 0.349, "step": 945 }, { "epoch": 0.24891509047736018, "grad_norm": 3.1990684592166447, "learning_rate": 1.969863217650906e-06, "loss": 0.2494, "step": 950 }, { "epoch": 0.25022516990092525, "grad_norm": 5.662300170468204, "learning_rate": 1.9693330457174113e-06, "loss": 0.3193, "step": 955 }, { "epoch": 0.2515352493244903, "grad_norm": 3.389632162628184, "learning_rate": 1.968798323519968e-06, "loss": 0.3378, "step": 960 }, { "epoch": 0.2528453287480554, "grad_norm": 3.776539689852539, "learning_rate": 1.9682590535686804e-06, "loss": 0.2909, "step": 965 }, { "epoch": 0.2541554081716204, "grad_norm": 2.739785839636218, "learning_rate": 1.9677152383950014e-06, "loss": 0.2877, "step": 970 }, { "epoch": 0.25546548759518545, "grad_norm": 4.226352508831814, "learning_rate": 1.9671668805517197e-06, "loss": 0.2917, "step": 975 }, { "epoch": 0.25677556701875054, "grad_norm": 4.092459875987079, "learning_rate": 1.9666139826129482e-06, "loss": 0.3101, "step": 980 }, { "epoch": 0.2580856464423156, "grad_norm": 3.022031693799492, "learning_rate": 1.9660565471741133e-06, "loss": 0.2451, "step": 985 }, { "epoch": 0.2593957258658806, "grad_norm": 5.445606320908549, "learning_rate": 1.965494576851939e-06, "loss": 0.2803, "step": 990 }, { "epoch": 0.26070580528944565, "grad_norm": 6.789604538228405, "learning_rate": 1.9649280742844383e-06, "loss": 0.3155, "step": 995 }, { "epoch": 0.26201588471301074, "grad_norm": 9.717312792399483, "learning_rate": 1.9643570421309013e-06, "loss": 0.354, "step": 1000 }, { "epoch": 0.26201588471301074, "eval_accuracy": 0.7272, "eval_loss": 0.641910970211029, "eval_runtime": 135.3149, "eval_samples_per_second": 9.238, "eval_steps_per_second": 2.313, "step": 1000 }, { "epoch": 0.2633259641365758, "grad_norm": 3.570212334750135, "learning_rate": 1.9637814830718784e-06, "loss": 0.2197, "step": 1005 }, { "epoch": 0.2646360435601408, "grad_norm": 5.497952454503209, "learning_rate": 1.9632013998091708e-06, "loss": 0.2843, "step": 1010 }, { "epoch": 0.2659461229837059, "grad_norm": 4.3351899402575595, "learning_rate": 1.962616795065819e-06, "loss": 0.3116, "step": 1015 }, { "epoch": 0.26725620240727094, "grad_norm": 5.059296767345353, "learning_rate": 1.962027671586086e-06, "loss": 0.2732, "step": 1020 }, { "epoch": 0.268566281830836, "grad_norm": 3.636972491245334, "learning_rate": 1.961434032135448e-06, "loss": 0.3015, "step": 1025 }, { "epoch": 0.26987636125440106, "grad_norm": 2.886855538209043, "learning_rate": 1.9608358795005805e-06, "loss": 0.271, "step": 1030 }, { "epoch": 0.2711864406779661, "grad_norm": 4.518933069509563, "learning_rate": 1.960233216489344e-06, "loss": 0.2875, "step": 1035 }, { "epoch": 0.27249652010153114, "grad_norm": 6.776126720418537, "learning_rate": 1.959626045930773e-06, "loss": 0.3297, "step": 1040 }, { "epoch": 0.27380659952509623, "grad_norm": 3.3329344336767175, "learning_rate": 1.9590143706750595e-06, "loss": 0.3023, "step": 1045 }, { "epoch": 0.27511667894866126, "grad_norm": 8.048171208434075, "learning_rate": 1.958398193593543e-06, "loss": 0.334, "step": 1050 }, { "epoch": 0.2764267583722263, "grad_norm": 3.78319855061, "learning_rate": 1.9577775175786944e-06, "loss": 0.2919, "step": 1055 }, { "epoch": 0.2777368377957914, "grad_norm": 3.3427967602303164, "learning_rate": 1.957152345544106e-06, "loss": 0.3142, "step": 1060 }, { "epoch": 0.2790469172193564, "grad_norm": 5.104886446268955, "learning_rate": 1.9565226804244723e-06, "loss": 0.3025, "step": 1065 }, { "epoch": 0.28035699664292146, "grad_norm": 2.0521842311663114, "learning_rate": 1.9558885251755814e-06, "loss": 0.2591, "step": 1070 }, { "epoch": 0.28166707606648655, "grad_norm": 2.77191138245992, "learning_rate": 1.955249882774298e-06, "loss": 0.3224, "step": 1075 }, { "epoch": 0.2829771554900516, "grad_norm": 5.0369322317805185, "learning_rate": 1.954606756218552e-06, "loss": 0.3104, "step": 1080 }, { "epoch": 0.2842872349136166, "grad_norm": 7.9829960694246624, "learning_rate": 1.9539591485273207e-06, "loss": 0.2774, "step": 1085 }, { "epoch": 0.2855973143371817, "grad_norm": 6.1487449662425835, "learning_rate": 1.953307062740619e-06, "loss": 0.3271, "step": 1090 }, { "epoch": 0.28690739376074675, "grad_norm": 4.902123463001405, "learning_rate": 1.952650501919481e-06, "loss": 0.3087, "step": 1095 }, { "epoch": 0.2882174731843118, "grad_norm": 5.005642536094271, "learning_rate": 1.9519894691459488e-06, "loss": 0.3698, "step": 1100 }, { "epoch": 0.2882174731843118, "eval_accuracy": 0.7104, "eval_loss": 0.7176594734191895, "eval_runtime": 136.7009, "eval_samples_per_second": 9.144, "eval_steps_per_second": 2.29, "step": 1100 }, { "epoch": 0.2895275526078768, "grad_norm": 3.1207339769636837, "learning_rate": 1.951323967523057e-06, "loss": 0.3232, "step": 1105 }, { "epoch": 0.2908376320314419, "grad_norm": 3.1159736941377147, "learning_rate": 1.9506540001748172e-06, "loss": 0.2797, "step": 1110 }, { "epoch": 0.29214771145500695, "grad_norm": 4.293077428446999, "learning_rate": 1.9499795702462047e-06, "loss": 0.3155, "step": 1115 }, { "epoch": 0.293457790878572, "grad_norm": 3.8028097270147794, "learning_rate": 1.949300680903143e-06, "loss": 0.21, "step": 1120 }, { "epoch": 0.2947678703021371, "grad_norm": 4.84671496698013, "learning_rate": 1.948617335332489e-06, "loss": 0.2633, "step": 1125 }, { "epoch": 0.2960779497257021, "grad_norm": 3.960211317187865, "learning_rate": 1.947929536742018e-06, "loss": 0.2954, "step": 1130 }, { "epoch": 0.29738802914926715, "grad_norm": 6.0168036176339985, "learning_rate": 1.947237288360408e-06, "loss": 0.284, "step": 1135 }, { "epoch": 0.29869810857283224, "grad_norm": 3.518422321473526, "learning_rate": 1.946540593437228e-06, "loss": 0.2759, "step": 1140 }, { "epoch": 0.3000081879963973, "grad_norm": 5.514253446140628, "learning_rate": 1.945839455242917e-06, "loss": 0.2304, "step": 1145 }, { "epoch": 0.3013182674199623, "grad_norm": 8.396945191804443, "learning_rate": 1.945133877068773e-06, "loss": 0.3492, "step": 1150 }, { "epoch": 0.3026283468435274, "grad_norm": 2.9344408033676497, "learning_rate": 1.9444238622269366e-06, "loss": 0.2529, "step": 1155 }, { "epoch": 0.30393842626709244, "grad_norm": 4.341884742564996, "learning_rate": 1.9437094140503745e-06, "loss": 0.2763, "step": 1160 }, { "epoch": 0.3052485056906575, "grad_norm": 4.144835095588702, "learning_rate": 1.9429905358928646e-06, "loss": 0.2992, "step": 1165 }, { "epoch": 0.30655858511422257, "grad_norm": 4.571896652829111, "learning_rate": 1.9422672311289797e-06, "loss": 0.2094, "step": 1170 }, { "epoch": 0.3078686645377876, "grad_norm": 6.487182515831676, "learning_rate": 1.9415395031540734e-06, "loss": 0.3184, "step": 1175 }, { "epoch": 0.30917874396135264, "grad_norm": 6.2890432701053784, "learning_rate": 1.9408073553842614e-06, "loss": 0.2885, "step": 1180 }, { "epoch": 0.31048882338491773, "grad_norm": 5.261502639917455, "learning_rate": 1.9400707912564078e-06, "loss": 0.2425, "step": 1185 }, { "epoch": 0.31179890280848277, "grad_norm": 5.4319235826756715, "learning_rate": 1.939329814228107e-06, "loss": 0.3138, "step": 1190 }, { "epoch": 0.3131089822320478, "grad_norm": 4.569463003103509, "learning_rate": 1.93858442777767e-06, "loss": 0.2905, "step": 1195 }, { "epoch": 0.3144190616556129, "grad_norm": 6.51841356160416, "learning_rate": 1.9378346354041057e-06, "loss": 0.2544, "step": 1200 }, { "epoch": 0.3144190616556129, "eval_accuracy": 0.7032, "eval_loss": 0.7195152640342712, "eval_runtime": 139.9321, "eval_samples_per_second": 8.933, "eval_steps_per_second": 2.237, "step": 1200 }, { "epoch": 0.31572914107917793, "grad_norm": 6.07329817723254, "learning_rate": 1.9370804406271053e-06, "loss": 0.3082, "step": 1205 }, { "epoch": 0.31703922050274297, "grad_norm": 5.5739656052051565, "learning_rate": 1.936321846987026e-06, "loss": 0.2982, "step": 1210 }, { "epoch": 0.318349299926308, "grad_norm": 3.19437710777185, "learning_rate": 1.9355588580448743e-06, "loss": 0.2404, "step": 1215 }, { "epoch": 0.3196593793498731, "grad_norm": 4.020281104033066, "learning_rate": 1.9347914773822897e-06, "loss": 0.3113, "step": 1220 }, { "epoch": 0.32096945877343813, "grad_norm": 4.494719920151199, "learning_rate": 1.9340197086015267e-06, "loss": 0.3129, "step": 1225 }, { "epoch": 0.32227953819700317, "grad_norm": 4.154261834382855, "learning_rate": 1.9332435553254386e-06, "loss": 0.3315, "step": 1230 }, { "epoch": 0.32358961762056826, "grad_norm": 4.748202443565547, "learning_rate": 1.932463021197461e-06, "loss": 0.2484, "step": 1235 }, { "epoch": 0.3248996970441333, "grad_norm": 3.4246878831109404, "learning_rate": 1.9316781098815938e-06, "loss": 0.2892, "step": 1240 }, { "epoch": 0.32620977646769833, "grad_norm": 2.680374700010348, "learning_rate": 1.930888825062385e-06, "loss": 0.2731, "step": 1245 }, { "epoch": 0.3275198558912634, "grad_norm": 5.622466682481238, "learning_rate": 1.9300951704449113e-06, "loss": 0.3281, "step": 1250 }, { "epoch": 0.32882993531482846, "grad_norm": 4.573189599074694, "learning_rate": 1.929297149754764e-06, "loss": 0.3044, "step": 1255 }, { "epoch": 0.3301400147383935, "grad_norm": 3.977565315266715, "learning_rate": 1.928494766738029e-06, "loss": 0.3347, "step": 1260 }, { "epoch": 0.3314500941619586, "grad_norm": 3.267696615745348, "learning_rate": 1.927688025161269e-06, "loss": 0.273, "step": 1265 }, { "epoch": 0.3327601735855236, "grad_norm": 3.7685478337164535, "learning_rate": 1.9268769288115083e-06, "loss": 0.308, "step": 1270 }, { "epoch": 0.33407025300908866, "grad_norm": 4.733168908675778, "learning_rate": 1.9260614814962127e-06, "loss": 0.2864, "step": 1275 }, { "epoch": 0.33538033243265375, "grad_norm": 3.7829911630990756, "learning_rate": 1.9252416870432723e-06, "loss": 0.2763, "step": 1280 }, { "epoch": 0.3366904118562188, "grad_norm": 4.817098610717063, "learning_rate": 1.9244175493009836e-06, "loss": 0.2661, "step": 1285 }, { "epoch": 0.3380004912797838, "grad_norm": 5.155494294997122, "learning_rate": 1.9235890721380323e-06, "loss": 0.3272, "step": 1290 }, { "epoch": 0.3393105707033489, "grad_norm": 3.9443526122441295, "learning_rate": 1.9227562594434733e-06, "loss": 0.3294, "step": 1295 }, { "epoch": 0.34062065012691395, "grad_norm": 3.268646227982553, "learning_rate": 1.9219191151267133e-06, "loss": 0.2571, "step": 1300 }, { "epoch": 0.34062065012691395, "eval_accuracy": 0.712, "eval_loss": 0.7290279269218445, "eval_runtime": 139.9623, "eval_samples_per_second": 8.931, "eval_steps_per_second": 2.236, "step": 1300 }, { "epoch": 0.341930729550479, "grad_norm": 4.156705015549692, "learning_rate": 1.9210776431174937e-06, "loss": 0.296, "step": 1305 }, { "epoch": 0.3432408089740441, "grad_norm": 4.1937292159776405, "learning_rate": 1.9202318473658702e-06, "loss": 0.2799, "step": 1310 }, { "epoch": 0.3445508883976091, "grad_norm": 3.5322347963356866, "learning_rate": 1.9193817318421952e-06, "loss": 0.2803, "step": 1315 }, { "epoch": 0.34586096782117415, "grad_norm": 5.317835964163557, "learning_rate": 1.9185273005371e-06, "loss": 0.2849, "step": 1320 }, { "epoch": 0.34717104724473924, "grad_norm": 5.169820633932056, "learning_rate": 1.9176685574614733e-06, "loss": 0.2987, "step": 1325 }, { "epoch": 0.3484811266683043, "grad_norm": 4.982709983606647, "learning_rate": 1.9168055066464457e-06, "loss": 0.2716, "step": 1330 }, { "epoch": 0.3497912060918693, "grad_norm": 4.866013018973415, "learning_rate": 1.9159381521433684e-06, "loss": 0.2766, "step": 1335 }, { "epoch": 0.35110128551543435, "grad_norm": 4.025011433913149, "learning_rate": 1.9150664980237964e-06, "loss": 0.2584, "step": 1340 }, { "epoch": 0.35241136493899944, "grad_norm": 3.8599220124227545, "learning_rate": 1.9141905483794664e-06, "loss": 0.3204, "step": 1345 }, { "epoch": 0.3537214443625645, "grad_norm": 3.79972737879995, "learning_rate": 1.91331030732228e-06, "loss": 0.2836, "step": 1350 }, { "epoch": 0.3550315237861295, "grad_norm": 2.919813006103404, "learning_rate": 1.9124257789842843e-06, "loss": 0.2587, "step": 1355 }, { "epoch": 0.3563416032096946, "grad_norm": 5.170605350757861, "learning_rate": 1.9115369675176504e-06, "loss": 0.3065, "step": 1360 }, { "epoch": 0.35765168263325964, "grad_norm": 6.1588988469192065, "learning_rate": 1.910643877094656e-06, "loss": 0.3447, "step": 1365 }, { "epoch": 0.3589617620568247, "grad_norm": 2.6672804726236303, "learning_rate": 1.9097465119076665e-06, "loss": 0.3036, "step": 1370 }, { "epoch": 0.36027184148038977, "grad_norm": 3.4333435314335983, "learning_rate": 1.908844876169112e-06, "loss": 0.2682, "step": 1375 }, { "epoch": 0.3615819209039548, "grad_norm": 2.6540494040134153, "learning_rate": 1.9079389741114696e-06, "loss": 0.2592, "step": 1380 }, { "epoch": 0.36289200032751984, "grad_norm": 4.249055358619522, "learning_rate": 1.9070288099872452e-06, "loss": 0.2605, "step": 1385 }, { "epoch": 0.36420207975108493, "grad_norm": 4.836139691290449, "learning_rate": 1.9061143880689503e-06, "loss": 0.2977, "step": 1390 }, { "epoch": 0.36551215917464996, "grad_norm": 4.705615420915993, "learning_rate": 1.905195712649084e-06, "loss": 0.3444, "step": 1395 }, { "epoch": 0.366822238598215, "grad_norm": 3.2537510054626515, "learning_rate": 1.9042727880401122e-06, "loss": 0.3558, "step": 1400 }, { "epoch": 0.366822238598215, "eval_accuracy": 0.676, "eval_loss": 0.7812010049819946, "eval_runtime": 137.3509, "eval_samples_per_second": 9.101, "eval_steps_per_second": 2.279, "step": 1400 }, { "epoch": 0.3681323180217801, "grad_norm": 2.7663311029982856, "learning_rate": 1.9033456185744469e-06, "loss": 0.2985, "step": 1405 }, { "epoch": 0.3694423974453451, "grad_norm": 2.6580405529803808, "learning_rate": 1.9024142086044277e-06, "loss": 0.2834, "step": 1410 }, { "epoch": 0.37075247686891016, "grad_norm": 3.6849674325031407, "learning_rate": 1.9014785625022985e-06, "loss": 0.2779, "step": 1415 }, { "epoch": 0.37206255629247525, "grad_norm": 4.020792652442759, "learning_rate": 1.9005386846601893e-06, "loss": 0.2472, "step": 1420 }, { "epoch": 0.3733726357160403, "grad_norm": 4.526317952946907, "learning_rate": 1.8995945794900953e-06, "loss": 0.2786, "step": 1425 }, { "epoch": 0.3746827151396053, "grad_norm": 4.123722032882601, "learning_rate": 1.8986462514238547e-06, "loss": 0.2833, "step": 1430 }, { "epoch": 0.3759927945631704, "grad_norm": 6.744679094753499, "learning_rate": 1.8976937049131298e-06, "loss": 0.3072, "step": 1435 }, { "epoch": 0.37730287398673545, "grad_norm": 4.370200879451724, "learning_rate": 1.8967369444293847e-06, "loss": 0.25, "step": 1440 }, { "epoch": 0.3786129534103005, "grad_norm": 2.6567231964667424, "learning_rate": 1.8957759744638651e-06, "loss": 0.2461, "step": 1445 }, { "epoch": 0.3799230328338655, "grad_norm": 4.993924185098615, "learning_rate": 1.8948107995275761e-06, "loss": 0.2457, "step": 1450 }, { "epoch": 0.3812331122574306, "grad_norm": 5.890536303510147, "learning_rate": 1.8938414241512637e-06, "loss": 0.3337, "step": 1455 }, { "epoch": 0.38254319168099565, "grad_norm": 6.086658941241191, "learning_rate": 1.8928678528853895e-06, "loss": 0.261, "step": 1460 }, { "epoch": 0.3838532711045607, "grad_norm": 2.5029858692820444, "learning_rate": 1.8918900903001136e-06, "loss": 0.2623, "step": 1465 }, { "epoch": 0.3851633505281258, "grad_norm": 3.1670655034154347, "learning_rate": 1.8909081409852692e-06, "loss": 0.3239, "step": 1470 }, { "epoch": 0.3864734299516908, "grad_norm": 6.92258040722732, "learning_rate": 1.8899220095503442e-06, "loss": 0.3251, "step": 1475 }, { "epoch": 0.38778350937525585, "grad_norm": 2.459950138600639, "learning_rate": 1.888931700624458e-06, "loss": 0.2865, "step": 1480 }, { "epoch": 0.38909358879882094, "grad_norm": 4.566006943218545, "learning_rate": 1.8879372188563396e-06, "loss": 0.2919, "step": 1485 }, { "epoch": 0.390403668222386, "grad_norm": 6.1169971997341515, "learning_rate": 1.8869385689143069e-06, "loss": 0.3248, "step": 1490 }, { "epoch": 0.391713747645951, "grad_norm": 4.321962895847691, "learning_rate": 1.885935755486244e-06, "loss": 0.2497, "step": 1495 }, { "epoch": 0.3930238270695161, "grad_norm": 3.786195016300289, "learning_rate": 1.8849287832795785e-06, "loss": 0.2842, "step": 1500 }, { "epoch": 0.3930238270695161, "eval_accuracy": 0.6952, "eval_loss": 0.7388889789581299, "eval_runtime": 140.4298, "eval_samples_per_second": 8.901, "eval_steps_per_second": 2.229, "step": 1500 }, { "epoch": 0.39433390649308114, "grad_norm": 4.144022239569388, "learning_rate": 1.8839176570212619e-06, "loss": 0.2776, "step": 1505 }, { "epoch": 0.3956439859166462, "grad_norm": 3.254154282020282, "learning_rate": 1.882902381457744e-06, "loss": 0.3046, "step": 1510 }, { "epoch": 0.39695406534021127, "grad_norm": 3.7972516111759465, "learning_rate": 1.8818829613549532e-06, "loss": 0.3571, "step": 1515 }, { "epoch": 0.3982641447637763, "grad_norm": 2.7098946167159217, "learning_rate": 1.8808594014982736e-06, "loss": 0.3086, "step": 1520 }, { "epoch": 0.39957422418734134, "grad_norm": 2.395987116328653, "learning_rate": 1.879831706692521e-06, "loss": 0.2955, "step": 1525 }, { "epoch": 0.40088430361090643, "grad_norm": 5.567909535649407, "learning_rate": 1.8787998817619233e-06, "loss": 0.3045, "step": 1530 }, { "epoch": 0.40219438303447147, "grad_norm": 4.680059669737792, "learning_rate": 1.8777639315500945e-06, "loss": 0.2648, "step": 1535 }, { "epoch": 0.4035044624580365, "grad_norm": 8.387574343664538, "learning_rate": 1.876723860920015e-06, "loss": 0.3123, "step": 1540 }, { "epoch": 0.4048145418816016, "grad_norm": 2.023022886159982, "learning_rate": 1.8756796747540057e-06, "loss": 0.2561, "step": 1545 }, { "epoch": 0.40612462130516663, "grad_norm": 4.634489921732028, "learning_rate": 1.8746313779537087e-06, "loss": 0.3115, "step": 1550 }, { "epoch": 0.40743470072873167, "grad_norm": 3.401314650673904, "learning_rate": 1.8735789754400603e-06, "loss": 0.2493, "step": 1555 }, { "epoch": 0.40874478015229676, "grad_norm": 2.586199787021852, "learning_rate": 1.8725224721532715e-06, "loss": 0.2521, "step": 1560 }, { "epoch": 0.4100548595758618, "grad_norm": 4.393631624962878, "learning_rate": 1.8714618730528024e-06, "loss": 0.2817, "step": 1565 }, { "epoch": 0.41136493899942683, "grad_norm": 5.57797489979425, "learning_rate": 1.8703971831173405e-06, "loss": 0.2937, "step": 1570 }, { "epoch": 0.41267501842299187, "grad_norm": 3.4687053444228235, "learning_rate": 1.8693284073447755e-06, "loss": 0.3344, "step": 1575 }, { "epoch": 0.41398509784655696, "grad_norm": 4.141659335286871, "learning_rate": 1.868255550752178e-06, "loss": 0.2546, "step": 1580 }, { "epoch": 0.415295177270122, "grad_norm": 7.531913864092293, "learning_rate": 1.8671786183757741e-06, "loss": 0.2992, "step": 1585 }, { "epoch": 0.41660525669368703, "grad_norm": 5.209397304260616, "learning_rate": 1.866097615270923e-06, "loss": 0.2978, "step": 1590 }, { "epoch": 0.4179153361172521, "grad_norm": 4.9952265985686966, "learning_rate": 1.865012546512092e-06, "loss": 0.2386, "step": 1595 }, { "epoch": 0.41922541554081716, "grad_norm": 4.620451674592193, "learning_rate": 1.863923417192835e-06, "loss": 0.3012, "step": 1600 }, { "epoch": 0.41922541554081716, "eval_accuracy": 0.7088, "eval_loss": 0.7305626273155212, "eval_runtime": 137.487, "eval_samples_per_second": 9.092, "eval_steps_per_second": 2.277, "step": 1600 }, { "epoch": 0.4205354949643822, "grad_norm": 3.8216787722513335, "learning_rate": 1.8628302324257664e-06, "loss": 0.2886, "step": 1605 }, { "epoch": 0.4218455743879473, "grad_norm": 4.302516444293675, "learning_rate": 1.8617329973425364e-06, "loss": 0.2986, "step": 1610 }, { "epoch": 0.4231556538115123, "grad_norm": 2.5052399973675823, "learning_rate": 1.86063171709381e-06, "loss": 0.2977, "step": 1615 }, { "epoch": 0.42446573323507736, "grad_norm": 4.561793077013278, "learning_rate": 1.8595263968492407e-06, "loss": 0.3231, "step": 1620 }, { "epoch": 0.42577581265864245, "grad_norm": 7.8865966163916426, "learning_rate": 1.8584170417974465e-06, "loss": 0.3202, "step": 1625 }, { "epoch": 0.4270858920822075, "grad_norm": 3.674689043482507, "learning_rate": 1.857303657145985e-06, "loss": 0.2683, "step": 1630 }, { "epoch": 0.4283959715057725, "grad_norm": 2.8123441864934, "learning_rate": 1.8561862481213313e-06, "loss": 0.2893, "step": 1635 }, { "epoch": 0.4297060509293376, "grad_norm": 2.5957429423912854, "learning_rate": 1.85506481996885e-06, "loss": 0.3001, "step": 1640 }, { "epoch": 0.43101613035290265, "grad_norm": 6.272609371298048, "learning_rate": 1.8539393779527735e-06, "loss": 0.2944, "step": 1645 }, { "epoch": 0.4323262097764677, "grad_norm": 5.300171596731147, "learning_rate": 1.8528099273561754e-06, "loss": 0.2443, "step": 1650 }, { "epoch": 0.4336362892000328, "grad_norm": 4.309301759126504, "learning_rate": 1.8516764734809475e-06, "loss": 0.2504, "step": 1655 }, { "epoch": 0.4349463686235978, "grad_norm": 2.716951228905198, "learning_rate": 1.8505390216477732e-06, "loss": 0.2625, "step": 1660 }, { "epoch": 0.43625644804716285, "grad_norm": 4.606417589142611, "learning_rate": 1.8493975771961026e-06, "loss": 0.2715, "step": 1665 }, { "epoch": 0.43756652747072794, "grad_norm": 3.7628490993032444, "learning_rate": 1.8482521454841296e-06, "loss": 0.3187, "step": 1670 }, { "epoch": 0.438876606894293, "grad_norm": 3.6806362340314878, "learning_rate": 1.8471027318887632e-06, "loss": 0.2446, "step": 1675 }, { "epoch": 0.440186686317858, "grad_norm": 2.8358474618960554, "learning_rate": 1.8459493418056064e-06, "loss": 0.2803, "step": 1680 }, { "epoch": 0.44149676574142305, "grad_norm": 3.559305323246588, "learning_rate": 1.8447919806489272e-06, "loss": 0.3376, "step": 1685 }, { "epoch": 0.44280684516498814, "grad_norm": 3.18546710084024, "learning_rate": 1.8436306538516348e-06, "loss": 0.2526, "step": 1690 }, { "epoch": 0.4441169245885532, "grad_norm": 2.2770988367831317, "learning_rate": 1.8424653668652548e-06, "loss": 0.2878, "step": 1695 }, { "epoch": 0.4454270040121182, "grad_norm": 2.6311103553113186, "learning_rate": 1.8412961251599021e-06, "loss": 0.323, "step": 1700 }, { "epoch": 0.4454270040121182, "eval_accuracy": 0.7104, "eval_loss": 0.7182445526123047, "eval_runtime": 139.4262, "eval_samples_per_second": 8.965, "eval_steps_per_second": 2.245, "step": 1700 }, { "epoch": 0.4467370834356833, "grad_norm": 2.7960747447704275, "learning_rate": 1.8401229342242564e-06, "loss": 0.3345, "step": 1705 }, { "epoch": 0.44804716285924834, "grad_norm": 2.303028253758041, "learning_rate": 1.8389457995655354e-06, "loss": 0.2837, "step": 1710 }, { "epoch": 0.4493572422828134, "grad_norm": 3.6261923466611763, "learning_rate": 1.8377647267094699e-06, "loss": 0.2656, "step": 1715 }, { "epoch": 0.45066732170637847, "grad_norm": 4.89861955448886, "learning_rate": 1.8365797212002777e-06, "loss": 0.276, "step": 1720 }, { "epoch": 0.4519774011299435, "grad_norm": 6.264789410236653, "learning_rate": 1.8353907886006369e-06, "loss": 0.3056, "step": 1725 }, { "epoch": 0.45328748055350854, "grad_norm": 2.51930771111435, "learning_rate": 1.8341979344916601e-06, "loss": 0.2885, "step": 1730 }, { "epoch": 0.45459755997707363, "grad_norm": 6.288406487126684, "learning_rate": 1.833001164472869e-06, "loss": 0.3229, "step": 1735 }, { "epoch": 0.45590763940063866, "grad_norm": 5.444204531738827, "learning_rate": 1.8318004841621666e-06, "loss": 0.2589, "step": 1740 }, { "epoch": 0.4572177188242037, "grad_norm": 6.474274016450882, "learning_rate": 1.8305958991958126e-06, "loss": 0.2912, "step": 1745 }, { "epoch": 0.4585277982477688, "grad_norm": 6.449122812307309, "learning_rate": 1.8293874152283952e-06, "loss": 0.2992, "step": 1750 }, { "epoch": 0.45983787767133383, "grad_norm": 4.641695980313187, "learning_rate": 1.8281750379328061e-06, "loss": 0.3278, "step": 1755 }, { "epoch": 0.46114795709489886, "grad_norm": 2.681087051955473, "learning_rate": 1.8269587730002125e-06, "loss": 0.255, "step": 1760 }, { "epoch": 0.46245803651846396, "grad_norm": 6.151626591739763, "learning_rate": 1.8257386261400316e-06, "loss": 0.2494, "step": 1765 }, { "epoch": 0.463768115942029, "grad_norm": 4.505233549633642, "learning_rate": 1.8245146030799025e-06, "loss": 0.3442, "step": 1770 }, { "epoch": 0.465078195365594, "grad_norm": 4.205788305228986, "learning_rate": 1.8232867095656608e-06, "loss": 0.3093, "step": 1775 }, { "epoch": 0.4663882747891591, "grad_norm": 3.5089694683317174, "learning_rate": 1.8220549513613104e-06, "loss": 0.2846, "step": 1780 }, { "epoch": 0.46769835421272415, "grad_norm": 4.514848486626711, "learning_rate": 1.820819334248997e-06, "loss": 0.3689, "step": 1785 }, { "epoch": 0.4690084336362892, "grad_norm": 2.320338638195368, "learning_rate": 1.8195798640289807e-06, "loss": 0.2559, "step": 1790 }, { "epoch": 0.4703185130598542, "grad_norm": 4.819290066996436, "learning_rate": 1.8183365465196099e-06, "loss": 0.2729, "step": 1795 }, { "epoch": 0.4716285924834193, "grad_norm": 2.8951610503894734, "learning_rate": 1.8170893875572916e-06, "loss": 0.2502, "step": 1800 }, { "epoch": 0.4716285924834193, "eval_accuracy": 0.7248, "eval_loss": 0.6544848680496216, "eval_runtime": 140.79, "eval_samples_per_second": 8.878, "eval_steps_per_second": 2.223, "step": 1800 }, { "epoch": 0.47293867190698435, "grad_norm": 6.1281331169132045, "learning_rate": 1.8158383929964665e-06, "loss": 0.2792, "step": 1805 }, { "epoch": 0.4742487513305494, "grad_norm": 4.051837237000961, "learning_rate": 1.8145835687095797e-06, "loss": 0.3106, "step": 1810 }, { "epoch": 0.4755588307541145, "grad_norm": 5.166640184431374, "learning_rate": 1.8133249205870547e-06, "loss": 0.3153, "step": 1815 }, { "epoch": 0.4768689101776795, "grad_norm": 4.026299173963558, "learning_rate": 1.8120624545372643e-06, "loss": 0.2343, "step": 1820 }, { "epoch": 0.47817898960124455, "grad_norm": 4.295951177165347, "learning_rate": 1.8107961764865033e-06, "loss": 0.2883, "step": 1825 }, { "epoch": 0.47948906902480964, "grad_norm": 3.4331157371118945, "learning_rate": 1.8095260923789617e-06, "loss": 0.2696, "step": 1830 }, { "epoch": 0.4807991484483747, "grad_norm": 4.813504074638746, "learning_rate": 1.8082522081766953e-06, "loss": 0.3209, "step": 1835 }, { "epoch": 0.4821092278719397, "grad_norm": 3.589632561094004, "learning_rate": 1.8069745298595992e-06, "loss": 0.2516, "step": 1840 }, { "epoch": 0.4834193072955048, "grad_norm": 3.7263852570627143, "learning_rate": 1.805693063425377e-06, "loss": 0.3106, "step": 1845 }, { "epoch": 0.48472938671906984, "grad_norm": 4.05564679184159, "learning_rate": 1.8044078148895174e-06, "loss": 0.2901, "step": 1850 }, { "epoch": 0.4860394661426349, "grad_norm": 3.53180167912472, "learning_rate": 1.8031187902852607e-06, "loss": 0.2981, "step": 1855 }, { "epoch": 0.48734954556619997, "grad_norm": 3.70131644140251, "learning_rate": 1.801825995663574e-06, "loss": 0.266, "step": 1860 }, { "epoch": 0.488659624989765, "grad_norm": 3.9187161597018214, "learning_rate": 1.8005294370931217e-06, "loss": 0.2921, "step": 1865 }, { "epoch": 0.48996970441333004, "grad_norm": 2.365181839837511, "learning_rate": 1.7992291206602366e-06, "loss": 0.292, "step": 1870 }, { "epoch": 0.49127978383689513, "grad_norm": 3.6772154023031014, "learning_rate": 1.797925052468892e-06, "loss": 0.2926, "step": 1875 }, { "epoch": 0.49258986326046017, "grad_norm": 3.169897885563791, "learning_rate": 1.7966172386406728e-06, "loss": 0.3069, "step": 1880 }, { "epoch": 0.4938999426840252, "grad_norm": 3.623768040249494, "learning_rate": 1.7953056853147466e-06, "loss": 0.2728, "step": 1885 }, { "epoch": 0.4952100221075903, "grad_norm": 3.8930495139893884, "learning_rate": 1.7939903986478354e-06, "loss": 0.2497, "step": 1890 }, { "epoch": 0.49652010153115533, "grad_norm": 3.054365703792985, "learning_rate": 1.7926713848141856e-06, "loss": 0.2798, "step": 1895 }, { "epoch": 0.49783018095472037, "grad_norm": 5.479009666538431, "learning_rate": 1.7913486500055402e-06, "loss": 0.3357, "step": 1900 }, { "epoch": 0.49783018095472037, "eval_accuracy": 0.7184, "eval_loss": 0.6975212097167969, "eval_runtime": 134.9313, "eval_samples_per_second": 9.264, "eval_steps_per_second": 2.32, "step": 1900 }, { "epoch": 0.49914026037828546, "grad_norm": 2.433308128925003, "learning_rate": 1.7900222004311098e-06, "loss": 0.28, "step": 1905 }, { "epoch": 0.5004503398018505, "grad_norm": 3.0724802951664896, "learning_rate": 1.788692042317542e-06, "loss": 0.2741, "step": 1910 }, { "epoch": 0.5017604192254156, "grad_norm": 2.9281942193789563, "learning_rate": 1.7873581819088937e-06, "loss": 0.2622, "step": 1915 }, { "epoch": 0.5030704986489806, "grad_norm": 3.2439911646866415, "learning_rate": 1.786020625466601e-06, "loss": 0.2706, "step": 1920 }, { "epoch": 0.5043805780725457, "grad_norm": 3.4446296154345175, "learning_rate": 1.7846793792694497e-06, "loss": 0.2596, "step": 1925 }, { "epoch": 0.5056906574961108, "grad_norm": 3.7556256418902905, "learning_rate": 1.7833344496135467e-06, "loss": 0.3073, "step": 1930 }, { "epoch": 0.5070007369196757, "grad_norm": 4.400597629681425, "learning_rate": 1.7819858428122893e-06, "loss": 0.2764, "step": 1935 }, { "epoch": 0.5083108163432408, "grad_norm": 3.7380324234060143, "learning_rate": 1.7806335651963372e-06, "loss": 0.2906, "step": 1940 }, { "epoch": 0.5096208957668059, "grad_norm": 4.614102737000705, "learning_rate": 1.7792776231135802e-06, "loss": 0.2898, "step": 1945 }, { "epoch": 0.5109309751903709, "grad_norm": 2.7297343017049287, "learning_rate": 1.7779180229291105e-06, "loss": 0.23, "step": 1950 }, { "epoch": 0.512241054613936, "grad_norm": 5.463732251912146, "learning_rate": 1.7765547710251935e-06, "loss": 0.2813, "step": 1955 }, { "epoch": 0.5135511340375011, "grad_norm": 2.478093229460874, "learning_rate": 1.7751878738012346e-06, "loss": 0.2119, "step": 1960 }, { "epoch": 0.5148612134610661, "grad_norm": 5.4823008219173595, "learning_rate": 1.7738173376737522e-06, "loss": 0.2642, "step": 1965 }, { "epoch": 0.5161712928846311, "grad_norm": 3.9944643614596638, "learning_rate": 1.7724431690763462e-06, "loss": 0.2575, "step": 1970 }, { "epoch": 0.5174813723081961, "grad_norm": 5.526636384041946, "learning_rate": 1.7710653744596687e-06, "loss": 0.3462, "step": 1975 }, { "epoch": 0.5187914517317612, "grad_norm": 5.965066611467832, "learning_rate": 1.7696839602913925e-06, "loss": 0.3024, "step": 1980 }, { "epoch": 0.5201015311553263, "grad_norm": 2.9808391808623247, "learning_rate": 1.7682989330561813e-06, "loss": 0.2729, "step": 1985 }, { "epoch": 0.5214116105788913, "grad_norm": 3.6106553367793746, "learning_rate": 1.7669102992556601e-06, "loss": 0.2461, "step": 1990 }, { "epoch": 0.5227216900024564, "grad_norm": 3.2772498647327732, "learning_rate": 1.7655180654083832e-06, "loss": 0.2842, "step": 1995 }, { "epoch": 0.5240317694260215, "grad_norm": 6.885168575642456, "learning_rate": 1.7641222380498044e-06, "loss": 0.3379, "step": 2000 }, { "epoch": 0.5240317694260215, "eval_accuracy": 0.7288, "eval_loss": 0.673081636428833, "eval_runtime": 136.835, "eval_samples_per_second": 9.135, "eval_steps_per_second": 2.287, "step": 2000 }, { "epoch": 0.5253418488495865, "grad_norm": 3.6157723487154128, "learning_rate": 1.7627228237322466e-06, "loss": 0.2985, "step": 2005 }, { "epoch": 0.5266519282731515, "grad_norm": 2.6275278120467105, "learning_rate": 1.7613198290248706e-06, "loss": 0.2281, "step": 2010 }, { "epoch": 0.5279620076967166, "grad_norm": 3.3363602312332175, "learning_rate": 1.7599132605136436e-06, "loss": 0.3043, "step": 2015 }, { "epoch": 0.5292720871202816, "grad_norm": 2.4277559410443175, "learning_rate": 1.7585031248013106e-06, "loss": 0.202, "step": 2020 }, { "epoch": 0.5305821665438467, "grad_norm": 5.072153681054817, "learning_rate": 1.7570894285073599e-06, "loss": 0.2483, "step": 2025 }, { "epoch": 0.5318922459674118, "grad_norm": 5.204973135313382, "learning_rate": 1.7556721782679956e-06, "loss": 0.3329, "step": 2030 }, { "epoch": 0.5332023253909768, "grad_norm": 2.806879239003485, "learning_rate": 1.7542513807361037e-06, "loss": 0.2548, "step": 2035 }, { "epoch": 0.5345124048145419, "grad_norm": 3.0228433116007327, "learning_rate": 1.7528270425812228e-06, "loss": 0.2651, "step": 2040 }, { "epoch": 0.535822484238107, "grad_norm": 2.6869261489897736, "learning_rate": 1.7513991704895112e-06, "loss": 0.2844, "step": 2045 }, { "epoch": 0.537132563661672, "grad_norm": 4.5723366454832215, "learning_rate": 1.7499677711637171e-06, "loss": 0.3071, "step": 2050 }, { "epoch": 0.538442643085237, "grad_norm": 4.179299164940856, "learning_rate": 1.7485328513231453e-06, "loss": 0.2774, "step": 2055 }, { "epoch": 0.5397527225088021, "grad_norm": 3.356708836608762, "learning_rate": 1.7470944177036277e-06, "loss": 0.2927, "step": 2060 }, { "epoch": 0.5410628019323671, "grad_norm": 4.017539637774881, "learning_rate": 1.74565247705749e-06, "loss": 0.3008, "step": 2065 }, { "epoch": 0.5423728813559322, "grad_norm": 3.974226727917273, "learning_rate": 1.744207036153521e-06, "loss": 0.2742, "step": 2070 }, { "epoch": 0.5436829607794973, "grad_norm": 3.160288278087426, "learning_rate": 1.7427581017769404e-06, "loss": 0.3134, "step": 2075 }, { "epoch": 0.5449930402030623, "grad_norm": 4.995568511985773, "learning_rate": 1.741305680729367e-06, "loss": 0.2927, "step": 2080 }, { "epoch": 0.5463031196266274, "grad_norm": 2.5038111120195192, "learning_rate": 1.7398497798287863e-06, "loss": 0.2442, "step": 2085 }, { "epoch": 0.5476131990501925, "grad_norm": 3.4536557863053035, "learning_rate": 1.7383904059095202e-06, "loss": 0.2592, "step": 2090 }, { "epoch": 0.5489232784737574, "grad_norm": 4.410374261613443, "learning_rate": 1.7369275658221926e-06, "loss": 0.3117, "step": 2095 }, { "epoch": 0.5502333578973225, "grad_norm": 2.538812009761459, "learning_rate": 1.735461266433699e-06, "loss": 0.2717, "step": 2100 }, { "epoch": 0.5502333578973225, "eval_accuracy": 0.7424, "eval_loss": 0.671869158744812, "eval_runtime": 135.9633, "eval_samples_per_second": 9.194, "eval_steps_per_second": 2.302, "step": 2100 }, { "epoch": 0.5515434373208876, "grad_norm": 3.627287673105934, "learning_rate": 1.7339915146271732e-06, "loss": 0.269, "step": 2105 }, { "epoch": 0.5528535167444526, "grad_norm": 2.889042058590513, "learning_rate": 1.7325183173019556e-06, "loss": 0.2357, "step": 2110 }, { "epoch": 0.5541635961680177, "grad_norm": 3.5905021176878775, "learning_rate": 1.731041681373561e-06, "loss": 0.2349, "step": 2115 }, { "epoch": 0.5554736755915828, "grad_norm": 4.874519688988458, "learning_rate": 1.729561613773645e-06, "loss": 0.2886, "step": 2120 }, { "epoch": 0.5567837550151478, "grad_norm": 3.281500493007926, "learning_rate": 1.7280781214499727e-06, "loss": 0.282, "step": 2125 }, { "epoch": 0.5580938344387129, "grad_norm": 1.90376998294021, "learning_rate": 1.7265912113663857e-06, "loss": 0.2952, "step": 2130 }, { "epoch": 0.559403913862278, "grad_norm": 4.157189646380671, "learning_rate": 1.7251008905027692e-06, "loss": 0.2913, "step": 2135 }, { "epoch": 0.5607139932858429, "grad_norm": 7.357071844148765, "learning_rate": 1.7236071658550191e-06, "loss": 0.3016, "step": 2140 }, { "epoch": 0.562024072709408, "grad_norm": 3.3548044431409214, "learning_rate": 1.7221100444350099e-06, "loss": 0.2526, "step": 2145 }, { "epoch": 0.5633341521329731, "grad_norm": 2.070755143349665, "learning_rate": 1.7206095332705608e-06, "loss": 0.2859, "step": 2150 }, { "epoch": 0.5646442315565381, "grad_norm": 2.9975997234107123, "learning_rate": 1.7191056394054035e-06, "loss": 0.2739, "step": 2155 }, { "epoch": 0.5659543109801032, "grad_norm": 4.385338795716017, "learning_rate": 1.7175983698991488e-06, "loss": 0.312, "step": 2160 }, { "epoch": 0.5672643904036683, "grad_norm": 3.112159603233555, "learning_rate": 1.7160877318272537e-06, "loss": 0.272, "step": 2165 }, { "epoch": 0.5685744698272333, "grad_norm": 2.353537092467099, "learning_rate": 1.7145737322809876e-06, "loss": 0.2534, "step": 2170 }, { "epoch": 0.5698845492507983, "grad_norm": 2.6947118285924385, "learning_rate": 1.7130563783674e-06, "loss": 0.2702, "step": 2175 }, { "epoch": 0.5711946286743634, "grad_norm": 4.573113056178734, "learning_rate": 1.7115356772092855e-06, "loss": 0.3059, "step": 2180 }, { "epoch": 0.5725047080979284, "grad_norm": 2.465542000370307, "learning_rate": 1.7100116359451523e-06, "loss": 0.2602, "step": 2185 }, { "epoch": 0.5738147875214935, "grad_norm": 3.340565744796501, "learning_rate": 1.7084842617291874e-06, "loss": 0.2824, "step": 2190 }, { "epoch": 0.5751248669450586, "grad_norm": 2.292406930546328, "learning_rate": 1.706953561731224e-06, "loss": 0.2696, "step": 2195 }, { "epoch": 0.5764349463686236, "grad_norm": 3.3346832292412967, "learning_rate": 1.705419543136707e-06, "loss": 0.317, "step": 2200 }, { "epoch": 0.5764349463686236, "eval_accuracy": 0.724, "eval_loss": 0.6991069912910461, "eval_runtime": 140.893, "eval_samples_per_second": 8.872, "eval_steps_per_second": 2.222, "step": 2200 }, { "epoch": 0.5777450257921887, "grad_norm": 2.3310945353240435, "learning_rate": 1.7038822131466583e-06, "loss": 0.2504, "step": 2205 }, { "epoch": 0.5790551052157537, "grad_norm": 2.3747884427134713, "learning_rate": 1.7023415789776463e-06, "loss": 0.298, "step": 2210 }, { "epoch": 0.5803651846393187, "grad_norm": 3.133493758446318, "learning_rate": 1.7007976478617484e-06, "loss": 0.2376, "step": 2215 }, { "epoch": 0.5816752640628838, "grad_norm": 4.12633676209407, "learning_rate": 1.6992504270465193e-06, "loss": 0.2944, "step": 2220 }, { "epoch": 0.5829853434864488, "grad_norm": 3.9130822922585144, "learning_rate": 1.697699923794956e-06, "loss": 0.2993, "step": 2225 }, { "epoch": 0.5842954229100139, "grad_norm": 2.7907750454465203, "learning_rate": 1.696146145385464e-06, "loss": 0.2868, "step": 2230 }, { "epoch": 0.585605502333579, "grad_norm": 3.8858008383873397, "learning_rate": 1.6945890991118236e-06, "loss": 0.3234, "step": 2235 }, { "epoch": 0.586915581757144, "grad_norm": 2.4869248403820308, "learning_rate": 1.6930287922831546e-06, "loss": 0.2584, "step": 2240 }, { "epoch": 0.5882256611807091, "grad_norm": 2.9309269678483525, "learning_rate": 1.6914652322238824e-06, "loss": 0.2303, "step": 2245 }, { "epoch": 0.5895357406042742, "grad_norm": 4.8721568277124625, "learning_rate": 1.6898984262737046e-06, "loss": 0.2216, "step": 2250 }, { "epoch": 0.5908458200278391, "grad_norm": 4.919818554906796, "learning_rate": 1.6883283817875546e-06, "loss": 0.2742, "step": 2255 }, { "epoch": 0.5921558994514042, "grad_norm": 7.35406987379939, "learning_rate": 1.6867551061355696e-06, "loss": 0.2984, "step": 2260 }, { "epoch": 0.5934659788749693, "grad_norm": 2.5259386632956864, "learning_rate": 1.6851786067030535e-06, "loss": 0.2001, "step": 2265 }, { "epoch": 0.5947760582985343, "grad_norm": 4.776608491229436, "learning_rate": 1.6835988908904437e-06, "loss": 0.3169, "step": 2270 }, { "epoch": 0.5960861377220994, "grad_norm": 2.473422082161941, "learning_rate": 1.6820159661132763e-06, "loss": 0.2355, "step": 2275 }, { "epoch": 0.5973962171456645, "grad_norm": 3.0778943971784165, "learning_rate": 1.6804298398021501e-06, "loss": 0.2308, "step": 2280 }, { "epoch": 0.5987062965692295, "grad_norm": 3.5878133464375646, "learning_rate": 1.6788405194026937e-06, "loss": 0.2586, "step": 2285 }, { "epoch": 0.6000163759927946, "grad_norm": 3.9888619741299194, "learning_rate": 1.6772480123755288e-06, "loss": 0.3039, "step": 2290 }, { "epoch": 0.6013264554163597, "grad_norm": 4.413706147317179, "learning_rate": 1.6756523261962361e-06, "loss": 0.3061, "step": 2295 }, { "epoch": 0.6026365348399246, "grad_norm": 6.997366587082385, "learning_rate": 1.6740534683553197e-06, "loss": 0.2696, "step": 2300 }, { "epoch": 0.6026365348399246, "eval_accuracy": 0.7272, "eval_loss": 0.7597007751464844, "eval_runtime": 141.0016, "eval_samples_per_second": 8.865, "eval_steps_per_second": 2.22, "step": 2300 }, { "epoch": 0.6039466142634897, "grad_norm": 5.753280459621215, "learning_rate": 1.6724514463581727e-06, "loss": 0.2935, "step": 2305 }, { "epoch": 0.6052566936870548, "grad_norm": 3.6663692285540996, "learning_rate": 1.6708462677250405e-06, "loss": 0.2493, "step": 2310 }, { "epoch": 0.6065667731106198, "grad_norm": 4.557277515775418, "learning_rate": 1.6692379399909876e-06, "loss": 0.3299, "step": 2315 }, { "epoch": 0.6078768525341849, "grad_norm": 2.960623480418135, "learning_rate": 1.6676264707058599e-06, "loss": 0.3056, "step": 2320 }, { "epoch": 0.60918693195775, "grad_norm": 5.423095170753771, "learning_rate": 1.6660118674342515e-06, "loss": 0.341, "step": 2325 }, { "epoch": 0.610497011381315, "grad_norm": 2.6332110051555113, "learning_rate": 1.6643941377554675e-06, "loss": 0.2743, "step": 2330 }, { "epoch": 0.61180709080488, "grad_norm": 2.2505359636330207, "learning_rate": 1.6627732892634893e-06, "loss": 0.2578, "step": 2335 }, { "epoch": 0.6131171702284451, "grad_norm": 4.154284637955047, "learning_rate": 1.6611493295669386e-06, "loss": 0.3286, "step": 2340 }, { "epoch": 0.6144272496520101, "grad_norm": 4.577727996526487, "learning_rate": 1.6595222662890418e-06, "loss": 0.2868, "step": 2345 }, { "epoch": 0.6157373290755752, "grad_norm": 4.653567093852986, "learning_rate": 1.657892107067594e-06, "loss": 0.2551, "step": 2350 }, { "epoch": 0.6170474084991403, "grad_norm": 4.19440150981268, "learning_rate": 1.6562588595549235e-06, "loss": 0.2847, "step": 2355 }, { "epoch": 0.6183574879227053, "grad_norm": 4.490108240777686, "learning_rate": 1.654622531417856e-06, "loss": 0.319, "step": 2360 }, { "epoch": 0.6196675673462704, "grad_norm": 3.896691949712352, "learning_rate": 1.6529831303376787e-06, "loss": 0.2833, "step": 2365 }, { "epoch": 0.6209776467698355, "grad_norm": 4.158481682638508, "learning_rate": 1.651340664010102e-06, "loss": 0.2759, "step": 2370 }, { "epoch": 0.6222877261934004, "grad_norm": 2.4673087844419337, "learning_rate": 1.6496951401452272e-06, "loss": 0.2068, "step": 2375 }, { "epoch": 0.6235978056169655, "grad_norm": 4.873523893306045, "learning_rate": 1.6480465664675078e-06, "loss": 0.2822, "step": 2380 }, { "epoch": 0.6249078850405306, "grad_norm": 4.513011014916301, "learning_rate": 1.6463949507157131e-06, "loss": 0.311, "step": 2385 }, { "epoch": 0.6262179644640956, "grad_norm": 4.41040904711801, "learning_rate": 1.644740300642894e-06, "loss": 0.2894, "step": 2390 }, { "epoch": 0.6275280438876607, "grad_norm": 7.808507472292027, "learning_rate": 1.6430826240163436e-06, "loss": 0.3345, "step": 2395 }, { "epoch": 0.6288381233112258, "grad_norm": 2.615243925080016, "learning_rate": 1.6414219286175635e-06, "loss": 0.2465, "step": 2400 }, { "epoch": 0.6288381233112258, "eval_accuracy": 0.7408, "eval_loss": 0.7380235195159912, "eval_runtime": 136.2377, "eval_samples_per_second": 9.175, "eval_steps_per_second": 2.297, "step": 2400 }, { "epoch": 0.6301482027347908, "grad_norm": 3.567322311755078, "learning_rate": 1.639758222242225e-06, "loss": 0.2349, "step": 2405 }, { "epoch": 0.6314582821583559, "grad_norm": 5.503722999681643, "learning_rate": 1.638091512700135e-06, "loss": 0.2486, "step": 2410 }, { "epoch": 0.632768361581921, "grad_norm": 5.060199812880774, "learning_rate": 1.6364218078151963e-06, "loss": 0.3254, "step": 2415 }, { "epoch": 0.6340784410054859, "grad_norm": 2.967797600289104, "learning_rate": 1.6347491154253738e-06, "loss": 0.3049, "step": 2420 }, { "epoch": 0.635388520429051, "grad_norm": 3.6123691523694297, "learning_rate": 1.6330734433826562e-06, "loss": 0.3079, "step": 2425 }, { "epoch": 0.636698599852616, "grad_norm": 2.520240254040695, "learning_rate": 1.6313947995530187e-06, "loss": 0.2677, "step": 2430 }, { "epoch": 0.6380086792761811, "grad_norm": 3.700192416490241, "learning_rate": 1.6297131918163874e-06, "loss": 0.2393, "step": 2435 }, { "epoch": 0.6393187586997462, "grad_norm": 3.4585291941554797, "learning_rate": 1.6280286280666011e-06, "loss": 0.253, "step": 2440 }, { "epoch": 0.6406288381233112, "grad_norm": 5.9407205540242884, "learning_rate": 1.6263411162113752e-06, "loss": 0.2991, "step": 2445 }, { "epoch": 0.6419389175468763, "grad_norm": 3.7257997296487546, "learning_rate": 1.624650664172264e-06, "loss": 0.3, "step": 2450 }, { "epoch": 0.6432489969704414, "grad_norm": 7.091164226358698, "learning_rate": 1.6229572798846233e-06, "loss": 0.2964, "step": 2455 }, { "epoch": 0.6445590763940063, "grad_norm": 3.945065293498854, "learning_rate": 1.6212609712975746e-06, "loss": 0.3003, "step": 2460 }, { "epoch": 0.6458691558175714, "grad_norm": 4.547862400421276, "learning_rate": 1.6195617463739657e-06, "loss": 0.312, "step": 2465 }, { "epoch": 0.6471792352411365, "grad_norm": 3.792620720497921, "learning_rate": 1.6178596130903343e-06, "loss": 0.2689, "step": 2470 }, { "epoch": 0.6484893146647015, "grad_norm": 4.549047060259805, "learning_rate": 1.6161545794368712e-06, "loss": 0.3019, "step": 2475 }, { "epoch": 0.6497993940882666, "grad_norm": 2.3008964624889114, "learning_rate": 1.614446653417382e-06, "loss": 0.2427, "step": 2480 }, { "epoch": 0.6511094735118317, "grad_norm": 3.448390528740768, "learning_rate": 1.6127358430492496e-06, "loss": 0.2733, "step": 2485 }, { "epoch": 0.6524195529353967, "grad_norm": 3.115685764931888, "learning_rate": 1.6110221563633966e-06, "loss": 0.2813, "step": 2490 }, { "epoch": 0.6537296323589618, "grad_norm": 2.7637030640180056, "learning_rate": 1.6093056014042476e-06, "loss": 0.316, "step": 2495 }, { "epoch": 0.6550397117825268, "grad_norm": 2.8028721777171763, "learning_rate": 1.6075861862296918e-06, "loss": 0.2465, "step": 2500 }, { "epoch": 0.6550397117825268, "eval_accuracy": 0.7504, "eval_loss": 0.7594350576400757, "eval_runtime": 136.5188, "eval_samples_per_second": 9.156, "eval_steps_per_second": 2.293, "step": 2500 }, { "epoch": 0.6563497912060918, "grad_norm": 2.201406591183069, "learning_rate": 1.6058639189110448e-06, "loss": 0.2579, "step": 2505 }, { "epoch": 0.6576598706296569, "grad_norm": 2.743123705211622, "learning_rate": 1.6041388075330104e-06, "loss": 0.2671, "step": 2510 }, { "epoch": 0.658969950053222, "grad_norm": 3.35926627410109, "learning_rate": 1.6024108601936441e-06, "loss": 0.2722, "step": 2515 }, { "epoch": 0.660280029476787, "grad_norm": 4.177142712614172, "learning_rate": 1.600680085004313e-06, "loss": 0.255, "step": 2520 }, { "epoch": 0.6615901089003521, "grad_norm": 5.607535570859494, "learning_rate": 1.5989464900896584e-06, "loss": 0.2808, "step": 2525 }, { "epoch": 0.6629001883239172, "grad_norm": 4.111155214094161, "learning_rate": 1.5972100835875596e-06, "loss": 0.2749, "step": 2530 }, { "epoch": 0.6642102677474822, "grad_norm": 3.852938068841201, "learning_rate": 1.5954708736490927e-06, "loss": 0.374, "step": 2535 }, { "epoch": 0.6655203471710472, "grad_norm": 4.998114409517782, "learning_rate": 1.5937288684384948e-06, "loss": 0.2988, "step": 2540 }, { "epoch": 0.6668304265946123, "grad_norm": 3.2234837676131036, "learning_rate": 1.5919840761331233e-06, "loss": 0.2926, "step": 2545 }, { "epoch": 0.6681405060181773, "grad_norm": 2.6128145021675135, "learning_rate": 1.59023650492342e-06, "loss": 0.2685, "step": 2550 }, { "epoch": 0.6694505854417424, "grad_norm": 2.746049149593303, "learning_rate": 1.588486163012871e-06, "loss": 0.276, "step": 2555 }, { "epoch": 0.6707606648653075, "grad_norm": 12.230075077506218, "learning_rate": 1.5867330586179692e-06, "loss": 0.3356, "step": 2560 }, { "epoch": 0.6720707442888725, "grad_norm": 3.609150884738277, "learning_rate": 1.5849771999681744e-06, "loss": 0.2876, "step": 2565 }, { "epoch": 0.6733808237124376, "grad_norm": 3.696663482780853, "learning_rate": 1.583218595305876e-06, "loss": 0.2801, "step": 2570 }, { "epoch": 0.6746909031360027, "grad_norm": 2.4314233621566674, "learning_rate": 1.5814572528863537e-06, "loss": 0.246, "step": 2575 }, { "epoch": 0.6760009825595676, "grad_norm": 3.410781191475223, "learning_rate": 1.5796931809777387e-06, "loss": 0.2854, "step": 2580 }, { "epoch": 0.6773110619831327, "grad_norm": 7.965165743041762, "learning_rate": 1.5779263878609752e-06, "loss": 0.3286, "step": 2585 }, { "epoch": 0.6786211414066978, "grad_norm": 2.8880233729881333, "learning_rate": 1.5761568818297814e-06, "loss": 0.3273, "step": 2590 }, { "epoch": 0.6799312208302628, "grad_norm": 4.279782991468597, "learning_rate": 1.5743846711906103e-06, "loss": 0.2907, "step": 2595 }, { "epoch": 0.6812413002538279, "grad_norm": 4.080828216313647, "learning_rate": 1.5726097642626112e-06, "loss": 0.3034, "step": 2600 }, { "epoch": 0.6812413002538279, "eval_accuracy": 0.7576, "eval_loss": 0.6795002818107605, "eval_runtime": 138.009, "eval_samples_per_second": 9.057, "eval_steps_per_second": 2.268, "step": 2600 }, { "epoch": 0.682551379677393, "grad_norm": 3.704947001007602, "learning_rate": 1.5708321693775901e-06, "loss": 0.2779, "step": 2605 }, { "epoch": 0.683861459100958, "grad_norm": 3.7633025557802045, "learning_rate": 1.569051894879971e-06, "loss": 0.2513, "step": 2610 }, { "epoch": 0.6851715385245231, "grad_norm": 4.533908569894921, "learning_rate": 1.5672689491267565e-06, "loss": 0.2519, "step": 2615 }, { "epoch": 0.6864816179480882, "grad_norm": 7.825654578480919, "learning_rate": 1.5654833404874889e-06, "loss": 0.3064, "step": 2620 }, { "epoch": 0.6877916973716531, "grad_norm": 6.701186216511913, "learning_rate": 1.5636950773442107e-06, "loss": 0.2888, "step": 2625 }, { "epoch": 0.6891017767952182, "grad_norm": 5.08064257662279, "learning_rate": 1.5619041680914244e-06, "loss": 0.2841, "step": 2630 }, { "epoch": 0.6904118562187833, "grad_norm": 3.3431125663033403, "learning_rate": 1.560110621136055e-06, "loss": 0.33, "step": 2635 }, { "epoch": 0.6917219356423483, "grad_norm": 3.348150218520968, "learning_rate": 1.5583144448974092e-06, "loss": 0.2425, "step": 2640 }, { "epoch": 0.6930320150659134, "grad_norm": 2.6434575784544485, "learning_rate": 1.556515647807136e-06, "loss": 0.2892, "step": 2645 }, { "epoch": 0.6943420944894785, "grad_norm": 3.4776108132319514, "learning_rate": 1.5547142383091868e-06, "loss": 0.2468, "step": 2650 }, { "epoch": 0.6956521739130435, "grad_norm": 4.831568463736758, "learning_rate": 1.5529102248597772e-06, "loss": 0.2789, "step": 2655 }, { "epoch": 0.6969622533366086, "grad_norm": 2.093319788857653, "learning_rate": 1.5511036159273452e-06, "loss": 0.287, "step": 2660 }, { "epoch": 0.6982723327601735, "grad_norm": 3.5718053321237213, "learning_rate": 1.5492944199925133e-06, "loss": 0.2576, "step": 2665 }, { "epoch": 0.6995824121837386, "grad_norm": 2.6423916949191173, "learning_rate": 1.5474826455480486e-06, "loss": 0.3232, "step": 2670 }, { "epoch": 0.7008924916073037, "grad_norm": 2.4967928837848996, "learning_rate": 1.5456683010988203e-06, "loss": 0.2656, "step": 2675 }, { "epoch": 0.7022025710308687, "grad_norm": 2.280284760261213, "learning_rate": 1.5438513951617637e-06, "loss": 0.223, "step": 2680 }, { "epoch": 0.7035126504544338, "grad_norm": 3.9018364135960497, "learning_rate": 1.5420319362658373e-06, "loss": 0.2352, "step": 2685 }, { "epoch": 0.7048227298779989, "grad_norm": 4.281704624864632, "learning_rate": 1.5402099329519845e-06, "loss": 0.2683, "step": 2690 }, { "epoch": 0.7061328093015639, "grad_norm": 6.450930230248805, "learning_rate": 1.5383853937730916e-06, "loss": 0.2804, "step": 2695 }, { "epoch": 0.707442888725129, "grad_norm": 3.6301375705851835, "learning_rate": 1.53655832729395e-06, "loss": 0.256, "step": 2700 }, { "epoch": 0.707442888725129, "eval_accuracy": 0.7624, "eval_loss": 0.7787925004959106, "eval_runtime": 138.9559, "eval_samples_per_second": 8.996, "eval_steps_per_second": 2.253, "step": 2700 }, { "epoch": 0.708752968148694, "grad_norm": 4.1331580241606725, "learning_rate": 1.534728742091214e-06, "loss": 0.3178, "step": 2705 }, { "epoch": 0.710063047572259, "grad_norm": 4.1609025912552005, "learning_rate": 1.532896646753362e-06, "loss": 0.2764, "step": 2710 }, { "epoch": 0.7113731269958241, "grad_norm": 2.4782882085210884, "learning_rate": 1.5310620498806548e-06, "loss": 0.2497, "step": 2715 }, { "epoch": 0.7126832064193892, "grad_norm": 4.503219440050312, "learning_rate": 1.5292249600850966e-06, "loss": 0.2618, "step": 2720 }, { "epoch": 0.7139932858429542, "grad_norm": 4.86090545111869, "learning_rate": 1.5273853859903935e-06, "loss": 0.2522, "step": 2725 }, { "epoch": 0.7153033652665193, "grad_norm": 4.018354852882808, "learning_rate": 1.525543336231914e-06, "loss": 0.3052, "step": 2730 }, { "epoch": 0.7166134446900844, "grad_norm": 4.797568374404226, "learning_rate": 1.5236988194566469e-06, "loss": 0.3183, "step": 2735 }, { "epoch": 0.7179235241136493, "grad_norm": 4.8386270061207055, "learning_rate": 1.5218518443231628e-06, "loss": 0.2763, "step": 2740 }, { "epoch": 0.7192336035372144, "grad_norm": 4.215400128326543, "learning_rate": 1.5200024195015719e-06, "loss": 0.2661, "step": 2745 }, { "epoch": 0.7205436829607795, "grad_norm": 4.56588429028685, "learning_rate": 1.5181505536734835e-06, "loss": 0.283, "step": 2750 }, { "epoch": 0.7218537623843445, "grad_norm": 6.619608414847504, "learning_rate": 1.5162962555319664e-06, "loss": 0.271, "step": 2755 }, { "epoch": 0.7231638418079096, "grad_norm": 2.4274939604447385, "learning_rate": 1.5144395337815063e-06, "loss": 0.313, "step": 2760 }, { "epoch": 0.7244739212314747, "grad_norm": 5.626984953335138, "learning_rate": 1.5125803971379665e-06, "loss": 0.2866, "step": 2765 }, { "epoch": 0.7257840006550397, "grad_norm": 4.285823933441923, "learning_rate": 1.5107188543285454e-06, "loss": 0.2603, "step": 2770 }, { "epoch": 0.7270940800786048, "grad_norm": 4.38863656110863, "learning_rate": 1.5088549140917381e-06, "loss": 0.3184, "step": 2775 }, { "epoch": 0.7284041595021699, "grad_norm": 2.9328782019117465, "learning_rate": 1.506988585177292e-06, "loss": 0.2389, "step": 2780 }, { "epoch": 0.7297142389257348, "grad_norm": 3.483606480673357, "learning_rate": 1.505119876346168e-06, "loss": 0.276, "step": 2785 }, { "epoch": 0.7310243183492999, "grad_norm": 4.504190498010961, "learning_rate": 1.5032487963705003e-06, "loss": 0.1977, "step": 2790 }, { "epoch": 0.732334397772865, "grad_norm": 4.184926339697806, "learning_rate": 1.5013753540335517e-06, "loss": 0.2972, "step": 2795 }, { "epoch": 0.73364447719643, "grad_norm": 3.8006093754774195, "learning_rate": 1.499499558129676e-06, "loss": 0.2776, "step": 2800 }, { "epoch": 0.73364447719643, "eval_accuracy": 0.7504, "eval_loss": 0.7540197372436523, "eval_runtime": 142.6507, "eval_samples_per_second": 8.763, "eval_steps_per_second": 2.194, "step": 2800 }, { "epoch": 0.7349545566199951, "grad_norm": 3.614639963171112, "learning_rate": 1.497621417464274e-06, "loss": 0.2199, "step": 2805 }, { "epoch": 0.7362646360435602, "grad_norm": 3.4753724939982367, "learning_rate": 1.4957409408537535e-06, "loss": 0.2842, "step": 2810 }, { "epoch": 0.7375747154671252, "grad_norm": 2.829347092202445, "learning_rate": 1.493858137125489e-06, "loss": 0.2054, "step": 2815 }, { "epoch": 0.7388847948906903, "grad_norm": 4.607528640210262, "learning_rate": 1.4919730151177773e-06, "loss": 0.2488, "step": 2820 }, { "epoch": 0.7401948743142553, "grad_norm": 4.424154310853472, "learning_rate": 1.4900855836797995e-06, "loss": 0.3079, "step": 2825 }, { "epoch": 0.7415049537378203, "grad_norm": 2.8286263481877434, "learning_rate": 1.4881958516715757e-06, "loss": 0.267, "step": 2830 }, { "epoch": 0.7428150331613854, "grad_norm": 4.694968243877861, "learning_rate": 1.4863038279639268e-06, "loss": 0.2903, "step": 2835 }, { "epoch": 0.7441251125849505, "grad_norm": 3.672549062096689, "learning_rate": 1.4844095214384309e-06, "loss": 0.2583, "step": 2840 }, { "epoch": 0.7454351920085155, "grad_norm": 3.4410420172535887, "learning_rate": 1.4825129409873822e-06, "loss": 0.3213, "step": 2845 }, { "epoch": 0.7467452714320806, "grad_norm": 3.4101727347068382, "learning_rate": 1.4806140955137495e-06, "loss": 0.2537, "step": 2850 }, { "epoch": 0.7480553508556457, "grad_norm": 4.519383622184218, "learning_rate": 1.4787129939311337e-06, "loss": 0.2929, "step": 2855 }, { "epoch": 0.7493654302792107, "grad_norm": 3.4774712459804404, "learning_rate": 1.4768096451637272e-06, "loss": 0.2682, "step": 2860 }, { "epoch": 0.7506755097027757, "grad_norm": 2.6479749188555575, "learning_rate": 1.4749040581462694e-06, "loss": 0.2519, "step": 2865 }, { "epoch": 0.7519855891263408, "grad_norm": 4.16913561566471, "learning_rate": 1.4729962418240086e-06, "loss": 0.2619, "step": 2870 }, { "epoch": 0.7532956685499058, "grad_norm": 2.0678666370348324, "learning_rate": 1.471086205152657e-06, "loss": 0.319, "step": 2875 }, { "epoch": 0.7546057479734709, "grad_norm": 2.976384517477917, "learning_rate": 1.469173957098349e-06, "loss": 0.3259, "step": 2880 }, { "epoch": 0.755915827397036, "grad_norm": 3.658100772623381, "learning_rate": 1.4672595066376015e-06, "loss": 0.2506, "step": 2885 }, { "epoch": 0.757225906820601, "grad_norm": 6.101815938265203, "learning_rate": 1.4653428627572674e-06, "loss": 0.2655, "step": 2890 }, { "epoch": 0.7585359862441661, "grad_norm": 2.8143348607782337, "learning_rate": 1.4634240344544988e-06, "loss": 0.2684, "step": 2895 }, { "epoch": 0.759846065667731, "grad_norm": 2.105144871048026, "learning_rate": 1.4615030307366998e-06, "loss": 0.2804, "step": 2900 }, { "epoch": 0.759846065667731, "eval_accuracy": 0.748, "eval_loss": 0.7601897716522217, "eval_runtime": 139.2011, "eval_samples_per_second": 8.98, "eval_steps_per_second": 2.249, "step": 2900 }, { "epoch": 0.7611561450912961, "grad_norm": 3.998393869040855, "learning_rate": 1.459579860621488e-06, "loss": 0.2674, "step": 2905 }, { "epoch": 0.7624662245148612, "grad_norm": 2.2268642108185053, "learning_rate": 1.4576545331366488e-06, "loss": 0.2702, "step": 2910 }, { "epoch": 0.7637763039384262, "grad_norm": 6.218849876814229, "learning_rate": 1.4557270573200962e-06, "loss": 0.2864, "step": 2915 }, { "epoch": 0.7650863833619913, "grad_norm": 4.323897051065836, "learning_rate": 1.4537974422198285e-06, "loss": 0.2636, "step": 2920 }, { "epoch": 0.7663964627855564, "grad_norm": 5.750558481033307, "learning_rate": 1.451865696893886e-06, "loss": 0.2319, "step": 2925 }, { "epoch": 0.7677065422091214, "grad_norm": 6.231068587651604, "learning_rate": 1.4499318304103097e-06, "loss": 0.2912, "step": 2930 }, { "epoch": 0.7690166216326865, "grad_norm": 3.8621492036274545, "learning_rate": 1.447995851847096e-06, "loss": 0.2594, "step": 2935 }, { "epoch": 0.7703267010562516, "grad_norm": 4.8680458967049285, "learning_rate": 1.4460577702921577e-06, "loss": 0.2787, "step": 2940 }, { "epoch": 0.7716367804798165, "grad_norm": 2.277649540761437, "learning_rate": 1.4441175948432784e-06, "loss": 0.2722, "step": 2945 }, { "epoch": 0.7729468599033816, "grad_norm": 3.335412545156817, "learning_rate": 1.4421753346080714e-06, "loss": 0.2614, "step": 2950 }, { "epoch": 0.7742569393269467, "grad_norm": 3.194957371530983, "learning_rate": 1.4402309987039365e-06, "loss": 0.3021, "step": 2955 }, { "epoch": 0.7755670187505117, "grad_norm": 4.092383074593162, "learning_rate": 1.4382845962580165e-06, "loss": 0.2532, "step": 2960 }, { "epoch": 0.7768770981740768, "grad_norm": 2.9160857878852595, "learning_rate": 1.436336136407156e-06, "loss": 0.3102, "step": 2965 }, { "epoch": 0.7781871775976419, "grad_norm": 2.6117926566384377, "learning_rate": 1.4343856282978565e-06, "loss": 0.2532, "step": 2970 }, { "epoch": 0.7794972570212069, "grad_norm": 2.514471217178964, "learning_rate": 1.4324330810862354e-06, "loss": 0.2709, "step": 2975 }, { "epoch": 0.780807336444772, "grad_norm": 2.391957064033006, "learning_rate": 1.430478503937981e-06, "loss": 0.2655, "step": 2980 }, { "epoch": 0.782117415868337, "grad_norm": 8.037406108008932, "learning_rate": 1.4285219060283119e-06, "loss": 0.3229, "step": 2985 }, { "epoch": 0.783427495291902, "grad_norm": 2.024818465241261, "learning_rate": 1.4265632965419311e-06, "loss": 0.2476, "step": 2990 }, { "epoch": 0.7847375747154671, "grad_norm": 4.525940151913456, "learning_rate": 1.4246026846729864e-06, "loss": 0.2801, "step": 2995 }, { "epoch": 0.7860476541390322, "grad_norm": 5.616203546892296, "learning_rate": 1.422640079625023e-06, "loss": 0.2893, "step": 3000 }, { "epoch": 0.7860476541390322, "eval_accuracy": 0.7448, "eval_loss": 0.8274851441383362, "eval_runtime": 139.0761, "eval_samples_per_second": 8.988, "eval_steps_per_second": 2.251, "step": 3000 }, { "epoch": 0.7873577335625972, "grad_norm": 3.641490270919579, "learning_rate": 1.420675490610944e-06, "loss": 0.2927, "step": 3005 }, { "epoch": 0.7886678129861623, "grad_norm": 2.8550119558130884, "learning_rate": 1.418708926852965e-06, "loss": 0.2525, "step": 3010 }, { "epoch": 0.7899778924097274, "grad_norm": 3.081716905966947, "learning_rate": 1.4167403975825726e-06, "loss": 0.2494, "step": 3015 }, { "epoch": 0.7912879718332924, "grad_norm": 4.676494817613609, "learning_rate": 1.4147699120404775e-06, "loss": 0.2858, "step": 3020 }, { "epoch": 0.7925980512568574, "grad_norm": 1.9991869906134794, "learning_rate": 1.4127974794765764e-06, "loss": 0.2937, "step": 3025 }, { "epoch": 0.7939081306804225, "grad_norm": 2.6233135206447398, "learning_rate": 1.410823109149904e-06, "loss": 0.2932, "step": 3030 }, { "epoch": 0.7952182101039875, "grad_norm": 2.2451673120874176, "learning_rate": 1.408846810328592e-06, "loss": 0.2594, "step": 3035 }, { "epoch": 0.7965282895275526, "grad_norm": 2.7009560292084336, "learning_rate": 1.4068685922898244e-06, "loss": 0.3115, "step": 3040 }, { "epoch": 0.7978383689511177, "grad_norm": 2.433826115411649, "learning_rate": 1.4048884643197947e-06, "loss": 0.268, "step": 3045 }, { "epoch": 0.7991484483746827, "grad_norm": 3.5050872305744663, "learning_rate": 1.4029064357136626e-06, "loss": 0.266, "step": 3050 }, { "epoch": 0.8004585277982478, "grad_norm": 3.1713770891943462, "learning_rate": 1.4009225157755085e-06, "loss": 0.2807, "step": 3055 }, { "epoch": 0.8017686072218129, "grad_norm": 3.3467606419772697, "learning_rate": 1.3989367138182924e-06, "loss": 0.2641, "step": 3060 }, { "epoch": 0.8030786866453778, "grad_norm": 3.1069787059795946, "learning_rate": 1.396949039163808e-06, "loss": 0.277, "step": 3065 }, { "epoch": 0.8043887660689429, "grad_norm": 4.763920028748174, "learning_rate": 1.3949595011426407e-06, "loss": 0.2625, "step": 3070 }, { "epoch": 0.805698845492508, "grad_norm": 2.8182907657903606, "learning_rate": 1.392968109094122e-06, "loss": 0.2487, "step": 3075 }, { "epoch": 0.807008924916073, "grad_norm": 2.72524792612828, "learning_rate": 1.3909748723662871e-06, "loss": 0.2513, "step": 3080 }, { "epoch": 0.8083190043396381, "grad_norm": 3.3492249191737677, "learning_rate": 1.3889798003158312e-06, "loss": 0.2844, "step": 3085 }, { "epoch": 0.8096290837632032, "grad_norm": 4.142567741582466, "learning_rate": 1.3869829023080636e-06, "loss": 0.2978, "step": 3090 }, { "epoch": 0.8109391631867682, "grad_norm": 4.435078753474168, "learning_rate": 1.384984187716866e-06, "loss": 0.217, "step": 3095 }, { "epoch": 0.8122492426103333, "grad_norm": 6.294316726095601, "learning_rate": 1.3829836659246473e-06, "loss": 0.3141, "step": 3100 }, { "epoch": 0.8122492426103333, "eval_accuracy": 0.7392, "eval_loss": 0.7475783824920654, "eval_runtime": 139.032, "eval_samples_per_second": 8.991, "eval_steps_per_second": 2.251, "step": 3100 }, { "epoch": 0.8135593220338984, "grad_norm": 2.2935780929180707, "learning_rate": 1.3809813463222995e-06, "loss": 0.2432, "step": 3105 }, { "epoch": 0.8148694014574633, "grad_norm": 3.0405694359199367, "learning_rate": 1.3789772383091542e-06, "loss": 0.234, "step": 3110 }, { "epoch": 0.8161794808810284, "grad_norm": 2.6036641756583974, "learning_rate": 1.3769713512929384e-06, "loss": 0.2513, "step": 3115 }, { "epoch": 0.8174895603045935, "grad_norm": 3.0854001671426166, "learning_rate": 1.37496369468973e-06, "loss": 0.3248, "step": 3120 }, { "epoch": 0.8187996397281585, "grad_norm": 2.7741805327089253, "learning_rate": 1.3729542779239133e-06, "loss": 0.2183, "step": 3125 }, { "epoch": 0.8201097191517236, "grad_norm": 3.154383132924369, "learning_rate": 1.370943110428136e-06, "loss": 0.2318, "step": 3130 }, { "epoch": 0.8214197985752886, "grad_norm": 2.728987342059349, "learning_rate": 1.3689302016432628e-06, "loss": 0.2505, "step": 3135 }, { "epoch": 0.8227298779988537, "grad_norm": 4.260828009892338, "learning_rate": 1.3669155610183336e-06, "loss": 0.2859, "step": 3140 }, { "epoch": 0.8240399574224188, "grad_norm": 4.698119734994208, "learning_rate": 1.364899198010518e-06, "loss": 0.3126, "step": 3145 }, { "epoch": 0.8253500368459837, "grad_norm": 2.9266861851773336, "learning_rate": 1.3628811220850703e-06, "loss": 0.2524, "step": 3150 }, { "epoch": 0.8266601162695488, "grad_norm": 3.3660376635311744, "learning_rate": 1.3608613427152854e-06, "loss": 0.26, "step": 3155 }, { "epoch": 0.8279701956931139, "grad_norm": 2.7452509590309027, "learning_rate": 1.358839869382455e-06, "loss": 0.2787, "step": 3160 }, { "epoch": 0.8292802751166789, "grad_norm": 2.2871077739708348, "learning_rate": 1.356816711575823e-06, "loss": 0.2774, "step": 3165 }, { "epoch": 0.830590354540244, "grad_norm": 2.353498321089704, "learning_rate": 1.3547918787925392e-06, "loss": 0.1922, "step": 3170 }, { "epoch": 0.8319004339638091, "grad_norm": 2.8043846357680895, "learning_rate": 1.352765380537618e-06, "loss": 0.2457, "step": 3175 }, { "epoch": 0.8332105133873741, "grad_norm": 6.287288851930004, "learning_rate": 1.3507372263238901e-06, "loss": 0.2882, "step": 3180 }, { "epoch": 0.8345205928109392, "grad_norm": 5.293879458072892, "learning_rate": 1.3487074256719608e-06, "loss": 0.2908, "step": 3185 }, { "epoch": 0.8358306722345042, "grad_norm": 3.70662303230532, "learning_rate": 1.3466759881101637e-06, "loss": 0.2343, "step": 3190 }, { "epoch": 0.8371407516580692, "grad_norm": 5.617247281731303, "learning_rate": 1.344642923174517e-06, "loss": 0.3469, "step": 3195 }, { "epoch": 0.8384508310816343, "grad_norm": 5.831422377330226, "learning_rate": 1.3426082404086772e-06, "loss": 0.3464, "step": 3200 }, { "epoch": 0.8384508310816343, "eval_accuracy": 0.7464, "eval_loss": 0.6823216080665588, "eval_runtime": 137.8591, "eval_samples_per_second": 9.067, "eval_steps_per_second": 2.27, "step": 3200 }, { "epoch": 0.8397609105051994, "grad_norm": 5.229782066538766, "learning_rate": 1.3405719493638959e-06, "loss": 0.2926, "step": 3205 }, { "epoch": 0.8410709899287644, "grad_norm": 1.8257926894676517, "learning_rate": 1.3385340595989738e-06, "loss": 0.2532, "step": 3210 }, { "epoch": 0.8423810693523295, "grad_norm": 2.1256870704370434, "learning_rate": 1.3364945806802173e-06, "loss": 0.2456, "step": 3215 }, { "epoch": 0.8436911487758946, "grad_norm": 2.7507619238311065, "learning_rate": 1.3344535221813915e-06, "loss": 0.2556, "step": 3220 }, { "epoch": 0.8450012281994596, "grad_norm": 3.313724807442175, "learning_rate": 1.3324108936836775e-06, "loss": 0.2604, "step": 3225 }, { "epoch": 0.8463113076230246, "grad_norm": 3.2583479898589385, "learning_rate": 1.330366704775625e-06, "loss": 0.2566, "step": 3230 }, { "epoch": 0.8476213870465897, "grad_norm": 4.463854161721075, "learning_rate": 1.3283209650531098e-06, "loss": 0.3077, "step": 3235 }, { "epoch": 0.8489314664701547, "grad_norm": 2.7758922633618868, "learning_rate": 1.326273684119287e-06, "loss": 0.2555, "step": 3240 }, { "epoch": 0.8502415458937198, "grad_norm": 6.067311625889626, "learning_rate": 1.3242248715845468e-06, "loss": 0.3606, "step": 3245 }, { "epoch": 0.8515516253172849, "grad_norm": 2.7176945371959658, "learning_rate": 1.3221745370664689e-06, "loss": 0.2035, "step": 3250 }, { "epoch": 0.8528617047408499, "grad_norm": 5.014963951139648, "learning_rate": 1.3201226901897773e-06, "loss": 0.3122, "step": 3255 }, { "epoch": 0.854171784164415, "grad_norm": 3.1409689213203262, "learning_rate": 1.318069340586296e-06, "loss": 0.2756, "step": 3260 }, { "epoch": 0.8554818635879801, "grad_norm": 2.6726747362164613, "learning_rate": 1.316014497894902e-06, "loss": 0.2037, "step": 3265 }, { "epoch": 0.856791943011545, "grad_norm": 4.397327066527509, "learning_rate": 1.3139581717614822e-06, "loss": 0.2166, "step": 3270 }, { "epoch": 0.8581020224351101, "grad_norm": 5.515919631566852, "learning_rate": 1.311900371838887e-06, "loss": 0.3015, "step": 3275 }, { "epoch": 0.8594121018586752, "grad_norm": 2.8756311830991206, "learning_rate": 1.3098411077868846e-06, "loss": 0.2597, "step": 3280 }, { "epoch": 0.8607221812822402, "grad_norm": 5.970001237473167, "learning_rate": 1.3077803892721166e-06, "loss": 0.2328, "step": 3285 }, { "epoch": 0.8620322607058053, "grad_norm": 5.921050843170067, "learning_rate": 1.3057182259680517e-06, "loss": 0.235, "step": 3290 }, { "epoch": 0.8633423401293704, "grad_norm": 3.740850134478578, "learning_rate": 1.3036546275549416e-06, "loss": 0.2827, "step": 3295 }, { "epoch": 0.8646524195529354, "grad_norm": 3.5537486719044873, "learning_rate": 1.3015896037197737e-06, "loss": 0.2382, "step": 3300 }, { "epoch": 0.8646524195529354, "eval_accuracy": 0.7336, "eval_loss": 0.791848361492157, "eval_runtime": 137.8386, "eval_samples_per_second": 9.069, "eval_steps_per_second": 2.271, "step": 3300 }, { "epoch": 0.8659624989765005, "grad_norm": 3.7119905199979706, "learning_rate": 1.2995231641562276e-06, "loss": 0.255, "step": 3305 }, { "epoch": 0.8672725784000656, "grad_norm": 3.383107638588926, "learning_rate": 1.2974553185646275e-06, "loss": 0.2459, "step": 3310 }, { "epoch": 0.8685826578236305, "grad_norm": 3.48789577540892, "learning_rate": 1.295386076651899e-06, "loss": 0.2969, "step": 3315 }, { "epoch": 0.8698927372471956, "grad_norm": 3.9640319112857556, "learning_rate": 1.2933154481315219e-06, "loss": 0.2857, "step": 3320 }, { "epoch": 0.8712028166707607, "grad_norm": 3.1582073938250077, "learning_rate": 1.2912434427234841e-06, "loss": 0.254, "step": 3325 }, { "epoch": 0.8725128960943257, "grad_norm": 2.276234259584371, "learning_rate": 1.289170070154239e-06, "loss": 0.2445, "step": 3330 }, { "epoch": 0.8738229755178908, "grad_norm": 5.00541090993625, "learning_rate": 1.2870953401566555e-06, "loss": 0.2843, "step": 3335 }, { "epoch": 0.8751330549414559, "grad_norm": 3.159470503361849, "learning_rate": 1.285019262469976e-06, "loss": 0.2521, "step": 3340 }, { "epoch": 0.8764431343650209, "grad_norm": 5.2597276914314435, "learning_rate": 1.282941846839769e-06, "loss": 0.2499, "step": 3345 }, { "epoch": 0.877753213788586, "grad_norm": 3.595455023325043, "learning_rate": 1.2808631030178834e-06, "loss": 0.2818, "step": 3350 }, { "epoch": 0.8790632932121509, "grad_norm": 3.5145264435052934, "learning_rate": 1.278783040762403e-06, "loss": 0.3035, "step": 3355 }, { "epoch": 0.880373372635716, "grad_norm": 1.9251613140804913, "learning_rate": 1.2767016698376002e-06, "loss": 0.2244, "step": 3360 }, { "epoch": 0.8816834520592811, "grad_norm": 4.237713316911567, "learning_rate": 1.2746190000138915e-06, "loss": 0.2627, "step": 3365 }, { "epoch": 0.8829935314828461, "grad_norm": 2.6561528116215474, "learning_rate": 1.27253504106779e-06, "loss": 0.273, "step": 3370 }, { "epoch": 0.8843036109064112, "grad_norm": 3.109317673581231, "learning_rate": 1.2704498027818603e-06, "loss": 0.2651, "step": 3375 }, { "epoch": 0.8856136903299763, "grad_norm": 2.0153319806341403, "learning_rate": 1.2683632949446726e-06, "loss": 0.2476, "step": 3380 }, { "epoch": 0.8869237697535413, "grad_norm": 5.576097183261757, "learning_rate": 1.266275527350757e-06, "loss": 0.235, "step": 3385 }, { "epoch": 0.8882338491771063, "grad_norm": 3.7365883269489784, "learning_rate": 1.2641865098005564e-06, "loss": 0.2446, "step": 3390 }, { "epoch": 0.8895439286006714, "grad_norm": 2.5559798536789153, "learning_rate": 1.2620962521003824e-06, "loss": 0.2616, "step": 3395 }, { "epoch": 0.8908540080242364, "grad_norm": 3.7982545775172545, "learning_rate": 1.260004764062367e-06, "loss": 0.3298, "step": 3400 }, { "epoch": 0.8908540080242364, "eval_accuracy": 0.7328, "eval_loss": 0.8060081601142883, "eval_runtime": 137.0834, "eval_samples_per_second": 9.119, "eval_steps_per_second": 2.283, "step": 3400 }, { "epoch": 0.8921640874478015, "grad_norm": 3.7011223316651165, "learning_rate": 1.2579120555044183e-06, "loss": 0.2734, "step": 3405 }, { "epoch": 0.8934741668713666, "grad_norm": 3.1941707471283496, "learning_rate": 1.2558181362501733e-06, "loss": 0.2535, "step": 3410 }, { "epoch": 0.8947842462949316, "grad_norm": 2.791095240654008, "learning_rate": 1.2537230161289536e-06, "loss": 0.264, "step": 3415 }, { "epoch": 0.8960943257184967, "grad_norm": 3.6185337902204244, "learning_rate": 1.2516267049757156e-06, "loss": 0.2472, "step": 3420 }, { "epoch": 0.8974044051420618, "grad_norm": 3.85855921456429, "learning_rate": 1.249529212631009e-06, "loss": 0.3052, "step": 3425 }, { "epoch": 0.8987144845656267, "grad_norm": 3.6343729382527146, "learning_rate": 1.247430548940927e-06, "loss": 0.2441, "step": 3430 }, { "epoch": 0.9000245639891918, "grad_norm": 3.2162815125864164, "learning_rate": 1.2453307237570617e-06, "loss": 0.2659, "step": 3435 }, { "epoch": 0.9013346434127569, "grad_norm": 3.1960651267976896, "learning_rate": 1.2432297469364569e-06, "loss": 0.2555, "step": 3440 }, { "epoch": 0.9026447228363219, "grad_norm": 3.523205821552062, "learning_rate": 1.2411276283415638e-06, "loss": 0.2867, "step": 3445 }, { "epoch": 0.903954802259887, "grad_norm": 3.455735398408258, "learning_rate": 1.2390243778401927e-06, "loss": 0.2998, "step": 3450 }, { "epoch": 0.9052648816834521, "grad_norm": 2.6520324497661822, "learning_rate": 1.2369200053054663e-06, "loss": 0.2581, "step": 3455 }, { "epoch": 0.9065749611070171, "grad_norm": 2.2579348470775655, "learning_rate": 1.2348145206157758e-06, "loss": 0.2196, "step": 3460 }, { "epoch": 0.9078850405305822, "grad_norm": 6.681015179121082, "learning_rate": 1.232707933654732e-06, "loss": 0.3075, "step": 3465 }, { "epoch": 0.9091951199541473, "grad_norm": 4.24745491945866, "learning_rate": 1.2306002543111215e-06, "loss": 0.2822, "step": 3470 }, { "epoch": 0.9105051993777122, "grad_norm": 2.769756448881865, "learning_rate": 1.2284914924788568e-06, "loss": 0.2628, "step": 3475 }, { "epoch": 0.9118152788012773, "grad_norm": 2.5369009573995407, "learning_rate": 1.2263816580569333e-06, "loss": 0.2338, "step": 3480 }, { "epoch": 0.9131253582248424, "grad_norm": 3.165436055957326, "learning_rate": 1.224270760949381e-06, "loss": 0.3067, "step": 3485 }, { "epoch": 0.9144354376484074, "grad_norm": 3.351223367009085, "learning_rate": 1.2221588110652183e-06, "loss": 0.3004, "step": 3490 }, { "epoch": 0.9157455170719725, "grad_norm": 3.3328215960308305, "learning_rate": 1.220045818318406e-06, "loss": 0.2857, "step": 3495 }, { "epoch": 0.9170555964955376, "grad_norm": 5.5246139033024635, "learning_rate": 1.2179317926277987e-06, "loss": 0.2715, "step": 3500 }, { "epoch": 0.9170555964955376, "eval_accuracy": 0.7488, "eval_loss": 0.7673412561416626, "eval_runtime": 137.1036, "eval_samples_per_second": 9.117, "eval_steps_per_second": 2.283, "step": 3500 }, { "epoch": 0.9183656759191026, "grad_norm": 2.8554155032311366, "learning_rate": 1.2158167439171026e-06, "loss": 0.2767, "step": 3505 }, { "epoch": 0.9196757553426677, "grad_norm": 3.134856252459499, "learning_rate": 1.2137006821148234e-06, "loss": 0.296, "step": 3510 }, { "epoch": 0.9209858347662327, "grad_norm": 3.993316808863827, "learning_rate": 1.2115836171542243e-06, "loss": 0.3058, "step": 3515 }, { "epoch": 0.9222959141897977, "grad_norm": 3.939068373566409, "learning_rate": 1.2094655589732773e-06, "loss": 0.2605, "step": 3520 }, { "epoch": 0.9236059936133628, "grad_norm": 2.3253660721101825, "learning_rate": 1.2073465175146159e-06, "loss": 0.2342, "step": 3525 }, { "epoch": 0.9249160730369279, "grad_norm": 4.9098246531853675, "learning_rate": 1.2052265027254904e-06, "loss": 0.2824, "step": 3530 }, { "epoch": 0.9262261524604929, "grad_norm": 4.153863783851212, "learning_rate": 1.203105524557719e-06, "loss": 0.2884, "step": 3535 }, { "epoch": 0.927536231884058, "grad_norm": 2.8597337529967786, "learning_rate": 1.2009835929676435e-06, "loss": 0.2527, "step": 3540 }, { "epoch": 0.9288463113076231, "grad_norm": 3.7214300730819603, "learning_rate": 1.19886071791608e-06, "loss": 0.2642, "step": 3545 }, { "epoch": 0.930156390731188, "grad_norm": 2.2141165902052826, "learning_rate": 1.196736909368275e-06, "loss": 0.1855, "step": 3550 }, { "epoch": 0.9314664701547531, "grad_norm": 4.015370663796761, "learning_rate": 1.1946121772938554e-06, "loss": 0.2747, "step": 3555 }, { "epoch": 0.9327765495783182, "grad_norm": 2.7879854190831366, "learning_rate": 1.1924865316667839e-06, "loss": 0.2768, "step": 3560 }, { "epoch": 0.9340866290018832, "grad_norm": 10.049756094566998, "learning_rate": 1.190359982465312e-06, "loss": 0.2436, "step": 3565 }, { "epoch": 0.9353967084254483, "grad_norm": 3.644259827300173, "learning_rate": 1.1882325396719323e-06, "loss": 0.2508, "step": 3570 }, { "epoch": 0.9367067878490134, "grad_norm": 3.612418668345567, "learning_rate": 1.1861042132733328e-06, "loss": 0.269, "step": 3575 }, { "epoch": 0.9380168672725784, "grad_norm": 3.7040505997852735, "learning_rate": 1.1839750132603486e-06, "loss": 0.2481, "step": 3580 }, { "epoch": 0.9393269466961435, "grad_norm": 4.288075933572893, "learning_rate": 1.1818449496279159e-06, "loss": 0.2708, "step": 3585 }, { "epoch": 0.9406370261197085, "grad_norm": 2.940632362279229, "learning_rate": 1.1797140323750249e-06, "loss": 0.2669, "step": 3590 }, { "epoch": 0.9419471055432735, "grad_norm": 3.7558809010488394, "learning_rate": 1.1775822715046736e-06, "loss": 0.2544, "step": 3595 }, { "epoch": 0.9432571849668386, "grad_norm": 3.414999704185975, "learning_rate": 1.175449677023819e-06, "loss": 0.2229, "step": 3600 }, { "epoch": 0.9432571849668386, "eval_accuracy": 0.7704, "eval_loss": 0.7259599566459656, "eval_runtime": 137.2184, "eval_samples_per_second": 9.11, "eval_steps_per_second": 2.281, "step": 3600 }, { "epoch": 0.9445672643904036, "grad_norm": 3.8559190297316244, "learning_rate": 1.173316258943332e-06, "loss": 0.2573, "step": 3605 }, { "epoch": 0.9458773438139687, "grad_norm": 3.862447153928952, "learning_rate": 1.1711820272779497e-06, "loss": 0.2706, "step": 3610 }, { "epoch": 0.9471874232375338, "grad_norm": 3.11275213461627, "learning_rate": 1.1690469920462276e-06, "loss": 0.242, "step": 3615 }, { "epoch": 0.9484975026610988, "grad_norm": 2.7086439962254265, "learning_rate": 1.166911163270494e-06, "loss": 0.2639, "step": 3620 }, { "epoch": 0.9498075820846639, "grad_norm": 2.7066495290470205, "learning_rate": 1.1647745509768025e-06, "loss": 0.2526, "step": 3625 }, { "epoch": 0.951117661508229, "grad_norm": 3.9839085079101735, "learning_rate": 1.1626371651948836e-06, "loss": 0.2045, "step": 3630 }, { "epoch": 0.9524277409317939, "grad_norm": 4.411550498275121, "learning_rate": 1.1604990159580998e-06, "loss": 0.2613, "step": 3635 }, { "epoch": 0.953737820355359, "grad_norm": 3.830942956470282, "learning_rate": 1.1583601133033973e-06, "loss": 0.3089, "step": 3640 }, { "epoch": 0.9550478997789241, "grad_norm": 4.664311559495326, "learning_rate": 1.1562204672712583e-06, "loss": 0.2669, "step": 3645 }, { "epoch": 0.9563579792024891, "grad_norm": 2.3992079061884515, "learning_rate": 1.1540800879056554e-06, "loss": 0.2524, "step": 3650 }, { "epoch": 0.9576680586260542, "grad_norm": 2.8182995787216627, "learning_rate": 1.1519389852540032e-06, "loss": 0.2641, "step": 3655 }, { "epoch": 0.9589781380496193, "grad_norm": 4.0781942860428675, "learning_rate": 1.1497971693671113e-06, "loss": 0.2646, "step": 3660 }, { "epoch": 0.9602882174731843, "grad_norm": 4.213894313755887, "learning_rate": 1.147654650299138e-06, "loss": 0.239, "step": 3665 }, { "epoch": 0.9615982968967494, "grad_norm": 2.8090197504712737, "learning_rate": 1.1455114381075423e-06, "loss": 0.2587, "step": 3670 }, { "epoch": 0.9629083763203145, "grad_norm": 3.858779958604277, "learning_rate": 1.1433675428530366e-06, "loss": 0.2865, "step": 3675 }, { "epoch": 0.9642184557438794, "grad_norm": 3.865527824121575, "learning_rate": 1.14122297459954e-06, "loss": 0.2328, "step": 3680 }, { "epoch": 0.9655285351674445, "grad_norm": 1.6264730559645344, "learning_rate": 1.1390777434141306e-06, "loss": 0.2631, "step": 3685 }, { "epoch": 0.9668386145910096, "grad_norm": 3.1179917555516834, "learning_rate": 1.1369318593669988e-06, "loss": 0.2577, "step": 3690 }, { "epoch": 0.9681486940145746, "grad_norm": 4.097463115035321, "learning_rate": 1.1347853325313993e-06, "loss": 0.2727, "step": 3695 }, { "epoch": 0.9694587734381397, "grad_norm": 2.5547154186995145, "learning_rate": 1.1326381729836045e-06, "loss": 0.225, "step": 3700 }, { "epoch": 0.9694587734381397, "eval_accuracy": 0.7512, "eval_loss": 0.6980738043785095, "eval_runtime": 138.6178, "eval_samples_per_second": 9.018, "eval_steps_per_second": 2.258, "step": 3700 }, { "epoch": 0.9707688528617048, "grad_norm": 2.383269216589878, "learning_rate": 1.1304903908028569e-06, "loss": 0.2568, "step": 3705 }, { "epoch": 0.9720789322852698, "grad_norm": 4.510012606255057, "learning_rate": 1.1283419960713212e-06, "loss": 0.3083, "step": 3710 }, { "epoch": 0.9733890117088349, "grad_norm": 2.280997080205734, "learning_rate": 1.126192998874038e-06, "loss": 0.2523, "step": 3715 }, { "epoch": 0.9746990911323999, "grad_norm": 6.594544711402591, "learning_rate": 1.1240434092988764e-06, "loss": 0.2407, "step": 3720 }, { "epoch": 0.9760091705559649, "grad_norm": 4.135213461207541, "learning_rate": 1.1218932374364855e-06, "loss": 0.2893, "step": 3725 }, { "epoch": 0.97731924997953, "grad_norm": 2.861868089040975, "learning_rate": 1.1197424933802485e-06, "loss": 0.2204, "step": 3730 }, { "epoch": 0.9786293294030951, "grad_norm": 5.266728439797464, "learning_rate": 1.1175911872262332e-06, "loss": 0.3179, "step": 3735 }, { "epoch": 0.9799394088266601, "grad_norm": 3.5669822038480405, "learning_rate": 1.1154393290731483e-06, "loss": 0.2392, "step": 3740 }, { "epoch": 0.9812494882502252, "grad_norm": 3.360906639915107, "learning_rate": 1.1132869290222917e-06, "loss": 0.2802, "step": 3745 }, { "epoch": 0.9825595676737903, "grad_norm": 4.200005976718456, "learning_rate": 1.111133997177506e-06, "loss": 0.3154, "step": 3750 }, { "epoch": 0.9838696470973552, "grad_norm": 2.262745756657975, "learning_rate": 1.1089805436451303e-06, "loss": 0.2222, "step": 3755 }, { "epoch": 0.9851797265209203, "grad_norm": 2.6653368398425723, "learning_rate": 1.1068265785339518e-06, "loss": 0.2718, "step": 3760 }, { "epoch": 0.9864898059444854, "grad_norm": 4.296893472198315, "learning_rate": 1.1046721119551598e-06, "loss": 0.3262, "step": 3765 }, { "epoch": 0.9877998853680504, "grad_norm": 4.189321257109803, "learning_rate": 1.1025171540222977e-06, "loss": 0.2656, "step": 3770 }, { "epoch": 0.9891099647916155, "grad_norm": 4.22478244416407, "learning_rate": 1.1003617148512149e-06, "loss": 0.2863, "step": 3775 }, { "epoch": 0.9904200442151806, "grad_norm": 3.0639361862726995, "learning_rate": 1.0982058045600205e-06, "loss": 0.2578, "step": 3780 }, { "epoch": 0.9917301236387456, "grad_norm": 3.242022108834711, "learning_rate": 1.0960494332690342e-06, "loss": 0.2316, "step": 3785 }, { "epoch": 0.9930402030623107, "grad_norm": 3.563673193533161, "learning_rate": 1.093892611100741e-06, "loss": 0.2838, "step": 3790 }, { "epoch": 0.9943502824858758, "grad_norm": 4.493703507987354, "learning_rate": 1.0917353481797412e-06, "loss": 0.2579, "step": 3795 }, { "epoch": 0.9956603619094407, "grad_norm": 6.8325710898527054, "learning_rate": 1.089577654632705e-06, "loss": 0.2317, "step": 3800 }, { "epoch": 0.9956603619094407, "eval_accuracy": 0.7432, "eval_loss": 0.6873839497566223, "eval_runtime": 147.4303, "eval_samples_per_second": 8.479, "eval_steps_per_second": 2.123, "step": 3800 }, { "epoch": 0.9969704413330058, "grad_norm": 3.8200894747277454, "learning_rate": 1.0874195405883231e-06, "loss": 0.2404, "step": 3805 }, { "epoch": 0.9982805207565709, "grad_norm": 2.60177554342581, "learning_rate": 1.085261016177261e-06, "loss": 0.2528, "step": 3810 }, { "epoch": 0.9995906001801359, "grad_norm": 2.373688391522338, "learning_rate": 1.0831020915321109e-06, "loss": 0.2214, "step": 3815 }, { "epoch": 1.000900679603701, "grad_norm": 2.2734563557794907, "learning_rate": 1.080942776787342e-06, "loss": 0.1921, "step": 3820 }, { "epoch": 1.002210759027266, "grad_norm": 2.3069259940839606, "learning_rate": 1.0787830820792566e-06, "loss": 0.2056, "step": 3825 }, { "epoch": 1.0035208384508312, "grad_norm": 6.478254156842445, "learning_rate": 1.0766230175459394e-06, "loss": 0.1716, "step": 3830 }, { "epoch": 1.0048309178743962, "grad_norm": 2.849832491085133, "learning_rate": 1.0744625933272118e-06, "loss": 0.1632, "step": 3835 }, { "epoch": 1.0061409972979611, "grad_norm": 3.009110354194684, "learning_rate": 1.0723018195645835e-06, "loss": 0.1915, "step": 3840 }, { "epoch": 1.0074510767215263, "grad_norm": 7.183583057327153, "learning_rate": 1.070140706401205e-06, "loss": 0.1776, "step": 3845 }, { "epoch": 1.0087611561450913, "grad_norm": 7.645123151827477, "learning_rate": 1.0679792639818199e-06, "loss": 0.2206, "step": 3850 }, { "epoch": 1.0100712355686563, "grad_norm": 2.0069646269660875, "learning_rate": 1.0658175024527175e-06, "loss": 0.1073, "step": 3855 }, { "epoch": 1.0113813149922215, "grad_norm": 2.0486815490375325, "learning_rate": 1.0636554319616853e-06, "loss": 0.1817, "step": 3860 }, { "epoch": 1.0126913944157865, "grad_norm": 7.2346007294382435, "learning_rate": 1.0614930626579603e-06, "loss": 0.2206, "step": 3865 }, { "epoch": 1.0140014738393515, "grad_norm": 4.437819204972086, "learning_rate": 1.0593304046921838e-06, "loss": 0.1944, "step": 3870 }, { "epoch": 1.0153115532629167, "grad_norm": 4.917690349323733, "learning_rate": 1.0571674682163504e-06, "loss": 0.1716, "step": 3875 }, { "epoch": 1.0166216326864816, "grad_norm": 4.465444787397066, "learning_rate": 1.0550042633837629e-06, "loss": 0.1873, "step": 3880 }, { "epoch": 1.0179317121100466, "grad_norm": 5.034120197481139, "learning_rate": 1.052840800348984e-06, "loss": 0.1971, "step": 3885 }, { "epoch": 1.0192417915336118, "grad_norm": 4.650074157777241, "learning_rate": 1.050677089267788e-06, "loss": 0.1936, "step": 3890 }, { "epoch": 1.0205518709571768, "grad_norm": 2.482941440321424, "learning_rate": 1.0485131402971142e-06, "loss": 0.1653, "step": 3895 }, { "epoch": 1.0218619503807418, "grad_norm": 2.852065540223939, "learning_rate": 1.0463489635950179e-06, "loss": 0.1846, "step": 3900 }, { "epoch": 1.0218619503807418, "eval_accuracy": 0.7488, "eval_loss": 0.8795240521430969, "eval_runtime": 142.5175, "eval_samples_per_second": 8.771, "eval_steps_per_second": 2.196, "step": 3900 }, { "epoch": 1.023172029804307, "grad_norm": 1.8616921778346158, "learning_rate": 1.0441845693206237e-06, "loss": 0.1646, "step": 3905 }, { "epoch": 1.024482109227872, "grad_norm": 2.432644973681949, "learning_rate": 1.0420199676340777e-06, "loss": 0.1653, "step": 3910 }, { "epoch": 1.025792188651437, "grad_norm": 6.272988090712415, "learning_rate": 1.0398551686964993e-06, "loss": 0.181, "step": 3915 }, { "epoch": 1.0271022680750022, "grad_norm": 5.836433358300422, "learning_rate": 1.0376901826699347e-06, "loss": 0.225, "step": 3920 }, { "epoch": 1.0284123474985671, "grad_norm": 4.8783777162924205, "learning_rate": 1.0355250197173066e-06, "loss": 0.193, "step": 3925 }, { "epoch": 1.0297224269221321, "grad_norm": 3.7860774323233537, "learning_rate": 1.0333596900023702e-06, "loss": 0.1351, "step": 3930 }, { "epoch": 1.031032506345697, "grad_norm": 3.3891904081042243, "learning_rate": 1.0311942036896623e-06, "loss": 0.1365, "step": 3935 }, { "epoch": 1.0323425857692623, "grad_norm": 4.255144512646531, "learning_rate": 1.0290285709444556e-06, "loss": 0.1947, "step": 3940 }, { "epoch": 1.0336526651928273, "grad_norm": 3.551773880971244, "learning_rate": 1.0268628019327088e-06, "loss": 0.1691, "step": 3945 }, { "epoch": 1.0349627446163923, "grad_norm": 3.21813090542831, "learning_rate": 1.0246969068210217e-06, "loss": 0.1839, "step": 3950 }, { "epoch": 1.0362728240399575, "grad_norm": 2.166114985115052, "learning_rate": 1.022530895776586e-06, "loss": 0.1386, "step": 3955 }, { "epoch": 1.0375829034635224, "grad_norm": 3.7664288882285573, "learning_rate": 1.0203647789671364e-06, "loss": 0.1829, "step": 3960 }, { "epoch": 1.0388929828870874, "grad_norm": 4.568624206840217, "learning_rate": 1.0181985665609051e-06, "loss": 0.1606, "step": 3965 }, { "epoch": 1.0402030623106526, "grad_norm": 4.12795033279393, "learning_rate": 1.0160322687265728e-06, "loss": 0.2144, "step": 3970 }, { "epoch": 1.0415131417342176, "grad_norm": 6.066279389724659, "learning_rate": 1.013865895633221e-06, "loss": 0.153, "step": 3975 }, { "epoch": 1.0428232211577826, "grad_norm": 5.99335728744553, "learning_rate": 1.0116994574502853e-06, "loss": 0.1776, "step": 3980 }, { "epoch": 1.0441333005813478, "grad_norm": 3.323982778252669, "learning_rate": 1.0095329643475056e-06, "loss": 0.1258, "step": 3985 }, { "epoch": 1.0454433800049128, "grad_norm": 4.0821616859221415, "learning_rate": 1.0073664264948803e-06, "loss": 0.141, "step": 3990 }, { "epoch": 1.0467534594284778, "grad_norm": 4.303578582045898, "learning_rate": 1.005199854062618e-06, "loss": 0.1888, "step": 3995 }, { "epoch": 1.048063538852043, "grad_norm": 6.9723723361771865, "learning_rate": 1.0030332572210896e-06, "loss": 0.1624, "step": 4000 }, { "epoch": 1.048063538852043, "eval_accuracy": 0.748, "eval_loss": 1.1003224849700928, "eval_runtime": 138.6989, "eval_samples_per_second": 9.012, "eval_steps_per_second": 2.257, "step": 4000 }, { "epoch": 1.049373618275608, "grad_norm": 4.415041744009451, "learning_rate": 1.00086664614078e-06, "loss": 0.167, "step": 4005 }, { "epoch": 1.050683697699173, "grad_norm": 1.7135670096102869, "learning_rate": 9.987000309922417e-07, "loss": 0.1711, "step": 4010 }, { "epoch": 1.0519937771227381, "grad_norm": 7.478625018041275, "learning_rate": 9.965334219460455e-07, "loss": 0.1731, "step": 4015 }, { "epoch": 1.053303856546303, "grad_norm": 2.7314624626438206, "learning_rate": 9.943668291727344e-07, "loss": 0.1859, "step": 4020 }, { "epoch": 1.054613935969868, "grad_norm": 5.770554889506936, "learning_rate": 9.922002628427742e-07, "loss": 0.1597, "step": 4025 }, { "epoch": 1.0559240153934333, "grad_norm": 3.3610794660047834, "learning_rate": 9.900337331265077e-07, "loss": 0.187, "step": 4030 }, { "epoch": 1.0572340948169983, "grad_norm": 5.42834738035381, "learning_rate": 9.878672501941045e-07, "loss": 0.1698, "step": 4035 }, { "epoch": 1.0585441742405632, "grad_norm": 2.9976736613034665, "learning_rate": 9.857008242155152e-07, "loss": 0.1254, "step": 4040 }, { "epoch": 1.0598542536641284, "grad_norm": 4.911495425759969, "learning_rate": 9.83534465360423e-07, "loss": 0.136, "step": 4045 }, { "epoch": 1.0611643330876934, "grad_norm": 3.02741302534027, "learning_rate": 9.813681837981966e-07, "loss": 0.1938, "step": 4050 }, { "epoch": 1.0624744125112584, "grad_norm": 11.241222743138435, "learning_rate": 9.792019896978412e-07, "loss": 0.1745, "step": 4055 }, { "epoch": 1.0637844919348236, "grad_norm": 3.613601159548242, "learning_rate": 9.77035893227951e-07, "loss": 0.1792, "step": 4060 }, { "epoch": 1.0650945713583886, "grad_norm": 2.560108792032753, "learning_rate": 9.748699045566625e-07, "loss": 0.173, "step": 4065 }, { "epoch": 1.0664046507819536, "grad_norm": 4.2940503388019495, "learning_rate": 9.727040338516066e-07, "loss": 0.1496, "step": 4070 }, { "epoch": 1.0677147302055188, "grad_norm": 4.675724374175031, "learning_rate": 9.705382912798596e-07, "loss": 0.2138, "step": 4075 }, { "epoch": 1.0690248096290837, "grad_norm": 5.628200856430463, "learning_rate": 9.683726870078971e-07, "loss": 0.2194, "step": 4080 }, { "epoch": 1.0703348890526487, "grad_norm": 5.2076046235205125, "learning_rate": 9.662072312015445e-07, "loss": 0.2401, "step": 4085 }, { "epoch": 1.071644968476214, "grad_norm": 1.9418784608600421, "learning_rate": 9.640419340259311e-07, "loss": 0.1514, "step": 4090 }, { "epoch": 1.072955047899779, "grad_norm": 7.120141596864124, "learning_rate": 9.618768056454415e-07, "loss": 0.157, "step": 4095 }, { "epoch": 1.074265127323344, "grad_norm": 3.705216890309473, "learning_rate": 9.597118562236679e-07, "loss": 0.1456, "step": 4100 }, { "epoch": 1.074265127323344, "eval_accuracy": 0.7608, "eval_loss": 0.9698547124862671, "eval_runtime": 139.0187, "eval_samples_per_second": 8.992, "eval_steps_per_second": 2.251, "step": 4100 }, { "epoch": 1.075575206746909, "grad_norm": 3.085456133975227, "learning_rate": 9.575470959233612e-07, "loss": 0.1856, "step": 4105 }, { "epoch": 1.076885286170474, "grad_norm": 1.823222530695256, "learning_rate": 9.553825349063864e-07, "loss": 0.1667, "step": 4110 }, { "epoch": 1.078195365594039, "grad_norm": 3.7383958453668096, "learning_rate": 9.532181833336721e-07, "loss": 0.1391, "step": 4115 }, { "epoch": 1.0795054450176043, "grad_norm": 3.8448773992307514, "learning_rate": 9.510540513651637e-07, "loss": 0.1542, "step": 4120 }, { "epoch": 1.0808155244411692, "grad_norm": 4.78668688388694, "learning_rate": 9.488901491597761e-07, "loss": 0.1696, "step": 4125 }, { "epoch": 1.0821256038647342, "grad_norm": 3.2055677447348923, "learning_rate": 9.46726486875345e-07, "loss": 0.2188, "step": 4130 }, { "epoch": 1.0834356832882994, "grad_norm": 5.891308978767123, "learning_rate": 9.445630746685806e-07, "loss": 0.1885, "step": 4135 }, { "epoch": 1.0847457627118644, "grad_norm": 4.8421132337125545, "learning_rate": 9.423999226950185e-07, "loss": 0.1609, "step": 4140 }, { "epoch": 1.0860558421354294, "grad_norm": 5.9154276009344136, "learning_rate": 9.402370411089732e-07, "loss": 0.1527, "step": 4145 }, { "epoch": 1.0873659215589946, "grad_norm": 7.1723306345970865, "learning_rate": 9.380744400634903e-07, "loss": 0.1594, "step": 4150 }, { "epoch": 1.0886760009825596, "grad_norm": 3.6603352773642106, "learning_rate": 9.35912129710297e-07, "loss": 0.1706, "step": 4155 }, { "epoch": 1.0899860804061245, "grad_norm": 1.8190671058733112, "learning_rate": 9.337501201997573e-07, "loss": 0.1687, "step": 4160 }, { "epoch": 1.0912961598296897, "grad_norm": 6.103828451150774, "learning_rate": 9.315884216808226e-07, "loss": 0.1543, "step": 4165 }, { "epoch": 1.0926062392532547, "grad_norm": 3.51452822849322, "learning_rate": 9.294270443009847e-07, "loss": 0.168, "step": 4170 }, { "epoch": 1.0939163186768197, "grad_norm": 3.9256660164579267, "learning_rate": 9.27265998206227e-07, "loss": 0.1447, "step": 4175 }, { "epoch": 1.095226398100385, "grad_norm": 6.647259650468985, "learning_rate": 9.251052935409783e-07, "loss": 0.219, "step": 4180 }, { "epoch": 1.09653647752395, "grad_norm": 2.8205270117655767, "learning_rate": 9.229449404480653e-07, "loss": 0.1496, "step": 4185 }, { "epoch": 1.0978465569475149, "grad_norm": 6.128991520594209, "learning_rate": 9.207849490686636e-07, "loss": 0.2047, "step": 4190 }, { "epoch": 1.09915663637108, "grad_norm": 4.140511690244823, "learning_rate": 9.186253295422514e-07, "loss": 0.2245, "step": 4195 }, { "epoch": 1.100466715794645, "grad_norm": 2.0987997572217303, "learning_rate": 9.1646609200656e-07, "loss": 0.1966, "step": 4200 }, { "epoch": 1.100466715794645, "eval_accuracy": 0.7504, "eval_loss": 0.9615470767021179, "eval_runtime": 145.8533, "eval_samples_per_second": 8.57, "eval_steps_per_second": 2.146, "step": 4200 }, { "epoch": 1.10177679521821, "grad_norm": 6.7544463505960195, "learning_rate": 9.14307246597529e-07, "loss": 0.1695, "step": 4205 }, { "epoch": 1.1030868746417752, "grad_norm": 3.881058148850053, "learning_rate": 9.121488034492568e-07, "loss": 0.1736, "step": 4210 }, { "epoch": 1.1043969540653402, "grad_norm": 5.101818081783207, "learning_rate": 9.099907726939533e-07, "loss": 0.2124, "step": 4215 }, { "epoch": 1.1057070334889052, "grad_norm": 4.13924770593388, "learning_rate": 9.078331644618934e-07, "loss": 0.149, "step": 4220 }, { "epoch": 1.1070171129124704, "grad_norm": 4.56496678907466, "learning_rate": 9.056759888813668e-07, "loss": 0.1696, "step": 4225 }, { "epoch": 1.1083271923360354, "grad_norm": 6.020881682990206, "learning_rate": 9.035192560786338e-07, "loss": 0.2085, "step": 4230 }, { "epoch": 1.1096372717596004, "grad_norm": 3.020245719833497, "learning_rate": 9.013629761778757e-07, "loss": 0.1503, "step": 4235 }, { "epoch": 1.1109473511831656, "grad_norm": 5.745920685976253, "learning_rate": 8.99207159301148e-07, "loss": 0.1883, "step": 4240 }, { "epoch": 1.1122574306067305, "grad_norm": 6.338038152438336, "learning_rate": 8.970518155683324e-07, "loss": 0.1612, "step": 4245 }, { "epoch": 1.1135675100302955, "grad_norm": 4.337625038324552, "learning_rate": 8.948969550970894e-07, "loss": 0.1276, "step": 4250 }, { "epoch": 1.1148775894538607, "grad_norm": 5.983331324725646, "learning_rate": 8.927425880028113e-07, "loss": 0.1572, "step": 4255 }, { "epoch": 1.1161876688774257, "grad_norm": 5.471826595967007, "learning_rate": 8.905887243985743e-07, "loss": 0.1733, "step": 4260 }, { "epoch": 1.1174977483009907, "grad_norm": 9.369736563628495, "learning_rate": 8.884353743950915e-07, "loss": 0.1768, "step": 4265 }, { "epoch": 1.118807827724556, "grad_norm": 4.302075004357904, "learning_rate": 8.862825481006637e-07, "loss": 0.1676, "step": 4270 }, { "epoch": 1.1201179071481209, "grad_norm": 3.995934439864761, "learning_rate": 8.841302556211348e-07, "loss": 0.1556, "step": 4275 }, { "epoch": 1.1214279865716859, "grad_norm": 5.71645655203888, "learning_rate": 8.81978507059842e-07, "loss": 0.173, "step": 4280 }, { "epoch": 1.122738065995251, "grad_norm": 3.863778158057957, "learning_rate": 8.798273125175697e-07, "loss": 0.1905, "step": 4285 }, { "epoch": 1.124048145418816, "grad_norm": 8.652156181110675, "learning_rate": 8.776766820925016e-07, "loss": 0.2137, "step": 4290 }, { "epoch": 1.125358224842381, "grad_norm": 4.502859011012491, "learning_rate": 8.755266258801725e-07, "loss": 0.1615, "step": 4295 }, { "epoch": 1.1266683042659462, "grad_norm": 6.956098402947135, "learning_rate": 8.73377153973423e-07, "loss": 0.203, "step": 4300 }, { "epoch": 1.1266683042659462, "eval_accuracy": 0.7576, "eval_loss": 1.0583382844924927, "eval_runtime": 149.7241, "eval_samples_per_second": 8.349, "eval_steps_per_second": 2.091, "step": 4300 }, { "epoch": 1.1279783836895112, "grad_norm": 4.881012007634449, "learning_rate": 8.712282764623495e-07, "loss": 0.1625, "step": 4305 }, { "epoch": 1.1292884631130762, "grad_norm": 2.1915201124191523, "learning_rate": 8.690800034342593e-07, "loss": 0.1598, "step": 4310 }, { "epoch": 1.1305985425366414, "grad_norm": 6.988605240677998, "learning_rate": 8.669323449736223e-07, "loss": 0.1763, "step": 4315 }, { "epoch": 1.1319086219602064, "grad_norm": 8.099073728074591, "learning_rate": 8.647853111620213e-07, "loss": 0.2026, "step": 4320 }, { "epoch": 1.1332187013837713, "grad_norm": 6.4285970895379, "learning_rate": 8.626389120781096e-07, "loss": 0.1622, "step": 4325 }, { "epoch": 1.1345287808073365, "grad_norm": 2.9631902369392145, "learning_rate": 8.604931577975591e-07, "loss": 0.1983, "step": 4330 }, { "epoch": 1.1358388602309015, "grad_norm": 1.8505065261163445, "learning_rate": 8.583480583930162e-07, "loss": 0.1276, "step": 4335 }, { "epoch": 1.1371489396544665, "grad_norm": 3.4127484907896175, "learning_rate": 8.562036239340519e-07, "loss": 0.1559, "step": 4340 }, { "epoch": 1.1384590190780317, "grad_norm": 4.817799074804773, "learning_rate": 8.540598644871166e-07, "loss": 0.2032, "step": 4345 }, { "epoch": 1.1397690985015967, "grad_norm": 3.9208306819766907, "learning_rate": 8.519167901154915e-07, "loss": 0.1249, "step": 4350 }, { "epoch": 1.1410791779251617, "grad_norm": 3.258083851351543, "learning_rate": 8.497744108792429e-07, "loss": 0.167, "step": 4355 }, { "epoch": 1.1423892573487269, "grad_norm": 5.712624458894274, "learning_rate": 8.476327368351731e-07, "loss": 0.1821, "step": 4360 }, { "epoch": 1.1436993367722919, "grad_norm": 5.622711654282518, "learning_rate": 8.454917780367738e-07, "loss": 0.1426, "step": 4365 }, { "epoch": 1.1450094161958568, "grad_norm": 2.9714920706791603, "learning_rate": 8.433515445341798e-07, "loss": 0.1508, "step": 4370 }, { "epoch": 1.146319495619422, "grad_norm": 8.145950090881742, "learning_rate": 8.412120463741213e-07, "loss": 0.1911, "step": 4375 }, { "epoch": 1.147629575042987, "grad_norm": 7.816310979213919, "learning_rate": 8.390732935998762e-07, "loss": 0.1972, "step": 4380 }, { "epoch": 1.148939654466552, "grad_norm": 10.997788887744328, "learning_rate": 8.369352962512241e-07, "loss": 0.2195, "step": 4385 }, { "epoch": 1.1502497338901172, "grad_norm": 2.7493033712936668, "learning_rate": 8.347980643643972e-07, "loss": 0.1853, "step": 4390 }, { "epoch": 1.1515598133136822, "grad_norm": 3.181837621729024, "learning_rate": 8.326616079720356e-07, "loss": 0.1779, "step": 4395 }, { "epoch": 1.1528698927372472, "grad_norm": 3.4603211671345084, "learning_rate": 8.305259371031385e-07, "loss": 0.1975, "step": 4400 }, { "epoch": 1.1528698927372472, "eval_accuracy": 0.756, "eval_loss": 0.9896759986877441, "eval_runtime": 146.8037, "eval_samples_per_second": 8.515, "eval_steps_per_second": 2.132, "step": 4400 }, { "epoch": 1.1541799721608124, "grad_norm": 6.84507170824882, "learning_rate": 8.283910617830185e-07, "loss": 0.2055, "step": 4405 }, { "epoch": 1.1554900515843773, "grad_norm": 2.9623090673688295, "learning_rate": 8.262569920332522e-07, "loss": 0.1344, "step": 4410 }, { "epoch": 1.1568001310079423, "grad_norm": 2.676649590370874, "learning_rate": 8.241237378716357e-07, "loss": 0.1341, "step": 4415 }, { "epoch": 1.1581102104315075, "grad_norm": 3.3462527937569284, "learning_rate": 8.219913093121367e-07, "loss": 0.1479, "step": 4420 }, { "epoch": 1.1594202898550725, "grad_norm": 4.922809803079367, "learning_rate": 8.198597163648466e-07, "loss": 0.1377, "step": 4425 }, { "epoch": 1.1607303692786375, "grad_norm": 7.567987448628956, "learning_rate": 8.177289690359354e-07, "loss": 0.2551, "step": 4430 }, { "epoch": 1.1620404487022027, "grad_norm": 4.269347526317818, "learning_rate": 8.155990773276022e-07, "loss": 0.1511, "step": 4435 }, { "epoch": 1.1633505281257677, "grad_norm": 5.196415940319941, "learning_rate": 8.134700512380304e-07, "loss": 0.2124, "step": 4440 }, { "epoch": 1.1646606075493326, "grad_norm": 4.346263990105768, "learning_rate": 8.113419007613399e-07, "loss": 0.1708, "step": 4445 }, { "epoch": 1.1659706869728979, "grad_norm": 4.993849508082759, "learning_rate": 8.092146358875405e-07, "loss": 0.147, "step": 4450 }, { "epoch": 1.1672807663964628, "grad_norm": 4.5209534974011945, "learning_rate": 8.070882666024847e-07, "loss": 0.1311, "step": 4455 }, { "epoch": 1.1685908458200278, "grad_norm": 9.122486214875625, "learning_rate": 8.049628028878199e-07, "loss": 0.179, "step": 4460 }, { "epoch": 1.169900925243593, "grad_norm": 4.302140098325049, "learning_rate": 8.02838254720944e-07, "loss": 0.1912, "step": 4465 }, { "epoch": 1.171211004667158, "grad_norm": 5.885855438044166, "learning_rate": 8.007146320749565e-07, "loss": 0.209, "step": 4470 }, { "epoch": 1.172521084090723, "grad_norm": 5.113483218057625, "learning_rate": 7.985919449186122e-07, "loss": 0.138, "step": 4475 }, { "epoch": 1.1738311635142882, "grad_norm": 7.107404436549728, "learning_rate": 7.964702032162748e-07, "loss": 0.1443, "step": 4480 }, { "epoch": 1.1751412429378532, "grad_norm": 5.613324141288739, "learning_rate": 7.943494169278694e-07, "loss": 0.1659, "step": 4485 }, { "epoch": 1.1764513223614181, "grad_norm": 7.621228727692981, "learning_rate": 7.922295960088366e-07, "loss": 0.2055, "step": 4490 }, { "epoch": 1.1777614017849831, "grad_norm": 5.157419189737596, "learning_rate": 7.901107504100851e-07, "loss": 0.1951, "step": 4495 }, { "epoch": 1.1790714812085483, "grad_norm": 6.255459111664694, "learning_rate": 7.879928900779455e-07, "loss": 0.1878, "step": 4500 }, { "epoch": 1.1790714812085483, "eval_accuracy": 0.7544, "eval_loss": 1.0830539464950562, "eval_runtime": 147.9333, "eval_samples_per_second": 8.45, "eval_steps_per_second": 2.116, "step": 4500 }, { "epoch": 1.1803815606321133, "grad_norm": 5.071030425233318, "learning_rate": 7.858760249541227e-07, "loss": 0.1376, "step": 4505 }, { "epoch": 1.1816916400556783, "grad_norm": 4.613895284146258, "learning_rate": 7.837601649756507e-07, "loss": 0.1871, "step": 4510 }, { "epoch": 1.1830017194792435, "grad_norm": 5.343062025394728, "learning_rate": 7.816453200748445e-07, "loss": 0.1557, "step": 4515 }, { "epoch": 1.1843117989028085, "grad_norm": 2.3488119081864878, "learning_rate": 7.795315001792545e-07, "loss": 0.1275, "step": 4520 }, { "epoch": 1.1856218783263734, "grad_norm": 7.211896790191278, "learning_rate": 7.774187152116195e-07, "loss": 0.1795, "step": 4525 }, { "epoch": 1.1869319577499386, "grad_norm": 6.629116565849999, "learning_rate": 7.753069750898195e-07, "loss": 0.1694, "step": 4530 }, { "epoch": 1.1882420371735036, "grad_norm": 6.7686458510507945, "learning_rate": 7.731962897268304e-07, "loss": 0.1823, "step": 4535 }, { "epoch": 1.1895521165970686, "grad_norm": 9.499227541023666, "learning_rate": 7.710866690306767e-07, "loss": 0.1973, "step": 4540 }, { "epoch": 1.1908621960206338, "grad_norm": 5.305703998887908, "learning_rate": 7.689781229043852e-07, "loss": 0.1417, "step": 4545 }, { "epoch": 1.1921722754441988, "grad_norm": 6.807389274265624, "learning_rate": 7.668706612459386e-07, "loss": 0.1309, "step": 4550 }, { "epoch": 1.1934823548677638, "grad_norm": 2.37846342555138, "learning_rate": 7.647642939482276e-07, "loss": 0.2224, "step": 4555 }, { "epoch": 1.194792434291329, "grad_norm": 7.282824648401145, "learning_rate": 7.626590308990073e-07, "loss": 0.1746, "step": 4560 }, { "epoch": 1.196102513714894, "grad_norm": 10.05223632714189, "learning_rate": 7.605548819808485e-07, "loss": 0.1777, "step": 4565 }, { "epoch": 1.197412593138459, "grad_norm": 3.5055904042182653, "learning_rate": 7.584518570710923e-07, "loss": 0.182, "step": 4570 }, { "epoch": 1.1987226725620241, "grad_norm": 5.58019032163587, "learning_rate": 7.56349966041803e-07, "loss": 0.1708, "step": 4575 }, { "epoch": 1.2000327519855891, "grad_norm": 4.895119266207443, "learning_rate": 7.542492187597227e-07, "loss": 0.1614, "step": 4580 }, { "epoch": 1.201342831409154, "grad_norm": 6.978583024065426, "learning_rate": 7.52149625086224e-07, "loss": 0.1561, "step": 4585 }, { "epoch": 1.2026529108327193, "grad_norm": 6.610019216750305, "learning_rate": 7.500511948772649e-07, "loss": 0.1557, "step": 4590 }, { "epoch": 1.2039629902562843, "grad_norm": 8.135947467914335, "learning_rate": 7.479539379833417e-07, "loss": 0.1616, "step": 4595 }, { "epoch": 1.2052730696798493, "grad_norm": 3.5915184810255667, "learning_rate": 7.458578642494417e-07, "loss": 0.1177, "step": 4600 }, { "epoch": 1.2052730696798493, "eval_accuracy": 0.7656, "eval_loss": 1.0870707035064697, "eval_runtime": 143.0548, "eval_samples_per_second": 8.738, "eval_steps_per_second": 2.188, "step": 4600 }, { "epoch": 1.2065831491034145, "grad_norm": 8.379654070687748, "learning_rate": 7.437629835149997e-07, "loss": 0.1494, "step": 4605 }, { "epoch": 1.2078932285269794, "grad_norm": 5.7531869850518165, "learning_rate": 7.416693056138496e-07, "loss": 0.15, "step": 4610 }, { "epoch": 1.2092033079505444, "grad_norm": 4.807799748254105, "learning_rate": 7.395768403741793e-07, "loss": 0.1665, "step": 4615 }, { "epoch": 1.2105133873741096, "grad_norm": 5.192570962379375, "learning_rate": 7.37485597618484e-07, "loss": 0.1866, "step": 4620 }, { "epoch": 1.2118234667976746, "grad_norm": 7.151185425257774, "learning_rate": 7.353955871635194e-07, "loss": 0.1781, "step": 4625 }, { "epoch": 1.2131335462212396, "grad_norm": 7.814709553590773, "learning_rate": 7.33306818820258e-07, "loss": 0.1362, "step": 4630 }, { "epoch": 1.2144436256448048, "grad_norm": 4.846484187047676, "learning_rate": 7.312193023938411e-07, "loss": 0.1624, "step": 4635 }, { "epoch": 1.2157537050683698, "grad_norm": 4.026530959170853, "learning_rate": 7.291330476835327e-07, "loss": 0.1428, "step": 4640 }, { "epoch": 1.2170637844919348, "grad_norm": 4.135011637724928, "learning_rate": 7.270480644826749e-07, "loss": 0.1685, "step": 4645 }, { "epoch": 1.2183738639155, "grad_norm": 4.129753177959848, "learning_rate": 7.249643625786396e-07, "loss": 0.1385, "step": 4650 }, { "epoch": 1.219683943339065, "grad_norm": 3.5058969060009972, "learning_rate": 7.228819517527853e-07, "loss": 0.1573, "step": 4655 }, { "epoch": 1.22099402276263, "grad_norm": 5.141143930253066, "learning_rate": 7.208008417804097e-07, "loss": 0.1667, "step": 4660 }, { "epoch": 1.2223041021861951, "grad_norm": 5.713198603618221, "learning_rate": 7.18721042430704e-07, "loss": 0.1665, "step": 4665 }, { "epoch": 1.22361418160976, "grad_norm": 6.881035052397601, "learning_rate": 7.166425634667061e-07, "loss": 0.0995, "step": 4670 }, { "epoch": 1.224924261033325, "grad_norm": 9.101390365069749, "learning_rate": 7.14565414645257e-07, "loss": 0.1738, "step": 4675 }, { "epoch": 1.2262343404568903, "grad_norm": 4.609339453067137, "learning_rate": 7.124896057169532e-07, "loss": 0.1568, "step": 4680 }, { "epoch": 1.2275444198804553, "grad_norm": 4.95075728149686, "learning_rate": 7.104151464261012e-07, "loss": 0.1443, "step": 4685 }, { "epoch": 1.2288544993040202, "grad_norm": 5.718250117967298, "learning_rate": 7.083420465106727e-07, "loss": 0.145, "step": 4690 }, { "epoch": 1.2301645787275854, "grad_norm": 4.025790690698405, "learning_rate": 7.062703157022571e-07, "loss": 0.2297, "step": 4695 }, { "epoch": 1.2314746581511504, "grad_norm": 5.85692684018953, "learning_rate": 7.041999637260179e-07, "loss": 0.1599, "step": 4700 }, { "epoch": 1.2314746581511504, "eval_accuracy": 0.7528, "eval_loss": 1.1270846128463745, "eval_runtime": 144.8062, "eval_samples_per_second": 8.632, "eval_steps_per_second": 2.162, "step": 4700 }, { "epoch": 1.2327847375747154, "grad_norm": 3.639396650071254, "learning_rate": 7.021310003006458e-07, "loss": 0.1767, "step": 4705 }, { "epoch": 1.2340948169982806, "grad_norm": 4.992262473685122, "learning_rate": 7.00063435138313e-07, "loss": 0.1965, "step": 4710 }, { "epoch": 1.2354048964218456, "grad_norm": 4.404778717860058, "learning_rate": 6.979972779446288e-07, "loss": 0.1772, "step": 4715 }, { "epoch": 1.2367149758454106, "grad_norm": 6.471861850114097, "learning_rate": 6.959325384185916e-07, "loss": 0.1849, "step": 4720 }, { "epoch": 1.2380250552689758, "grad_norm": 4.178248349318392, "learning_rate": 6.938692262525463e-07, "loss": 0.1845, "step": 4725 }, { "epoch": 1.2393351346925408, "grad_norm": 3.328304867825855, "learning_rate": 6.918073511321372e-07, "loss": 0.1609, "step": 4730 }, { "epoch": 1.2406452141161057, "grad_norm": 6.825787570815529, "learning_rate": 6.897469227362626e-07, "loss": 0.2165, "step": 4735 }, { "epoch": 1.241955293539671, "grad_norm": 3.6473987849811573, "learning_rate": 6.876879507370296e-07, "loss": 0.1681, "step": 4740 }, { "epoch": 1.243265372963236, "grad_norm": 5.7577805362937395, "learning_rate": 6.856304447997087e-07, "loss": 0.1393, "step": 4745 }, { "epoch": 1.244575452386801, "grad_norm": 2.988983071415241, "learning_rate": 6.835744145826883e-07, "loss": 0.1293, "step": 4750 }, { "epoch": 1.245885531810366, "grad_norm": 4.094951732166031, "learning_rate": 6.815198697374295e-07, "loss": 0.1986, "step": 4755 }, { "epoch": 1.247195611233931, "grad_norm": 5.3643705834327555, "learning_rate": 6.794668199084211e-07, "loss": 0.1561, "step": 4760 }, { "epoch": 1.248505690657496, "grad_norm": 6.488709549888938, "learning_rate": 6.774152747331327e-07, "loss": 0.1506, "step": 4765 }, { "epoch": 1.2498157700810613, "grad_norm": 8.658130020703828, "learning_rate": 6.753652438419724e-07, "loss": 0.1462, "step": 4770 }, { "epoch": 1.2511258495046262, "grad_norm": 3.6729089118316143, "learning_rate": 6.733167368582387e-07, "loss": 0.1754, "step": 4775 }, { "epoch": 1.2524359289281912, "grad_norm": 4.412486038113313, "learning_rate": 6.71269763398077e-07, "loss": 0.1524, "step": 4780 }, { "epoch": 1.2537460083517562, "grad_norm": 7.05308577872481, "learning_rate": 6.692243330704345e-07, "loss": 0.1955, "step": 4785 }, { "epoch": 1.2550560877753214, "grad_norm": 3.707641518167412, "learning_rate": 6.671804554770134e-07, "loss": 0.1519, "step": 4790 }, { "epoch": 1.2563661671988864, "grad_norm": 5.375891710975129, "learning_rate": 6.651381402122279e-07, "loss": 0.175, "step": 4795 }, { "epoch": 1.2576762466224514, "grad_norm": 5.686007779957575, "learning_rate": 6.630973968631582e-07, "loss": 0.1541, "step": 4800 }, { "epoch": 1.2576762466224514, "eval_accuracy": 0.7504, "eval_loss": 1.1022791862487793, "eval_runtime": 147.8534, "eval_samples_per_second": 8.454, "eval_steps_per_second": 2.117, "step": 4800 }, { "epoch": 1.2589863260460166, "grad_norm": 4.67723472749942, "learning_rate": 6.610582350095056e-07, "loss": 0.1378, "step": 4805 }, { "epoch": 1.2602964054695815, "grad_norm": 4.617066533461443, "learning_rate": 6.590206642235469e-07, "loss": 0.1512, "step": 4810 }, { "epoch": 1.2616064848931465, "grad_norm": 6.057922955780923, "learning_rate": 6.569846940700905e-07, "loss": 0.1826, "step": 4815 }, { "epoch": 1.2629165643167117, "grad_norm": 6.250001047253521, "learning_rate": 6.549503341064315e-07, "loss": 0.1458, "step": 4820 }, { "epoch": 1.2642266437402767, "grad_norm": 4.27455152251786, "learning_rate": 6.529175938823059e-07, "loss": 0.1333, "step": 4825 }, { "epoch": 1.2655367231638417, "grad_norm": 4.878557343525659, "learning_rate": 6.508864829398464e-07, "loss": 0.16, "step": 4830 }, { "epoch": 1.266846802587407, "grad_norm": 5.406320166959371, "learning_rate": 6.488570108135375e-07, "loss": 0.1777, "step": 4835 }, { "epoch": 1.2681568820109719, "grad_norm": 2.433179347244675, "learning_rate": 6.468291870301707e-07, "loss": 0.1715, "step": 4840 }, { "epoch": 1.2694669614345369, "grad_norm": 9.875918507487867, "learning_rate": 6.448030211087997e-07, "loss": 0.1599, "step": 4845 }, { "epoch": 1.270777040858102, "grad_norm": 6.6713245625990245, "learning_rate": 6.427785225606961e-07, "loss": 0.1406, "step": 4850 }, { "epoch": 1.272087120281667, "grad_norm": 5.027821637791478, "learning_rate": 6.40755700889305e-07, "loss": 0.187, "step": 4855 }, { "epoch": 1.273397199705232, "grad_norm": 2.7556993105477527, "learning_rate": 6.38734565590198e-07, "loss": 0.159, "step": 4860 }, { "epoch": 1.2747072791287972, "grad_norm": 5.364669134362788, "learning_rate": 6.367151261510324e-07, "loss": 0.2186, "step": 4865 }, { "epoch": 1.2760173585523622, "grad_norm": 3.5785337661276673, "learning_rate": 6.346973920515039e-07, "loss": 0.1364, "step": 4870 }, { "epoch": 1.2773274379759272, "grad_norm": 5.414858492393773, "learning_rate": 6.326813727633034e-07, "loss": 0.1825, "step": 4875 }, { "epoch": 1.2786375173994924, "grad_norm": 2.4129442164086536, "learning_rate": 6.306670777500718e-07, "loss": 0.1197, "step": 4880 }, { "epoch": 1.2799475968230574, "grad_norm": 5.264909047367014, "learning_rate": 6.286545164673555e-07, "loss": 0.2254, "step": 4885 }, { "epoch": 1.2812576762466223, "grad_norm": 2.0451302251453956, "learning_rate": 6.26643698362563e-07, "loss": 0.1347, "step": 4890 }, { "epoch": 1.2825677556701875, "grad_norm": 3.7829205343945733, "learning_rate": 6.246346328749199e-07, "loss": 0.1552, "step": 4895 }, { "epoch": 1.2838778350937525, "grad_norm": 5.594121118174163, "learning_rate": 6.226273294354247e-07, "loss": 0.1621, "step": 4900 }, { "epoch": 1.2838778350937525, "eval_accuracy": 0.7496, "eval_loss": 1.1256372928619385, "eval_runtime": 143.3507, "eval_samples_per_second": 8.72, "eval_steps_per_second": 2.183, "step": 4900 }, { "epoch": 1.2851879145173175, "grad_norm": 5.282433980211254, "learning_rate": 6.206217974668034e-07, "loss": 0.1379, "step": 4905 }, { "epoch": 1.2864979939408827, "grad_norm": 3.277253081058706, "learning_rate": 6.186180463834675e-07, "loss": 0.1338, "step": 4910 }, { "epoch": 1.2878080733644477, "grad_norm": 6.466429706589474, "learning_rate": 6.166160855914683e-07, "loss": 0.1542, "step": 4915 }, { "epoch": 1.2891181527880127, "grad_norm": 4.3752571537021625, "learning_rate": 6.146159244884533e-07, "loss": 0.204, "step": 4920 }, { "epoch": 1.2904282322115779, "grad_norm": 9.612960014494304, "learning_rate": 6.126175724636213e-07, "loss": 0.1666, "step": 4925 }, { "epoch": 1.2917383116351429, "grad_norm": 7.388843981840605, "learning_rate": 6.106210388976792e-07, "loss": 0.1676, "step": 4930 }, { "epoch": 1.2930483910587078, "grad_norm": 5.525130280779438, "learning_rate": 6.086263331627975e-07, "loss": 0.1371, "step": 4935 }, { "epoch": 1.294358470482273, "grad_norm": 15.7136065256586, "learning_rate": 6.066334646225669e-07, "loss": 0.2647, "step": 4940 }, { "epoch": 1.295668549905838, "grad_norm": 4.509787064035042, "learning_rate": 6.046424426319534e-07, "loss": 0.186, "step": 4945 }, { "epoch": 1.296978629329403, "grad_norm": 3.034726602811886, "learning_rate": 6.026532765372556e-07, "loss": 0.1689, "step": 4950 }, { "epoch": 1.2982887087529682, "grad_norm": 9.470147103767804, "learning_rate": 6.006659756760587e-07, "loss": 0.1738, "step": 4955 }, { "epoch": 1.2995987881765332, "grad_norm": 3.723471628172551, "learning_rate": 5.986805493771933e-07, "loss": 0.1699, "step": 4960 }, { "epoch": 1.3009088676000982, "grad_norm": 2.4954433173901998, "learning_rate": 5.966970069606905e-07, "loss": 0.1066, "step": 4965 }, { "epoch": 1.3022189470236634, "grad_norm": 3.8824997751755332, "learning_rate": 5.947153577377372e-07, "loss": 0.1243, "step": 4970 }, { "epoch": 1.3035290264472283, "grad_norm": 7.155179589969273, "learning_rate": 5.927356110106335e-07, "loss": 0.1868, "step": 4975 }, { "epoch": 1.3048391058707933, "grad_norm": 6.426034228727986, "learning_rate": 5.907577760727491e-07, "loss": 0.1749, "step": 4980 }, { "epoch": 1.3061491852943585, "grad_norm": 6.259326643585595, "learning_rate": 5.887818622084792e-07, "loss": 0.1687, "step": 4985 }, { "epoch": 1.3074592647179235, "grad_norm": 7.118570215949986, "learning_rate": 5.86807878693201e-07, "loss": 0.1945, "step": 4990 }, { "epoch": 1.3087693441414885, "grad_norm": 4.470114328159557, "learning_rate": 5.848358347932305e-07, "loss": 0.1279, "step": 4995 }, { "epoch": 1.3100794235650537, "grad_norm": 4.518469800001269, "learning_rate": 5.828657397657775e-07, "loss": 0.1581, "step": 5000 }, { "epoch": 1.3100794235650537, "eval_accuracy": 0.7664, "eval_loss": 1.0690715312957764, "eval_runtime": 138.0186, "eval_samples_per_second": 9.057, "eval_steps_per_second": 2.268, "step": 5000 }, { "epoch": 1.3113895029886187, "grad_norm": 3.118035615186372, "learning_rate": 5.808976028589052e-07, "loss": 0.148, "step": 5005 }, { "epoch": 1.3126995824121837, "grad_norm": 8.339043807140468, "learning_rate": 5.789314333114832e-07, "loss": 0.1599, "step": 5010 }, { "epoch": 1.3140096618357489, "grad_norm": 3.979545884384439, "learning_rate": 5.769672403531476e-07, "loss": 0.1862, "step": 5015 }, { "epoch": 1.3153197412593138, "grad_norm": 5.713823007971235, "learning_rate": 5.750050332042546e-07, "loss": 0.1493, "step": 5020 }, { "epoch": 1.3166298206828788, "grad_norm": 3.975891070180634, "learning_rate": 5.730448210758392e-07, "loss": 0.1615, "step": 5025 }, { "epoch": 1.317939900106444, "grad_norm": 9.895767819873718, "learning_rate": 5.710866131695707e-07, "loss": 0.1817, "step": 5030 }, { "epoch": 1.319249979530009, "grad_norm": 3.9920763618019306, "learning_rate": 5.691304186777112e-07, "loss": 0.1139, "step": 5035 }, { "epoch": 1.320560058953574, "grad_norm": 4.602626327223196, "learning_rate": 5.671762467830701e-07, "loss": 0.1388, "step": 5040 }, { "epoch": 1.3218701383771392, "grad_norm": 3.504924250719407, "learning_rate": 5.652241066589638e-07, "loss": 0.1349, "step": 5045 }, { "epoch": 1.3231802178007042, "grad_norm": 3.9727230916218694, "learning_rate": 5.6327400746917e-07, "loss": 0.1308, "step": 5050 }, { "epoch": 1.3244902972242691, "grad_norm": 5.966820662181034, "learning_rate": 5.613259583678855e-07, "loss": 0.1937, "step": 5055 }, { "epoch": 1.3258003766478343, "grad_norm": 4.982426614770313, "learning_rate": 5.593799684996851e-07, "loss": 0.0966, "step": 5060 }, { "epoch": 1.3271104560713993, "grad_norm": 11.15113796656988, "learning_rate": 5.574360469994755e-07, "loss": 0.1868, "step": 5065 }, { "epoch": 1.3284205354949643, "grad_norm": 4.032648149967243, "learning_rate": 5.55494202992455e-07, "loss": 0.1081, "step": 5070 }, { "epoch": 1.3297306149185295, "grad_norm": 13.254457078365599, "learning_rate": 5.535544455940685e-07, "loss": 0.198, "step": 5075 }, { "epoch": 1.3310406943420945, "grad_norm": 6.857347197358272, "learning_rate": 5.51616783909968e-07, "loss": 0.1458, "step": 5080 }, { "epoch": 1.3323507737656595, "grad_norm": 4.4329913576651565, "learning_rate": 5.496812270359651e-07, "loss": 0.1764, "step": 5085 }, { "epoch": 1.3336608531892247, "grad_norm": 1.9756285446680741, "learning_rate": 5.477477840579941e-07, "loss": 0.1328, "step": 5090 }, { "epoch": 1.3349709326127897, "grad_norm": 16.33866926710285, "learning_rate": 5.458164640520626e-07, "loss": 0.1688, "step": 5095 }, { "epoch": 1.3362810120363546, "grad_norm": 7.260662473486001, "learning_rate": 5.438872760842155e-07, "loss": 0.1475, "step": 5100 }, { "epoch": 1.3362810120363546, "eval_accuracy": 0.7632, "eval_loss": 1.166494607925415, "eval_runtime": 139.3544, "eval_samples_per_second": 8.97, "eval_steps_per_second": 2.246, "step": 5100 }, { "epoch": 1.3375910914599198, "grad_norm": 9.098282960735217, "learning_rate": 5.419602292104877e-07, "loss": 0.2249, "step": 5105 }, { "epoch": 1.3389011708834848, "grad_norm": 14.59920166671128, "learning_rate": 5.400353324768641e-07, "loss": 0.2254, "step": 5110 }, { "epoch": 1.3402112503070498, "grad_norm": 5.661845688109474, "learning_rate": 5.381125949192369e-07, "loss": 0.1491, "step": 5115 }, { "epoch": 1.341521329730615, "grad_norm": 4.343788864841793, "learning_rate": 5.361920255633608e-07, "loss": 0.1416, "step": 5120 }, { "epoch": 1.34283140915418, "grad_norm": 5.455096296340938, "learning_rate": 5.342736334248142e-07, "loss": 0.1591, "step": 5125 }, { "epoch": 1.344141488577745, "grad_norm": 6.62802098246343, "learning_rate": 5.323574275089542e-07, "loss": 0.1631, "step": 5130 }, { "epoch": 1.3454515680013102, "grad_norm": 5.475172079252899, "learning_rate": 5.304434168108768e-07, "loss": 0.1486, "step": 5135 }, { "epoch": 1.3467616474248751, "grad_norm": 3.7823792470345867, "learning_rate": 5.285316103153703e-07, "loss": 0.162, "step": 5140 }, { "epoch": 1.3480717268484401, "grad_norm": 3.950455595949725, "learning_rate": 5.266220169968789e-07, "loss": 0.1386, "step": 5145 }, { "epoch": 1.3493818062720053, "grad_norm": 7.064128257879523, "learning_rate": 5.247146458194558e-07, "loss": 0.1265, "step": 5150 }, { "epoch": 1.3506918856955703, "grad_norm": 3.451315722735635, "learning_rate": 5.228095057367244e-07, "loss": 0.1564, "step": 5155 }, { "epoch": 1.3520019651191353, "grad_norm": 6.376390408082125, "learning_rate": 5.209066056918336e-07, "loss": 0.1408, "step": 5160 }, { "epoch": 1.3533120445427005, "grad_norm": 10.020189583400871, "learning_rate": 5.190059546174173e-07, "loss": 0.1868, "step": 5165 }, { "epoch": 1.3546221239662655, "grad_norm": 10.402422967695797, "learning_rate": 5.171075614355531e-07, "loss": 0.1567, "step": 5170 }, { "epoch": 1.3559322033898304, "grad_norm": 6.325865551750877, "learning_rate": 5.152114350577183e-07, "loss": 0.1524, "step": 5175 }, { "epoch": 1.3572422828133957, "grad_norm": 9.377120804911332, "learning_rate": 5.133175843847507e-07, "loss": 0.2113, "step": 5180 }, { "epoch": 1.3585523622369606, "grad_norm": 5.0552106113831226, "learning_rate": 5.114260183068043e-07, "loss": 0.1793, "step": 5185 }, { "epoch": 1.3598624416605256, "grad_norm": 7.793559280526845, "learning_rate": 5.095367457033091e-07, "loss": 0.2107, "step": 5190 }, { "epoch": 1.3611725210840908, "grad_norm": 5.256871245122319, "learning_rate": 5.076497754429286e-07, "loss": 0.153, "step": 5195 }, { "epoch": 1.3624826005076558, "grad_norm": 7.340368996139302, "learning_rate": 5.0576511638352e-07, "loss": 0.1562, "step": 5200 }, { "epoch": 1.3624826005076558, "eval_accuracy": 0.7648, "eval_loss": 1.0087817907333374, "eval_runtime": 139.3617, "eval_samples_per_second": 8.969, "eval_steps_per_second": 2.246, "step": 5200 }, { "epoch": 1.3637926799312208, "grad_norm": 4.87245584125935, "learning_rate": 5.03882777372089e-07, "loss": 0.1542, "step": 5205 }, { "epoch": 1.365102759354786, "grad_norm": 4.198926991817569, "learning_rate": 5.020027672447531e-07, "loss": 0.1252, "step": 5210 }, { "epoch": 1.366412838778351, "grad_norm": 6.952393202633123, "learning_rate": 5.001250948266953e-07, "loss": 0.1858, "step": 5215 }, { "epoch": 1.367722918201916, "grad_norm": 2.4402795139976936, "learning_rate": 4.982497689321254e-07, "loss": 0.139, "step": 5220 }, { "epoch": 1.3690329976254811, "grad_norm": 3.952799970755654, "learning_rate": 4.963767983642391e-07, "loss": 0.1942, "step": 5225 }, { "epoch": 1.3703430770490461, "grad_norm": 3.1488846024801855, "learning_rate": 4.945061919151748e-07, "loss": 0.1268, "step": 5230 }, { "epoch": 1.371653156472611, "grad_norm": 3.8295556788809533, "learning_rate": 4.926379583659732e-07, "loss": 0.1492, "step": 5235 }, { "epoch": 1.3729632358961763, "grad_norm": 5.807740283147101, "learning_rate": 4.907721064865358e-07, "loss": 0.1764, "step": 5240 }, { "epoch": 1.3742733153197413, "grad_norm": 6.049197583837082, "learning_rate": 4.889086450355853e-07, "loss": 0.1335, "step": 5245 }, { "epoch": 1.3755833947433063, "grad_norm": 5.872325479624525, "learning_rate": 4.870475827606218e-07, "loss": 0.1875, "step": 5250 }, { "epoch": 1.3768934741668715, "grad_norm": 7.484940602815659, "learning_rate": 4.851889283978841e-07, "loss": 0.2242, "step": 5255 }, { "epoch": 1.3782035535904364, "grad_norm": 4.44039134698427, "learning_rate": 4.833326906723071e-07, "loss": 0.1884, "step": 5260 }, { "epoch": 1.3795136330140014, "grad_norm": 2.9457269887282584, "learning_rate": 4.814788782974814e-07, "loss": 0.1575, "step": 5265 }, { "epoch": 1.3808237124375666, "grad_norm": 4.142755818231746, "learning_rate": 4.796274999756134e-07, "loss": 0.1503, "step": 5270 }, { "epoch": 1.3821337918611316, "grad_norm": 4.231068410579535, "learning_rate": 4.777785643974822e-07, "loss": 0.1296, "step": 5275 }, { "epoch": 1.3834438712846966, "grad_norm": 7.276995612922505, "learning_rate": 4.7593208024240196e-07, "loss": 0.1793, "step": 5280 }, { "epoch": 1.3847539507082618, "grad_norm": 2.8458411818811973, "learning_rate": 4.740880561781766e-07, "loss": 0.097, "step": 5285 }, { "epoch": 1.3860640301318268, "grad_norm": 9.313608656005794, "learning_rate": 4.7224650086106444e-07, "loss": 0.1973, "step": 5290 }, { "epoch": 1.3873741095553918, "grad_norm": 8.399153217159018, "learning_rate": 4.7040742293573334e-07, "loss": 0.1789, "step": 5295 }, { "epoch": 1.388684188978957, "grad_norm": 5.871107449455294, "learning_rate": 4.6857083103522277e-07, "loss": 0.1899, "step": 5300 }, { "epoch": 1.388684188978957, "eval_accuracy": 0.7624, "eval_loss": 1.0121264457702637, "eval_runtime": 139.5539, "eval_samples_per_second": 8.957, "eval_steps_per_second": 2.243, "step": 5300 }, { "epoch": 1.389994268402522, "grad_norm": 3.8156581594965684, "learning_rate": 4.667367337809016e-07, "loss": 0.1204, "step": 5305 }, { "epoch": 1.391304347826087, "grad_norm": 4.985992975846861, "learning_rate": 4.6490513978242804e-07, "loss": 0.1319, "step": 5310 }, { "epoch": 1.3926144272496521, "grad_norm": 5.076467294818805, "learning_rate": 4.6307605763771076e-07, "loss": 0.1684, "step": 5315 }, { "epoch": 1.393924506673217, "grad_norm": 5.593144516901242, "learning_rate": 4.6124949593286523e-07, "loss": 0.2016, "step": 5320 }, { "epoch": 1.395234586096782, "grad_norm": 6.386164042680592, "learning_rate": 4.5942546324217803e-07, "loss": 0.1468, "step": 5325 }, { "epoch": 1.3965446655203473, "grad_norm": 6.108862203835904, "learning_rate": 4.576039681280608e-07, "loss": 0.1441, "step": 5330 }, { "epoch": 1.3978547449439123, "grad_norm": 3.9244812319321274, "learning_rate": 4.557850191410161e-07, "loss": 0.1768, "step": 5335 }, { "epoch": 1.3991648243674772, "grad_norm": 4.15439436004101, "learning_rate": 4.5396862481959243e-07, "loss": 0.1338, "step": 5340 }, { "epoch": 1.4004749037910424, "grad_norm": 5.897274197879647, "learning_rate": 4.521547936903477e-07, "loss": 0.1798, "step": 5345 }, { "epoch": 1.4017849832146074, "grad_norm": 6.090071523418443, "learning_rate": 4.5034353426780657e-07, "loss": 0.1729, "step": 5350 }, { "epoch": 1.4030950626381724, "grad_norm": 6.853061331028334, "learning_rate": 4.4853485505442133e-07, "loss": 0.1445, "step": 5355 }, { "epoch": 1.4044051420617376, "grad_norm": 4.835112562982259, "learning_rate": 4.4672876454053354e-07, "loss": 0.1255, "step": 5360 }, { "epoch": 1.4057152214853026, "grad_norm": 3.3603532935805713, "learning_rate": 4.449252712043311e-07, "loss": 0.1178, "step": 5365 }, { "epoch": 1.4070253009088676, "grad_norm": 5.621068344363656, "learning_rate": 4.431243835118124e-07, "loss": 0.1521, "step": 5370 }, { "epoch": 1.4083353803324328, "grad_norm": 6.177956693196569, "learning_rate": 4.4132610991674123e-07, "loss": 0.2011, "step": 5375 }, { "epoch": 1.4096454597559978, "grad_norm": 3.8093768021526095, "learning_rate": 4.3953045886061336e-07, "loss": 0.1414, "step": 5380 }, { "epoch": 1.4109555391795627, "grad_norm": 8.73859339392293, "learning_rate": 4.377374387726116e-07, "loss": 0.2335, "step": 5385 }, { "epoch": 1.412265618603128, "grad_norm": 7.4975778108062965, "learning_rate": 4.359470580695701e-07, "loss": 0.1395, "step": 5390 }, { "epoch": 1.413575698026693, "grad_norm": 8.260550346451351, "learning_rate": 4.341593251559319e-07, "loss": 0.1615, "step": 5395 }, { "epoch": 1.414885777450258, "grad_norm": 3.552748594627843, "learning_rate": 4.323742484237107e-07, "loss": 0.1378, "step": 5400 }, { "epoch": 1.414885777450258, "eval_accuracy": 0.7656, "eval_loss": 1.022687554359436, "eval_runtime": 142.9636, "eval_samples_per_second": 8.743, "eval_steps_per_second": 2.189, "step": 5400 }, { "epoch": 1.416195856873823, "grad_norm": 3.658068549302559, "learning_rate": 4.3059183625245275e-07, "loss": 0.1878, "step": 5405 }, { "epoch": 1.417505936297388, "grad_norm": 12.81739721678878, "learning_rate": 4.288120970091947e-07, "loss": 0.2519, "step": 5410 }, { "epoch": 1.418816015720953, "grad_norm": 6.806589788632898, "learning_rate": 4.270350390484274e-07, "loss": 0.1387, "step": 5415 }, { "epoch": 1.4201260951445183, "grad_norm": 3.558959244970237, "learning_rate": 4.2526067071205394e-07, "loss": 0.1574, "step": 5420 }, { "epoch": 1.4214361745680832, "grad_norm": 4.1991124986912665, "learning_rate": 4.234890003293522e-07, "loss": 0.1533, "step": 5425 }, { "epoch": 1.4227462539916482, "grad_norm": 5.57307236404189, "learning_rate": 4.2172003621693495e-07, "loss": 0.1435, "step": 5430 }, { "epoch": 1.4240563334152134, "grad_norm": 9.98468287385797, "learning_rate": 4.1995378667871206e-07, "loss": 0.1221, "step": 5435 }, { "epoch": 1.4253664128387784, "grad_norm": 4.021122095372401, "learning_rate": 4.1819026000584935e-07, "loss": 0.1356, "step": 5440 }, { "epoch": 1.4266764922623434, "grad_norm": 2.539120750400961, "learning_rate": 4.164294644767321e-07, "loss": 0.1386, "step": 5445 }, { "epoch": 1.4279865716859086, "grad_norm": 4.900068180108013, "learning_rate": 4.1467140835692403e-07, "loss": 0.1509, "step": 5450 }, { "epoch": 1.4292966511094736, "grad_norm": 8.043017914794532, "learning_rate": 4.1291609989912955e-07, "loss": 0.1282, "step": 5455 }, { "epoch": 1.4306067305330386, "grad_norm": 4.392158250344916, "learning_rate": 4.1116354734315596e-07, "loss": 0.1136, "step": 5460 }, { "epoch": 1.4319168099566038, "grad_norm": 4.14190663170065, "learning_rate": 4.0941375891587273e-07, "loss": 0.1398, "step": 5465 }, { "epoch": 1.4332268893801687, "grad_norm": 8.877219117463126, "learning_rate": 4.076667428311739e-07, "loss": 0.1529, "step": 5470 }, { "epoch": 1.4345369688037337, "grad_norm": 10.951781738101637, "learning_rate": 4.059225072899397e-07, "loss": 0.1794, "step": 5475 }, { "epoch": 1.435847048227299, "grad_norm": 3.5884378317825982, "learning_rate": 4.041810604799986e-07, "loss": 0.1287, "step": 5480 }, { "epoch": 1.437157127650864, "grad_norm": 7.646485652411928, "learning_rate": 4.0244241057608675e-07, "loss": 0.1526, "step": 5485 }, { "epoch": 1.4384672070744289, "grad_norm": 4.095820215840846, "learning_rate": 4.0070656573981263e-07, "loss": 0.1629, "step": 5490 }, { "epoch": 1.4397772864979939, "grad_norm": 4.619960063809711, "learning_rate": 3.9897353411961576e-07, "loss": 0.1631, "step": 5495 }, { "epoch": 1.441087365921559, "grad_norm": 7.65793128091754, "learning_rate": 3.9724332385073e-07, "loss": 0.1684, "step": 5500 }, { "epoch": 1.441087365921559, "eval_accuracy": 0.7616, "eval_loss": 1.1524358987808228, "eval_runtime": 140.9725, "eval_samples_per_second": 8.867, "eval_steps_per_second": 2.22, "step": 5500 }, { "epoch": 1.442397445345124, "grad_norm": 9.53033405655631, "learning_rate": 3.955159430551462e-07, "loss": 0.1856, "step": 5505 }, { "epoch": 1.443707524768689, "grad_norm": 6.635703636489727, "learning_rate": 3.937913998415716e-07, "loss": 0.1173, "step": 5510 }, { "epoch": 1.4450176041922542, "grad_norm": 2.8008935096487444, "learning_rate": 3.9206970230539484e-07, "loss": 0.1407, "step": 5515 }, { "epoch": 1.4463276836158192, "grad_norm": 7.567404117719152, "learning_rate": 3.90350858528644e-07, "loss": 0.1339, "step": 5520 }, { "epoch": 1.4476377630393842, "grad_norm": 5.684176798537522, "learning_rate": 3.886348765799535e-07, "loss": 0.1448, "step": 5525 }, { "epoch": 1.4489478424629494, "grad_norm": 7.48394265324762, "learning_rate": 3.8692176451452187e-07, "loss": 0.1873, "step": 5530 }, { "epoch": 1.4502579218865144, "grad_norm": 4.704500306984347, "learning_rate": 3.852115303740775e-07, "loss": 0.1384, "step": 5535 }, { "epoch": 1.4515680013100793, "grad_norm": 7.782133396015602, "learning_rate": 3.8350418218683656e-07, "loss": 0.1678, "step": 5540 }, { "epoch": 1.4528780807336446, "grad_norm": 7.950318860217571, "learning_rate": 3.817997279674707e-07, "loss": 0.1491, "step": 5545 }, { "epoch": 1.4541881601572095, "grad_norm": 4.885181041366676, "learning_rate": 3.800981757170647e-07, "loss": 0.1333, "step": 5550 }, { "epoch": 1.4554982395807745, "grad_norm": 4.5099336389227, "learning_rate": 3.7839953342308195e-07, "loss": 0.1649, "step": 5555 }, { "epoch": 1.4568083190043397, "grad_norm": 1.3392143606297127, "learning_rate": 3.767038090593262e-07, "loss": 0.1196, "step": 5560 }, { "epoch": 1.4581183984279047, "grad_norm": 6.801138336879372, "learning_rate": 3.7501101058590156e-07, "loss": 0.1303, "step": 5565 }, { "epoch": 1.4594284778514697, "grad_norm": 6.231994683303187, "learning_rate": 3.733211459491802e-07, "loss": 0.1275, "step": 5570 }, { "epoch": 1.4607385572750349, "grad_norm": 7.481895592566931, "learning_rate": 3.716342230817598e-07, "loss": 0.1563, "step": 5575 }, { "epoch": 1.4620486366985999, "grad_norm": 5.294171079568994, "learning_rate": 3.6995024990243097e-07, "loss": 0.1615, "step": 5580 }, { "epoch": 1.4633587161221648, "grad_norm": 4.889827410342588, "learning_rate": 3.682692343161361e-07, "loss": 0.1409, "step": 5585 }, { "epoch": 1.46466879554573, "grad_norm": 4.755475192052327, "learning_rate": 3.6659118421393454e-07, "loss": 0.2151, "step": 5590 }, { "epoch": 1.465978874969295, "grad_norm": 8.146609131839343, "learning_rate": 3.6491610747296464e-07, "loss": 0.167, "step": 5595 }, { "epoch": 1.46728895439286, "grad_norm": 11.571129826043965, "learning_rate": 3.632440119564084e-07, "loss": 0.1526, "step": 5600 }, { "epoch": 1.46728895439286, "eval_accuracy": 0.7632, "eval_loss": 1.1522161960601807, "eval_runtime": 140.4528, "eval_samples_per_second": 8.9, "eval_steps_per_second": 2.229, "step": 5600 }, { "epoch": 1.4685990338164252, "grad_norm": 13.388610986617053, "learning_rate": 3.615749055134516e-07, "loss": 0.1434, "step": 5605 }, { "epoch": 1.4699091132399902, "grad_norm": 4.611773903177833, "learning_rate": 3.5990879597925015e-07, "loss": 0.1593, "step": 5610 }, { "epoch": 1.4712191926635552, "grad_norm": 5.209036065178781, "learning_rate": 3.5824569117489087e-07, "loss": 0.1589, "step": 5615 }, { "epoch": 1.4725292720871201, "grad_norm": 5.985898828281977, "learning_rate": 3.565855989073555e-07, "loss": 0.2083, "step": 5620 }, { "epoch": 1.4738393515106853, "grad_norm": 7.4907701541903355, "learning_rate": 3.549285269694855e-07, "loss": 0.2042, "step": 5625 }, { "epoch": 1.4751494309342503, "grad_norm": 8.271025659121081, "learning_rate": 3.53274483139943e-07, "loss": 0.1482, "step": 5630 }, { "epoch": 1.4764595103578153, "grad_norm": 8.918810219484554, "learning_rate": 3.5162347518317614e-07, "loss": 0.155, "step": 5635 }, { "epoch": 1.4777695897813805, "grad_norm": 6.395860091750389, "learning_rate": 3.499755108493814e-07, "loss": 0.1675, "step": 5640 }, { "epoch": 1.4790796692049455, "grad_norm": 4.314058968165375, "learning_rate": 3.483305978744688e-07, "loss": 0.1404, "step": 5645 }, { "epoch": 1.4803897486285105, "grad_norm": 4.359611398643348, "learning_rate": 3.4668874398002367e-07, "loss": 0.1973, "step": 5650 }, { "epoch": 1.4816998280520757, "grad_norm": 4.284018420029098, "learning_rate": 3.450499568732722e-07, "loss": 0.1673, "step": 5655 }, { "epoch": 1.4830099074756407, "grad_norm": 11.418803588749793, "learning_rate": 3.434142442470437e-07, "loss": 0.1604, "step": 5660 }, { "epoch": 1.4843199868992056, "grad_norm": 11.223038201848048, "learning_rate": 3.41781613779735e-07, "loss": 0.1685, "step": 5665 }, { "epoch": 1.4856300663227708, "grad_norm": 2.587612812317313, "learning_rate": 3.401520731352758e-07, "loss": 0.136, "step": 5670 }, { "epoch": 1.4869401457463358, "grad_norm": 3.3606721211871826, "learning_rate": 3.385256299630901e-07, "loss": 0.1451, "step": 5675 }, { "epoch": 1.4882502251699008, "grad_norm": 4.563542445144167, "learning_rate": 3.36902291898063e-07, "loss": 0.1518, "step": 5680 }, { "epoch": 1.489560304593466, "grad_norm": 4.828685381151716, "learning_rate": 3.352820665605016e-07, "loss": 0.1545, "step": 5685 }, { "epoch": 1.490870384017031, "grad_norm": 6.374748369935473, "learning_rate": 3.336649615561035e-07, "loss": 0.1404, "step": 5690 }, { "epoch": 1.492180463440596, "grad_norm": 6.1615180099066285, "learning_rate": 3.320509844759168e-07, "loss": 0.1522, "step": 5695 }, { "epoch": 1.4934905428641612, "grad_norm": 3.212620440024653, "learning_rate": 3.3044014289630827e-07, "loss": 0.1852, "step": 5700 }, { "epoch": 1.4934905428641612, "eval_accuracy": 0.7648, "eval_loss": 1.1015968322753906, "eval_runtime": 139.045, "eval_samples_per_second": 8.99, "eval_steps_per_second": 2.251, "step": 5700 }, { "epoch": 1.4948006222877261, "grad_norm": 9.244280944426595, "learning_rate": 3.288324443789243e-07, "loss": 0.173, "step": 5705 }, { "epoch": 1.4961107017112911, "grad_norm": 7.103864592003495, "learning_rate": 3.272278964706575e-07, "loss": 0.1468, "step": 5710 }, { "epoch": 1.4974207811348563, "grad_norm": 9.814370133917347, "learning_rate": 3.256265067036118e-07, "loss": 0.2144, "step": 5715 }, { "epoch": 1.4987308605584213, "grad_norm": 3.41872463628258, "learning_rate": 3.2402828259506445e-07, "loss": 0.1161, "step": 5720 }, { "epoch": 1.5000409399819863, "grad_norm": 3.982224660417661, "learning_rate": 3.2243323164743453e-07, "loss": 0.1338, "step": 5725 }, { "epoch": 1.5013510194055515, "grad_norm": 3.5236382432537052, "learning_rate": 3.208413613482429e-07, "loss": 0.1216, "step": 5730 }, { "epoch": 1.5026610988291165, "grad_norm": 5.361802539263841, "learning_rate": 3.1925267917008224e-07, "loss": 0.1533, "step": 5735 }, { "epoch": 1.5039711782526815, "grad_norm": 2.839792538080114, "learning_rate": 3.1766719257057785e-07, "loss": 0.1389, "step": 5740 }, { "epoch": 1.5052812576762467, "grad_norm": 8.32225450006049, "learning_rate": 3.160849089923555e-07, "loss": 0.1513, "step": 5745 }, { "epoch": 1.5065913370998116, "grad_norm": 7.675934502719418, "learning_rate": 3.145058358630043e-07, "loss": 0.1482, "step": 5750 }, { "epoch": 1.5079014165233766, "grad_norm": 4.898667455406003, "learning_rate": 3.1292998059504294e-07, "loss": 0.1264, "step": 5755 }, { "epoch": 1.5092114959469418, "grad_norm": 7.366536986513522, "learning_rate": 3.113573505858855e-07, "loss": 0.1777, "step": 5760 }, { "epoch": 1.5105215753705068, "grad_norm": 6.704445471281222, "learning_rate": 3.0978795321780506e-07, "loss": 0.1492, "step": 5765 }, { "epoch": 1.5118316547940718, "grad_norm": 7.187985717646523, "learning_rate": 3.0822179585790063e-07, "loss": 0.1358, "step": 5770 }, { "epoch": 1.513141734217637, "grad_norm": 8.290411615555337, "learning_rate": 3.0665888585806163e-07, "loss": 0.2399, "step": 5775 }, { "epoch": 1.514451813641202, "grad_norm": 4.958261863176042, "learning_rate": 3.050992305549335e-07, "loss": 0.1241, "step": 5780 }, { "epoch": 1.515761893064767, "grad_norm": 5.902391740892916, "learning_rate": 3.035428372698833e-07, "loss": 0.1296, "step": 5785 }, { "epoch": 1.5170719724883321, "grad_norm": 4.285558089259955, "learning_rate": 3.0198971330896637e-07, "loss": 0.183, "step": 5790 }, { "epoch": 1.5183820519118971, "grad_norm": 5.273051514451481, "learning_rate": 3.0043986596289027e-07, "loss": 0.1311, "step": 5795 }, { "epoch": 1.519692131335462, "grad_norm": 6.51925311343804, "learning_rate": 2.988933025069811e-07, "loss": 0.1358, "step": 5800 }, { "epoch": 1.519692131335462, "eval_accuracy": 0.7624, "eval_loss": 1.1159664392471313, "eval_runtime": 141.0917, "eval_samples_per_second": 8.859, "eval_steps_per_second": 2.218, "step": 5800 }, { "epoch": 1.5210022107590273, "grad_norm": 7.383268111667339, "learning_rate": 2.973500302011509e-07, "loss": 0.147, "step": 5805 }, { "epoch": 1.5223122901825923, "grad_norm": 3.8912007914030182, "learning_rate": 2.958100562898609e-07, "loss": 0.1089, "step": 5810 }, { "epoch": 1.5236223696061573, "grad_norm": 8.712734187230408, "learning_rate": 2.9427338800209033e-07, "loss": 0.2046, "step": 5815 }, { "epoch": 1.5249324490297225, "grad_norm": 3.405653489283435, "learning_rate": 2.927400325513001e-07, "loss": 0.1524, "step": 5820 }, { "epoch": 1.5262425284532875, "grad_norm": 9.562161269374949, "learning_rate": 2.912099971354002e-07, "loss": 0.1311, "step": 5825 }, { "epoch": 1.5275526078768524, "grad_norm": 4.4220037412099495, "learning_rate": 2.896832889367151e-07, "loss": 0.1844, "step": 5830 }, { "epoch": 1.5288626873004176, "grad_norm": 8.764506736183701, "learning_rate": 2.8815991512195217e-07, "loss": 0.1857, "step": 5835 }, { "epoch": 1.5301727667239826, "grad_norm": 5.111081703573647, "learning_rate": 2.8663988284216444e-07, "loss": 0.1286, "step": 5840 }, { "epoch": 1.5314828461475476, "grad_norm": 4.293980562734722, "learning_rate": 2.851231992327208e-07, "loss": 0.1363, "step": 5845 }, { "epoch": 1.5327929255711128, "grad_norm": 8.711462310411664, "learning_rate": 2.8360987141326954e-07, "loss": 0.1606, "step": 5850 }, { "epoch": 1.5341030049946778, "grad_norm": 8.990881900478213, "learning_rate": 2.820999064877062e-07, "loss": 0.2247, "step": 5855 }, { "epoch": 1.5354130844182428, "grad_norm": 9.192897104652108, "learning_rate": 2.805933115441412e-07, "loss": 0.1715, "step": 5860 }, { "epoch": 1.536723163841808, "grad_norm": 3.8288642962534567, "learning_rate": 2.790900936548646e-07, "loss": 0.1652, "step": 5865 }, { "epoch": 1.538033243265373, "grad_norm": 12.038766998437053, "learning_rate": 2.775902598763137e-07, "loss": 0.2038, "step": 5870 }, { "epoch": 1.539343322688938, "grad_norm": 4.827284945086436, "learning_rate": 2.7609381724904024e-07, "loss": 0.1901, "step": 5875 }, { "epoch": 1.5406534021125031, "grad_norm": 4.331535469429681, "learning_rate": 2.746007727976779e-07, "loss": 0.1754, "step": 5880 }, { "epoch": 1.541963481536068, "grad_norm": 1.900735020734344, "learning_rate": 2.731111335309072e-07, "loss": 0.1058, "step": 5885 }, { "epoch": 1.543273560959633, "grad_norm": 2.961654173117568, "learning_rate": 2.7162490644142545e-07, "loss": 0.1598, "step": 5890 }, { "epoch": 1.5445836403831983, "grad_norm": 10.176050102064952, "learning_rate": 2.701420985059112e-07, "loss": 0.1775, "step": 5895 }, { "epoch": 1.5458937198067633, "grad_norm": 8.432950736971934, "learning_rate": 2.686627166849931e-07, "loss": 0.1664, "step": 5900 }, { "epoch": 1.5458937198067633, "eval_accuracy": 0.7504, "eval_loss": 1.094858169555664, "eval_runtime": 139.504, "eval_samples_per_second": 8.96, "eval_steps_per_second": 2.244, "step": 5900 }, { "epoch": 1.5472037992303282, "grad_norm": 3.4036278165694545, "learning_rate": 2.671867679232175e-07, "loss": 0.1702, "step": 5905 }, { "epoch": 1.5485138786538934, "grad_norm": 2.46431216575455, "learning_rate": 2.65714259149014e-07, "loss": 0.2071, "step": 5910 }, { "epoch": 1.5498239580774584, "grad_norm": 4.676488316765192, "learning_rate": 2.64245197274666e-07, "loss": 0.1668, "step": 5915 }, { "epoch": 1.5511340375010234, "grad_norm": 2.776658179276032, "learning_rate": 2.6277958919627386e-07, "loss": 0.173, "step": 5920 }, { "epoch": 1.5524441169245886, "grad_norm": 2.9881895057457424, "learning_rate": 2.6131744179372725e-07, "loss": 0.166, "step": 5925 }, { "epoch": 1.5537541963481536, "grad_norm": 5.351569952228676, "learning_rate": 2.5985876193066925e-07, "loss": 0.1378, "step": 5930 }, { "epoch": 1.5550642757717186, "grad_norm": 7.287593146087938, "learning_rate": 2.5840355645446687e-07, "loss": 0.18, "step": 5935 }, { "epoch": 1.5563743551952838, "grad_norm": 5.162933527380095, "learning_rate": 2.5695183219617644e-07, "loss": 0.2142, "step": 5940 }, { "epoch": 1.5576844346188488, "grad_norm": 4.473626526697219, "learning_rate": 2.555035959705127e-07, "loss": 0.1674, "step": 5945 }, { "epoch": 1.5589945140424137, "grad_norm": 5.821815812779585, "learning_rate": 2.540588545758179e-07, "loss": 0.1876, "step": 5950 }, { "epoch": 1.560304593465979, "grad_norm": 4.101891407721645, "learning_rate": 2.5261761479402734e-07, "loss": 0.1597, "step": 5955 }, { "epoch": 1.561614672889544, "grad_norm": 15.605199269628502, "learning_rate": 2.5117988339064053e-07, "loss": 0.1509, "step": 5960 }, { "epoch": 1.562924752313109, "grad_norm": 7.523818440166435, "learning_rate": 2.49745667114686e-07, "loss": 0.1557, "step": 5965 }, { "epoch": 1.564234831736674, "grad_norm": 4.751911403623371, "learning_rate": 2.483149726986934e-07, "loss": 0.1609, "step": 5970 }, { "epoch": 1.565544911160239, "grad_norm": 10.300208770198049, "learning_rate": 2.468878068586583e-07, "loss": 0.2317, "step": 5975 }, { "epoch": 1.566854990583804, "grad_norm": 6.418645648934525, "learning_rate": 2.4546417629401396e-07, "loss": 0.1699, "step": 5980 }, { "epoch": 1.5681650700073693, "grad_norm": 6.411061114094063, "learning_rate": 2.440440876875971e-07, "loss": 0.1921, "step": 5985 }, { "epoch": 1.5694751494309342, "grad_norm": 7.188743967752872, "learning_rate": 2.4262754770561777e-07, "loss": 0.1515, "step": 5990 }, { "epoch": 1.5707852288544992, "grad_norm": 5.309747838189995, "learning_rate": 2.412145629976289e-07, "loss": 0.1801, "step": 5995 }, { "epoch": 1.5720953082780644, "grad_norm": 4.00510825001943, "learning_rate": 2.39805140196493e-07, "loss": 0.1731, "step": 6000 }, { "epoch": 1.5720953082780644, "eval_accuracy": 0.76, "eval_loss": 1.0234113931655884, "eval_runtime": 139.1959, "eval_samples_per_second": 8.98, "eval_steps_per_second": 2.249, "step": 6000 }, { "epoch": 1.5734053877016294, "grad_norm": 9.59492476008665, "learning_rate": 2.3839928591835335e-07, "loss": 0.1686, "step": 6005 }, { "epoch": 1.5747154671251944, "grad_norm": 3.765377816124392, "learning_rate": 2.3699700676260092e-07, "loss": 0.1494, "step": 6010 }, { "epoch": 1.5760255465487596, "grad_norm": 5.338406619519301, "learning_rate": 2.3559830931184455e-07, "loss": 0.1467, "step": 6015 }, { "epoch": 1.5773356259723246, "grad_norm": 6.763713809000965, "learning_rate": 2.3420320013187954e-07, "loss": 0.1898, "step": 6020 }, { "epoch": 1.5786457053958896, "grad_norm": 5.481130008134444, "learning_rate": 2.328116857716579e-07, "loss": 0.1548, "step": 6025 }, { "epoch": 1.5799557848194548, "grad_norm": 3.557311963677506, "learning_rate": 2.3142377276325563e-07, "loss": 0.1443, "step": 6030 }, { "epoch": 1.5812658642430197, "grad_norm": 5.246282698025739, "learning_rate": 2.30039467621844e-07, "loss": 0.1982, "step": 6035 }, { "epoch": 1.5825759436665847, "grad_norm": 7.849841735411486, "learning_rate": 2.286587768456575e-07, "loss": 0.1785, "step": 6040 }, { "epoch": 1.58388602309015, "grad_norm": 3.056662932671019, "learning_rate": 2.272817069159647e-07, "loss": 0.1452, "step": 6045 }, { "epoch": 1.585196102513715, "grad_norm": 7.441075760718876, "learning_rate": 2.2590826429703647e-07, "loss": 0.1358, "step": 6050 }, { "epoch": 1.5865061819372799, "grad_norm": 4.349747044723176, "learning_rate": 2.2453845543611705e-07, "loss": 0.126, "step": 6055 }, { "epoch": 1.587816261360845, "grad_norm": 7.685604617278789, "learning_rate": 2.2317228676339216e-07, "loss": 0.1422, "step": 6060 }, { "epoch": 1.58912634078441, "grad_norm": 5.9349721355409795, "learning_rate": 2.218097646919599e-07, "loss": 0.1861, "step": 6065 }, { "epoch": 1.590436420207975, "grad_norm": 5.891145802679616, "learning_rate": 2.2045089561780107e-07, "loss": 0.1431, "step": 6070 }, { "epoch": 1.5917464996315402, "grad_norm": 2.287356113721854, "learning_rate": 2.1909568591974748e-07, "loss": 0.1288, "step": 6075 }, { "epoch": 1.5930565790551052, "grad_norm": 7.38109989814088, "learning_rate": 2.1774414195945423e-07, "loss": 0.1277, "step": 6080 }, { "epoch": 1.5943666584786702, "grad_norm": 3.893434900862315, "learning_rate": 2.1639627008136697e-07, "loss": 0.1412, "step": 6085 }, { "epoch": 1.5956767379022354, "grad_norm": 3.7517744969526183, "learning_rate": 2.1505207661269554e-07, "loss": 0.131, "step": 6090 }, { "epoch": 1.5969868173258004, "grad_norm": 5.213568999899694, "learning_rate": 2.1371156786338107e-07, "loss": 0.1493, "step": 6095 }, { "epoch": 1.5982968967493654, "grad_norm": 7.30638497105969, "learning_rate": 2.123747501260691e-07, "loss": 0.1427, "step": 6100 }, { "epoch": 1.5982968967493654, "eval_accuracy": 0.7576, "eval_loss": 1.1005467176437378, "eval_runtime": 138.3031, "eval_samples_per_second": 9.038, "eval_steps_per_second": 2.263, "step": 6100 }, { "epoch": 1.5996069761729306, "grad_norm": 4.455588136802878, "learning_rate": 2.1104162967607774e-07, "loss": 0.1667, "step": 6105 }, { "epoch": 1.6009170555964956, "grad_norm": 5.978054630784655, "learning_rate": 2.0971221277136942e-07, "loss": 0.1548, "step": 6110 }, { "epoch": 1.6022271350200605, "grad_norm": 6.935845367832056, "learning_rate": 2.083865056525218e-07, "loss": 0.1465, "step": 6115 }, { "epoch": 1.6035372144436257, "grad_norm": 9.625156258887605, "learning_rate": 2.0706451454269723e-07, "loss": 0.1988, "step": 6120 }, { "epoch": 1.6048472938671907, "grad_norm": 8.66757452370429, "learning_rate": 2.0574624564761557e-07, "loss": 0.1249, "step": 6125 }, { "epoch": 1.6061573732907557, "grad_norm": 6.280401831803158, "learning_rate": 2.0443170515552166e-07, "loss": 0.1398, "step": 6130 }, { "epoch": 1.607467452714321, "grad_norm": 7.078421243621821, "learning_rate": 2.0312089923716058e-07, "loss": 0.1744, "step": 6135 }, { "epoch": 1.6087775321378859, "grad_norm": 3.5683942015597343, "learning_rate": 2.0181383404574493e-07, "loss": 0.1518, "step": 6140 }, { "epoch": 1.6100876115614509, "grad_norm": 5.9348857695988135, "learning_rate": 2.0051051571692866e-07, "loss": 0.15, "step": 6145 }, { "epoch": 1.611397690985016, "grad_norm": 2.6156284263853595, "learning_rate": 1.9921095036877644e-07, "loss": 0.1248, "step": 6150 }, { "epoch": 1.612707770408581, "grad_norm": 5.4712881492494425, "learning_rate": 1.9791514410173538e-07, "loss": 0.1972, "step": 6155 }, { "epoch": 1.614017849832146, "grad_norm": 5.1651116865695785, "learning_rate": 1.966231029986075e-07, "loss": 0.1164, "step": 6160 }, { "epoch": 1.6153279292557112, "grad_norm": 7.989806082396054, "learning_rate": 1.9533483312451959e-07, "loss": 0.2138, "step": 6165 }, { "epoch": 1.6166380086792762, "grad_norm": 5.514479603993972, "learning_rate": 1.9405034052689585e-07, "loss": 0.1346, "step": 6170 }, { "epoch": 1.6179480881028412, "grad_norm": 5.995445272935516, "learning_rate": 1.927696312354289e-07, "loss": 0.1327, "step": 6175 }, { "epoch": 1.6192581675264064, "grad_norm": 7.346676621492845, "learning_rate": 1.9149271126205168e-07, "loss": 0.172, "step": 6180 }, { "epoch": 1.6205682469499714, "grad_norm": 7.02864110812697, "learning_rate": 1.902195866009091e-07, "loss": 0.1684, "step": 6185 }, { "epoch": 1.6218783263735363, "grad_norm": 5.984903488092592, "learning_rate": 1.8895026322833063e-07, "loss": 0.1282, "step": 6190 }, { "epoch": 1.6231884057971016, "grad_norm": 3.5054458379581197, "learning_rate": 1.876847471028009e-07, "loss": 0.1257, "step": 6195 }, { "epoch": 1.6244984852206665, "grad_norm": 4.70529605927654, "learning_rate": 1.8642304416493283e-07, "loss": 0.1267, "step": 6200 }, { "epoch": 1.6244984852206665, "eval_accuracy": 0.7552, "eval_loss": 1.1194959878921509, "eval_runtime": 142.0998, "eval_samples_per_second": 8.797, "eval_steps_per_second": 2.203, "step": 6200 }, { "epoch": 1.6258085646442315, "grad_norm": 7.7424757434182006, "learning_rate": 1.8516516033743956e-07, "loss": 0.1575, "step": 6205 }, { "epoch": 1.6271186440677967, "grad_norm": 5.852572753378986, "learning_rate": 1.8391110152510615e-07, "loss": 0.1466, "step": 6210 }, { "epoch": 1.6284287234913617, "grad_norm": 9.711403077550843, "learning_rate": 1.8266087361476258e-07, "loss": 0.2505, "step": 6215 }, { "epoch": 1.6297388029149267, "grad_norm": 7.233520202930265, "learning_rate": 1.8141448247525527e-07, "loss": 0.1326, "step": 6220 }, { "epoch": 1.6310488823384919, "grad_norm": 7.089498480489309, "learning_rate": 1.8017193395742024e-07, "loss": 0.165, "step": 6225 }, { "epoch": 1.6323589617620569, "grad_norm": 6.065667727984184, "learning_rate": 1.7893323389405524e-07, "loss": 0.1338, "step": 6230 }, { "epoch": 1.6336690411856218, "grad_norm": 4.969225424762747, "learning_rate": 1.776983880998929e-07, "loss": 0.1625, "step": 6235 }, { "epoch": 1.634979120609187, "grad_norm": 6.713374539142975, "learning_rate": 1.7646740237157254e-07, "loss": 0.1703, "step": 6240 }, { "epoch": 1.636289200032752, "grad_norm": 5.594374782950924, "learning_rate": 1.7524028248761401e-07, "loss": 0.1917, "step": 6245 }, { "epoch": 1.637599279456317, "grad_norm": 6.226467459107668, "learning_rate": 1.7401703420838975e-07, "loss": 0.1807, "step": 6250 }, { "epoch": 1.6389093588798822, "grad_norm": 4.7490550372943074, "learning_rate": 1.7279766327609757e-07, "loss": 0.1605, "step": 6255 }, { "epoch": 1.6402194383034472, "grad_norm": 6.40470417405122, "learning_rate": 1.7158217541473518e-07, "loss": 0.1279, "step": 6260 }, { "epoch": 1.6415295177270122, "grad_norm": 9.101155921957748, "learning_rate": 1.7037057633007157e-07, "loss": 0.1125, "step": 6265 }, { "epoch": 1.6428395971505774, "grad_norm": 6.16268053536328, "learning_rate": 1.6916287170962107e-07, "loss": 0.1575, "step": 6270 }, { "epoch": 1.6441496765741423, "grad_norm": 6.511167097266026, "learning_rate": 1.6795906722261644e-07, "loss": 0.1668, "step": 6275 }, { "epoch": 1.6454597559977073, "grad_norm": 5.236685660626339, "learning_rate": 1.6675916851998272e-07, "loss": 0.106, "step": 6280 }, { "epoch": 1.6467698354212725, "grad_norm": 5.638441943629535, "learning_rate": 1.6556318123430978e-07, "loss": 0.1362, "step": 6285 }, { "epoch": 1.6480799148448375, "grad_norm": 8.863766597675237, "learning_rate": 1.6437111097982726e-07, "loss": 0.1769, "step": 6290 }, { "epoch": 1.6493899942684025, "grad_norm": 3.794500769944755, "learning_rate": 1.631829633523767e-07, "loss": 0.1522, "step": 6295 }, { "epoch": 1.6507000736919677, "grad_norm": 3.9716392563806027, "learning_rate": 1.6199874392938574e-07, "loss": 0.1483, "step": 6300 }, { "epoch": 1.6507000736919677, "eval_accuracy": 0.7576, "eval_loss": 1.1004310846328735, "eval_runtime": 141.7328, "eval_samples_per_second": 8.819, "eval_steps_per_second": 2.208, "step": 6300 }, { "epoch": 1.6520101531155327, "grad_norm": 3.655291620709186, "learning_rate": 1.6081845826984307e-07, "loss": 0.1227, "step": 6305 }, { "epoch": 1.6533202325390977, "grad_norm": 6.90496866662, "learning_rate": 1.5964211191427058e-07, "loss": 0.1756, "step": 6310 }, { "epoch": 1.6546303119626629, "grad_norm": 9.388436741639218, "learning_rate": 1.5846971038469915e-07, "loss": 0.1361, "step": 6315 }, { "epoch": 1.6559403913862278, "grad_norm": 6.255702654542568, "learning_rate": 1.573012591846402e-07, "loss": 0.1674, "step": 6320 }, { "epoch": 1.6572504708097928, "grad_norm": 5.3552774475843945, "learning_rate": 1.5613676379906315e-07, "loss": 0.1525, "step": 6325 }, { "epoch": 1.658560550233358, "grad_norm": 2.691722315737717, "learning_rate": 1.5497622969436662e-07, "loss": 0.1796, "step": 6330 }, { "epoch": 1.659870629656923, "grad_norm": 6.025378541300572, "learning_rate": 1.538196623183552e-07, "loss": 0.183, "step": 6335 }, { "epoch": 1.661180709080488, "grad_norm": 4.761564415573431, "learning_rate": 1.5266706710021194e-07, "loss": 0.1312, "step": 6340 }, { "epoch": 1.6624907885040532, "grad_norm": 7.805446740567002, "learning_rate": 1.51518449450474e-07, "loss": 0.1651, "step": 6345 }, { "epoch": 1.6638008679276182, "grad_norm": 5.256694126891557, "learning_rate": 1.5037381476100707e-07, "loss": 0.1294, "step": 6350 }, { "epoch": 1.6651109473511831, "grad_norm": 7.755672196084709, "learning_rate": 1.4923316840497968e-07, "loss": 0.156, "step": 6355 }, { "epoch": 1.6664210267747483, "grad_norm": 4.656789891974052, "learning_rate": 1.480965157368389e-07, "loss": 0.133, "step": 6360 }, { "epoch": 1.667731106198313, "grad_norm": 5.4961595195828705, "learning_rate": 1.4696386209228307e-07, "loss": 0.1812, "step": 6365 }, { "epoch": 1.6690411856218783, "grad_norm": 6.495919732500172, "learning_rate": 1.4583521278824008e-07, "loss": 0.1657, "step": 6370 }, { "epoch": 1.6703512650454435, "grad_norm": 6.812774990594977, "learning_rate": 1.4471057312283906e-07, "loss": 0.1115, "step": 6375 }, { "epoch": 1.6716613444690083, "grad_norm": 3.956134496828771, "learning_rate": 1.4358994837538817e-07, "loss": 0.2423, "step": 6380 }, { "epoch": 1.6729714238925735, "grad_norm": 6.259621872344836, "learning_rate": 1.424733438063479e-07, "loss": 0.1022, "step": 6385 }, { "epoch": 1.6742815033161387, "grad_norm": 6.072009290092665, "learning_rate": 1.4136076465730695e-07, "loss": 0.1832, "step": 6390 }, { "epoch": 1.6755915827397034, "grad_norm": 7.456858413136058, "learning_rate": 1.4025221615095873e-07, "loss": 0.1657, "step": 6395 }, { "epoch": 1.6769016621632686, "grad_norm": 7.260932680002808, "learning_rate": 1.3914770349107495e-07, "loss": 0.1346, "step": 6400 }, { "epoch": 1.6769016621632686, "eval_accuracy": 0.7632, "eval_loss": 1.100335717201233, "eval_runtime": 141.6273, "eval_samples_per_second": 8.826, "eval_steps_per_second": 2.21, "step": 6400 }, { "epoch": 1.6782117415868338, "grad_norm": 3.6061256334408203, "learning_rate": 1.3804723186248313e-07, "loss": 0.1343, "step": 6405 }, { "epoch": 1.6795218210103986, "grad_norm": 8.270495540507543, "learning_rate": 1.369508064310404e-07, "loss": 0.1182, "step": 6410 }, { "epoch": 1.6808319004339638, "grad_norm": 5.629731179384749, "learning_rate": 1.3585843234361049e-07, "loss": 0.1568, "step": 6415 }, { "epoch": 1.682141979857529, "grad_norm": 2.867569435579598, "learning_rate": 1.347701147280391e-07, "loss": 0.1729, "step": 6420 }, { "epoch": 1.6834520592810938, "grad_norm": 6.83897468616409, "learning_rate": 1.3368585869313065e-07, "loss": 0.1874, "step": 6425 }, { "epoch": 1.684762138704659, "grad_norm": 4.3014228257373945, "learning_rate": 1.326056693286226e-07, "loss": 0.1778, "step": 6430 }, { "epoch": 1.6860722181282242, "grad_norm": 10.457994997688326, "learning_rate": 1.31529551705163e-07, "loss": 0.2127, "step": 6435 }, { "epoch": 1.687382297551789, "grad_norm": 5.086460128057394, "learning_rate": 1.3045751087428648e-07, "loss": 0.153, "step": 6440 }, { "epoch": 1.6886923769753541, "grad_norm": 5.2909003289546375, "learning_rate": 1.2938955186838983e-07, "loss": 0.1303, "step": 6445 }, { "epoch": 1.6900024563989193, "grad_norm": 6.3469654482036555, "learning_rate": 1.283256797007094e-07, "loss": 0.1625, "step": 6450 }, { "epoch": 1.691312535822484, "grad_norm": 3.8066454211371687, "learning_rate": 1.2726589936529654e-07, "loss": 0.2029, "step": 6455 }, { "epoch": 1.6926226152460493, "grad_norm": 5.242010338144686, "learning_rate": 1.2621021583699476e-07, "loss": 0.1424, "step": 6460 }, { "epoch": 1.6939326946696145, "grad_norm": 3.25302928925533, "learning_rate": 1.2515863407141603e-07, "loss": 0.1493, "step": 6465 }, { "epoch": 1.6952427740931792, "grad_norm": 5.788528827053253, "learning_rate": 1.2411115900491865e-07, "loss": 0.1396, "step": 6470 }, { "epoch": 1.6965528535167445, "grad_norm": 5.311777991254716, "learning_rate": 1.230677955545819e-07, "loss": 0.1388, "step": 6475 }, { "epoch": 1.6978629329403097, "grad_norm": 9.193485516456121, "learning_rate": 1.2202854861818557e-07, "loss": 0.1502, "step": 6480 }, { "epoch": 1.6991730123638744, "grad_norm": 3.6649259537864043, "learning_rate": 1.2099342307418392e-07, "loss": 0.1834, "step": 6485 }, { "epoch": 1.7004830917874396, "grad_norm": 2.368713152141659, "learning_rate": 1.199624237816862e-07, "loss": 0.1621, "step": 6490 }, { "epoch": 1.7017931712110048, "grad_norm": 10.317778961476517, "learning_rate": 1.1893555558043089e-07, "loss": 0.1625, "step": 6495 }, { "epoch": 1.7031032506345696, "grad_norm": 3.1939718268686623, "learning_rate": 1.1791282329076523e-07, "loss": 0.1682, "step": 6500 }, { "epoch": 1.7031032506345696, "eval_accuracy": 0.7608, "eval_loss": 1.0906686782836914, "eval_runtime": 141.6589, "eval_samples_per_second": 8.824, "eval_steps_per_second": 2.21, "step": 6500 }, { "epoch": 1.7044133300581348, "grad_norm": 7.245253263810927, "learning_rate": 1.1689423171362079e-07, "loss": 0.147, "step": 6505 }, { "epoch": 1.7057234094817, "grad_norm": 3.713535860169349, "learning_rate": 1.1587978563049161e-07, "loss": 0.1361, "step": 6510 }, { "epoch": 1.7070334889052647, "grad_norm": 3.1243926261206547, "learning_rate": 1.1486948980341282e-07, "loss": 0.1104, "step": 6515 }, { "epoch": 1.70834356832883, "grad_norm": 2.6440342898591838, "learning_rate": 1.1386334897493632e-07, "loss": 0.1154, "step": 6520 }, { "epoch": 1.7096536477523951, "grad_norm": 2.7119760374007877, "learning_rate": 1.128613678681104e-07, "loss": 0.1315, "step": 6525 }, { "epoch": 1.71096372717596, "grad_norm": 6.397748692900095, "learning_rate": 1.1186355118645552e-07, "loss": 0.1652, "step": 6530 }, { "epoch": 1.712273806599525, "grad_norm": 4.992532932504358, "learning_rate": 1.1086990361394477e-07, "loss": 0.1224, "step": 6535 }, { "epoch": 1.7135838860230903, "grad_norm": 8.728515211435251, "learning_rate": 1.0988042981497947e-07, "loss": 0.2042, "step": 6540 }, { "epoch": 1.714893965446655, "grad_norm": 13.21150999052598, "learning_rate": 1.0889513443436904e-07, "loss": 0.1576, "step": 6545 }, { "epoch": 1.7162040448702203, "grad_norm": 2.632026076658514, "learning_rate": 1.0791402209730794e-07, "loss": 0.0997, "step": 6550 }, { "epoch": 1.7175141242937855, "grad_norm": 5.720341695762074, "learning_rate": 1.0693709740935463e-07, "loss": 0.155, "step": 6555 }, { "epoch": 1.7188242037173502, "grad_norm": 5.80222557370497, "learning_rate": 1.0596436495641025e-07, "loss": 0.1255, "step": 6560 }, { "epoch": 1.7201342831409154, "grad_norm": 6.100322814196168, "learning_rate": 1.0499582930469597e-07, "loss": 0.1629, "step": 6565 }, { "epoch": 1.7214443625644806, "grad_norm": 4.478475471096975, "learning_rate": 1.0403149500073294e-07, "loss": 0.1398, "step": 6570 }, { "epoch": 1.7227544419880454, "grad_norm": 4.8025162956814835, "learning_rate": 1.0307136657131977e-07, "loss": 0.1035, "step": 6575 }, { "epoch": 1.7240645214116106, "grad_norm": 3.2219463387203233, "learning_rate": 1.0211544852351183e-07, "loss": 0.1807, "step": 6580 }, { "epoch": 1.7253746008351758, "grad_norm": 5.573188128425597, "learning_rate": 1.0116374534459993e-07, "loss": 0.1532, "step": 6585 }, { "epoch": 1.7266846802587406, "grad_norm": 4.185020111907581, "learning_rate": 1.0021626150208984e-07, "loss": 0.1329, "step": 6590 }, { "epoch": 1.7279947596823058, "grad_norm": 3.8823758733463016, "learning_rate": 9.927300144368045e-08, "loss": 0.1349, "step": 6595 }, { "epoch": 1.7293048391058707, "grad_norm": 16.750270685634902, "learning_rate": 9.833396959724306e-08, "loss": 0.1322, "step": 6600 }, { "epoch": 1.7293048391058707, "eval_accuracy": 0.7608, "eval_loss": 1.128875970840454, "eval_runtime": 143.9246, "eval_samples_per_second": 8.685, "eval_steps_per_second": 2.175, "step": 6600 }, { "epoch": 1.7306149185294357, "grad_norm": 5.043906483496135, "learning_rate": 9.739917037080148e-08, "loss": 0.1572, "step": 6605 }, { "epoch": 1.731924997953001, "grad_norm": 6.199552158675514, "learning_rate": 9.646860815250979e-08, "loss": 0.1627, "step": 6610 }, { "epoch": 1.733235077376566, "grad_norm": 2.951863930249291, "learning_rate": 9.554228731063373e-08, "loss": 0.154, "step": 6615 }, { "epoch": 1.7345451568001309, "grad_norm": 5.2157616611104975, "learning_rate": 9.462021219352801e-08, "loss": 0.1631, "step": 6620 }, { "epoch": 1.735855236223696, "grad_norm": 3.3619242290029963, "learning_rate": 9.370238712961742e-08, "loss": 0.2129, "step": 6625 }, { "epoch": 1.737165315647261, "grad_norm": 4.655934383309167, "learning_rate": 9.27888164273759e-08, "loss": 0.1738, "step": 6630 }, { "epoch": 1.738475395070826, "grad_norm": 9.09248052048832, "learning_rate": 9.1879504375307e-08, "loss": 0.21, "step": 6635 }, { "epoch": 1.7397854744943912, "grad_norm": 9.603961489686213, "learning_rate": 9.097445524192248e-08, "loss": 0.1156, "step": 6640 }, { "epoch": 1.7410955539179562, "grad_norm": 6.608421456426681, "learning_rate": 9.007367327572368e-08, "loss": 0.1623, "step": 6645 }, { "epoch": 1.7424056333415212, "grad_norm": 5.838309352454389, "learning_rate": 8.91771627051805e-08, "loss": 0.1661, "step": 6650 }, { "epoch": 1.7437157127650864, "grad_norm": 5.515377990794858, "learning_rate": 8.828492773871177e-08, "loss": 0.1721, "step": 6655 }, { "epoch": 1.7450257921886514, "grad_norm": 5.475478047926709, "learning_rate": 8.739697256466638e-08, "loss": 0.1668, "step": 6660 }, { "epoch": 1.7463358716122164, "grad_norm": 6.660668383621966, "learning_rate": 8.651330135130241e-08, "loss": 0.1841, "step": 6665 }, { "epoch": 1.7476459510357816, "grad_norm": 4.043989017999333, "learning_rate": 8.563391824676814e-08, "loss": 0.1521, "step": 6670 }, { "epoch": 1.7489560304593466, "grad_norm": 3.805298088229625, "learning_rate": 8.475882737908247e-08, "loss": 0.129, "step": 6675 }, { "epoch": 1.7502661098829115, "grad_norm": 3.0665024919957022, "learning_rate": 8.388803285611601e-08, "loss": 0.1577, "step": 6680 }, { "epoch": 1.7515761893064767, "grad_norm": 14.26899142932033, "learning_rate": 8.30215387655706e-08, "loss": 0.1589, "step": 6685 }, { "epoch": 1.7528862687300417, "grad_norm": 6.541801424230482, "learning_rate": 8.215934917496192e-08, "loss": 0.153, "step": 6690 }, { "epoch": 1.7541963481536067, "grad_norm": 5.250779392264523, "learning_rate": 8.130146813159844e-08, "loss": 0.148, "step": 6695 }, { "epoch": 1.755506427577172, "grad_norm": 3.927112284430514, "learning_rate": 8.044789966256382e-08, "loss": 0.1994, "step": 6700 }, { "epoch": 1.755506427577172, "eval_accuracy": 0.76, "eval_loss": 1.106866478919983, "eval_runtime": 141.8989, "eval_samples_per_second": 8.809, "eval_steps_per_second": 2.206, "step": 6700 }, { "epoch": 1.7568165070007369, "grad_norm": 9.767839811176259, "learning_rate": 7.959864777469749e-08, "loss": 0.2056, "step": 6705 }, { "epoch": 1.7581265864243019, "grad_norm": 5.9103243659566, "learning_rate": 7.875371645457574e-08, "loss": 0.1468, "step": 6710 }, { "epoch": 1.759436665847867, "grad_norm": 6.055358982690812, "learning_rate": 7.791310966849362e-08, "loss": 0.1375, "step": 6715 }, { "epoch": 1.760746745271432, "grad_norm": 7.406585709894513, "learning_rate": 7.707683136244503e-08, "loss": 0.1663, "step": 6720 }, { "epoch": 1.762056824694997, "grad_norm": 10.412933831104338, "learning_rate": 7.624488546210584e-08, "loss": 0.1703, "step": 6725 }, { "epoch": 1.7633669041185622, "grad_norm": 11.2079157468613, "learning_rate": 7.5417275872814e-08, "loss": 0.1649, "step": 6730 }, { "epoch": 1.7646769835421272, "grad_norm": 4.487770161857672, "learning_rate": 7.459400647955261e-08, "loss": 0.1109, "step": 6735 }, { "epoch": 1.7659870629656922, "grad_norm": 4.720489203941066, "learning_rate": 7.377508114693021e-08, "loss": 0.2277, "step": 6740 }, { "epoch": 1.7672971423892574, "grad_norm": 5.432786720479122, "learning_rate": 7.296050371916362e-08, "loss": 0.1617, "step": 6745 }, { "epoch": 1.7686072218128224, "grad_norm": 6.287493094401141, "learning_rate": 7.21502780200598e-08, "loss": 0.1686, "step": 6750 }, { "epoch": 1.7699173012363874, "grad_norm": 2.1808008637078897, "learning_rate": 7.134440785299745e-08, "loss": 0.1527, "step": 6755 }, { "epoch": 1.7712273806599526, "grad_norm": 3.224585978413297, "learning_rate": 7.054289700090987e-08, "loss": 0.1003, "step": 6760 }, { "epoch": 1.7725374600835175, "grad_norm": 4.540878351864405, "learning_rate": 6.974574922626598e-08, "loss": 0.146, "step": 6765 }, { "epoch": 1.7738475395070825, "grad_norm": 7.4029328124452345, "learning_rate": 6.895296827105423e-08, "loss": 0.1749, "step": 6770 }, { "epoch": 1.7751576189306477, "grad_norm": 4.084693702284536, "learning_rate": 6.81645578567639e-08, "loss": 0.1532, "step": 6775 }, { "epoch": 1.7764676983542127, "grad_norm": 5.481752305139202, "learning_rate": 6.738052168436814e-08, "loss": 0.1742, "step": 6780 }, { "epoch": 1.7777777777777777, "grad_norm": 6.07861709288181, "learning_rate": 6.660086343430637e-08, "loss": 0.1624, "step": 6785 }, { "epoch": 1.7790878572013429, "grad_norm": 4.676302274517847, "learning_rate": 6.582558676646676e-08, "loss": 0.1583, "step": 6790 }, { "epoch": 1.7803979366249079, "grad_norm": 3.9543347540784666, "learning_rate": 6.505469532017005e-08, "loss": 0.142, "step": 6795 }, { "epoch": 1.7817080160484728, "grad_norm": 7.99552033660062, "learning_rate": 6.428819271415098e-08, "loss": 0.159, "step": 6800 }, { "epoch": 1.7817080160484728, "eval_accuracy": 0.7608, "eval_loss": 1.0944597721099854, "eval_runtime": 142.6449, "eval_samples_per_second": 8.763, "eval_steps_per_second": 2.194, "step": 6800 }, { "epoch": 1.783018095472038, "grad_norm": 5.928421818999669, "learning_rate": 6.35260825465429e-08, "loss": 0.1406, "step": 6805 }, { "epoch": 1.784328174895603, "grad_norm": 5.358450738294621, "learning_rate": 6.276836839485944e-08, "loss": 0.1684, "step": 6810 }, { "epoch": 1.785638254319168, "grad_norm": 7.4098511156954885, "learning_rate": 6.201505381597872e-08, "loss": 0.1258, "step": 6815 }, { "epoch": 1.7869483337427332, "grad_norm": 10.028496011778648, "learning_rate": 6.126614234612593e-08, "loss": 0.1363, "step": 6820 }, { "epoch": 1.7882584131662982, "grad_norm": 8.42691690098271, "learning_rate": 6.05216375008576e-08, "loss": 0.1629, "step": 6825 }, { "epoch": 1.7895684925898632, "grad_norm": 7.8437977453123695, "learning_rate": 5.978154277504432e-08, "loss": 0.1488, "step": 6830 }, { "epoch": 1.7908785720134284, "grad_norm": 9.079568079332288, "learning_rate": 5.904586164285441e-08, "loss": 0.1451, "step": 6835 }, { "epoch": 1.7921886514369934, "grad_norm": 5.503155338633177, "learning_rate": 5.831459755773815e-08, "loss": 0.1478, "step": 6840 }, { "epoch": 1.7934987308605583, "grad_norm": 6.773303376259157, "learning_rate": 5.7587753952411e-08, "loss": 0.1445, "step": 6845 }, { "epoch": 1.7948088102841235, "grad_norm": 4.765236453726558, "learning_rate": 5.686533423883788e-08, "loss": 0.1617, "step": 6850 }, { "epoch": 1.7961188897076885, "grad_norm": 6.230089077098877, "learning_rate": 5.6147341808216894e-08, "loss": 0.1509, "step": 6855 }, { "epoch": 1.7974289691312535, "grad_norm": 4.574445807215422, "learning_rate": 5.543378003096344e-08, "loss": 0.1722, "step": 6860 }, { "epoch": 1.7987390485548187, "grad_norm": 4.348732547956968, "learning_rate": 5.4724652256694205e-08, "loss": 0.1443, "step": 6865 }, { "epoch": 1.8000491279783837, "grad_norm": 7.066725464839033, "learning_rate": 5.401996181421253e-08, "loss": 0.1485, "step": 6870 }, { "epoch": 1.8013592074019487, "grad_norm": 5.328359673700302, "learning_rate": 5.331971201149088e-08, "loss": 0.1419, "step": 6875 }, { "epoch": 1.8026692868255139, "grad_norm": 5.644656887108177, "learning_rate": 5.262390613565737e-08, "loss": 0.1424, "step": 6880 }, { "epoch": 1.8039793662490788, "grad_norm": 5.671327408847072, "learning_rate": 5.193254745297848e-08, "loss": 0.198, "step": 6885 }, { "epoch": 1.8052894456726438, "grad_norm": 4.877568064660579, "learning_rate": 5.124563920884495e-08, "loss": 0.1428, "step": 6890 }, { "epoch": 1.806599525096209, "grad_norm": 6.817395500823747, "learning_rate": 5.056318462775644e-08, "loss": 0.1432, "step": 6895 }, { "epoch": 1.807909604519774, "grad_norm": 5.927934052499454, "learning_rate": 4.988518691330579e-08, "loss": 0.1137, "step": 6900 }, { "epoch": 1.807909604519774, "eval_accuracy": 0.7632, "eval_loss": 1.1021169424057007, "eval_runtime": 141.7931, "eval_samples_per_second": 8.816, "eval_steps_per_second": 2.207, "step": 6900 }, { "epoch": 1.809219683943339, "grad_norm": 4.973694598901904, "learning_rate": 4.9211649248164125e-08, "loss": 0.1506, "step": 6905 }, { "epoch": 1.8105297633669042, "grad_norm": 3.883659080774288, "learning_rate": 4.854257479406654e-08, "loss": 0.141, "step": 6910 }, { "epoch": 1.8118398427904692, "grad_norm": 6.41491315709119, "learning_rate": 4.787796669179689e-08, "loss": 0.158, "step": 6915 }, { "epoch": 1.8131499222140341, "grad_norm": 3.9302834820572814, "learning_rate": 4.721782806117236e-08, "loss": 0.1322, "step": 6920 }, { "epoch": 1.8144600016375994, "grad_norm": 4.289211965868008, "learning_rate": 4.656216200103036e-08, "loss": 0.1337, "step": 6925 }, { "epoch": 1.8157700810611643, "grad_norm": 11.248831743934637, "learning_rate": 4.591097158921198e-08, "loss": 0.1829, "step": 6930 }, { "epoch": 1.8170801604847293, "grad_norm": 9.878450217229018, "learning_rate": 4.526425988254967e-08, "loss": 0.1566, "step": 6935 }, { "epoch": 1.8183902399082945, "grad_norm": 7.865233707641088, "learning_rate": 4.4622029916850935e-08, "loss": 0.1189, "step": 6940 }, { "epoch": 1.8197003193318595, "grad_norm": 9.744045573658111, "learning_rate": 4.3984284706885976e-08, "loss": 0.1497, "step": 6945 }, { "epoch": 1.8210103987554245, "grad_norm": 7.22547788135765, "learning_rate": 4.335102724637163e-08, "loss": 0.2296, "step": 6950 }, { "epoch": 1.8223204781789897, "grad_norm": 11.079040650914395, "learning_rate": 4.2722260507958684e-08, "loss": 0.1922, "step": 6955 }, { "epoch": 1.8236305576025547, "grad_norm": 8.165182518676898, "learning_rate": 4.2097987443217577e-08, "loss": 0.1381, "step": 6960 }, { "epoch": 1.8249406370261196, "grad_norm": 3.739467681211686, "learning_rate": 4.147821098262405e-08, "loss": 0.1294, "step": 6965 }, { "epoch": 1.8262507164496848, "grad_norm": 5.402892377251783, "learning_rate": 4.086293403554641e-08, "loss": 0.1786, "step": 6970 }, { "epoch": 1.8275607958732498, "grad_norm": 4.468753443729968, "learning_rate": 4.0252159490230645e-08, "loss": 0.1654, "step": 6975 }, { "epoch": 1.8288708752968148, "grad_norm": 5.203594506261989, "learning_rate": 3.964589021378772e-08, "loss": 0.1367, "step": 6980 }, { "epoch": 1.83018095472038, "grad_norm": 3.380052180374155, "learning_rate": 3.90441290521798e-08, "loss": 0.1106, "step": 6985 }, { "epoch": 1.831491034143945, "grad_norm": 4.633407767619745, "learning_rate": 3.8446878830207254e-08, "loss": 0.1679, "step": 6990 }, { "epoch": 1.83280111356751, "grad_norm": 5.329985366173506, "learning_rate": 3.785414235149465e-08, "loss": 0.1565, "step": 6995 }, { "epoch": 1.8341111929910752, "grad_norm": 6.556659404147076, "learning_rate": 3.726592239847826e-08, "loss": 0.2095, "step": 7000 }, { "epoch": 1.8341111929910752, "eval_accuracy": 0.7624, "eval_loss": 1.1032047271728516, "eval_runtime": 141.6558, "eval_samples_per_second": 8.824, "eval_steps_per_second": 2.21, "step": 7000 }, { "epoch": 1.8354212724146401, "grad_norm": 4.684757438072978, "learning_rate": 3.668222173239288e-08, "loss": 0.1576, "step": 7005 }, { "epoch": 1.8367313518382051, "grad_norm": 7.039438206694064, "learning_rate": 3.6103043093258625e-08, "loss": 0.1128, "step": 7010 }, { "epoch": 1.8380414312617703, "grad_norm": 12.172322751037852, "learning_rate": 3.552838919986845e-08, "loss": 0.1683, "step": 7015 }, { "epoch": 1.8393515106853353, "grad_norm": 7.127343846862284, "learning_rate": 3.495826274977487e-08, "loss": 0.0943, "step": 7020 }, { "epoch": 1.8406615901089003, "grad_norm": 3.994985416023033, "learning_rate": 3.439266641927752e-08, "loss": 0.1505, "step": 7025 }, { "epoch": 1.8419716695324655, "grad_norm": 10.627564884995905, "learning_rate": 3.383160286341091e-08, "loss": 0.1712, "step": 7030 }, { "epoch": 1.8432817489560305, "grad_norm": 5.604749535399941, "learning_rate": 3.327507471593172e-08, "loss": 0.1205, "step": 7035 }, { "epoch": 1.8445918283795955, "grad_norm": 4.8421269912484215, "learning_rate": 3.272308458930606e-08, "loss": 0.1152, "step": 7040 }, { "epoch": 1.8459019078031607, "grad_norm": 9.074100019849872, "learning_rate": 3.2175635074698005e-08, "loss": 0.2357, "step": 7045 }, { "epoch": 1.8472119872267256, "grad_norm": 7.154307006903744, "learning_rate": 3.1632728741956884e-08, "loss": 0.1552, "step": 7050 }, { "epoch": 1.8485220666502906, "grad_norm": 2.6781924530315306, "learning_rate": 3.1094368139604865e-08, "loss": 0.106, "step": 7055 }, { "epoch": 1.8498321460738558, "grad_norm": 7.370147947886868, "learning_rate": 3.0560555794826196e-08, "loss": 0.1413, "step": 7060 }, { "epoch": 1.8511422254974208, "grad_norm": 3.802825821569026, "learning_rate": 3.003129421345407e-08, "loss": 0.1453, "step": 7065 }, { "epoch": 1.8524523049209858, "grad_norm": 6.1792330095504395, "learning_rate": 2.9506585879959577e-08, "loss": 0.1564, "step": 7070 }, { "epoch": 1.853762384344551, "grad_norm": 5.355565750702989, "learning_rate": 2.8986433257439658e-08, "loss": 0.1967, "step": 7075 }, { "epoch": 1.855072463768116, "grad_norm": 6.360151114951984, "learning_rate": 2.8470838787606034e-08, "loss": 0.0963, "step": 7080 }, { "epoch": 1.856382543191681, "grad_norm": 6.69819610174965, "learning_rate": 2.795980489077332e-08, "loss": 0.1303, "step": 7085 }, { "epoch": 1.8576926226152461, "grad_norm": 4.435876647603045, "learning_rate": 2.7453333965847815e-08, "loss": 0.1269, "step": 7090 }, { "epoch": 1.8590027020388111, "grad_norm": 7.698895216569222, "learning_rate": 2.6951428390316165e-08, "loss": 0.1347, "step": 7095 }, { "epoch": 1.860312781462376, "grad_norm": 3.790916594995889, "learning_rate": 2.6454090520234063e-08, "loss": 0.2099, "step": 7100 }, { "epoch": 1.860312781462376, "eval_accuracy": 0.7632, "eval_loss": 1.1122453212738037, "eval_runtime": 142.3069, "eval_samples_per_second": 8.784, "eval_steps_per_second": 2.199, "step": 7100 }, { "epoch": 1.8616228608859413, "grad_norm": 3.9663209864483724, "learning_rate": 2.596132269021589e-08, "loss": 0.1212, "step": 7105 }, { "epoch": 1.8629329403095063, "grad_norm": 5.657131945956216, "learning_rate": 2.5473127213422762e-08, "loss": 0.1551, "step": 7110 }, { "epoch": 1.8642430197330713, "grad_norm": 5.092795916916417, "learning_rate": 2.4989506381552617e-08, "loss": 0.1736, "step": 7115 }, { "epoch": 1.8655530991566365, "grad_norm": 5.798872866253983, "learning_rate": 2.4510462464828352e-08, "loss": 0.1684, "step": 7120 }, { "epoch": 1.8668631785802015, "grad_norm": 3.20953589455004, "learning_rate": 2.4035997711988387e-08, "loss": 0.1094, "step": 7125 }, { "epoch": 1.8681732580037664, "grad_norm": 3.3680896195026477, "learning_rate": 2.3566114350275223e-08, "loss": 0.1694, "step": 7130 }, { "epoch": 1.8694833374273316, "grad_norm": 5.729982989667835, "learning_rate": 2.3100814585425564e-08, "loss": 0.1564, "step": 7135 }, { "epoch": 1.8707934168508966, "grad_norm": 10.389717669058681, "learning_rate": 2.264010060165944e-08, "loss": 0.1514, "step": 7140 }, { "epoch": 1.8721034962744616, "grad_norm": 5.075854563803782, "learning_rate": 2.2183974561670205e-08, "loss": 0.2024, "step": 7145 }, { "epoch": 1.8734135756980268, "grad_norm": 4.242493204391383, "learning_rate": 2.1732438606614665e-08, "loss": 0.1311, "step": 7150 }, { "epoch": 1.8747236551215918, "grad_norm": 5.4042102288406095, "learning_rate": 2.1285494856102315e-08, "loss": 0.1726, "step": 7155 }, { "epoch": 1.8760337345451568, "grad_norm": 6.071406295353831, "learning_rate": 2.0843145408186547e-08, "loss": 0.1006, "step": 7160 }, { "epoch": 1.877343813968722, "grad_norm": 6.149832756857255, "learning_rate": 2.0405392339353234e-08, "loss": 0.1713, "step": 7165 }, { "epoch": 1.878653893392287, "grad_norm": 9.357766475713992, "learning_rate": 1.9972237704512283e-08, "loss": 0.1644, "step": 7170 }, { "epoch": 1.879963972815852, "grad_norm": 5.253827651758333, "learning_rate": 1.9543683536987434e-08, "loss": 0.111, "step": 7175 }, { "epoch": 1.8812740522394171, "grad_norm": 11.566026718756346, "learning_rate": 1.9119731848506902e-08, "loss": 0.1984, "step": 7180 }, { "epoch": 1.882584131662982, "grad_norm": 8.248572977961965, "learning_rate": 1.8700384629193876e-08, "loss": 0.1202, "step": 7185 }, { "epoch": 1.883894211086547, "grad_norm": 4.350069192366543, "learning_rate": 1.828564384755682e-08, "loss": 0.1471, "step": 7190 }, { "epoch": 1.8852042905101123, "grad_norm": 5.389482671182921, "learning_rate": 1.787551145048094e-08, "loss": 0.1356, "step": 7195 }, { "epoch": 1.8865143699336773, "grad_norm": 9.568845913678533, "learning_rate": 1.7469989363218528e-08, "loss": 0.209, "step": 7200 }, { "epoch": 1.8865143699336773, "eval_accuracy": 0.7616, "eval_loss": 1.1072343587875366, "eval_runtime": 141.2973, "eval_samples_per_second": 8.847, "eval_steps_per_second": 2.215, "step": 7200 }, { "epoch": 1.8878244493572423, "grad_norm": 8.437455929535107, "learning_rate": 1.706907948938008e-08, "loss": 0.1703, "step": 7205 }, { "epoch": 1.8891345287808075, "grad_norm": 9.671218815145942, "learning_rate": 1.6672783710925288e-08, "loss": 0.18, "step": 7210 }, { "epoch": 1.8904446082043724, "grad_norm": 5.889420942486911, "learning_rate": 1.628110388815429e-08, "loss": 0.1196, "step": 7215 }, { "epoch": 1.8917546876279374, "grad_norm": 4.068164701286381, "learning_rate": 1.5894041859698783e-08, "loss": 0.1432, "step": 7220 }, { "epoch": 1.8930647670515026, "grad_norm": 4.827161273788743, "learning_rate": 1.5511599442513677e-08, "loss": 0.1612, "step": 7225 }, { "epoch": 1.8943748464750676, "grad_norm": 2.5102178836850495, "learning_rate": 1.5133778431868583e-08, "loss": 0.1626, "step": 7230 }, { "epoch": 1.8956849258986326, "grad_norm": 7.7934519895192285, "learning_rate": 1.4760580601338669e-08, "loss": 0.2144, "step": 7235 }, { "epoch": 1.8969950053221978, "grad_norm": 8.667381276168085, "learning_rate": 1.439200770279736e-08, "loss": 0.2232, "step": 7240 }, { "epoch": 1.8983050847457628, "grad_norm": 7.482118460513503, "learning_rate": 1.4028061466407449e-08, "loss": 0.1269, "step": 7245 }, { "epoch": 1.8996151641693277, "grad_norm": 5.507929252171665, "learning_rate": 1.3668743600613097e-08, "loss": 0.1869, "step": 7250 }, { "epoch": 1.900925243592893, "grad_norm": 3.497582694746413, "learning_rate": 1.3314055792131961e-08, "loss": 0.1518, "step": 7255 }, { "epoch": 1.902235323016458, "grad_norm": 6.083554156233226, "learning_rate": 1.2963999705947193e-08, "loss": 0.158, "step": 7260 }, { "epoch": 1.903545402440023, "grad_norm": 11.09067780911257, "learning_rate": 1.2618576985299334e-08, "loss": 0.1666, "step": 7265 }, { "epoch": 1.904855481863588, "grad_norm": 4.6729278171579605, "learning_rate": 1.227778925167955e-08, "loss": 0.135, "step": 7270 }, { "epoch": 1.906165561287153, "grad_norm": 5.01367722348655, "learning_rate": 1.1941638104820517e-08, "loss": 0.1376, "step": 7275 }, { "epoch": 1.907475640710718, "grad_norm": 8.20404432458644, "learning_rate": 1.1610125122690328e-08, "loss": 0.2188, "step": 7280 }, { "epoch": 1.9087857201342833, "grad_norm": 9.608276812286391, "learning_rate": 1.1283251861484378e-08, "loss": 0.199, "step": 7285 }, { "epoch": 1.910095799557848, "grad_norm": 4.3189093484564145, "learning_rate": 1.0961019855618037e-08, "loss": 0.1662, "step": 7290 }, { "epoch": 1.9114058789814132, "grad_norm": 6.758336690737442, "learning_rate": 1.0643430617719663e-08, "loss": 0.1357, "step": 7295 }, { "epoch": 1.9127159584049784, "grad_norm": 6.780661902438455, "learning_rate": 1.0330485638623488e-08, "loss": 0.178, "step": 7300 }, { "epoch": 1.9127159584049784, "eval_accuracy": 0.7656, "eval_loss": 1.1024446487426758, "eval_runtime": 141.415, "eval_samples_per_second": 8.839, "eval_steps_per_second": 2.213, "step": 7300 }, { "epoch": 1.9140260378285432, "grad_norm": 6.026966107553702, "learning_rate": 1.0022186387362742e-08, "loss": 0.1445, "step": 7305 }, { "epoch": 1.9153361172521084, "grad_norm": 6.277429872371872, "learning_rate": 9.718534311161985e-09, "loss": 0.1679, "step": 7310 }, { "epoch": 1.9166461966756736, "grad_norm": 4.119214806989816, "learning_rate": 9.419530835431676e-09, "loss": 0.1928, "step": 7315 }, { "epoch": 1.9179562760992384, "grad_norm": 5.6153080280283385, "learning_rate": 9.125177363759951e-09, "loss": 0.1118, "step": 7320 }, { "epoch": 1.9192663555228036, "grad_norm": 4.7666055487436525, "learning_rate": 8.835475277907622e-09, "loss": 0.1643, "step": 7325 }, { "epoch": 1.9205764349463688, "grad_norm": 8.192595195938912, "learning_rate": 8.550425937800088e-09, "loss": 0.1507, "step": 7330 }, { "epoch": 1.9218865143699335, "grad_norm": 6.765512918112135, "learning_rate": 8.270030681522099e-09, "loss": 0.1295, "step": 7335 }, { "epoch": 1.9231965937934987, "grad_norm": 3.56638369560944, "learning_rate": 7.994290825311333e-09, "loss": 0.1031, "step": 7340 }, { "epoch": 1.924506673217064, "grad_norm": 7.699305575968428, "learning_rate": 7.72320766355139e-09, "loss": 0.1526, "step": 7345 }, { "epoch": 1.9258167526406287, "grad_norm": 4.8819269994799654, "learning_rate": 7.45678246876702e-09, "loss": 0.1646, "step": 7350 }, { "epoch": 1.9271268320641939, "grad_norm": 3.9964247606777357, "learning_rate": 7.19501649161669e-09, "loss": 0.1028, "step": 7355 }, { "epoch": 1.928436911487759, "grad_norm": 7.878684128224621, "learning_rate": 6.937910960888138e-09, "loss": 0.1542, "step": 7360 }, { "epoch": 1.9297469909113238, "grad_norm": 9.167984693047696, "learning_rate": 6.685467083491492e-09, "loss": 0.1468, "step": 7365 }, { "epoch": 1.931057070334889, "grad_norm": 2.1053337480076344, "learning_rate": 6.437686044454382e-09, "loss": 0.153, "step": 7370 }, { "epoch": 1.9323671497584543, "grad_norm": 6.447874086726777, "learning_rate": 6.194569006915729e-09, "loss": 0.1358, "step": 7375 }, { "epoch": 1.933677229182019, "grad_norm": 10.70751883403138, "learning_rate": 5.95611711212074e-09, "loss": 0.2028, "step": 7380 }, { "epoch": 1.9349873086055842, "grad_norm": 10.40080019721987, "learning_rate": 5.722331479415476e-09, "loss": 0.1971, "step": 7385 }, { "epoch": 1.9362973880291494, "grad_norm": 6.470624613494294, "learning_rate": 5.4932132062414095e-09, "loss": 0.148, "step": 7390 }, { "epoch": 1.9376074674527142, "grad_norm": 6.406990396350547, "learning_rate": 5.268763368130425e-09, "loss": 0.1788, "step": 7395 }, { "epoch": 1.9389175468762794, "grad_norm": 2.770021538790025, "learning_rate": 5.048983018699826e-09, "loss": 0.1198, "step": 7400 }, { "epoch": 1.9389175468762794, "eval_accuracy": 0.7632, "eval_loss": 1.106671929359436, "eval_runtime": 143.3333, "eval_samples_per_second": 8.721, "eval_steps_per_second": 2.184, "step": 7400 }, { "epoch": 1.9402276262998446, "grad_norm": 7.932600638186708, "learning_rate": 4.8338731896472305e-09, "loss": 0.0954, "step": 7405 }, { "epoch": 1.9415377057234093, "grad_norm": 4.158505987091941, "learning_rate": 4.623434890745792e-09, "loss": 0.1482, "step": 7410 }, { "epoch": 1.9428477851469745, "grad_norm": 4.125686999210389, "learning_rate": 4.417669109839539e-09, "loss": 0.1672, "step": 7415 }, { "epoch": 1.9441578645705397, "grad_norm": 9.401687509625868, "learning_rate": 4.2165768128384905e-09, "loss": 0.2056, "step": 7420 }, { "epoch": 1.9454679439941045, "grad_norm": 6.656706387472742, "learning_rate": 4.020158943714436e-09, "loss": 0.1292, "step": 7425 }, { "epoch": 1.9467780234176697, "grad_norm": 5.612244185690268, "learning_rate": 3.828416424496383e-09, "loss": 0.1141, "step": 7430 }, { "epoch": 1.948088102841235, "grad_norm": 8.832257438742863, "learning_rate": 3.641350155266232e-09, "loss": 0.2152, "step": 7435 }, { "epoch": 1.9493981822647997, "grad_norm": 9.016570066506297, "learning_rate": 3.458961014154327e-09, "loss": 0.1548, "step": 7440 }, { "epoch": 1.9507082616883649, "grad_norm": 5.852774450303297, "learning_rate": 3.2812498573359104e-09, "loss": 0.1769, "step": 7445 }, { "epoch": 1.95201834111193, "grad_norm": 3.8609337077050783, "learning_rate": 3.108217519026235e-09, "loss": 0.1429, "step": 7450 }, { "epoch": 1.9533284205354948, "grad_norm": 3.881198457400227, "learning_rate": 2.9398648114775658e-09, "loss": 0.1024, "step": 7455 }, { "epoch": 1.95463849995906, "grad_norm": 4.8164655468300115, "learning_rate": 2.776192524974741e-09, "loss": 0.138, "step": 7460 }, { "epoch": 1.9559485793826252, "grad_norm": 6.545137926620753, "learning_rate": 2.617201427831728e-09, "loss": 0.1693, "step": 7465 }, { "epoch": 1.95725865880619, "grad_norm": 4.068942827553475, "learning_rate": 2.4628922663879615e-09, "loss": 0.1181, "step": 7470 }, { "epoch": 1.9585687382297552, "grad_norm": 3.941067732119495, "learning_rate": 2.3132657650047905e-09, "loss": 0.1674, "step": 7475 }, { "epoch": 1.9598788176533204, "grad_norm": 4.838837050335417, "learning_rate": 2.168322626062147e-09, "loss": 0.1547, "step": 7480 }, { "epoch": 1.9611888970768852, "grad_norm": 7.190077481755126, "learning_rate": 2.0280635299551043e-09, "loss": 0.1601, "step": 7485 }, { "epoch": 1.9624989765004504, "grad_norm": 6.231947930455576, "learning_rate": 1.8924891350911023e-09, "loss": 0.1571, "step": 7490 }, { "epoch": 1.9638090559240156, "grad_norm": 3.2883780328164156, "learning_rate": 1.7616000778863938e-09, "loss": 0.1346, "step": 7495 }, { "epoch": 1.9651191353475803, "grad_norm": 2.251817701553141, "learning_rate": 1.6353969727629368e-09, "loss": 0.1483, "step": 7500 }, { "epoch": 1.9651191353475803, "eval_accuracy": 0.764, "eval_loss": 1.1052285432815552, "eval_runtime": 140.9937, "eval_samples_per_second": 8.866, "eval_steps_per_second": 2.22, "step": 7500 }, { "epoch": 1.9664292147711455, "grad_norm": 6.416248872950269, "learning_rate": 1.5138804121462844e-09, "loss": 0.1381, "step": 7505 }, { "epoch": 1.9677392941947107, "grad_norm": 6.998254537894883, "learning_rate": 1.3970509664620323e-09, "loss": 0.143, "step": 7510 }, { "epoch": 1.9690493736182755, "grad_norm": 8.413132217200232, "learning_rate": 1.284909184133487e-09, "loss": 0.1645, "step": 7515 }, { "epoch": 1.9703594530418407, "grad_norm": 5.34533852883374, "learning_rate": 1.1774555915787799e-09, "loss": 0.1896, "step": 7520 }, { "epoch": 1.9716695324654057, "grad_norm": 3.1253164448556814, "learning_rate": 1.0746906932092016e-09, "loss": 0.17, "step": 7525 }, { "epoch": 1.9729796118889706, "grad_norm": 6.051727298230588, "learning_rate": 9.7661497142576e-10, "loss": 0.1539, "step": 7530 }, { "epoch": 1.9742896913125358, "grad_norm": 9.970322292501223, "learning_rate": 8.832288866175152e-10, "loss": 0.1438, "step": 7535 }, { "epoch": 1.9755997707361008, "grad_norm": 4.902999720057873, "learning_rate": 7.945328771596926e-10, "loss": 0.1661, "step": 7540 }, { "epoch": 1.9769098501596658, "grad_norm": 6.8168524634333325, "learning_rate": 7.105273594107953e-10, "loss": 0.1571, "step": 7545 }, { "epoch": 1.978219929583231, "grad_norm": 11.517594529415646, "learning_rate": 6.312127277113833e-10, "loss": 0.2014, "step": 7550 }, { "epoch": 1.979530009006796, "grad_norm": 5.3054234456984535, "learning_rate": 5.565893543818534e-10, "loss": 0.1121, "step": 7555 }, { "epoch": 1.980840088430361, "grad_norm": 6.920016441538864, "learning_rate": 4.866575897208846e-10, "loss": 0.1289, "step": 7560 }, { "epoch": 1.9821501678539262, "grad_norm": 5.089129025398095, "learning_rate": 4.2141776200366184e-10, "loss": 0.1832, "step": 7565 }, { "epoch": 1.9834602472774912, "grad_norm": 13.206489418737272, "learning_rate": 3.6087017748043235e-10, "loss": 0.1799, "step": 7570 }, { "epoch": 1.9847703267010561, "grad_norm": 7.895102786478579, "learning_rate": 3.050151203749518e-10, "loss": 0.1761, "step": 7575 }, { "epoch": 1.9860804061246213, "grad_norm": 7.13408976553629, "learning_rate": 2.538528528831518e-10, "loss": 0.1308, "step": 7580 }, { "epoch": 1.9873904855481863, "grad_norm": 5.735776530531999, "learning_rate": 2.0738361517214087e-10, "loss": 0.1536, "step": 7585 }, { "epoch": 1.9887005649717513, "grad_norm": 6.373237190830313, "learning_rate": 1.656076253786498e-10, "loss": 0.1635, "step": 7590 }, { "epoch": 1.9900106443953165, "grad_norm": 6.685342113001509, "learning_rate": 1.2852507960858793e-10, "loss": 0.124, "step": 7595 }, { "epoch": 1.9913207238188815, "grad_norm": 5.437567848160449, "learning_rate": 9.613615193548863e-11, "loss": 0.1298, "step": 7600 }, { "epoch": 1.9913207238188815, "eval_accuracy": 0.76, "eval_loss": 1.105454921722412, "eval_runtime": 140.9417, "eval_samples_per_second": 8.869, "eval_steps_per_second": 2.221, "step": 7600 }, { "epoch": 1.9926308032424465, "grad_norm": 6.920328620350111, "learning_rate": 6.84409944003983e-11, "loss": 0.1196, "step": 7605 }, { "epoch": 1.9939408826660117, "grad_norm": 8.1447085166869, "learning_rate": 4.543973701021109e-11, "loss": 0.1285, "step": 7610 }, { "epoch": 1.9952509620895766, "grad_norm": 3.617450574006755, "learning_rate": 2.7132487738223964e-11, "loss": 0.1182, "step": 7615 }, { "epoch": 1.9965610415131416, "grad_norm": 5.8602211913393445, "learning_rate": 1.3519332522471393e-11, "loss": 0.1203, "step": 7620 }, { "epoch": 1.9978711209367068, "grad_norm": 7.209657643570803, "learning_rate": 4.6003352661694304e-12, "loss": 0.1478, "step": 7625 }, { "epoch": 1.9991812003602718, "grad_norm": 4.426426765312953, "learning_rate": 3.7553783716059993e-13, "loss": 0.1669, "step": 7630 } ], "logging_steps": 5, "max_steps": 7632, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }