{ "best_metric": 0.9987980723381042, "best_model_checkpoint": "./SmolVLM-500M-Base_findit_caption_0.0.4-vqav/checkpoint-14750", "epoch": 2.9998486759142495, "eval_steps": 250, "global_step": 14868, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005044136191677175, "grad_norm": 10.0625, "learning_rate": 5e-05, "loss": 2.5412, "step": 25 }, { "epoch": 0.01008827238335435, "grad_norm": 4.28125, "learning_rate": 0.0001, "loss": 1.8289, "step": 50 }, { "epoch": 0.015132408575031526, "grad_norm": 3.71875, "learning_rate": 9.98312862734512e-05, "loss": 1.5273, "step": 75 }, { "epoch": 0.0201765447667087, "grad_norm": 3.703125, "learning_rate": 9.966257254690242e-05, "loss": 1.4195, "step": 100 }, { "epoch": 0.025220680958385876, "grad_norm": 4.03125, "learning_rate": 9.949385882035362e-05, "loss": 1.372, "step": 125 }, { "epoch": 0.03026481715006305, "grad_norm": 3.59375, "learning_rate": 9.932514509380484e-05, "loss": 1.363, "step": 150 }, { "epoch": 0.03530895334174023, "grad_norm": 3.484375, "learning_rate": 9.915643136725604e-05, "loss": 1.3175, "step": 175 }, { "epoch": 0.0403530895334174, "grad_norm": 3.34375, "learning_rate": 9.898771764070726e-05, "loss": 1.3161, "step": 200 }, { "epoch": 0.04539722572509458, "grad_norm": 3.578125, "learning_rate": 9.881900391415846e-05, "loss": 1.2853, "step": 225 }, { "epoch": 0.05044136191677175, "grad_norm": 3.5625, "learning_rate": 9.865029018760967e-05, "loss": 1.2497, "step": 250 }, { "epoch": 0.05044136191677175, "eval_loss": 1.26058828830719, "eval_runtime": 2714.8164, "eval_samples_per_second": 6.491, "eval_steps_per_second": 0.811, "step": 250 }, { "epoch": 0.05548549810844893, "grad_norm": 3.5, "learning_rate": 9.848157646106087e-05, "loss": 1.2615, "step": 275 }, { "epoch": 0.0605296343001261, "grad_norm": 3.4375, "learning_rate": 9.831286273451209e-05, "loss": 1.2301, "step": 300 }, { "epoch": 0.06557377049180328, "grad_norm": 3.46875, "learning_rate": 9.814414900796329e-05, "loss": 1.2175, "step": 325 }, { "epoch": 0.07061790668348046, "grad_norm": 3.34375, "learning_rate": 9.797543528141449e-05, "loss": 1.2218, "step": 350 }, { "epoch": 0.07566204287515763, "grad_norm": 3.6875, "learning_rate": 9.780672155486571e-05, "loss": 1.2164, "step": 375 }, { "epoch": 0.0807061790668348, "grad_norm": 4.03125, "learning_rate": 9.763800782831691e-05, "loss": 1.2188, "step": 400 }, { "epoch": 0.08575031525851198, "grad_norm": 3.421875, "learning_rate": 9.746929410176812e-05, "loss": 1.2073, "step": 425 }, { "epoch": 0.09079445145018916, "grad_norm": 3.5, "learning_rate": 9.730058037521933e-05, "loss": 1.1983, "step": 450 }, { "epoch": 0.09583858764186633, "grad_norm": 3.578125, "learning_rate": 9.713186664867054e-05, "loss": 1.1863, "step": 475 }, { "epoch": 0.1008827238335435, "grad_norm": 3.8125, "learning_rate": 9.696315292212174e-05, "loss": 1.2273, "step": 500 }, { "epoch": 0.1008827238335435, "eval_loss": 1.1868129968643188, "eval_runtime": 2650.4956, "eval_samples_per_second": 6.649, "eval_steps_per_second": 0.831, "step": 500 }, { "epoch": 0.10592686002522068, "grad_norm": 3.265625, "learning_rate": 9.679443919557296e-05, "loss": 1.202, "step": 525 }, { "epoch": 0.11097099621689786, "grad_norm": 3.359375, "learning_rate": 9.662572546902416e-05, "loss": 1.1873, "step": 550 }, { "epoch": 0.11601513240857503, "grad_norm": 3.5625, "learning_rate": 9.645701174247536e-05, "loss": 1.1843, "step": 575 }, { "epoch": 0.1210592686002522, "grad_norm": 3.5, "learning_rate": 9.628829801592658e-05, "loss": 1.1816, "step": 600 }, { "epoch": 0.12610340479192939, "grad_norm": 3.453125, "learning_rate": 9.611958428937778e-05, "loss": 1.1696, "step": 625 }, { "epoch": 0.13114754098360656, "grad_norm": 3.546875, "learning_rate": 9.595087056282899e-05, "loss": 1.1651, "step": 650 }, { "epoch": 0.13619167717528374, "grad_norm": 3.15625, "learning_rate": 9.57821568362802e-05, "loss": 1.1664, "step": 675 }, { "epoch": 0.14123581336696092, "grad_norm": 3.375, "learning_rate": 9.561344310973141e-05, "loss": 1.1586, "step": 700 }, { "epoch": 0.14627994955863807, "grad_norm": 4.0, "learning_rate": 9.544472938318261e-05, "loss": 1.1667, "step": 725 }, { "epoch": 0.15132408575031525, "grad_norm": 3.234375, "learning_rate": 9.527601565663383e-05, "loss": 1.148, "step": 750 }, { "epoch": 0.15132408575031525, "eval_loss": 1.1463068723678589, "eval_runtime": 3337.4601, "eval_samples_per_second": 5.28, "eval_steps_per_second": 0.66, "step": 750 }, { "epoch": 0.15636822194199243, "grad_norm": 3.265625, "learning_rate": 9.510730193008504e-05, "loss": 1.1414, "step": 775 }, { "epoch": 0.1614123581336696, "grad_norm": 3.375, "learning_rate": 9.493858820353624e-05, "loss": 1.1441, "step": 800 }, { "epoch": 0.1664564943253468, "grad_norm": 3.375, "learning_rate": 9.476987447698746e-05, "loss": 1.1367, "step": 825 }, { "epoch": 0.17150063051702397, "grad_norm": 3.515625, "learning_rate": 9.460116075043866e-05, "loss": 1.1433, "step": 850 }, { "epoch": 0.17654476670870115, "grad_norm": 3.328125, "learning_rate": 9.443244702388988e-05, "loss": 1.1344, "step": 875 }, { "epoch": 0.18158890290037832, "grad_norm": 3.390625, "learning_rate": 9.426373329734108e-05, "loss": 1.127, "step": 900 }, { "epoch": 0.18663303909205547, "grad_norm": 3.359375, "learning_rate": 9.409501957079229e-05, "loss": 1.1404, "step": 925 }, { "epoch": 0.19167717528373265, "grad_norm": 3.453125, "learning_rate": 9.39263058442435e-05, "loss": 1.1436, "step": 950 }, { "epoch": 0.19672131147540983, "grad_norm": 3.46875, "learning_rate": 9.375759211769471e-05, "loss": 1.1172, "step": 975 }, { "epoch": 0.201765447667087, "grad_norm": 3.140625, "learning_rate": 9.358887839114591e-05, "loss": 1.1291, "step": 1000 }, { "epoch": 0.201765447667087, "eval_loss": 1.1213735342025757, "eval_runtime": 5735.0857, "eval_samples_per_second": 3.073, "eval_steps_per_second": 0.384, "step": 1000 }, { "epoch": 0.2068095838587642, "grad_norm": 3.328125, "learning_rate": 9.342016466459713e-05, "loss": 1.1178, "step": 1025 }, { "epoch": 0.21185372005044137, "grad_norm": 3.15625, "learning_rate": 9.325145093804833e-05, "loss": 1.1042, "step": 1050 }, { "epoch": 0.21689785624211855, "grad_norm": 3.515625, "learning_rate": 9.308273721149953e-05, "loss": 1.1326, "step": 1075 }, { "epoch": 0.22194199243379573, "grad_norm": 3.046875, "learning_rate": 9.291402348495074e-05, "loss": 1.1147, "step": 1100 }, { "epoch": 0.22698612862547288, "grad_norm": 3.390625, "learning_rate": 9.274530975840195e-05, "loss": 1.1163, "step": 1125 }, { "epoch": 0.23203026481715006, "grad_norm": 3.171875, "learning_rate": 9.257659603185316e-05, "loss": 1.1017, "step": 1150 }, { "epoch": 0.23707440100882723, "grad_norm": 3.390625, "learning_rate": 9.240788230530436e-05, "loss": 1.1219, "step": 1175 }, { "epoch": 0.2421185372005044, "grad_norm": 3.1875, "learning_rate": 9.223916857875558e-05, "loss": 1.1066, "step": 1200 }, { "epoch": 0.2471626733921816, "grad_norm": 2.890625, "learning_rate": 9.207045485220678e-05, "loss": 1.1103, "step": 1225 }, { "epoch": 0.25220680958385877, "grad_norm": 3.265625, "learning_rate": 9.1901741125658e-05, "loss": 1.0914, "step": 1250 }, { "epoch": 0.25220680958385877, "eval_loss": 1.1013678312301636, "eval_runtime": 2597.4552, "eval_samples_per_second": 6.785, "eval_steps_per_second": 0.848, "step": 1250 }, { "epoch": 0.2572509457755359, "grad_norm": 3.59375, "learning_rate": 9.17330273991092e-05, "loss": 1.0972, "step": 1275 }, { "epoch": 0.26229508196721313, "grad_norm": 3.328125, "learning_rate": 9.156431367256041e-05, "loss": 1.0895, "step": 1300 }, { "epoch": 0.2673392181588903, "grad_norm": 3.265625, "learning_rate": 9.139559994601161e-05, "loss": 1.1197, "step": 1325 }, { "epoch": 0.2723833543505675, "grad_norm": 3.203125, "learning_rate": 9.122688621946282e-05, "loss": 1.1212, "step": 1350 }, { "epoch": 0.27742749054224464, "grad_norm": 3.390625, "learning_rate": 9.105817249291403e-05, "loss": 1.0781, "step": 1375 }, { "epoch": 0.28247162673392184, "grad_norm": 3.953125, "learning_rate": 9.088945876636523e-05, "loss": 1.1108, "step": 1400 }, { "epoch": 0.287515762925599, "grad_norm": 3.328125, "learning_rate": 9.072074503981645e-05, "loss": 1.0993, "step": 1425 }, { "epoch": 0.29255989911727615, "grad_norm": 3.515625, "learning_rate": 9.055203131326765e-05, "loss": 1.0897, "step": 1450 }, { "epoch": 0.29760403530895335, "grad_norm": 3.453125, "learning_rate": 9.038331758671886e-05, "loss": 1.0916, "step": 1475 }, { "epoch": 0.3026481715006305, "grad_norm": 3.125, "learning_rate": 9.021460386017007e-05, "loss": 1.0835, "step": 1500 }, { "epoch": 0.3026481715006305, "eval_loss": 1.0892547369003296, "eval_runtime": 2739.1182, "eval_samples_per_second": 6.434, "eval_steps_per_second": 0.804, "step": 1500 }, { "epoch": 0.3076923076923077, "grad_norm": 3.859375, "learning_rate": 9.004589013362128e-05, "loss": 1.0849, "step": 1525 }, { "epoch": 0.31273644388398486, "grad_norm": 3.5625, "learning_rate": 8.987717640707248e-05, "loss": 1.0843, "step": 1550 }, { "epoch": 0.31778058007566207, "grad_norm": 3.390625, "learning_rate": 8.97084626805237e-05, "loss": 1.0925, "step": 1575 }, { "epoch": 0.3228247162673392, "grad_norm": 3.3125, "learning_rate": 8.95397489539749e-05, "loss": 1.0789, "step": 1600 }, { "epoch": 0.32786885245901637, "grad_norm": 3.359375, "learning_rate": 8.93710352274261e-05, "loss": 1.0714, "step": 1625 }, { "epoch": 0.3329129886506936, "grad_norm": 3.390625, "learning_rate": 8.920232150087732e-05, "loss": 1.0821, "step": 1650 }, { "epoch": 0.3379571248423707, "grad_norm": 3.1875, "learning_rate": 8.903360777432852e-05, "loss": 1.1045, "step": 1675 }, { "epoch": 0.34300126103404793, "grad_norm": 3.40625, "learning_rate": 8.886489404777973e-05, "loss": 1.0963, "step": 1700 }, { "epoch": 0.3480453972257251, "grad_norm": 3.109375, "learning_rate": 8.869618032123093e-05, "loss": 1.0684, "step": 1725 }, { "epoch": 0.3530895334174023, "grad_norm": 3.546875, "learning_rate": 8.852746659468215e-05, "loss": 1.0834, "step": 1750 }, { "epoch": 0.3530895334174023, "eval_loss": 1.0766961574554443, "eval_runtime": 2601.6554, "eval_samples_per_second": 6.774, "eval_steps_per_second": 0.847, "step": 1750 }, { "epoch": 0.35813366960907944, "grad_norm": 3.59375, "learning_rate": 8.835875286813335e-05, "loss": 1.056, "step": 1775 }, { "epoch": 0.36317780580075665, "grad_norm": 3.40625, "learning_rate": 8.819003914158457e-05, "loss": 1.0648, "step": 1800 }, { "epoch": 0.3682219419924338, "grad_norm": 3.625, "learning_rate": 8.802132541503577e-05, "loss": 1.0544, "step": 1825 }, { "epoch": 0.37326607818411095, "grad_norm": 3.484375, "learning_rate": 8.785261168848697e-05, "loss": 1.0513, "step": 1850 }, { "epoch": 0.37831021437578816, "grad_norm": 3.5, "learning_rate": 8.768389796193819e-05, "loss": 1.0781, "step": 1875 }, { "epoch": 0.3833543505674653, "grad_norm": 3.34375, "learning_rate": 8.751518423538939e-05, "loss": 1.0582, "step": 1900 }, { "epoch": 0.3883984867591425, "grad_norm": 3.34375, "learning_rate": 8.73464705088406e-05, "loss": 1.0595, "step": 1925 }, { "epoch": 0.39344262295081966, "grad_norm": 3.296875, "learning_rate": 8.71777567822918e-05, "loss": 1.0591, "step": 1950 }, { "epoch": 0.39848675914249687, "grad_norm": 3.34375, "learning_rate": 8.700904305574302e-05, "loss": 1.064, "step": 1975 }, { "epoch": 0.403530895334174, "grad_norm": 3.25, "learning_rate": 8.684032932919422e-05, "loss": 1.0684, "step": 2000 }, { "epoch": 0.403530895334174, "eval_loss": 1.0674290657043457, "eval_runtime": 2598.42, "eval_samples_per_second": 6.782, "eval_steps_per_second": 0.848, "step": 2000 }, { "epoch": 0.4085750315258512, "grad_norm": 3.421875, "learning_rate": 8.667161560264544e-05, "loss": 1.0696, "step": 2025 }, { "epoch": 0.4136191677175284, "grad_norm": 3.484375, "learning_rate": 8.650290187609664e-05, "loss": 1.062, "step": 2050 }, { "epoch": 0.41866330390920553, "grad_norm": 3.375, "learning_rate": 8.633418814954785e-05, "loss": 1.097, "step": 2075 }, { "epoch": 0.42370744010088274, "grad_norm": 3.71875, "learning_rate": 8.616547442299905e-05, "loss": 1.0464, "step": 2100 }, { "epoch": 0.4287515762925599, "grad_norm": 3.5, "learning_rate": 8.599676069645026e-05, "loss": 1.0775, "step": 2125 }, { "epoch": 0.4337957124842371, "grad_norm": 3.34375, "learning_rate": 8.582804696990147e-05, "loss": 1.0548, "step": 2150 }, { "epoch": 0.43883984867591425, "grad_norm": 3.3125, "learning_rate": 8.565933324335269e-05, "loss": 1.0734, "step": 2175 }, { "epoch": 0.44388398486759145, "grad_norm": 3.328125, "learning_rate": 8.549061951680389e-05, "loss": 1.0716, "step": 2200 }, { "epoch": 0.4489281210592686, "grad_norm": 3.640625, "learning_rate": 8.53219057902551e-05, "loss": 1.055, "step": 2225 }, { "epoch": 0.45397225725094575, "grad_norm": 3.53125, "learning_rate": 8.51531920637063e-05, "loss": 1.0621, "step": 2250 }, { "epoch": 0.45397225725094575, "eval_loss": 1.0594151020050049, "eval_runtime": 2737.066, "eval_samples_per_second": 6.439, "eval_steps_per_second": 0.805, "step": 2250 }, { "epoch": 0.45901639344262296, "grad_norm": 3.203125, "learning_rate": 8.498447833715752e-05, "loss": 1.0418, "step": 2275 }, { "epoch": 0.4640605296343001, "grad_norm": 3.34375, "learning_rate": 8.481576461060872e-05, "loss": 1.042, "step": 2300 }, { "epoch": 0.4691046658259773, "grad_norm": 3.296875, "learning_rate": 8.464705088405994e-05, "loss": 1.0576, "step": 2325 }, { "epoch": 0.47414880201765447, "grad_norm": 3.78125, "learning_rate": 8.447833715751114e-05, "loss": 1.0704, "step": 2350 }, { "epoch": 0.4791929382093317, "grad_norm": 3.359375, "learning_rate": 8.430962343096235e-05, "loss": 1.0524, "step": 2375 }, { "epoch": 0.4842370744010088, "grad_norm": 3.28125, "learning_rate": 8.414090970441356e-05, "loss": 1.0539, "step": 2400 }, { "epoch": 0.489281210592686, "grad_norm": 3.578125, "learning_rate": 8.397219597786477e-05, "loss": 1.0612, "step": 2425 }, { "epoch": 0.4943253467843632, "grad_norm": 3.28125, "learning_rate": 8.380348225131597e-05, "loss": 1.0651, "step": 2450 }, { "epoch": 0.49936948297604034, "grad_norm": 3.734375, "learning_rate": 8.363476852476719e-05, "loss": 1.0346, "step": 2475 }, { "epoch": 0.5044136191677175, "grad_norm": 3.296875, "learning_rate": 8.346605479821839e-05, "loss": 1.0511, "step": 2500 }, { "epoch": 0.5044136191677175, "eval_loss": 1.0523473024368286, "eval_runtime": 2599.9667, "eval_samples_per_second": 6.778, "eval_steps_per_second": 0.847, "step": 2500 }, { "epoch": 0.5094577553593947, "grad_norm": 3.34375, "learning_rate": 8.32973410716696e-05, "loss": 1.0441, "step": 2525 }, { "epoch": 0.5145018915510718, "grad_norm": 3.109375, "learning_rate": 8.31286273451208e-05, "loss": 1.0599, "step": 2550 }, { "epoch": 0.519546027742749, "grad_norm": 3.640625, "learning_rate": 8.295991361857202e-05, "loss": 1.0527, "step": 2575 }, { "epoch": 0.5245901639344263, "grad_norm": 3.15625, "learning_rate": 8.279119989202322e-05, "loss": 1.0384, "step": 2600 }, { "epoch": 0.5296343001261034, "grad_norm": 3.421875, "learning_rate": 8.262248616547442e-05, "loss": 1.0541, "step": 2625 }, { "epoch": 0.5346784363177806, "grad_norm": 3.46875, "learning_rate": 8.245377243892564e-05, "loss": 1.0584, "step": 2650 }, { "epoch": 0.5397225725094578, "grad_norm": 3.1875, "learning_rate": 8.228505871237684e-05, "loss": 1.0651, "step": 2675 }, { "epoch": 0.544766708701135, "grad_norm": 3.21875, "learning_rate": 8.211634498582806e-05, "loss": 1.0584, "step": 2700 }, { "epoch": 0.5498108448928121, "grad_norm": 3.484375, "learning_rate": 8.194763125927926e-05, "loss": 1.0356, "step": 2725 }, { "epoch": 0.5548549810844893, "grad_norm": 3.234375, "learning_rate": 8.177891753273047e-05, "loss": 1.0485, "step": 2750 }, { "epoch": 0.5548549810844893, "eval_loss": 1.0466351509094238, "eval_runtime": 2598.483, "eval_samples_per_second": 6.782, "eval_steps_per_second": 0.848, "step": 2750 }, { "epoch": 0.5598991172761665, "grad_norm": 3.203125, "learning_rate": 8.161020380618167e-05, "loss": 1.0453, "step": 2775 }, { "epoch": 0.5649432534678437, "grad_norm": 4.09375, "learning_rate": 8.144149007963289e-05, "loss": 1.0666, "step": 2800 }, { "epoch": 0.5699873896595208, "grad_norm": 3.5, "learning_rate": 8.127277635308409e-05, "loss": 1.0513, "step": 2825 }, { "epoch": 0.575031525851198, "grad_norm": 3.21875, "learning_rate": 8.11040626265353e-05, "loss": 1.0373, "step": 2850 }, { "epoch": 0.5800756620428752, "grad_norm": 3.25, "learning_rate": 8.093534889998651e-05, "loss": 1.0368, "step": 2875 }, { "epoch": 0.5851197982345523, "grad_norm": 3.328125, "learning_rate": 8.076663517343771e-05, "loss": 1.051, "step": 2900 }, { "epoch": 0.5901639344262295, "grad_norm": 3.40625, "learning_rate": 8.059792144688893e-05, "loss": 1.0537, "step": 2925 }, { "epoch": 0.5952080706179067, "grad_norm": 3.5, "learning_rate": 8.042920772034013e-05, "loss": 1.0573, "step": 2950 }, { "epoch": 0.6002522068095839, "grad_norm": 3.65625, "learning_rate": 8.026049399379134e-05, "loss": 1.0479, "step": 2975 }, { "epoch": 0.605296343001261, "grad_norm": 3.203125, "learning_rate": 8.009178026724254e-05, "loss": 1.0517, "step": 3000 }, { "epoch": 0.605296343001261, "eval_loss": 1.0410985946655273, "eval_runtime": 2602.3541, "eval_samples_per_second": 6.772, "eval_steps_per_second": 0.847, "step": 3000 }, { "epoch": 0.6103404791929382, "grad_norm": 3.703125, "learning_rate": 7.992306654069376e-05, "loss": 1.0418, "step": 3025 }, { "epoch": 0.6153846153846154, "grad_norm": 3.546875, "learning_rate": 7.975435281414496e-05, "loss": 1.0177, "step": 3050 }, { "epoch": 0.6204287515762925, "grad_norm": 3.375, "learning_rate": 7.958563908759618e-05, "loss": 1.043, "step": 3075 }, { "epoch": 0.6254728877679697, "grad_norm": 3.625, "learning_rate": 7.941692536104738e-05, "loss": 1.0393, "step": 3100 }, { "epoch": 0.6305170239596469, "grad_norm": 3.328125, "learning_rate": 7.924821163449858e-05, "loss": 1.063, "step": 3125 }, { "epoch": 0.6355611601513241, "grad_norm": 3.171875, "learning_rate": 7.90794979079498e-05, "loss": 1.0332, "step": 3150 }, { "epoch": 0.6406052963430012, "grad_norm": 3.734375, "learning_rate": 7.8910784181401e-05, "loss": 1.0415, "step": 3175 }, { "epoch": 0.6456494325346784, "grad_norm": 3.21875, "learning_rate": 7.874207045485221e-05, "loss": 1.0348, "step": 3200 }, { "epoch": 0.6506935687263556, "grad_norm": 3.875, "learning_rate": 7.857335672830341e-05, "loss": 1.0513, "step": 3225 }, { "epoch": 0.6557377049180327, "grad_norm": 3.15625, "learning_rate": 7.840464300175463e-05, "loss": 1.0292, "step": 3250 }, { "epoch": 0.6557377049180327, "eval_loss": 1.0367991924285889, "eval_runtime": 2665.705, "eval_samples_per_second": 6.611, "eval_steps_per_second": 0.826, "step": 3250 }, { "epoch": 0.6607818411097099, "grad_norm": 3.5, "learning_rate": 7.823592927520583e-05, "loss": 1.019, "step": 3275 }, { "epoch": 0.6658259773013872, "grad_norm": 3.4375, "learning_rate": 7.806721554865704e-05, "loss": 1.0145, "step": 3300 }, { "epoch": 0.6708701134930644, "grad_norm": 3.59375, "learning_rate": 7.789850182210825e-05, "loss": 1.0241, "step": 3325 }, { "epoch": 0.6759142496847415, "grad_norm": 3.328125, "learning_rate": 7.772978809555946e-05, "loss": 1.014, "step": 3350 }, { "epoch": 0.6809583858764187, "grad_norm": 3.296875, "learning_rate": 7.756107436901066e-05, "loss": 1.0579, "step": 3375 }, { "epoch": 0.6860025220680959, "grad_norm": 3.546875, "learning_rate": 7.739236064246186e-05, "loss": 1.021, "step": 3400 }, { "epoch": 0.691046658259773, "grad_norm": 3.625, "learning_rate": 7.722364691591308e-05, "loss": 1.0294, "step": 3425 }, { "epoch": 0.6960907944514502, "grad_norm": 3.40625, "learning_rate": 7.705493318936428e-05, "loss": 1.0548, "step": 3450 }, { "epoch": 0.7011349306431274, "grad_norm": 3.5, "learning_rate": 7.68862194628155e-05, "loss": 1.0318, "step": 3475 }, { "epoch": 0.7061790668348046, "grad_norm": 3.546875, "learning_rate": 7.67175057362667e-05, "loss": 1.0398, "step": 3500 }, { "epoch": 0.7061790668348046, "eval_loss": 1.0327616930007935, "eval_runtime": 2661.2688, "eval_samples_per_second": 6.622, "eval_steps_per_second": 0.828, "step": 3500 }, { "epoch": 0.7112232030264817, "grad_norm": 3.640625, "learning_rate": 7.654879200971791e-05, "loss": 1.048, "step": 3525 }, { "epoch": 0.7162673392181589, "grad_norm": 3.4375, "learning_rate": 7.638007828316912e-05, "loss": 1.0229, "step": 3550 }, { "epoch": 0.7213114754098361, "grad_norm": 3.4375, "learning_rate": 7.621136455662033e-05, "loss": 1.0286, "step": 3575 }, { "epoch": 0.7263556116015133, "grad_norm": 3.65625, "learning_rate": 7.604265083007153e-05, "loss": 1.0352, "step": 3600 }, { "epoch": 0.7313997477931904, "grad_norm": 3.09375, "learning_rate": 7.587393710352275e-05, "loss": 1.021, "step": 3625 }, { "epoch": 0.7364438839848676, "grad_norm": 3.40625, "learning_rate": 7.570522337697395e-05, "loss": 1.0473, "step": 3650 }, { "epoch": 0.7414880201765448, "grad_norm": 3.609375, "learning_rate": 7.553650965042516e-05, "loss": 1.0262, "step": 3675 }, { "epoch": 0.7465321563682219, "grad_norm": 3.734375, "learning_rate": 7.536779592387637e-05, "loss": 1.0147, "step": 3700 }, { "epoch": 0.7515762925598991, "grad_norm": 3.328125, "learning_rate": 7.519908219732758e-05, "loss": 1.0285, "step": 3725 }, { "epoch": 0.7566204287515763, "grad_norm": 3.65625, "learning_rate": 7.503036847077878e-05, "loss": 1.0199, "step": 3750 }, { "epoch": 0.7566204287515763, "eval_loss": 1.0295381546020508, "eval_runtime": 2633.8879, "eval_samples_per_second": 6.691, "eval_steps_per_second": 0.836, "step": 3750 }, { "epoch": 0.7616645649432535, "grad_norm": 3.1875, "learning_rate": 7.486165474423e-05, "loss": 1.0574, "step": 3775 }, { "epoch": 0.7667087011349306, "grad_norm": 3.453125, "learning_rate": 7.46929410176812e-05, "loss": 1.0569, "step": 3800 }, { "epoch": 0.7717528373266078, "grad_norm": 3.625, "learning_rate": 7.452422729113241e-05, "loss": 1.0223, "step": 3825 }, { "epoch": 0.776796973518285, "grad_norm": 3.4375, "learning_rate": 7.435551356458363e-05, "loss": 1.0172, "step": 3850 }, { "epoch": 0.7818411097099621, "grad_norm": 3.5625, "learning_rate": 7.418679983803483e-05, "loss": 1.0369, "step": 3875 }, { "epoch": 0.7868852459016393, "grad_norm": 3.390625, "learning_rate": 7.401808611148603e-05, "loss": 1.0202, "step": 3900 }, { "epoch": 0.7919293820933165, "grad_norm": 3.359375, "learning_rate": 7.384937238493725e-05, "loss": 1.0563, "step": 3925 }, { "epoch": 0.7969735182849937, "grad_norm": 3.46875, "learning_rate": 7.368065865838845e-05, "loss": 1.0202, "step": 3950 }, { "epoch": 0.8020176544766708, "grad_norm": 3.625, "learning_rate": 7.351194493183967e-05, "loss": 1.0308, "step": 3975 }, { "epoch": 0.807061790668348, "grad_norm": 4.375, "learning_rate": 7.334323120529087e-05, "loss": 1.032, "step": 4000 }, { "epoch": 0.807061790668348, "eval_loss": 1.0260218381881714, "eval_runtime": 2634.9851, "eval_samples_per_second": 6.688, "eval_steps_per_second": 0.836, "step": 4000 }, { "epoch": 0.8121059268600253, "grad_norm": 3.359375, "learning_rate": 7.317451747874208e-05, "loss": 1.0182, "step": 4025 }, { "epoch": 0.8171500630517023, "grad_norm": 3.40625, "learning_rate": 7.300580375219328e-05, "loss": 1.0022, "step": 4050 }, { "epoch": 0.8221941992433796, "grad_norm": 3.484375, "learning_rate": 7.28370900256445e-05, "loss": 1.0151, "step": 4075 }, { "epoch": 0.8272383354350568, "grad_norm": 3.40625, "learning_rate": 7.26683762990957e-05, "loss": 1.0147, "step": 4100 }, { "epoch": 0.832282471626734, "grad_norm": 3.453125, "learning_rate": 7.249966257254692e-05, "loss": 1.0347, "step": 4125 }, { "epoch": 0.8373266078184111, "grad_norm": 3.34375, "learning_rate": 7.233094884599812e-05, "loss": 1.0056, "step": 4150 }, { "epoch": 0.8423707440100883, "grad_norm": 3.96875, "learning_rate": 7.216223511944932e-05, "loss": 1.0435, "step": 4175 }, { "epoch": 0.8474148802017655, "grad_norm": 3.390625, "learning_rate": 7.199352139290053e-05, "loss": 1.0383, "step": 4200 }, { "epoch": 0.8524590163934426, "grad_norm": 3.359375, "learning_rate": 7.182480766635174e-05, "loss": 1.0207, "step": 4225 }, { "epoch": 0.8575031525851198, "grad_norm": 3.40625, "learning_rate": 7.165609393980295e-05, "loss": 1.0345, "step": 4250 }, { "epoch": 0.8575031525851198, "eval_loss": 1.0239174365997314, "eval_runtime": 2640.3041, "eval_samples_per_second": 6.675, "eval_steps_per_second": 0.834, "step": 4250 }, { "epoch": 0.862547288776797, "grad_norm": 3.390625, "learning_rate": 7.148738021325415e-05, "loss": 1.0139, "step": 4275 }, { "epoch": 0.8675914249684742, "grad_norm": 3.5, "learning_rate": 7.131866648670537e-05, "loss": 1.0284, "step": 4300 }, { "epoch": 0.8726355611601513, "grad_norm": 3.390625, "learning_rate": 7.114995276015657e-05, "loss": 1.0265, "step": 4325 }, { "epoch": 0.8776796973518285, "grad_norm": 3.5, "learning_rate": 7.098123903360778e-05, "loss": 1.0122, "step": 4350 }, { "epoch": 0.8827238335435057, "grad_norm": 3.359375, "learning_rate": 7.081252530705899e-05, "loss": 1.0295, "step": 4375 }, { "epoch": 0.8877679697351829, "grad_norm": 3.484375, "learning_rate": 7.064381158051019e-05, "loss": 1.0037, "step": 4400 }, { "epoch": 0.89281210592686, "grad_norm": 3.53125, "learning_rate": 7.04750978539614e-05, "loss": 1.0188, "step": 4425 }, { "epoch": 0.8978562421185372, "grad_norm": 3.265625, "learning_rate": 7.03063841274126e-05, "loss": 1.0306, "step": 4450 }, { "epoch": 0.9029003783102144, "grad_norm": 3.375, "learning_rate": 7.013767040086382e-05, "loss": 1.0231, "step": 4475 }, { "epoch": 0.9079445145018915, "grad_norm": 3.46875, "learning_rate": 6.996895667431502e-05, "loss": 1.0166, "step": 4500 }, { "epoch": 0.9079445145018915, "eval_loss": 1.0209671258926392, "eval_runtime": 2504.9105, "eval_samples_per_second": 7.035, "eval_steps_per_second": 0.879, "step": 4500 }, { "epoch": 0.9129886506935687, "grad_norm": 3.640625, "learning_rate": 6.980024294776624e-05, "loss": 1.0057, "step": 4525 }, { "epoch": 0.9180327868852459, "grad_norm": 3.4375, "learning_rate": 6.963152922121744e-05, "loss": 1.0199, "step": 4550 }, { "epoch": 0.9230769230769231, "grad_norm": 3.40625, "learning_rate": 6.946281549466865e-05, "loss": 1.0126, "step": 4575 }, { "epoch": 0.9281210592686002, "grad_norm": 3.8125, "learning_rate": 6.929410176811986e-05, "loss": 1.0117, "step": 4600 }, { "epoch": 0.9331651954602774, "grad_norm": 3.53125, "learning_rate": 6.912538804157107e-05, "loss": 1.0299, "step": 4625 }, { "epoch": 0.9382093316519546, "grad_norm": 3.59375, "learning_rate": 6.895667431502227e-05, "loss": 0.9995, "step": 4650 }, { "epoch": 0.9432534678436317, "grad_norm": 3.6875, "learning_rate": 6.878796058847347e-05, "loss": 1.0298, "step": 4675 }, { "epoch": 0.9482976040353089, "grad_norm": 3.328125, "learning_rate": 6.861924686192469e-05, "loss": 1.0156, "step": 4700 }, { "epoch": 0.9533417402269861, "grad_norm": 3.578125, "learning_rate": 6.845053313537589e-05, "loss": 0.9942, "step": 4725 }, { "epoch": 0.9583858764186634, "grad_norm": 3.5, "learning_rate": 6.82818194088271e-05, "loss": 1.0058, "step": 4750 }, { "epoch": 0.9583858764186634, "eval_loss": 1.0182453393936157, "eval_runtime": 2488.3536, "eval_samples_per_second": 7.082, "eval_steps_per_second": 0.885, "step": 4750 }, { "epoch": 0.9634300126103404, "grad_norm": 3.28125, "learning_rate": 6.811310568227831e-05, "loss": 1.0126, "step": 4775 }, { "epoch": 0.9684741488020177, "grad_norm": 3.265625, "learning_rate": 6.794439195572952e-05, "loss": 1.0137, "step": 4800 }, { "epoch": 0.9735182849936949, "grad_norm": 3.359375, "learning_rate": 6.777567822918072e-05, "loss": 1.0129, "step": 4825 }, { "epoch": 0.978562421185372, "grad_norm": 3.609375, "learning_rate": 6.760696450263194e-05, "loss": 1.054, "step": 4850 }, { "epoch": 0.9836065573770492, "grad_norm": 3.53125, "learning_rate": 6.743825077608314e-05, "loss": 1.0247, "step": 4875 }, { "epoch": 0.9886506935687264, "grad_norm": 3.484375, "learning_rate": 6.726953704953434e-05, "loss": 1.0272, "step": 4900 }, { "epoch": 0.9936948297604036, "grad_norm": 3.8125, "learning_rate": 6.710082332298556e-05, "loss": 1.0216, "step": 4925 }, { "epoch": 0.9987389659520807, "grad_norm": 3.3125, "learning_rate": 6.693210959643676e-05, "loss": 1.0327, "step": 4950 }, { "epoch": 1.003783102143758, "grad_norm": 4.53125, "learning_rate": 6.676339586988797e-05, "loss": 1.0073, "step": 4975 }, { "epoch": 1.008827238335435, "grad_norm": 3.40625, "learning_rate": 6.659468214333918e-05, "loss": 1.0093, "step": 5000 }, { "epoch": 1.008827238335435, "eval_loss": 1.0161057710647583, "eval_runtime": 2490.4001, "eval_samples_per_second": 7.076, "eval_steps_per_second": 0.885, "step": 5000 }, { "epoch": 1.0138713745271122, "grad_norm": 3.40625, "learning_rate": 6.642596841679039e-05, "loss": 0.9871, "step": 5025 }, { "epoch": 1.0189155107187895, "grad_norm": 3.515625, "learning_rate": 6.625725469024159e-05, "loss": 0.9819, "step": 5050 }, { "epoch": 1.0239596469104666, "grad_norm": 3.484375, "learning_rate": 6.608854096369281e-05, "loss": 1.0082, "step": 5075 }, { "epoch": 1.0290037831021437, "grad_norm": 3.34375, "learning_rate": 6.591982723714401e-05, "loss": 0.9819, "step": 5100 }, { "epoch": 1.034047919293821, "grad_norm": 3.640625, "learning_rate": 6.575111351059523e-05, "loss": 1.0117, "step": 5125 }, { "epoch": 1.039092055485498, "grad_norm": 3.515625, "learning_rate": 6.558239978404643e-05, "loss": 1.0002, "step": 5150 }, { "epoch": 1.0441361916771752, "grad_norm": 4.0625, "learning_rate": 6.541368605749764e-05, "loss": 0.9964, "step": 5175 }, { "epoch": 1.0491803278688525, "grad_norm": 3.5, "learning_rate": 6.524497233094884e-05, "loss": 1.005, "step": 5200 }, { "epoch": 1.0542244640605296, "grad_norm": 3.5, "learning_rate": 6.507625860440006e-05, "loss": 0.9538, "step": 5225 }, { "epoch": 1.0592686002522067, "grad_norm": 3.59375, "learning_rate": 6.490754487785127e-05, "loss": 1.0072, "step": 5250 }, { "epoch": 1.0592686002522067, "eval_loss": 1.0140247344970703, "eval_runtime": 2498.4403, "eval_samples_per_second": 7.054, "eval_steps_per_second": 0.882, "step": 5250 }, { "epoch": 1.064312736443884, "grad_norm": 3.578125, "learning_rate": 6.473883115130248e-05, "loss": 1.0155, "step": 5275 }, { "epoch": 1.0693568726355611, "grad_norm": 3.65625, "learning_rate": 6.457011742475369e-05, "loss": 0.9873, "step": 5300 }, { "epoch": 1.0744010088272384, "grad_norm": 3.890625, "learning_rate": 6.440140369820489e-05, "loss": 0.9941, "step": 5325 }, { "epoch": 1.0794451450189155, "grad_norm": 3.34375, "learning_rate": 6.423268997165611e-05, "loss": 0.9944, "step": 5350 }, { "epoch": 1.0844892812105926, "grad_norm": 3.65625, "learning_rate": 6.406397624510731e-05, "loss": 0.9859, "step": 5375 }, { "epoch": 1.08953341740227, "grad_norm": 3.6875, "learning_rate": 6.389526251855852e-05, "loss": 0.9876, "step": 5400 }, { "epoch": 1.094577553593947, "grad_norm": 3.921875, "learning_rate": 6.372654879200973e-05, "loss": 0.9935, "step": 5425 }, { "epoch": 1.0996216897856241, "grad_norm": 3.484375, "learning_rate": 6.355783506546093e-05, "loss": 1.0001, "step": 5450 }, { "epoch": 1.1046658259773015, "grad_norm": 3.546875, "learning_rate": 6.338912133891214e-05, "loss": 0.9805, "step": 5475 }, { "epoch": 1.1097099621689785, "grad_norm": 3.328125, "learning_rate": 6.322040761236334e-05, "loss": 0.9907, "step": 5500 }, { "epoch": 1.1097099621689785, "eval_loss": 1.0129742622375488, "eval_runtime": 2495.1599, "eval_samples_per_second": 7.063, "eval_steps_per_second": 0.883, "step": 5500 }, { "epoch": 1.1147540983606556, "grad_norm": 3.40625, "learning_rate": 6.305169388581456e-05, "loss": 0.985, "step": 5525 }, { "epoch": 1.119798234552333, "grad_norm": 3.546875, "learning_rate": 6.288298015926576e-05, "loss": 0.9905, "step": 5550 }, { "epoch": 1.12484237074401, "grad_norm": 3.46875, "learning_rate": 6.271426643271698e-05, "loss": 0.9828, "step": 5575 }, { "epoch": 1.1298865069356872, "grad_norm": 3.546875, "learning_rate": 6.254555270616818e-05, "loss": 0.9798, "step": 5600 }, { "epoch": 1.1349306431273645, "grad_norm": 3.40625, "learning_rate": 6.23768389796194e-05, "loss": 1.0083, "step": 5625 }, { "epoch": 1.1399747793190416, "grad_norm": 3.4375, "learning_rate": 6.22081252530706e-05, "loss": 1.0031, "step": 5650 }, { "epoch": 1.1450189155107189, "grad_norm": 3.515625, "learning_rate": 6.20394115265218e-05, "loss": 1.015, "step": 5675 }, { "epoch": 1.150063051702396, "grad_norm": 3.390625, "learning_rate": 6.187069779997301e-05, "loss": 1.0156, "step": 5700 }, { "epoch": 1.155107187894073, "grad_norm": 4.0, "learning_rate": 6.170198407342421e-05, "loss": 0.996, "step": 5725 }, { "epoch": 1.1601513240857504, "grad_norm": 3.8125, "learning_rate": 6.153327034687543e-05, "loss": 0.9816, "step": 5750 }, { "epoch": 1.1601513240857504, "eval_loss": 1.0113532543182373, "eval_runtime": 2559.0525, "eval_samples_per_second": 6.887, "eval_steps_per_second": 0.861, "step": 5750 }, { "epoch": 1.1651954602774275, "grad_norm": 3.578125, "learning_rate": 6.136455662032663e-05, "loss": 0.9989, "step": 5775 }, { "epoch": 1.1702395964691046, "grad_norm": 4.03125, "learning_rate": 6.119584289377785e-05, "loss": 0.988, "step": 5800 }, { "epoch": 1.175283732660782, "grad_norm": 3.578125, "learning_rate": 6.102712916722905e-05, "loss": 1.0024, "step": 5825 }, { "epoch": 1.180327868852459, "grad_norm": 3.640625, "learning_rate": 6.085841544068026e-05, "loss": 1.0147, "step": 5850 }, { "epoch": 1.1853720050441363, "grad_norm": 3.640625, "learning_rate": 6.0689701714131464e-05, "loss": 0.9908, "step": 5875 }, { "epoch": 1.1904161412358134, "grad_norm": 3.546875, "learning_rate": 6.052098798758268e-05, "loss": 0.9932, "step": 5900 }, { "epoch": 1.1954602774274905, "grad_norm": 3.375, "learning_rate": 6.035227426103388e-05, "loss": 1.0117, "step": 5925 }, { "epoch": 1.2005044136191678, "grad_norm": 4.09375, "learning_rate": 6.018356053448508e-05, "loss": 0.9878, "step": 5950 }, { "epoch": 1.205548549810845, "grad_norm": 3.375, "learning_rate": 6.00148468079363e-05, "loss": 0.9903, "step": 5975 }, { "epoch": 1.210592686002522, "grad_norm": 3.59375, "learning_rate": 5.98461330813875e-05, "loss": 0.9709, "step": 6000 }, { "epoch": 1.210592686002522, "eval_loss": 1.0105805397033691, "eval_runtime": 5253.3855, "eval_samples_per_second": 3.355, "eval_steps_per_second": 0.419, "step": 6000 }, { "epoch": 1.2156368221941993, "grad_norm": 3.515625, "learning_rate": 5.9677419354838715e-05, "loss": 1.0037, "step": 6025 }, { "epoch": 1.2206809583858764, "grad_norm": 4.3125, "learning_rate": 5.9508705628289916e-05, "loss": 0.996, "step": 6050 }, { "epoch": 1.2257250945775535, "grad_norm": 4.03125, "learning_rate": 5.933999190174113e-05, "loss": 0.9887, "step": 6075 }, { "epoch": 1.2307692307692308, "grad_norm": 3.296875, "learning_rate": 5.917127817519233e-05, "loss": 0.9932, "step": 6100 }, { "epoch": 1.235813366960908, "grad_norm": 3.640625, "learning_rate": 5.900256444864355e-05, "loss": 0.9932, "step": 6125 }, { "epoch": 1.240857503152585, "grad_norm": 3.6875, "learning_rate": 5.883385072209475e-05, "loss": 0.9999, "step": 6150 }, { "epoch": 1.2459016393442623, "grad_norm": 3.984375, "learning_rate": 5.866513699554596e-05, "loss": 0.9873, "step": 6175 }, { "epoch": 1.2509457755359394, "grad_norm": 3.75, "learning_rate": 5.849642326899717e-05, "loss": 0.9807, "step": 6200 }, { "epoch": 1.2559899117276165, "grad_norm": 3.640625, "learning_rate": 5.8327709542448375e-05, "loss": 0.9999, "step": 6225 }, { "epoch": 1.2610340479192939, "grad_norm": 3.3125, "learning_rate": 5.8158995815899583e-05, "loss": 1.0047, "step": 6250 }, { "epoch": 1.2610340479192939, "eval_loss": 1.0091475248336792, "eval_runtime": 2551.5125, "eval_samples_per_second": 6.907, "eval_steps_per_second": 0.863, "step": 6250 }, { "epoch": 1.266078184110971, "grad_norm": 3.5625, "learning_rate": 5.799028208935079e-05, "loss": 0.9834, "step": 6275 }, { "epoch": 1.271122320302648, "grad_norm": 3.640625, "learning_rate": 5.7821568362802e-05, "loss": 0.9886, "step": 6300 }, { "epoch": 1.2761664564943254, "grad_norm": 3.578125, "learning_rate": 5.765285463625321e-05, "loss": 0.9892, "step": 6325 }, { "epoch": 1.2812105926860025, "grad_norm": 3.5, "learning_rate": 5.748414090970442e-05, "loss": 0.967, "step": 6350 }, { "epoch": 1.2862547288776798, "grad_norm": 3.359375, "learning_rate": 5.7315427183155626e-05, "loss": 0.9905, "step": 6375 }, { "epoch": 1.2912988650693569, "grad_norm": 3.453125, "learning_rate": 5.714671345660684e-05, "loss": 0.9754, "step": 6400 }, { "epoch": 1.296343001261034, "grad_norm": 3.59375, "learning_rate": 5.697799973005804e-05, "loss": 1.0018, "step": 6425 }, { "epoch": 1.3013871374527113, "grad_norm": 3.421875, "learning_rate": 5.6809286003509244e-05, "loss": 0.9895, "step": 6450 }, { "epoch": 1.3064312736443884, "grad_norm": 3.53125, "learning_rate": 5.664057227696046e-05, "loss": 0.974, "step": 6475 }, { "epoch": 1.3114754098360657, "grad_norm": 3.59375, "learning_rate": 5.647185855041166e-05, "loss": 0.9929, "step": 6500 }, { "epoch": 1.3114754098360657, "eval_loss": 1.0080196857452393, "eval_runtime": 5107.0041, "eval_samples_per_second": 3.451, "eval_steps_per_second": 0.431, "step": 6500 }, { "epoch": 1.3165195460277428, "grad_norm": 3.734375, "learning_rate": 5.6303144823862876e-05, "loss": 0.985, "step": 6525 }, { "epoch": 1.3215636822194199, "grad_norm": 3.703125, "learning_rate": 5.613443109731408e-05, "loss": 0.9846, "step": 6550 }, { "epoch": 1.3266078184110972, "grad_norm": 3.46875, "learning_rate": 5.596571737076529e-05, "loss": 0.971, "step": 6575 }, { "epoch": 1.3316519546027743, "grad_norm": 3.71875, "learning_rate": 5.5797003644216495e-05, "loss": 0.9928, "step": 6600 }, { "epoch": 1.3366960907944514, "grad_norm": 3.921875, "learning_rate": 5.562828991766771e-05, "loss": 0.9873, "step": 6625 }, { "epoch": 1.3417402269861287, "grad_norm": 3.546875, "learning_rate": 5.545957619111891e-05, "loss": 0.9874, "step": 6650 }, { "epoch": 1.3467843631778058, "grad_norm": 3.484375, "learning_rate": 5.529086246457011e-05, "loss": 0.9819, "step": 6675 }, { "epoch": 1.351828499369483, "grad_norm": 3.640625, "learning_rate": 5.512214873802133e-05, "loss": 0.9959, "step": 6700 }, { "epoch": 1.3568726355611602, "grad_norm": 3.65625, "learning_rate": 5.495343501147253e-05, "loss": 1.0071, "step": 6725 }, { "epoch": 1.3619167717528373, "grad_norm": 3.609375, "learning_rate": 5.4784721284923745e-05, "loss": 0.9894, "step": 6750 }, { "epoch": 1.3619167717528373, "eval_loss": 1.0074087381362915, "eval_runtime": 2647.0951, "eval_samples_per_second": 6.657, "eval_steps_per_second": 0.832, "step": 6750 }, { "epoch": 1.3669609079445144, "grad_norm": 3.71875, "learning_rate": 5.461600755837495e-05, "loss": 0.989, "step": 6775 }, { "epoch": 1.3720050441361917, "grad_norm": 3.6875, "learning_rate": 5.444729383182616e-05, "loss": 0.9984, "step": 6800 }, { "epoch": 1.3770491803278688, "grad_norm": 3.578125, "learning_rate": 5.4278580105277363e-05, "loss": 1.0006, "step": 6825 }, { "epoch": 1.382093316519546, "grad_norm": 3.703125, "learning_rate": 5.410986637872858e-05, "loss": 0.9854, "step": 6850 }, { "epoch": 1.3871374527112232, "grad_norm": 3.625, "learning_rate": 5.394115265217978e-05, "loss": 0.9992, "step": 6875 }, { "epoch": 1.3921815889029003, "grad_norm": 3.515625, "learning_rate": 5.3772438925630996e-05, "loss": 0.9875, "step": 6900 }, { "epoch": 1.3972257250945774, "grad_norm": 3.65625, "learning_rate": 5.36037251990822e-05, "loss": 0.9766, "step": 6925 }, { "epoch": 1.4022698612862547, "grad_norm": 3.578125, "learning_rate": 5.3435011472533406e-05, "loss": 0.9862, "step": 6950 }, { "epoch": 1.4073139974779318, "grad_norm": 3.65625, "learning_rate": 5.3266297745984614e-05, "loss": 0.9886, "step": 6975 }, { "epoch": 1.4123581336696092, "grad_norm": 3.359375, "learning_rate": 5.309758401943582e-05, "loss": 0.9927, "step": 7000 }, { "epoch": 1.4123581336696092, "eval_loss": 1.0061756372451782, "eval_runtime": 2627.456, "eval_samples_per_second": 6.707, "eval_steps_per_second": 0.838, "step": 7000 }, { "epoch": 1.4174022698612863, "grad_norm": 3.484375, "learning_rate": 5.292887029288703e-05, "loss": 0.9727, "step": 7025 }, { "epoch": 1.4224464060529634, "grad_norm": 3.640625, "learning_rate": 5.276015656633824e-05, "loss": 1.0034, "step": 7050 }, { "epoch": 1.4274905422446407, "grad_norm": 3.609375, "learning_rate": 5.259144283978945e-05, "loss": 0.9734, "step": 7075 }, { "epoch": 1.4325346784363178, "grad_norm": 3.546875, "learning_rate": 5.2422729113240656e-05, "loss": 0.988, "step": 7100 }, { "epoch": 1.437578814627995, "grad_norm": 3.59375, "learning_rate": 5.225401538669187e-05, "loss": 0.9724, "step": 7125 }, { "epoch": 1.4426229508196722, "grad_norm": 3.765625, "learning_rate": 5.208530166014307e-05, "loss": 1.003, "step": 7150 }, { "epoch": 1.4476670870113493, "grad_norm": 3.625, "learning_rate": 5.191658793359429e-05, "loss": 0.9775, "step": 7175 }, { "epoch": 1.4527112232030266, "grad_norm": 3.765625, "learning_rate": 5.174787420704549e-05, "loss": 0.9876, "step": 7200 }, { "epoch": 1.4577553593947037, "grad_norm": 3.671875, "learning_rate": 5.157916048049669e-05, "loss": 0.9954, "step": 7225 }, { "epoch": 1.4627994955863808, "grad_norm": 3.921875, "learning_rate": 5.1410446753947907e-05, "loss": 0.9862, "step": 7250 }, { "epoch": 1.4627994955863808, "eval_loss": 1.005358338356018, "eval_runtime": 2629.6577, "eval_samples_per_second": 6.702, "eval_steps_per_second": 0.838, "step": 7250 }, { "epoch": 1.467843631778058, "grad_norm": 3.546875, "learning_rate": 5.124173302739911e-05, "loss": 0.9929, "step": 7275 }, { "epoch": 1.4728877679697352, "grad_norm": 3.515625, "learning_rate": 5.107301930085032e-05, "loss": 0.9981, "step": 7300 }, { "epoch": 1.4779319041614123, "grad_norm": 3.84375, "learning_rate": 5.0904305574301525e-05, "loss": 0.9872, "step": 7325 }, { "epoch": 1.4829760403530896, "grad_norm": 3.734375, "learning_rate": 5.073559184775274e-05, "loss": 0.99, "step": 7350 }, { "epoch": 1.4880201765447667, "grad_norm": 3.71875, "learning_rate": 5.056687812120394e-05, "loss": 0.9901, "step": 7375 }, { "epoch": 1.4930643127364438, "grad_norm": 3.96875, "learning_rate": 5.039816439465516e-05, "loss": 1.0028, "step": 7400 }, { "epoch": 1.4981084489281211, "grad_norm": 3.71875, "learning_rate": 5.022945066810636e-05, "loss": 0.9854, "step": 7425 }, { "epoch": 1.5031525851197982, "grad_norm": 3.640625, "learning_rate": 5.006073694155756e-05, "loss": 0.9833, "step": 7450 }, { "epoch": 1.5081967213114753, "grad_norm": 3.421875, "learning_rate": 4.9892023215008776e-05, "loss": 0.9745, "step": 7475 }, { "epoch": 1.5132408575031526, "grad_norm": 3.515625, "learning_rate": 4.9723309488459984e-05, "loss": 0.9844, "step": 7500 }, { "epoch": 1.5132408575031526, "eval_loss": 1.004678726196289, "eval_runtime": 2622.6745, "eval_samples_per_second": 6.719, "eval_steps_per_second": 0.84, "step": 7500 }, { "epoch": 1.5182849936948297, "grad_norm": 3.546875, "learning_rate": 4.955459576191119e-05, "loss": 0.9862, "step": 7525 }, { "epoch": 1.5233291298865068, "grad_norm": 3.578125, "learning_rate": 4.93858820353624e-05, "loss": 0.9821, "step": 7550 }, { "epoch": 1.5283732660781841, "grad_norm": 3.734375, "learning_rate": 4.92171683088136e-05, "loss": 0.9959, "step": 7575 }, { "epoch": 1.5334174022698615, "grad_norm": 3.65625, "learning_rate": 4.904845458226481e-05, "loss": 0.996, "step": 7600 }, { "epoch": 1.5384615384615383, "grad_norm": 3.640625, "learning_rate": 4.887974085571602e-05, "loss": 0.9815, "step": 7625 }, { "epoch": 1.5435056746532156, "grad_norm": 3.59375, "learning_rate": 4.871102712916723e-05, "loss": 0.9791, "step": 7650 }, { "epoch": 1.548549810844893, "grad_norm": 3.46875, "learning_rate": 4.8542313402618436e-05, "loss": 0.9842, "step": 7675 }, { "epoch": 1.5535939470365698, "grad_norm": 3.734375, "learning_rate": 4.8373599676069644e-05, "loss": 0.985, "step": 7700 }, { "epoch": 1.5586380832282472, "grad_norm": 3.59375, "learning_rate": 4.820488594952085e-05, "loss": 0.982, "step": 7725 }, { "epoch": 1.5636822194199245, "grad_norm": 3.3125, "learning_rate": 4.803617222297206e-05, "loss": 0.9976, "step": 7750 }, { "epoch": 1.5636822194199245, "eval_loss": 1.0039445161819458, "eval_runtime": 2622.6443, "eval_samples_per_second": 6.72, "eval_steps_per_second": 0.84, "step": 7750 }, { "epoch": 1.5687263556116016, "grad_norm": 3.609375, "learning_rate": 4.786745849642327e-05, "loss": 0.9789, "step": 7775 }, { "epoch": 1.5737704918032787, "grad_norm": 3.5625, "learning_rate": 4.7698744769874485e-05, "loss": 0.9861, "step": 7800 }, { "epoch": 1.578814627994956, "grad_norm": 3.578125, "learning_rate": 4.7530031043325687e-05, "loss": 0.988, "step": 7825 }, { "epoch": 1.583858764186633, "grad_norm": 3.53125, "learning_rate": 4.7361317316776895e-05, "loss": 0.9705, "step": 7850 }, { "epoch": 1.5889029003783102, "grad_norm": 3.65625, "learning_rate": 4.71926035902281e-05, "loss": 0.9678, "step": 7875 }, { "epoch": 1.5939470365699875, "grad_norm": 3.78125, "learning_rate": 4.702388986367931e-05, "loss": 0.983, "step": 7900 }, { "epoch": 1.5989911727616646, "grad_norm": 3.421875, "learning_rate": 4.685517613713052e-05, "loss": 0.9766, "step": 7925 }, { "epoch": 1.6040353089533417, "grad_norm": 3.671875, "learning_rate": 4.668646241058173e-05, "loss": 1.0102, "step": 7950 }, { "epoch": 1.609079445145019, "grad_norm": 4.0, "learning_rate": 4.651774868403294e-05, "loss": 0.9755, "step": 7975 }, { "epoch": 1.614123581336696, "grad_norm": 3.40625, "learning_rate": 4.6349034957484145e-05, "loss": 0.988, "step": 8000 }, { "epoch": 1.614123581336696, "eval_loss": 1.0029233694076538, "eval_runtime": 2632.3169, "eval_samples_per_second": 6.695, "eval_steps_per_second": 0.837, "step": 8000 }, { "epoch": 1.6191677175283732, "grad_norm": 3.3125, "learning_rate": 4.6180321230935354e-05, "loss": 0.9725, "step": 8025 }, { "epoch": 1.6242118537200505, "grad_norm": 3.9375, "learning_rate": 4.601160750438656e-05, "loss": 0.9906, "step": 8050 }, { "epoch": 1.6292559899117276, "grad_norm": 3.765625, "learning_rate": 4.584289377783777e-05, "loss": 0.9636, "step": 8075 }, { "epoch": 1.6343001261034047, "grad_norm": 3.796875, "learning_rate": 4.567418005128897e-05, "loss": 0.9932, "step": 8100 }, { "epoch": 1.639344262295082, "grad_norm": 3.546875, "learning_rate": 4.550546632474018e-05, "loss": 0.9741, "step": 8125 }, { "epoch": 1.644388398486759, "grad_norm": 3.625, "learning_rate": 4.533675259819139e-05, "loss": 0.9639, "step": 8150 }, { "epoch": 1.6494325346784362, "grad_norm": 3.625, "learning_rate": 4.51680388716426e-05, "loss": 0.9839, "step": 8175 }, { "epoch": 1.6544766708701135, "grad_norm": 3.53125, "learning_rate": 4.4999325145093806e-05, "loss": 0.977, "step": 8200 }, { "epoch": 1.6595208070617906, "grad_norm": 4.0625, "learning_rate": 4.4830611418545014e-05, "loss": 0.9848, "step": 8225 }, { "epoch": 1.6645649432534677, "grad_norm": 3.46875, "learning_rate": 4.466189769199622e-05, "loss": 0.9852, "step": 8250 }, { "epoch": 1.6645649432534677, "eval_loss": 1.0022705793380737, "eval_runtime": 2630.6679, "eval_samples_per_second": 6.699, "eval_steps_per_second": 0.837, "step": 8250 }, { "epoch": 1.669609079445145, "grad_norm": 3.8125, "learning_rate": 4.449318396544743e-05, "loss": 0.9993, "step": 8275 }, { "epoch": 1.6746532156368223, "grad_norm": 3.765625, "learning_rate": 4.432447023889864e-05, "loss": 0.9866, "step": 8300 }, { "epoch": 1.6796973518284992, "grad_norm": 3.71875, "learning_rate": 4.415575651234985e-05, "loss": 0.9697, "step": 8325 }, { "epoch": 1.6847414880201765, "grad_norm": 3.609375, "learning_rate": 4.398704278580105e-05, "loss": 0.9738, "step": 8350 }, { "epoch": 1.6897856242118539, "grad_norm": 3.828125, "learning_rate": 4.381832905925226e-05, "loss": 0.9755, "step": 8375 }, { "epoch": 1.694829760403531, "grad_norm": 3.46875, "learning_rate": 4.3649615332703467e-05, "loss": 0.9759, "step": 8400 }, { "epoch": 1.699873896595208, "grad_norm": 3.4375, "learning_rate": 4.3480901606154675e-05, "loss": 0.9921, "step": 8425 }, { "epoch": 1.7049180327868854, "grad_norm": 3.546875, "learning_rate": 4.3312187879605883e-05, "loss": 1.0079, "step": 8450 }, { "epoch": 1.7099621689785625, "grad_norm": 3.609375, "learning_rate": 4.314347415305709e-05, "loss": 0.963, "step": 8475 }, { "epoch": 1.7150063051702396, "grad_norm": 3.65625, "learning_rate": 4.297476042650831e-05, "loss": 0.9687, "step": 8500 }, { "epoch": 1.7150063051702396, "eval_loss": 1.0018465518951416, "eval_runtime": 2638.7528, "eval_samples_per_second": 6.679, "eval_steps_per_second": 0.835, "step": 8500 }, { "epoch": 1.7200504413619169, "grad_norm": 3.765625, "learning_rate": 4.2806046699959515e-05, "loss": 0.9678, "step": 8525 }, { "epoch": 1.725094577553594, "grad_norm": 3.859375, "learning_rate": 4.2637332973410724e-05, "loss": 0.988, "step": 8550 }, { "epoch": 1.730138713745271, "grad_norm": 3.921875, "learning_rate": 4.246861924686193e-05, "loss": 0.9767, "step": 8575 }, { "epoch": 1.7351828499369484, "grad_norm": 3.59375, "learning_rate": 4.2299905520313134e-05, "loss": 0.9997, "step": 8600 }, { "epoch": 1.7402269861286255, "grad_norm": 3.625, "learning_rate": 4.213119179376434e-05, "loss": 0.9696, "step": 8625 }, { "epoch": 1.7452711223203026, "grad_norm": 3.40625, "learning_rate": 4.196247806721555e-05, "loss": 0.9881, "step": 8650 }, { "epoch": 1.7503152585119799, "grad_norm": 3.9375, "learning_rate": 4.179376434066676e-05, "loss": 0.9776, "step": 8675 }, { "epoch": 1.755359394703657, "grad_norm": 3.75, "learning_rate": 4.162505061411797e-05, "loss": 0.9754, "step": 8700 }, { "epoch": 1.760403530895334, "grad_norm": 3.515625, "learning_rate": 4.1456336887569176e-05, "loss": 0.9804, "step": 8725 }, { "epoch": 1.7654476670870114, "grad_norm": 3.5625, "learning_rate": 4.1287623161020384e-05, "loss": 0.9887, "step": 8750 }, { "epoch": 1.7654476670870114, "eval_loss": 1.0013858079910278, "eval_runtime": 2630.3176, "eval_samples_per_second": 6.7, "eval_steps_per_second": 0.838, "step": 8750 }, { "epoch": 1.7704918032786885, "grad_norm": 3.71875, "learning_rate": 4.111890943447159e-05, "loss": 0.9991, "step": 8775 }, { "epoch": 1.7755359394703656, "grad_norm": 3.484375, "learning_rate": 4.09501957079228e-05, "loss": 0.9746, "step": 8800 }, { "epoch": 1.780580075662043, "grad_norm": 3.546875, "learning_rate": 4.078148198137401e-05, "loss": 0.9773, "step": 8825 }, { "epoch": 1.78562421185372, "grad_norm": 3.625, "learning_rate": 4.061276825482521e-05, "loss": 0.9968, "step": 8850 }, { "epoch": 1.790668348045397, "grad_norm": 4.03125, "learning_rate": 4.044405452827642e-05, "loss": 0.9992, "step": 8875 }, { "epoch": 1.7957124842370744, "grad_norm": 3.609375, "learning_rate": 4.027534080172763e-05, "loss": 0.9867, "step": 8900 }, { "epoch": 1.8007566204287517, "grad_norm": 3.65625, "learning_rate": 4.0106627075178837e-05, "loss": 0.9705, "step": 8925 }, { "epoch": 1.8058007566204286, "grad_norm": 3.859375, "learning_rate": 3.9937913348630045e-05, "loss": 0.9975, "step": 8950 }, { "epoch": 1.810844892812106, "grad_norm": 3.703125, "learning_rate": 3.976919962208125e-05, "loss": 0.9825, "step": 8975 }, { "epoch": 1.8158890290037832, "grad_norm": 3.5625, "learning_rate": 3.960048589553246e-05, "loss": 0.9902, "step": 9000 }, { "epoch": 1.8158890290037832, "eval_loss": 1.0009592771530151, "eval_runtime": 2632.3183, "eval_samples_per_second": 6.695, "eval_steps_per_second": 0.837, "step": 9000 }, { "epoch": 1.8209331651954601, "grad_norm": 3.78125, "learning_rate": 3.943177216898367e-05, "loss": 0.9924, "step": 9025 }, { "epoch": 1.8259773013871374, "grad_norm": 3.875, "learning_rate": 3.926305844243488e-05, "loss": 0.9796, "step": 9050 }, { "epoch": 1.8310214375788147, "grad_norm": 3.46875, "learning_rate": 3.909434471588609e-05, "loss": 0.9698, "step": 9075 }, { "epoch": 1.8360655737704918, "grad_norm": 3.96875, "learning_rate": 3.892563098933729e-05, "loss": 0.9986, "step": 9100 }, { "epoch": 1.841109709962169, "grad_norm": 3.4375, "learning_rate": 3.87569172627885e-05, "loss": 0.9999, "step": 9125 }, { "epoch": 1.8461538461538463, "grad_norm": 3.921875, "learning_rate": 3.8588203536239705e-05, "loss": 0.9804, "step": 9150 }, { "epoch": 1.8511979823455234, "grad_norm": 3.625, "learning_rate": 3.8419489809690914e-05, "loss": 0.9845, "step": 9175 }, { "epoch": 1.8562421185372004, "grad_norm": 3.5625, "learning_rate": 3.825077608314213e-05, "loss": 0.9703, "step": 9200 }, { "epoch": 1.8612862547288778, "grad_norm": 3.84375, "learning_rate": 3.808206235659334e-05, "loss": 0.986, "step": 9225 }, { "epoch": 1.8663303909205549, "grad_norm": 3.671875, "learning_rate": 3.7913348630044546e-05, "loss": 0.9995, "step": 9250 }, { "epoch": 1.8663303909205549, "eval_loss": 1.000319004058838, "eval_runtime": 2648.6022, "eval_samples_per_second": 6.654, "eval_steps_per_second": 0.832, "step": 9250 }, { "epoch": 1.871374527112232, "grad_norm": 3.6875, "learning_rate": 3.7744634903495754e-05, "loss": 0.9635, "step": 9275 }, { "epoch": 1.8764186633039093, "grad_norm": 3.484375, "learning_rate": 3.757592117694696e-05, "loss": 0.9921, "step": 9300 }, { "epoch": 1.8814627994955864, "grad_norm": 3.5625, "learning_rate": 3.740720745039817e-05, "loss": 0.9752, "step": 9325 }, { "epoch": 1.8865069356872635, "grad_norm": 3.453125, "learning_rate": 3.723849372384937e-05, "loss": 0.9684, "step": 9350 }, { "epoch": 1.8915510718789408, "grad_norm": 3.734375, "learning_rate": 3.706977999730058e-05, "loss": 0.993, "step": 9375 }, { "epoch": 1.8965952080706179, "grad_norm": 3.65625, "learning_rate": 3.690106627075179e-05, "loss": 0.9916, "step": 9400 }, { "epoch": 1.901639344262295, "grad_norm": 3.765625, "learning_rate": 3.6732352544203e-05, "loss": 0.9861, "step": 9425 }, { "epoch": 1.9066834804539723, "grad_norm": 3.5, "learning_rate": 3.6563638817654206e-05, "loss": 0.9842, "step": 9450 }, { "epoch": 1.9117276166456494, "grad_norm": 3.8125, "learning_rate": 3.6394925091105415e-05, "loss": 0.9629, "step": 9475 }, { "epoch": 1.9167717528373265, "grad_norm": 3.609375, "learning_rate": 3.622621136455662e-05, "loss": 1.0018, "step": 9500 }, { "epoch": 1.9167717528373265, "eval_loss": 0.9999775290489197, "eval_runtime": 2645.8568, "eval_samples_per_second": 6.661, "eval_steps_per_second": 0.833, "step": 9500 }, { "epoch": 1.9218158890290038, "grad_norm": 3.59375, "learning_rate": 3.605749763800783e-05, "loss": 0.9682, "step": 9525 }, { "epoch": 1.9268600252206811, "grad_norm": 3.5, "learning_rate": 3.588878391145904e-05, "loss": 0.9782, "step": 9550 }, { "epoch": 1.931904161412358, "grad_norm": 3.75, "learning_rate": 3.572007018491025e-05, "loss": 0.9804, "step": 9575 }, { "epoch": 1.9369482976040353, "grad_norm": 3.609375, "learning_rate": 3.555135645836146e-05, "loss": 0.9964, "step": 9600 }, { "epoch": 1.9419924337957126, "grad_norm": 3.796875, "learning_rate": 3.538264273181266e-05, "loss": 0.9828, "step": 9625 }, { "epoch": 1.9470365699873895, "grad_norm": 3.484375, "learning_rate": 3.521392900526387e-05, "loss": 0.9878, "step": 9650 }, { "epoch": 1.9520807061790668, "grad_norm": 3.75, "learning_rate": 3.5045215278715075e-05, "loss": 0.9917, "step": 9675 }, { "epoch": 1.9571248423707441, "grad_norm": 3.515625, "learning_rate": 3.4876501552166284e-05, "loss": 0.9629, "step": 9700 }, { "epoch": 1.9621689785624212, "grad_norm": 3.609375, "learning_rate": 3.470778782561749e-05, "loss": 0.9707, "step": 9725 }, { "epoch": 1.9672131147540983, "grad_norm": 3.953125, "learning_rate": 3.45390740990687e-05, "loss": 0.9975, "step": 9750 }, { "epoch": 1.9672131147540983, "eval_loss": 0.9995858073234558, "eval_runtime": 2645.0099, "eval_samples_per_second": 6.663, "eval_steps_per_second": 0.833, "step": 9750 }, { "epoch": 1.9722572509457756, "grad_norm": 4.0625, "learning_rate": 3.437036037251991e-05, "loss": 0.988, "step": 9775 }, { "epoch": 1.9773013871374527, "grad_norm": 3.609375, "learning_rate": 3.420164664597112e-05, "loss": 0.9857, "step": 9800 }, { "epoch": 1.9823455233291298, "grad_norm": 3.640625, "learning_rate": 3.4032932919422326e-05, "loss": 0.9824, "step": 9825 }, { "epoch": 1.9873896595208072, "grad_norm": 3.59375, "learning_rate": 3.3864219192873534e-05, "loss": 0.9632, "step": 9850 }, { "epoch": 1.9924337957124842, "grad_norm": 3.734375, "learning_rate": 3.3695505466324736e-05, "loss": 0.9738, "step": 9875 }, { "epoch": 1.9974779319041613, "grad_norm": 3.609375, "learning_rate": 3.352679173977595e-05, "loss": 0.9696, "step": 9900 }, { "epoch": 2.0025220680958387, "grad_norm": 3.40625, "learning_rate": 3.335807801322716e-05, "loss": 0.9731, "step": 9925 }, { "epoch": 2.007566204287516, "grad_norm": 3.453125, "learning_rate": 3.318936428667837e-05, "loss": 0.9534, "step": 9950 }, { "epoch": 2.012610340479193, "grad_norm": 3.703125, "learning_rate": 3.3020650560129576e-05, "loss": 0.9727, "step": 9975 }, { "epoch": 2.01765447667087, "grad_norm": 3.65625, "learning_rate": 3.2851936833580785e-05, "loss": 0.9782, "step": 10000 }, { "epoch": 2.01765447667087, "eval_loss": 0.9998543858528137, "eval_runtime": 2651.8308, "eval_samples_per_second": 6.646, "eval_steps_per_second": 0.831, "step": 10000 }, { "epoch": 2.0226986128625475, "grad_norm": 3.671875, "learning_rate": 3.268322310703199e-05, "loss": 0.9677, "step": 10025 }, { "epoch": 2.0277427490542244, "grad_norm": 3.71875, "learning_rate": 3.25145093804832e-05, "loss": 0.9658, "step": 10050 }, { "epoch": 2.0327868852459017, "grad_norm": 3.890625, "learning_rate": 3.234579565393441e-05, "loss": 0.9689, "step": 10075 }, { "epoch": 2.037831021437579, "grad_norm": 3.65625, "learning_rate": 3.217708192738562e-05, "loss": 0.96, "step": 10100 }, { "epoch": 2.042875157629256, "grad_norm": 3.546875, "learning_rate": 3.200836820083682e-05, "loss": 0.9806, "step": 10125 }, { "epoch": 2.047919293820933, "grad_norm": 3.9375, "learning_rate": 3.183965447428803e-05, "loss": 0.9668, "step": 10150 }, { "epoch": 2.0529634300126105, "grad_norm": 3.546875, "learning_rate": 3.167094074773924e-05, "loss": 0.998, "step": 10175 }, { "epoch": 2.0580075662042874, "grad_norm": 3.828125, "learning_rate": 3.1502227021190445e-05, "loss": 0.9449, "step": 10200 }, { "epoch": 2.0630517023959647, "grad_norm": 3.828125, "learning_rate": 3.1333513294641654e-05, "loss": 0.9774, "step": 10225 }, { "epoch": 2.068095838587642, "grad_norm": 3.609375, "learning_rate": 3.116479956809286e-05, "loss": 0.9723, "step": 10250 }, { "epoch": 2.068095838587642, "eval_loss": 0.9999814629554749, "eval_runtime": 2659.556, "eval_samples_per_second": 6.626, "eval_steps_per_second": 0.828, "step": 10250 }, { "epoch": 2.073139974779319, "grad_norm": 3.78125, "learning_rate": 3.099608584154407e-05, "loss": 0.9776, "step": 10275 }, { "epoch": 2.078184110970996, "grad_norm": 3.765625, "learning_rate": 3.082737211499528e-05, "loss": 0.9484, "step": 10300 }, { "epoch": 2.0832282471626735, "grad_norm": 3.8125, "learning_rate": 3.065865838844649e-05, "loss": 0.9888, "step": 10325 }, { "epoch": 2.0882723833543504, "grad_norm": 3.984375, "learning_rate": 3.0489944661897696e-05, "loss": 0.9687, "step": 10350 }, { "epoch": 2.0933165195460277, "grad_norm": 3.75, "learning_rate": 3.0321230935348898e-05, "loss": 0.9511, "step": 10375 }, { "epoch": 2.098360655737705, "grad_norm": 3.859375, "learning_rate": 3.0152517208800106e-05, "loss": 0.9533, "step": 10400 }, { "epoch": 2.103404791929382, "grad_norm": 3.734375, "learning_rate": 2.9983803482251314e-05, "loss": 0.963, "step": 10425 }, { "epoch": 2.108448928121059, "grad_norm": 3.734375, "learning_rate": 2.9815089755702523e-05, "loss": 0.9845, "step": 10450 }, { "epoch": 2.1134930643127365, "grad_norm": 3.953125, "learning_rate": 2.9646376029153735e-05, "loss": 0.9578, "step": 10475 }, { "epoch": 2.1185372005044134, "grad_norm": 3.5, "learning_rate": 2.9477662302604943e-05, "loss": 0.9687, "step": 10500 }, { "epoch": 2.1185372005044134, "eval_loss": 0.9998512864112854, "eval_runtime": 2662.2832, "eval_samples_per_second": 6.62, "eval_steps_per_second": 0.827, "step": 10500 }, { "epoch": 2.1235813366960907, "grad_norm": 3.421875, "learning_rate": 2.930894857605615e-05, "loss": 0.9743, "step": 10525 }, { "epoch": 2.128625472887768, "grad_norm": 3.796875, "learning_rate": 2.914023484950736e-05, "loss": 0.9784, "step": 10550 }, { "epoch": 2.133669609079445, "grad_norm": 3.890625, "learning_rate": 2.8971521122958568e-05, "loss": 0.9671, "step": 10575 }, { "epoch": 2.1387137452711222, "grad_norm": 3.53125, "learning_rate": 2.8802807396409777e-05, "loss": 0.9666, "step": 10600 }, { "epoch": 2.1437578814627996, "grad_norm": 3.6875, "learning_rate": 2.8634093669860978e-05, "loss": 0.9684, "step": 10625 }, { "epoch": 2.148802017654477, "grad_norm": 4.15625, "learning_rate": 2.8465379943312187e-05, "loss": 0.96, "step": 10650 }, { "epoch": 2.1538461538461537, "grad_norm": 3.875, "learning_rate": 2.8296666216763395e-05, "loss": 0.9642, "step": 10675 }, { "epoch": 2.158890290037831, "grad_norm": 3.59375, "learning_rate": 2.8127952490214604e-05, "loss": 0.9645, "step": 10700 }, { "epoch": 2.1639344262295084, "grad_norm": 3.625, "learning_rate": 2.7959238763665812e-05, "loss": 0.9493, "step": 10725 }, { "epoch": 2.1689785624211853, "grad_norm": 3.796875, "learning_rate": 2.779052503711702e-05, "loss": 0.9691, "step": 10750 }, { "epoch": 2.1689785624211853, "eval_loss": 0.9996906518936157, "eval_runtime": 2666.2703, "eval_samples_per_second": 6.61, "eval_steps_per_second": 0.826, "step": 10750 }, { "epoch": 2.1740226986128626, "grad_norm": 3.546875, "learning_rate": 2.762181131056823e-05, "loss": 0.9581, "step": 10775 }, { "epoch": 2.17906683480454, "grad_norm": 3.8125, "learning_rate": 2.7453097584019437e-05, "loss": 0.9597, "step": 10800 }, { "epoch": 2.1841109709962168, "grad_norm": 3.546875, "learning_rate": 2.7284383857470646e-05, "loss": 0.9724, "step": 10825 }, { "epoch": 2.189155107187894, "grad_norm": 3.640625, "learning_rate": 2.7115670130921854e-05, "loss": 0.9738, "step": 10850 }, { "epoch": 2.1941992433795714, "grad_norm": 3.8125, "learning_rate": 2.6946956404373062e-05, "loss": 0.9795, "step": 10875 }, { "epoch": 2.1992433795712483, "grad_norm": 3.75, "learning_rate": 2.6778242677824267e-05, "loss": 0.9653, "step": 10900 }, { "epoch": 2.2042875157629256, "grad_norm": 3.65625, "learning_rate": 2.6609528951275476e-05, "loss": 0.9552, "step": 10925 }, { "epoch": 2.209331651954603, "grad_norm": 3.75, "learning_rate": 2.6440815224726684e-05, "loss": 0.9653, "step": 10950 }, { "epoch": 2.2143757881462798, "grad_norm": 3.75, "learning_rate": 2.6272101498177893e-05, "loss": 0.9765, "step": 10975 }, { "epoch": 2.219419924337957, "grad_norm": 3.890625, "learning_rate": 2.61033877716291e-05, "loss": 0.9715, "step": 11000 }, { "epoch": 2.219419924337957, "eval_loss": 0.9996886849403381, "eval_runtime": 2670.7258, "eval_samples_per_second": 6.599, "eval_steps_per_second": 0.825, "step": 11000 }, { "epoch": 2.2244640605296344, "grad_norm": 3.8125, "learning_rate": 2.593467404508031e-05, "loss": 0.9607, "step": 11025 }, { "epoch": 2.2295081967213113, "grad_norm": 3.640625, "learning_rate": 2.5765960318531518e-05, "loss": 0.96, "step": 11050 }, { "epoch": 2.2345523329129886, "grad_norm": 4.03125, "learning_rate": 2.5597246591982726e-05, "loss": 0.9689, "step": 11075 }, { "epoch": 2.239596469104666, "grad_norm": 3.90625, "learning_rate": 2.5428532865433935e-05, "loss": 0.9676, "step": 11100 }, { "epoch": 2.244640605296343, "grad_norm": 3.828125, "learning_rate": 2.5259819138885143e-05, "loss": 0.981, "step": 11125 }, { "epoch": 2.24968474148802, "grad_norm": 4.0, "learning_rate": 2.5091105412336345e-05, "loss": 0.971, "step": 11150 }, { "epoch": 2.2547288776796974, "grad_norm": 3.84375, "learning_rate": 2.4922391685787557e-05, "loss": 0.9739, "step": 11175 }, { "epoch": 2.2597730138713743, "grad_norm": 3.71875, "learning_rate": 2.4753677959238765e-05, "loss": 0.9631, "step": 11200 }, { "epoch": 2.2648171500630516, "grad_norm": 3.484375, "learning_rate": 2.4584964232689973e-05, "loss": 0.994, "step": 11225 }, { "epoch": 2.269861286254729, "grad_norm": 3.53125, "learning_rate": 2.4416250506141182e-05, "loss": 0.9695, "step": 11250 }, { "epoch": 2.269861286254729, "eval_loss": 0.9994797706604004, "eval_runtime": 2662.9245, "eval_samples_per_second": 6.618, "eval_steps_per_second": 0.827, "step": 11250 }, { "epoch": 2.2749054224464063, "grad_norm": 3.53125, "learning_rate": 2.424753677959239e-05, "loss": 0.9694, "step": 11275 }, { "epoch": 2.279949558638083, "grad_norm": 4.46875, "learning_rate": 2.40788230530436e-05, "loss": 0.9819, "step": 11300 }, { "epoch": 2.2849936948297604, "grad_norm": 3.734375, "learning_rate": 2.3910109326494804e-05, "loss": 0.9746, "step": 11325 }, { "epoch": 2.2900378310214378, "grad_norm": 3.578125, "learning_rate": 2.3741395599946012e-05, "loss": 0.9531, "step": 11350 }, { "epoch": 2.2950819672131146, "grad_norm": 3.953125, "learning_rate": 2.357268187339722e-05, "loss": 0.9505, "step": 11375 }, { "epoch": 2.300126103404792, "grad_norm": 3.96875, "learning_rate": 2.340396814684843e-05, "loss": 0.9668, "step": 11400 }, { "epoch": 2.3051702395964693, "grad_norm": 3.75, "learning_rate": 2.3235254420299637e-05, "loss": 0.948, "step": 11425 }, { "epoch": 2.310214375788146, "grad_norm": 3.65625, "learning_rate": 2.3066540693750846e-05, "loss": 0.9613, "step": 11450 }, { "epoch": 2.3152585119798235, "grad_norm": 3.796875, "learning_rate": 2.289782696720205e-05, "loss": 0.9643, "step": 11475 }, { "epoch": 2.320302648171501, "grad_norm": 3.890625, "learning_rate": 2.272911324065326e-05, "loss": 0.9917, "step": 11500 }, { "epoch": 2.320302648171501, "eval_loss": 0.9993996620178223, "eval_runtime": 2610.2907, "eval_samples_per_second": 6.751, "eval_steps_per_second": 0.844, "step": 11500 }, { "epoch": 2.3253467843631777, "grad_norm": 3.703125, "learning_rate": 2.2560399514104468e-05, "loss": 0.968, "step": 11525 }, { "epoch": 2.330390920554855, "grad_norm": 3.546875, "learning_rate": 2.2391685787555676e-05, "loss": 0.9678, "step": 11550 }, { "epoch": 2.3354350567465323, "grad_norm": 3.765625, "learning_rate": 2.2222972061006885e-05, "loss": 0.9777, "step": 11575 }, { "epoch": 2.340479192938209, "grad_norm": 3.78125, "learning_rate": 2.2054258334458093e-05, "loss": 0.9671, "step": 11600 }, { "epoch": 2.3455233291298865, "grad_norm": 3.671875, "learning_rate": 2.18855446079093e-05, "loss": 0.9572, "step": 11625 }, { "epoch": 2.350567465321564, "grad_norm": 4.03125, "learning_rate": 2.171683088136051e-05, "loss": 0.974, "step": 11650 }, { "epoch": 2.3556116015132407, "grad_norm": 3.90625, "learning_rate": 2.1548117154811718e-05, "loss": 0.9619, "step": 11675 }, { "epoch": 2.360655737704918, "grad_norm": 3.84375, "learning_rate": 2.1379403428262927e-05, "loss": 0.9856, "step": 11700 }, { "epoch": 2.3656998738965953, "grad_norm": 3.765625, "learning_rate": 2.121068970171413e-05, "loss": 0.9627, "step": 11725 }, { "epoch": 2.3707440100882726, "grad_norm": 3.75, "learning_rate": 2.104197597516534e-05, "loss": 0.9623, "step": 11750 }, { "epoch": 2.3707440100882726, "eval_loss": 0.99924236536026, "eval_runtime": 2619.3153, "eval_samples_per_second": 6.728, "eval_steps_per_second": 0.841, "step": 11750 }, { "epoch": 2.3757881462799495, "grad_norm": 3.609375, "learning_rate": 2.087326224861655e-05, "loss": 0.9579, "step": 11775 }, { "epoch": 2.380832282471627, "grad_norm": 3.765625, "learning_rate": 2.0704548522067757e-05, "loss": 0.9509, "step": 11800 }, { "epoch": 2.3858764186633037, "grad_norm": 3.546875, "learning_rate": 2.0535834795518965e-05, "loss": 0.9816, "step": 11825 }, { "epoch": 2.390920554854981, "grad_norm": 3.671875, "learning_rate": 2.036712106897017e-05, "loss": 0.9649, "step": 11850 }, { "epoch": 2.3959646910466583, "grad_norm": 3.578125, "learning_rate": 2.019840734242138e-05, "loss": 0.9594, "step": 11875 }, { "epoch": 2.4010088272383356, "grad_norm": 4.09375, "learning_rate": 2.0029693615872587e-05, "loss": 0.9819, "step": 11900 }, { "epoch": 2.4060529634300125, "grad_norm": 3.890625, "learning_rate": 1.9860979889323796e-05, "loss": 0.9498, "step": 11925 }, { "epoch": 2.41109709962169, "grad_norm": 3.75, "learning_rate": 1.9692266162775004e-05, "loss": 0.966, "step": 11950 }, { "epoch": 2.416141235813367, "grad_norm": 3.921875, "learning_rate": 1.9523552436226212e-05, "loss": 0.9826, "step": 11975 }, { "epoch": 2.421185372005044, "grad_norm": 4.46875, "learning_rate": 1.935483870967742e-05, "loss": 0.9677, "step": 12000 }, { "epoch": 2.421185372005044, "eval_loss": 0.9991822242736816, "eval_runtime": 2614.0726, "eval_samples_per_second": 6.742, "eval_steps_per_second": 0.843, "step": 12000 }, { "epoch": 2.4262295081967213, "grad_norm": 3.90625, "learning_rate": 1.918612498312863e-05, "loss": 0.9891, "step": 12025 }, { "epoch": 2.4312736443883987, "grad_norm": 3.578125, "learning_rate": 1.9017411256579838e-05, "loss": 0.9649, "step": 12050 }, { "epoch": 2.4363177805800755, "grad_norm": 3.609375, "learning_rate": 1.8848697530031046e-05, "loss": 0.9643, "step": 12075 }, { "epoch": 2.441361916771753, "grad_norm": 3.65625, "learning_rate": 1.867998380348225e-05, "loss": 0.9474, "step": 12100 }, { "epoch": 2.44640605296343, "grad_norm": 3.703125, "learning_rate": 1.851127007693346e-05, "loss": 0.9692, "step": 12125 }, { "epoch": 2.451450189155107, "grad_norm": 3.640625, "learning_rate": 1.8342556350384668e-05, "loss": 0.9631, "step": 12150 }, { "epoch": 2.4564943253467844, "grad_norm": 3.578125, "learning_rate": 1.8173842623835876e-05, "loss": 0.9686, "step": 12175 }, { "epoch": 2.4615384615384617, "grad_norm": 3.75, "learning_rate": 1.8005128897287085e-05, "loss": 0.9627, "step": 12200 }, { "epoch": 2.4665825977301385, "grad_norm": 3.78125, "learning_rate": 1.783641517073829e-05, "loss": 0.9744, "step": 12225 }, { "epoch": 2.471626733921816, "grad_norm": 3.625, "learning_rate": 1.7667701444189498e-05, "loss": 0.9824, "step": 12250 }, { "epoch": 2.471626733921816, "eval_loss": 0.9991060495376587, "eval_runtime": 2612.0483, "eval_samples_per_second": 6.747, "eval_steps_per_second": 0.843, "step": 12250 }, { "epoch": 2.476670870113493, "grad_norm": 3.609375, "learning_rate": 1.7498987717640707e-05, "loss": 0.947, "step": 12275 }, { "epoch": 2.48171500630517, "grad_norm": 3.84375, "learning_rate": 1.7330273991091915e-05, "loss": 0.9556, "step": 12300 }, { "epoch": 2.4867591424968474, "grad_norm": 3.96875, "learning_rate": 1.7161560264543123e-05, "loss": 0.9742, "step": 12325 }, { "epoch": 2.4918032786885247, "grad_norm": 3.5, "learning_rate": 1.6992846537994332e-05, "loss": 0.9701, "step": 12350 }, { "epoch": 2.496847414880202, "grad_norm": 3.75, "learning_rate": 1.682413281144554e-05, "loss": 0.9804, "step": 12375 }, { "epoch": 2.501891551071879, "grad_norm": 3.828125, "learning_rate": 1.665541908489675e-05, "loss": 0.9534, "step": 12400 }, { "epoch": 2.506935687263556, "grad_norm": 3.8125, "learning_rate": 1.6486705358347957e-05, "loss": 0.9381, "step": 12425 }, { "epoch": 2.511979823455233, "grad_norm": 3.796875, "learning_rate": 1.6317991631799166e-05, "loss": 0.9817, "step": 12450 }, { "epoch": 2.5170239596469104, "grad_norm": 3.640625, "learning_rate": 1.614927790525037e-05, "loss": 0.9721, "step": 12475 }, { "epoch": 2.5220680958385877, "grad_norm": 4.09375, "learning_rate": 1.598056417870158e-05, "loss": 0.9607, "step": 12500 }, { "epoch": 2.5220680958385877, "eval_loss": 0.9990498423576355, "eval_runtime": 2605.9832, "eval_samples_per_second": 6.763, "eval_steps_per_second": 0.845, "step": 12500 }, { "epoch": 2.527112232030265, "grad_norm": 3.890625, "learning_rate": 1.5811850452152787e-05, "loss": 0.9809, "step": 12525 }, { "epoch": 2.532156368221942, "grad_norm": 3.5625, "learning_rate": 1.5643136725603996e-05, "loss": 0.9549, "step": 12550 }, { "epoch": 2.537200504413619, "grad_norm": 3.640625, "learning_rate": 1.5474422999055204e-05, "loss": 0.9616, "step": 12575 }, { "epoch": 2.542244640605296, "grad_norm": 3.78125, "learning_rate": 1.530570927250641e-05, "loss": 0.9478, "step": 12600 }, { "epoch": 2.5472887767969734, "grad_norm": 3.78125, "learning_rate": 1.513699554595762e-05, "loss": 0.9535, "step": 12625 }, { "epoch": 2.5523329129886507, "grad_norm": 3.84375, "learning_rate": 1.4968281819408828e-05, "loss": 0.9858, "step": 12650 }, { "epoch": 2.557377049180328, "grad_norm": 3.78125, "learning_rate": 1.4799568092860036e-05, "loss": 0.9751, "step": 12675 }, { "epoch": 2.562421185372005, "grad_norm": 3.6875, "learning_rate": 1.4630854366311245e-05, "loss": 0.9491, "step": 12700 }, { "epoch": 2.5674653215636822, "grad_norm": 3.875, "learning_rate": 1.4462140639762453e-05, "loss": 0.9482, "step": 12725 }, { "epoch": 2.5725094577553596, "grad_norm": 3.59375, "learning_rate": 1.4293426913213658e-05, "loss": 0.968, "step": 12750 }, { "epoch": 2.5725094577553596, "eval_loss": 0.9989694356918335, "eval_runtime": 2611.5729, "eval_samples_per_second": 6.748, "eval_steps_per_second": 0.844, "step": 12750 }, { "epoch": 2.5775535939470364, "grad_norm": 3.71875, "learning_rate": 1.4124713186664868e-05, "loss": 0.9654, "step": 12775 }, { "epoch": 2.5825977301387137, "grad_norm": 3.625, "learning_rate": 1.3955999460116077e-05, "loss": 0.9703, "step": 12800 }, { "epoch": 2.587641866330391, "grad_norm": 3.984375, "learning_rate": 1.3787285733567285e-05, "loss": 0.9555, "step": 12825 }, { "epoch": 2.592686002522068, "grad_norm": 3.6875, "learning_rate": 1.3618572007018493e-05, "loss": 0.962, "step": 12850 }, { "epoch": 2.5977301387137453, "grad_norm": 3.578125, "learning_rate": 1.3449858280469698e-05, "loss": 0.9609, "step": 12875 }, { "epoch": 2.6027742749054226, "grad_norm": 3.734375, "learning_rate": 1.3281144553920907e-05, "loss": 0.977, "step": 12900 }, { "epoch": 2.6078184110970994, "grad_norm": 3.96875, "learning_rate": 1.3112430827372115e-05, "loss": 0.9553, "step": 12925 }, { "epoch": 2.6128625472887768, "grad_norm": 4.21875, "learning_rate": 1.2943717100823324e-05, "loss": 0.9827, "step": 12950 }, { "epoch": 2.617906683480454, "grad_norm": 3.859375, "learning_rate": 1.2775003374274532e-05, "loss": 0.9635, "step": 12975 }, { "epoch": 2.6229508196721314, "grad_norm": 3.8125, "learning_rate": 1.2606289647725739e-05, "loss": 1.0026, "step": 13000 }, { "epoch": 2.6229508196721314, "eval_loss": 0.9988852739334106, "eval_runtime": 2614.4289, "eval_samples_per_second": 6.741, "eval_steps_per_second": 0.843, "step": 13000 }, { "epoch": 2.6279949558638083, "grad_norm": 3.78125, "learning_rate": 1.2437575921176947e-05, "loss": 0.9774, "step": 13025 }, { "epoch": 2.6330390920554856, "grad_norm": 3.625, "learning_rate": 1.2268862194628156e-05, "loss": 0.9548, "step": 13050 }, { "epoch": 2.6380832282471625, "grad_norm": 4.0625, "learning_rate": 1.2100148468079364e-05, "loss": 0.9546, "step": 13075 }, { "epoch": 2.6431273644388398, "grad_norm": 4.15625, "learning_rate": 1.193143474153057e-05, "loss": 0.9338, "step": 13100 }, { "epoch": 2.648171500630517, "grad_norm": 3.796875, "learning_rate": 1.176272101498178e-05, "loss": 0.9417, "step": 13125 }, { "epoch": 2.6532156368221944, "grad_norm": 3.640625, "learning_rate": 1.1594007288432988e-05, "loss": 0.932, "step": 13150 }, { "epoch": 2.6582597730138713, "grad_norm": 3.609375, "learning_rate": 1.1425293561884196e-05, "loss": 0.9773, "step": 13175 }, { "epoch": 2.6633039092055486, "grad_norm": 3.796875, "learning_rate": 1.1256579835335404e-05, "loss": 0.9483, "step": 13200 }, { "epoch": 2.6683480453972255, "grad_norm": 3.953125, "learning_rate": 1.1087866108786611e-05, "loss": 0.9727, "step": 13225 }, { "epoch": 2.673392181588903, "grad_norm": 3.421875, "learning_rate": 1.091915238223782e-05, "loss": 0.9679, "step": 13250 }, { "epoch": 2.673392181588903, "eval_loss": 0.9988698959350586, "eval_runtime": 2615.3625, "eval_samples_per_second": 6.738, "eval_steps_per_second": 0.842, "step": 13250 }, { "epoch": 2.67843631778058, "grad_norm": 3.625, "learning_rate": 1.0750438655689026e-05, "loss": 0.9599, "step": 13275 }, { "epoch": 2.6834804539722574, "grad_norm": 3.75, "learning_rate": 1.0581724929140235e-05, "loss": 0.9629, "step": 13300 }, { "epoch": 2.6885245901639343, "grad_norm": 3.671875, "learning_rate": 1.0413011202591443e-05, "loss": 0.9805, "step": 13325 }, { "epoch": 2.6935687263556116, "grad_norm": 3.796875, "learning_rate": 1.0244297476042652e-05, "loss": 0.9514, "step": 13350 }, { "epoch": 2.698612862547289, "grad_norm": 3.53125, "learning_rate": 1.007558374949386e-05, "loss": 0.945, "step": 13375 }, { "epoch": 2.703656998738966, "grad_norm": 3.703125, "learning_rate": 9.906870022945067e-06, "loss": 0.95, "step": 13400 }, { "epoch": 2.708701134930643, "grad_norm": 3.59375, "learning_rate": 9.738156296396275e-06, "loss": 0.9509, "step": 13425 }, { "epoch": 2.7137452711223204, "grad_norm": 4.0, "learning_rate": 9.569442569847484e-06, "loss": 0.9699, "step": 13450 }, { "epoch": 2.7187894073139973, "grad_norm": 3.75, "learning_rate": 9.40072884329869e-06, "loss": 0.9621, "step": 13475 }, { "epoch": 2.7238335435056746, "grad_norm": 3.890625, "learning_rate": 9.232015116749899e-06, "loss": 0.9641, "step": 13500 }, { "epoch": 2.7238335435056746, "eval_loss": 0.9989115595817566, "eval_runtime": 2611.0175, "eval_samples_per_second": 6.749, "eval_steps_per_second": 0.844, "step": 13500 }, { "epoch": 2.728877679697352, "grad_norm": 3.9375, "learning_rate": 9.063301390201107e-06, "loss": 0.9603, "step": 13525 }, { "epoch": 2.733921815889029, "grad_norm": 3.578125, "learning_rate": 8.894587663652315e-06, "loss": 0.9712, "step": 13550 }, { "epoch": 2.738965952080706, "grad_norm": 3.90625, "learning_rate": 8.725873937103524e-06, "loss": 0.9738, "step": 13575 }, { "epoch": 2.7440100882723835, "grad_norm": 3.8125, "learning_rate": 8.55716021055473e-06, "loss": 0.96, "step": 13600 }, { "epoch": 2.749054224464061, "grad_norm": 3.84375, "learning_rate": 8.388446484005939e-06, "loss": 0.9771, "step": 13625 }, { "epoch": 2.7540983606557377, "grad_norm": 3.609375, "learning_rate": 8.219732757457146e-06, "loss": 0.9879, "step": 13650 }, { "epoch": 2.759142496847415, "grad_norm": 4.0625, "learning_rate": 8.051019030908354e-06, "loss": 0.9788, "step": 13675 }, { "epoch": 2.764186633039092, "grad_norm": 3.734375, "learning_rate": 7.882305304359564e-06, "loss": 0.9706, "step": 13700 }, { "epoch": 2.769230769230769, "grad_norm": 3.71875, "learning_rate": 7.713591577810771e-06, "loss": 0.9755, "step": 13725 }, { "epoch": 2.7742749054224465, "grad_norm": 3.5, "learning_rate": 7.5448778512619794e-06, "loss": 0.959, "step": 13750 }, { "epoch": 2.7742749054224465, "eval_loss": 0.9988441467285156, "eval_runtime": 2608.3933, "eval_samples_per_second": 6.756, "eval_steps_per_second": 0.845, "step": 13750 }, { "epoch": 2.779319041614124, "grad_norm": 3.640625, "learning_rate": 7.376164124713188e-06, "loss": 0.959, "step": 13775 }, { "epoch": 2.7843631778058007, "grad_norm": 3.65625, "learning_rate": 7.2074503981643946e-06, "loss": 0.9548, "step": 13800 }, { "epoch": 2.789407313997478, "grad_norm": 3.734375, "learning_rate": 7.038736671615603e-06, "loss": 0.9917, "step": 13825 }, { "epoch": 2.794451450189155, "grad_norm": 3.65625, "learning_rate": 6.8700229450668106e-06, "loss": 0.9591, "step": 13850 }, { "epoch": 2.799495586380832, "grad_norm": 3.78125, "learning_rate": 6.701309218518019e-06, "loss": 0.9595, "step": 13875 }, { "epoch": 2.8045397225725095, "grad_norm": 3.578125, "learning_rate": 6.532595491969227e-06, "loss": 0.9575, "step": 13900 }, { "epoch": 2.809583858764187, "grad_norm": 3.90625, "learning_rate": 6.363881765420435e-06, "loss": 0.9574, "step": 13925 }, { "epoch": 2.8146279949558637, "grad_norm": 3.671875, "learning_rate": 6.195168038871643e-06, "loss": 0.9588, "step": 13950 }, { "epoch": 2.819672131147541, "grad_norm": 3.5625, "learning_rate": 6.026454312322851e-06, "loss": 0.9509, "step": 13975 }, { "epoch": 2.8247162673392183, "grad_norm": 3.640625, "learning_rate": 5.8577405857740585e-06, "loss": 0.9717, "step": 14000 }, { "epoch": 2.8247162673392183, "eval_loss": 0.9988219141960144, "eval_runtime": 2609.4192, "eval_samples_per_second": 6.754, "eval_steps_per_second": 0.844, "step": 14000 }, { "epoch": 2.829760403530895, "grad_norm": 3.65625, "learning_rate": 5.689026859225267e-06, "loss": 0.9649, "step": 14025 }, { "epoch": 2.8348045397225725, "grad_norm": 3.859375, "learning_rate": 5.5203131326764745e-06, "loss": 0.9508, "step": 14050 }, { "epoch": 2.83984867591425, "grad_norm": 3.515625, "learning_rate": 5.351599406127683e-06, "loss": 0.9737, "step": 14075 }, { "epoch": 2.8448928121059267, "grad_norm": 3.75, "learning_rate": 5.1828856795788905e-06, "loss": 0.9833, "step": 14100 }, { "epoch": 2.849936948297604, "grad_norm": 3.5625, "learning_rate": 5.014171953030099e-06, "loss": 0.9634, "step": 14125 }, { "epoch": 2.8549810844892813, "grad_norm": 3.640625, "learning_rate": 4.8454582264813065e-06, "loss": 0.9616, "step": 14150 }, { "epoch": 2.860025220680958, "grad_norm": 3.84375, "learning_rate": 4.676744499932514e-06, "loss": 0.951, "step": 14175 }, { "epoch": 2.8650693568726355, "grad_norm": 3.71875, "learning_rate": 4.508030773383723e-06, "loss": 0.9685, "step": 14200 }, { "epoch": 2.870113493064313, "grad_norm": 3.890625, "learning_rate": 4.339317046834931e-06, "loss": 0.9487, "step": 14225 }, { "epoch": 2.87515762925599, "grad_norm": 3.9375, "learning_rate": 4.1706033202861384e-06, "loss": 0.9692, "step": 14250 }, { "epoch": 2.87515762925599, "eval_loss": 0.9988105893135071, "eval_runtime": 2609.1475, "eval_samples_per_second": 6.754, "eval_steps_per_second": 0.844, "step": 14250 }, { "epoch": 2.880201765447667, "grad_norm": 3.78125, "learning_rate": 4.001889593737347e-06, "loss": 0.9656, "step": 14275 }, { "epoch": 2.8852459016393444, "grad_norm": 3.953125, "learning_rate": 3.833175867188555e-06, "loss": 0.9507, "step": 14300 }, { "epoch": 2.8902900378310212, "grad_norm": 3.703125, "learning_rate": 3.664462140639763e-06, "loss": 0.959, "step": 14325 }, { "epoch": 2.8953341740226985, "grad_norm": 3.84375, "learning_rate": 3.495748414090971e-06, "loss": 0.9559, "step": 14350 }, { "epoch": 2.900378310214376, "grad_norm": 4.15625, "learning_rate": 3.3270346875421784e-06, "loss": 0.9501, "step": 14375 }, { "epoch": 2.905422446406053, "grad_norm": 3.765625, "learning_rate": 3.1583209609933864e-06, "loss": 0.9508, "step": 14400 }, { "epoch": 2.91046658259773, "grad_norm": 3.484375, "learning_rate": 2.9896072344445944e-06, "loss": 0.9841, "step": 14425 }, { "epoch": 2.9155107187894074, "grad_norm": 3.84375, "learning_rate": 2.820893507895803e-06, "loss": 0.9874, "step": 14450 }, { "epoch": 2.9205548549810842, "grad_norm": 3.71875, "learning_rate": 2.6521797813470104e-06, "loss": 0.9736, "step": 14475 }, { "epoch": 2.9255989911727616, "grad_norm": 3.921875, "learning_rate": 2.4834660547982188e-06, "loss": 0.9697, "step": 14500 }, { "epoch": 2.9255989911727616, "eval_loss": 0.9988086819648743, "eval_runtime": 2614.4325, "eval_samples_per_second": 6.741, "eval_steps_per_second": 0.843, "step": 14500 }, { "epoch": 2.930643127364439, "grad_norm": 3.78125, "learning_rate": 2.3147523282494263e-06, "loss": 0.9516, "step": 14525 }, { "epoch": 2.935687263556116, "grad_norm": 3.796875, "learning_rate": 2.1460386017006343e-06, "loss": 0.9528, "step": 14550 }, { "epoch": 2.940731399747793, "grad_norm": 4.0, "learning_rate": 1.9773248751518423e-06, "loss": 0.9617, "step": 14575 }, { "epoch": 2.9457755359394704, "grad_norm": 3.921875, "learning_rate": 1.8086111486030503e-06, "loss": 0.9635, "step": 14600 }, { "epoch": 2.9508196721311473, "grad_norm": 3.65625, "learning_rate": 1.6398974220542585e-06, "loss": 0.9463, "step": 14625 }, { "epoch": 2.9558638083228246, "grad_norm": 3.75, "learning_rate": 1.4711836955054663e-06, "loss": 0.9654, "step": 14650 }, { "epoch": 2.960907944514502, "grad_norm": 3.390625, "learning_rate": 1.3024699689566743e-06, "loss": 0.9536, "step": 14675 }, { "epoch": 2.965952080706179, "grad_norm": 3.6875, "learning_rate": 1.1337562424078823e-06, "loss": 0.9584, "step": 14700 }, { "epoch": 2.970996216897856, "grad_norm": 3.703125, "learning_rate": 9.650425158590903e-07, "loss": 0.9672, "step": 14725 }, { "epoch": 2.9760403530895334, "grad_norm": 3.953125, "learning_rate": 7.963287893102984e-07, "loss": 0.9483, "step": 14750 }, { "epoch": 2.9760403530895334, "eval_loss": 0.9987980723381042, "eval_runtime": 2611.1467, "eval_samples_per_second": 6.749, "eval_steps_per_second": 0.844, "step": 14750 }, { "epoch": 2.9810844892812107, "grad_norm": 3.8125, "learning_rate": 6.276150627615063e-07, "loss": 0.967, "step": 14775 }, { "epoch": 2.9861286254728876, "grad_norm": 3.984375, "learning_rate": 4.589013362127143e-07, "loss": 0.9728, "step": 14800 }, { "epoch": 2.991172761664565, "grad_norm": 3.5, "learning_rate": 2.9018760966392225e-07, "loss": 0.949, "step": 14825 }, { "epoch": 2.9962168978562422, "grad_norm": 3.640625, "learning_rate": 1.2147388311513025e-07, "loss": 0.9645, "step": 14850 } ], "logging_steps": 25, "max_steps": 14868, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.103814657600032e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }