{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 79296, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018916464891041162, "grad_norm": 1.2924933433532715, "learning_rate": 0.00029811213680387406, "loss": 7.7116, "step": 500 }, { "epoch": 0.037832929782082324, "grad_norm": 1.2520179748535156, "learning_rate": 0.00029622049031476994, "loss": 7.1543, "step": 1000 }, { "epoch": 0.05674939467312349, "grad_norm": 1.6114544868469238, "learning_rate": 0.0002943288438256658, "loss": 6.9337, "step": 1500 }, { "epoch": 0.07566585956416465, "grad_norm": 2.069924831390381, "learning_rate": 0.0002924371973365617, "loss": 6.7892, "step": 2000 }, { "epoch": 0.09458232445520581, "grad_norm": 1.5514849424362183, "learning_rate": 0.0002905455508474576, "loss": 6.6502, "step": 2500 }, { "epoch": 0.11349878934624698, "grad_norm": 1.2996011972427368, "learning_rate": 0.0002886539043583535, "loss": 6.5405, "step": 3000 }, { "epoch": 0.13241525423728814, "grad_norm": 1.5956826210021973, "learning_rate": 0.00028676225786924937, "loss": 6.4099, "step": 3500 }, { "epoch": 0.1513317191283293, "grad_norm": 1.1812454462051392, "learning_rate": 0.00028487061138014524, "loss": 6.2872, "step": 4000 }, { "epoch": 0.17024818401937045, "grad_norm": 1.3722965717315674, "learning_rate": 0.0002829789648910411, "loss": 6.22, "step": 4500 }, { "epoch": 0.18916464891041163, "grad_norm": 2.514615058898926, "learning_rate": 0.00028108731840193705, "loss": 6.1227, "step": 5000 }, { "epoch": 0.20808111380145278, "grad_norm": 1.5694422721862793, "learning_rate": 0.0002791956719128329, "loss": 6.0, "step": 5500 }, { "epoch": 0.22699757869249396, "grad_norm": 1.2888797521591187, "learning_rate": 0.0002773040254237288, "loss": 5.9409, "step": 6000 }, { "epoch": 0.2459140435835351, "grad_norm": 1.8620376586914062, "learning_rate": 0.00027541237893462467, "loss": 5.8575, "step": 6500 }, { "epoch": 0.2648305084745763, "grad_norm": 1.548992395401001, "learning_rate": 0.00027352073244552055, "loss": 5.8123, "step": 7000 }, { "epoch": 0.28374697336561744, "grad_norm": 1.4120700359344482, "learning_rate": 0.0002716290859564165, "loss": 5.7071, "step": 7500 }, { "epoch": 0.3026634382566586, "grad_norm": 1.1697487831115723, "learning_rate": 0.00026973743946731235, "loss": 5.6656, "step": 8000 }, { "epoch": 0.32157990314769974, "grad_norm": 2.402817487716675, "learning_rate": 0.0002678457929782082, "loss": 5.6182, "step": 8500 }, { "epoch": 0.3404963680387409, "grad_norm": 2.5044350624084473, "learning_rate": 0.0002659541464891041, "loss": 5.5394, "step": 9000 }, { "epoch": 0.3594128329297821, "grad_norm": 1.7818189859390259, "learning_rate": 0.0002640625, "loss": 5.5234, "step": 9500 }, { "epoch": 0.37832929782082325, "grad_norm": 1.1046422719955444, "learning_rate": 0.00026217085351089585, "loss": 5.4713, "step": 10000 }, { "epoch": 0.3972457627118644, "grad_norm": 1.5595403909683228, "learning_rate": 0.0002602792070217918, "loss": 5.4638, "step": 10500 }, { "epoch": 0.41616222760290555, "grad_norm": 1.9899064302444458, "learning_rate": 0.00025838756053268765, "loss": 5.418, "step": 11000 }, { "epoch": 0.4350786924939467, "grad_norm": 1.3187510967254639, "learning_rate": 0.00025649591404358353, "loss": 5.3819, "step": 11500 }, { "epoch": 0.4539951573849879, "grad_norm": 1.804287314414978, "learning_rate": 0.0002546042675544794, "loss": 5.3156, "step": 12000 }, { "epoch": 0.47291162227602906, "grad_norm": 1.1610921621322632, "learning_rate": 0.0002527126210653753, "loss": 5.2946, "step": 12500 }, { "epoch": 0.4918280871670702, "grad_norm": 1.6857260465621948, "learning_rate": 0.00025082097457627115, "loss": 5.2887, "step": 13000 }, { "epoch": 0.5107445520581114, "grad_norm": 1.2952368259429932, "learning_rate": 0.00024892932808716703, "loss": 5.2392, "step": 13500 }, { "epoch": 0.5296610169491526, "grad_norm": 1.7109565734863281, "learning_rate": 0.0002470376815980629, "loss": 5.2078, "step": 14000 }, { "epoch": 0.5485774818401937, "grad_norm": 1.2140947580337524, "learning_rate": 0.00024514603510895883, "loss": 5.1776, "step": 14500 }, { "epoch": 0.5674939467312349, "grad_norm": 1.6038719415664673, "learning_rate": 0.00024325438861985468, "loss": 5.172, "step": 15000 }, { "epoch": 0.586410411622276, "grad_norm": 1.9119188785552979, "learning_rate": 0.00024136274213075058, "loss": 5.1032, "step": 15500 }, { "epoch": 0.6053268765133172, "grad_norm": 1.5576452016830444, "learning_rate": 0.00023947109564164646, "loss": 5.0741, "step": 16000 }, { "epoch": 0.6242433414043583, "grad_norm": 1.7232158184051514, "learning_rate": 0.00023757944915254236, "loss": 5.0445, "step": 16500 }, { "epoch": 0.6431598062953995, "grad_norm": 1.8923276662826538, "learning_rate": 0.00023568780266343823, "loss": 5.0539, "step": 17000 }, { "epoch": 0.6620762711864406, "grad_norm": 1.5271363258361816, "learning_rate": 0.0002337961561743341, "loss": 4.9995, "step": 17500 }, { "epoch": 0.6809927360774818, "grad_norm": 1.5291951894760132, "learning_rate": 0.00023190450968523, "loss": 4.9854, "step": 18000 }, { "epoch": 0.699909200968523, "grad_norm": 1.839331030845642, "learning_rate": 0.00023001286319612588, "loss": 4.9752, "step": 18500 }, { "epoch": 0.7188256658595642, "grad_norm": 1.5823332071304321, "learning_rate": 0.00022812121670702176, "loss": 4.9663, "step": 19000 }, { "epoch": 0.7377421307506054, "grad_norm": 1.5094155073165894, "learning_rate": 0.00022622957021791766, "loss": 4.9367, "step": 19500 }, { "epoch": 0.7566585956416465, "grad_norm": 1.885586142539978, "learning_rate": 0.00022433792372881354, "loss": 4.9085, "step": 20000 }, { "epoch": 0.7755750605326877, "grad_norm": 1.849187970161438, "learning_rate": 0.00022244627723970944, "loss": 4.9009, "step": 20500 }, { "epoch": 0.7944915254237288, "grad_norm": 1.4356294870376587, "learning_rate": 0.0002205546307506053, "loss": 4.8896, "step": 21000 }, { "epoch": 0.81340799031477, "grad_norm": 1.8109959363937378, "learning_rate": 0.0002186629842615012, "loss": 4.8644, "step": 21500 }, { "epoch": 0.8323244552058111, "grad_norm": 1.8115473985671997, "learning_rate": 0.0002167713377723971, "loss": 4.8406, "step": 22000 }, { "epoch": 0.8512409200968523, "grad_norm": 1.0598844289779663, "learning_rate": 0.00021487969128329297, "loss": 4.8253, "step": 22500 }, { "epoch": 0.8701573849878934, "grad_norm": 1.9162489175796509, "learning_rate": 0.00021298804479418884, "loss": 4.8107, "step": 23000 }, { "epoch": 0.8890738498789347, "grad_norm": 1.4632196426391602, "learning_rate": 0.00021109639830508474, "loss": 4.7961, "step": 23500 }, { "epoch": 0.9079903147699758, "grad_norm": 1.5689505338668823, "learning_rate": 0.00020920475181598062, "loss": 4.7696, "step": 24000 }, { "epoch": 0.926906779661017, "grad_norm": 2.6846208572387695, "learning_rate": 0.0002073131053268765, "loss": 4.7789, "step": 24500 }, { "epoch": 0.9458232445520581, "grad_norm": 1.2678192853927612, "learning_rate": 0.0002054214588377724, "loss": 4.7336, "step": 25000 }, { "epoch": 0.9647397094430993, "grad_norm": 1.225905179977417, "learning_rate": 0.00020352981234866827, "loss": 4.6987, "step": 25500 }, { "epoch": 0.9836561743341404, "grad_norm": 1.7740434408187866, "learning_rate": 0.00020163816585956417, "loss": 4.7174, "step": 26000 }, { "epoch": 1.0, "eval_loss": 4.44639253616333, "eval_rouge1": 0.029162381578966425, "eval_rouge2": 0.006098969028215646, "eval_rougeL": 0.029110736990587645, "eval_rougeLsum": 0.02925410138170841, "eval_runtime": 1525.6875, "eval_samples_per_second": 3.85, "eval_steps_per_second": 1.925, "step": 26432 }, { "epoch": 1.0025726392251817, "grad_norm": 1.906161904335022, "learning_rate": 0.00019974651937046005, "loss": 4.6901, "step": 26500 }, { "epoch": 1.0214891041162228, "grad_norm": 1.2957439422607422, "learning_rate": 0.00019785487288135592, "loss": 4.5955, "step": 27000 }, { "epoch": 1.040405569007264, "grad_norm": 1.5361531972885132, "learning_rate": 0.00019596322639225182, "loss": 4.6307, "step": 27500 }, { "epoch": 1.0593220338983051, "grad_norm": 1.5238388776779175, "learning_rate": 0.00019407157990314767, "loss": 4.6137, "step": 28000 }, { "epoch": 1.0782384987893463, "grad_norm": 2.2133569717407227, "learning_rate": 0.00019217993341404354, "loss": 4.5785, "step": 28500 }, { "epoch": 1.0971549636803875, "grad_norm": 1.6678129434585571, "learning_rate": 0.00019028828692493945, "loss": 4.5837, "step": 29000 }, { "epoch": 1.1160714285714286, "grad_norm": 2.0616204738616943, "learning_rate": 0.00018839664043583532, "loss": 4.5654, "step": 29500 }, { "epoch": 1.1349878934624698, "grad_norm": 1.1392605304718018, "learning_rate": 0.0001865049939467312, "loss": 4.5712, "step": 30000 }, { "epoch": 1.153904358353511, "grad_norm": 1.5049901008605957, "learning_rate": 0.0001846133474576271, "loss": 4.5356, "step": 30500 }, { "epoch": 1.172820823244552, "grad_norm": 1.9887439012527466, "learning_rate": 0.00018272170096852297, "loss": 4.5427, "step": 31000 }, { "epoch": 1.1917372881355932, "grad_norm": 1.2317928075790405, "learning_rate": 0.00018083005447941888, "loss": 4.5478, "step": 31500 }, { "epoch": 1.2106537530266344, "grad_norm": 1.7353732585906982, "learning_rate": 0.00017893840799031475, "loss": 4.5317, "step": 32000 }, { "epoch": 1.2295702179176755, "grad_norm": 1.880778431892395, "learning_rate": 0.00017704676150121063, "loss": 4.521, "step": 32500 }, { "epoch": 1.2484866828087167, "grad_norm": 1.8771882057189941, "learning_rate": 0.00017515511501210653, "loss": 4.4815, "step": 33000 }, { "epoch": 1.2674031476997578, "grad_norm": 1.9645997285842896, "learning_rate": 0.0001732634685230024, "loss": 4.4965, "step": 33500 }, { "epoch": 1.286319612590799, "grad_norm": 2.013093948364258, "learning_rate": 0.00017137182203389828, "loss": 4.4653, "step": 34000 }, { "epoch": 1.3052360774818401, "grad_norm": 2.37980055809021, "learning_rate": 0.00016948017554479418, "loss": 4.4756, "step": 34500 }, { "epoch": 1.3241525423728815, "grad_norm": 1.638635277748108, "learning_rate": 0.00016758852905569005, "loss": 4.4445, "step": 35000 }, { "epoch": 1.3430690072639226, "grad_norm": 1.2321521043777466, "learning_rate": 0.00016569688256658593, "loss": 4.4817, "step": 35500 }, { "epoch": 1.3619854721549638, "grad_norm": 1.8688061237335205, "learning_rate": 0.00016380523607748183, "loss": 4.4432, "step": 36000 }, { "epoch": 1.380901937046005, "grad_norm": 1.8400958776474, "learning_rate": 0.0001619135895883777, "loss": 4.4358, "step": 36500 }, { "epoch": 1.399818401937046, "grad_norm": 2.266047716140747, "learning_rate": 0.0001600219430992736, "loss": 4.4424, "step": 37000 }, { "epoch": 1.4187348668280872, "grad_norm": 2.100658416748047, "learning_rate": 0.00015813029661016948, "loss": 4.4106, "step": 37500 }, { "epoch": 1.4376513317191284, "grad_norm": 1.547130823135376, "learning_rate": 0.00015623865012106536, "loss": 4.3929, "step": 38000 }, { "epoch": 1.4565677966101696, "grad_norm": 1.9904333353042603, "learning_rate": 0.00015434700363196126, "loss": 4.4082, "step": 38500 }, { "epoch": 1.4754842615012107, "grad_norm": 1.5838130712509155, "learning_rate": 0.00015245535714285713, "loss": 4.3945, "step": 39000 }, { "epoch": 1.4944007263922519, "grad_norm": 2.325477361679077, "learning_rate": 0.000150563710653753, "loss": 4.3806, "step": 39500 }, { "epoch": 1.513317191283293, "grad_norm": 2.4055263996124268, "learning_rate": 0.00014867206416464888, "loss": 4.3929, "step": 40000 }, { "epoch": 1.5322336561743342, "grad_norm": 1.5647815465927124, "learning_rate": 0.00014678041767554479, "loss": 4.3551, "step": 40500 }, { "epoch": 1.5511501210653753, "grad_norm": 1.8453116416931152, "learning_rate": 0.00014488877118644066, "loss": 4.3654, "step": 41000 }, { "epoch": 1.5700665859564165, "grad_norm": 1.488938808441162, "learning_rate": 0.00014299712469733654, "loss": 4.3329, "step": 41500 }, { "epoch": 1.5889830508474576, "grad_norm": 2.035290479660034, "learning_rate": 0.00014110547820823244, "loss": 4.3577, "step": 42000 }, { "epoch": 1.6078995157384988, "grad_norm": 1.7979990243911743, "learning_rate": 0.0001392138317191283, "loss": 4.3409, "step": 42500 }, { "epoch": 1.62681598062954, "grad_norm": 1.5887001752853394, "learning_rate": 0.0001373221852300242, "loss": 4.3569, "step": 43000 }, { "epoch": 1.645732445520581, "grad_norm": 2.750321388244629, "learning_rate": 0.0001354305387409201, "loss": 4.299, "step": 43500 }, { "epoch": 1.6646489104116222, "grad_norm": 1.6100013256072998, "learning_rate": 0.00013353889225181596, "loss": 4.3352, "step": 44000 }, { "epoch": 1.6835653753026634, "grad_norm": 2.0449681282043457, "learning_rate": 0.00013164724576271187, "loss": 4.3398, "step": 44500 }, { "epoch": 1.7024818401937045, "grad_norm": 1.5766605138778687, "learning_rate": 0.00012975559927360774, "loss": 4.3298, "step": 45000 }, { "epoch": 1.7213983050847457, "grad_norm": 1.8957105875015259, "learning_rate": 0.00012786395278450362, "loss": 4.3114, "step": 45500 }, { "epoch": 1.7403147699757868, "grad_norm": 1.4540507793426514, "learning_rate": 0.00012597230629539952, "loss": 4.3202, "step": 46000 }, { "epoch": 1.759231234866828, "grad_norm": 3.3543920516967773, "learning_rate": 0.0001240806598062954, "loss": 4.2924, "step": 46500 }, { "epoch": 1.7781476997578691, "grad_norm": 1.7161678075790405, "learning_rate": 0.00012218901331719127, "loss": 4.3171, "step": 47000 }, { "epoch": 1.7970641646489103, "grad_norm": 1.3512232303619385, "learning_rate": 0.00012029736682808714, "loss": 4.3074, "step": 47500 }, { "epoch": 1.8159806295399514, "grad_norm": 2.0491695404052734, "learning_rate": 0.00011840572033898303, "loss": 4.2953, "step": 48000 }, { "epoch": 1.8348970944309926, "grad_norm": 1.3966342210769653, "learning_rate": 0.00011651407384987892, "loss": 4.2738, "step": 48500 }, { "epoch": 1.8538135593220337, "grad_norm": 1.5732593536376953, "learning_rate": 0.00011462242736077481, "loss": 4.3132, "step": 49000 }, { "epoch": 1.872730024213075, "grad_norm": 2.0010573863983154, "learning_rate": 0.00011273078087167068, "loss": 4.2993, "step": 49500 }, { "epoch": 1.8916464891041163, "grad_norm": 1.476754903793335, "learning_rate": 0.00011083913438256657, "loss": 4.2867, "step": 50000 }, { "epoch": 1.9105629539951574, "grad_norm": 1.8218967914581299, "learning_rate": 0.00010894748789346246, "loss": 4.2594, "step": 50500 }, { "epoch": 1.9294794188861986, "grad_norm": 2.112929582595825, "learning_rate": 0.00010705584140435835, "loss": 4.2801, "step": 51000 }, { "epoch": 1.9483958837772397, "grad_norm": 1.6175868511199951, "learning_rate": 0.00010516419491525422, "loss": 4.2593, "step": 51500 }, { "epoch": 1.9673123486682809, "grad_norm": 1.6821808815002441, "learning_rate": 0.00010327254842615011, "loss": 4.2319, "step": 52000 }, { "epoch": 1.986228813559322, "grad_norm": 1.5724012851715088, "learning_rate": 0.000101380901937046, "loss": 4.2547, "step": 52500 }, { "epoch": 2.0, "eval_loss": 4.0252814292907715, "eval_rouge1": 0.0322746293108514, "eval_rouge2": 0.006304439772753797, "eval_rougeL": 0.0319470868840228, "eval_rougeLsum": 0.0323918851898741, "eval_runtime": 1535.538, "eval_samples_per_second": 3.825, "eval_steps_per_second": 1.913, "step": 52864 }, { "epoch": 2.0051452784503634, "grad_norm": 1.531037449836731, "learning_rate": 9.948925544794189e-05, "loss": 4.2014, "step": 53000 }, { "epoch": 2.0240617433414045, "grad_norm": 1.795510172843933, "learning_rate": 9.759760895883776e-05, "loss": 4.1718, "step": 53500 }, { "epoch": 2.0429782082324457, "grad_norm": 2.444718360900879, "learning_rate": 9.570596246973365e-05, "loss": 4.1535, "step": 54000 }, { "epoch": 2.061894673123487, "grad_norm": 1.757287859916687, "learning_rate": 9.381431598062954e-05, "loss": 4.1447, "step": 54500 }, { "epoch": 2.080811138014528, "grad_norm": 2.1495566368103027, "learning_rate": 9.19226694915254e-05, "loss": 4.1612, "step": 55000 }, { "epoch": 2.099727602905569, "grad_norm": 1.7492542266845703, "learning_rate": 9.003102300242129e-05, "loss": 4.1351, "step": 55500 }, { "epoch": 2.1186440677966103, "grad_norm": 1.9334352016448975, "learning_rate": 8.813937651331718e-05, "loss": 4.1265, "step": 56000 }, { "epoch": 2.1375605326876514, "grad_norm": 1.4551591873168945, "learning_rate": 8.624773002421307e-05, "loss": 4.1541, "step": 56500 }, { "epoch": 2.1564769975786926, "grad_norm": 2.019604444503784, "learning_rate": 8.435608353510894e-05, "loss": 4.1578, "step": 57000 }, { "epoch": 2.1753934624697338, "grad_norm": 1.2288185358047485, "learning_rate": 8.246443704600483e-05, "loss": 4.1524, "step": 57500 }, { "epoch": 2.194309927360775, "grad_norm": 1.313474416732788, "learning_rate": 8.057279055690072e-05, "loss": 4.134, "step": 58000 }, { "epoch": 2.213226392251816, "grad_norm": 2.2463033199310303, "learning_rate": 7.868114406779661e-05, "loss": 4.1114, "step": 58500 }, { "epoch": 2.232142857142857, "grad_norm": 1.560890555381775, "learning_rate": 7.678949757869248e-05, "loss": 4.128, "step": 59000 }, { "epoch": 2.2510593220338984, "grad_norm": 1.6205390691757202, "learning_rate": 7.489785108958837e-05, "loss": 4.1441, "step": 59500 }, { "epoch": 2.2699757869249395, "grad_norm": 1.6627792119979858, "learning_rate": 7.300620460048426e-05, "loss": 4.1235, "step": 60000 }, { "epoch": 2.2888922518159807, "grad_norm": 1.6804839372634888, "learning_rate": 7.111455811138013e-05, "loss": 4.1176, "step": 60500 }, { "epoch": 2.307808716707022, "grad_norm": 1.7862976789474487, "learning_rate": 6.922291162227602e-05, "loss": 4.1161, "step": 61000 }, { "epoch": 2.326725181598063, "grad_norm": 1.7818354368209839, "learning_rate": 6.733126513317191e-05, "loss": 4.1204, "step": 61500 }, { "epoch": 2.345641646489104, "grad_norm": 1.4335715770721436, "learning_rate": 6.543961864406779e-05, "loss": 4.1003, "step": 62000 }, { "epoch": 2.3645581113801453, "grad_norm": 1.509127140045166, "learning_rate": 6.354797215496367e-05, "loss": 4.097, "step": 62500 }, { "epoch": 2.3834745762711864, "grad_norm": 1.6239145994186401, "learning_rate": 6.165632566585956e-05, "loss": 4.1343, "step": 63000 }, { "epoch": 2.4023910411622276, "grad_norm": 1.8273452520370483, "learning_rate": 5.9764679176755444e-05, "loss": 4.0974, "step": 63500 }, { "epoch": 2.4213075060532687, "grad_norm": 1.709304928779602, "learning_rate": 5.787303268765133e-05, "loss": 4.1166, "step": 64000 }, { "epoch": 2.44022397094431, "grad_norm": 1.4822978973388672, "learning_rate": 5.598138619854721e-05, "loss": 4.0727, "step": 64500 }, { "epoch": 2.459140435835351, "grad_norm": 1.680364966392517, "learning_rate": 5.408973970944309e-05, "loss": 4.0892, "step": 65000 }, { "epoch": 2.478056900726392, "grad_norm": 2.6956686973571777, "learning_rate": 5.219809322033898e-05, "loss": 4.1104, "step": 65500 }, { "epoch": 2.4969733656174333, "grad_norm": 1.3153131008148193, "learning_rate": 5.030644673123486e-05, "loss": 4.1054, "step": 66000 }, { "epoch": 2.5158898305084745, "grad_norm": 1.3412959575653076, "learning_rate": 4.841480024213075e-05, "loss": 4.1015, "step": 66500 }, { "epoch": 2.5348062953995156, "grad_norm": 1.6976650953292847, "learning_rate": 4.652315375302663e-05, "loss": 4.1274, "step": 67000 }, { "epoch": 2.553722760290557, "grad_norm": 1.8603146076202393, "learning_rate": 4.463150726392252e-05, "loss": 4.111, "step": 67500 }, { "epoch": 2.572639225181598, "grad_norm": 2.3264224529266357, "learning_rate": 4.27398607748184e-05, "loss": 4.0838, "step": 68000 }, { "epoch": 2.591555690072639, "grad_norm": 1.8271955251693726, "learning_rate": 4.084821428571428e-05, "loss": 4.1165, "step": 68500 }, { "epoch": 2.6104721549636802, "grad_norm": 2.3216850757598877, "learning_rate": 3.895656779661016e-05, "loss": 4.0805, "step": 69000 }, { "epoch": 2.6293886198547214, "grad_norm": 1.6033034324645996, "learning_rate": 3.706492130750605e-05, "loss": 4.0799, "step": 69500 }, { "epoch": 2.648305084745763, "grad_norm": 1.6763901710510254, "learning_rate": 3.517327481840193e-05, "loss": 4.0538, "step": 70000 }, { "epoch": 2.667221549636804, "grad_norm": 2.459399938583374, "learning_rate": 3.328162832929782e-05, "loss": 4.0777, "step": 70500 }, { "epoch": 2.6861380145278453, "grad_norm": 1.5721230506896973, "learning_rate": 3.13899818401937e-05, "loss": 4.0774, "step": 71000 }, { "epoch": 2.7050544794188864, "grad_norm": 1.7746046781539917, "learning_rate": 2.9498335351089584e-05, "loss": 4.0636, "step": 71500 }, { "epoch": 2.7239709443099276, "grad_norm": 1.4254429340362549, "learning_rate": 2.760668886198547e-05, "loss": 4.0853, "step": 72000 }, { "epoch": 2.7428874092009687, "grad_norm": 1.4355696439743042, "learning_rate": 2.5715042372881354e-05, "loss": 4.0817, "step": 72500 }, { "epoch": 2.76180387409201, "grad_norm": 1.614816427230835, "learning_rate": 2.382339588377724e-05, "loss": 4.0721, "step": 73000 }, { "epoch": 2.780720338983051, "grad_norm": 1.4038259983062744, "learning_rate": 2.193174939467312e-05, "loss": 4.0766, "step": 73500 }, { "epoch": 2.799636803874092, "grad_norm": 1.4863741397857666, "learning_rate": 2.0040102905569006e-05, "loss": 4.0841, "step": 74000 }, { "epoch": 2.8185532687651333, "grad_norm": 1.383074402809143, "learning_rate": 1.814845641646489e-05, "loss": 4.0546, "step": 74500 }, { "epoch": 2.8374697336561745, "grad_norm": 1.1907336711883545, "learning_rate": 1.6256809927360773e-05, "loss": 4.1018, "step": 75000 }, { "epoch": 2.8563861985472156, "grad_norm": 1.6975387334823608, "learning_rate": 1.4365163438256656e-05, "loss": 4.0758, "step": 75500 }, { "epoch": 2.875302663438257, "grad_norm": 1.451759934425354, "learning_rate": 1.2473516949152541e-05, "loss": 4.0655, "step": 76000 }, { "epoch": 2.894219128329298, "grad_norm": 1.7477492094039917, "learning_rate": 1.0581870460048424e-05, "loss": 4.0795, "step": 76500 }, { "epoch": 2.913135593220339, "grad_norm": 1.0805734395980835, "learning_rate": 8.69022397094431e-06, "loss": 4.0839, "step": 77000 }, { "epoch": 2.9320520581113803, "grad_norm": 1.5750969648361206, "learning_rate": 6.7985774818401935e-06, "loss": 4.0445, "step": 77500 }, { "epoch": 2.9509685230024214, "grad_norm": 1.9501795768737793, "learning_rate": 4.906930992736077e-06, "loss": 4.0509, "step": 78000 }, { "epoch": 2.9698849878934626, "grad_norm": 1.9664883613586426, "learning_rate": 3.015284503631961e-06, "loss": 4.062, "step": 78500 }, { "epoch": 2.9888014527845037, "grad_norm": 1.3531116247177124, "learning_rate": 1.1236380145278449e-06, "loss": 4.0699, "step": 79000 } ], "logging_steps": 500, "max_steps": 79296, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4314942284949504.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }