diff --git "a/checkpoint-18312/trainer_state.json" "b/checkpoint-18312/trainer_state.json" deleted file mode 100644--- "a/checkpoint-18312/trainer_state.json" +++ /dev/null @@ -1,4163 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.6629138233750249, - "eval_steps": 500, - "global_step": 18312, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0011222328814234257, - "grad_norm": 37.096622467041016, - "learning_rate": 1.0157273918741808e-06, - "loss": 8.8686, - "step": 31 - }, - { - "epoch": 0.0022444657628468514, - "grad_norm": 13.880346298217773, - "learning_rate": 2.0314547837483616e-06, - "loss": 7.6419, - "step": 62 - }, - { - "epoch": 0.0033666986442702773, - "grad_norm": 16.09684944152832, - "learning_rate": 3.0471821756225426e-06, - "loss": 6.4382, - "step": 93 - }, - { - "epoch": 0.004488931525693703, - "grad_norm": 19.170230865478516, - "learning_rate": 4.062909567496723e-06, - "loss": 5.3399, - "step": 124 - }, - { - "epoch": 0.005611164407117128, - "grad_norm": 24.654130935668945, - "learning_rate": 5.078636959370905e-06, - "loss": 4.7646, - "step": 155 - }, - { - "epoch": 0.006733397288540555, - "grad_norm": 24.712974548339844, - "learning_rate": 6.094364351245085e-06, - "loss": 4.4667, - "step": 186 - }, - { - "epoch": 0.00785563016996398, - "grad_norm": 17.238990783691406, - "learning_rate": 7.110091743119267e-06, - "loss": 4.2168, - "step": 217 - }, - { - "epoch": 0.008977863051387406, - "grad_norm": 20.40213394165039, - "learning_rate": 8.125819134993446e-06, - "loss": 4.0355, - "step": 248 - }, - { - "epoch": 0.010100095932810832, - "grad_norm": 15.052313804626465, - "learning_rate": 9.141546526867629e-06, - "loss": 3.8458, - "step": 279 - }, - { - "epoch": 0.011222328814234257, - "grad_norm": 18.802026748657227, - "learning_rate": 1.015727391874181e-05, - "loss": 3.6688, - "step": 310 - }, - { - "epoch": 0.012344561695657683, - "grad_norm": 16.62171745300293, - "learning_rate": 1.117300131061599e-05, - "loss": 3.52, - "step": 341 - }, - { - "epoch": 0.01346679457708111, - "grad_norm": 16.29236602783203, - "learning_rate": 1.218872870249017e-05, - "loss": 3.402, - "step": 372 - }, - { - "epoch": 0.014589027458504534, - "grad_norm": 11.65068531036377, - "learning_rate": 1.3204456094364351e-05, - "loss": 3.2829, - "step": 403 - }, - { - "epoch": 0.01571126033992796, - "grad_norm": 10.617654800415039, - "learning_rate": 1.4220183486238533e-05, - "loss": 3.2008, - "step": 434 - }, - { - "epoch": 0.016833493221351387, - "grad_norm": 10.611294746398926, - "learning_rate": 1.5235910878112714e-05, - "loss": 3.1249, - "step": 465 - }, - { - "epoch": 0.01795572610277481, - "grad_norm": 9.946114540100098, - "learning_rate": 1.6251638269986893e-05, - "loss": 3.0503, - "step": 496 - }, - { - "epoch": 0.019077958984198236, - "grad_norm": 10.92148494720459, - "learning_rate": 1.7267365661861077e-05, - "loss": 2.9903, - "step": 527 - }, - { - "epoch": 0.020200191865621664, - "grad_norm": 8.329671859741211, - "learning_rate": 1.8283093053735257e-05, - "loss": 2.9261, - "step": 558 - }, - { - "epoch": 0.02132242474704509, - "grad_norm": 7.897571086883545, - "learning_rate": 1.9298820445609438e-05, - "loss": 2.889, - "step": 589 - }, - { - "epoch": 0.022444657628468513, - "grad_norm": 7.548309326171875, - "learning_rate": 2.031454783748362e-05, - "loss": 2.7945, - "step": 620 - }, - { - "epoch": 0.02356689050989194, - "grad_norm": 8.54383659362793, - "learning_rate": 2.13302752293578e-05, - "loss": 2.7538, - "step": 651 - }, - { - "epoch": 0.024689123391315366, - "grad_norm": 7.025435924530029, - "learning_rate": 2.234600262123198e-05, - "loss": 2.7075, - "step": 682 - }, - { - "epoch": 0.02581135627273879, - "grad_norm": 7.59956169128418, - "learning_rate": 2.336173001310616e-05, - "loss": 2.6625, - "step": 713 - }, - { - "epoch": 0.02693358915416222, - "grad_norm": 6.982921123504639, - "learning_rate": 2.437745740498034e-05, - "loss": 2.6248, - "step": 744 - }, - { - "epoch": 0.028055822035585643, - "grad_norm": 6.033556938171387, - "learning_rate": 2.5393184796854525e-05, - "loss": 2.5724, - "step": 775 - }, - { - "epoch": 0.029178054917009068, - "grad_norm": 6.674008846282959, - "learning_rate": 2.6408912188728702e-05, - "loss": 2.5292, - "step": 806 - }, - { - "epoch": 0.030300287798432492, - "grad_norm": 6.499022006988525, - "learning_rate": 2.7424639580602886e-05, - "loss": 2.496, - "step": 837 - }, - { - "epoch": 0.03142252067985592, - "grad_norm": 6.163687229156494, - "learning_rate": 2.8440366972477066e-05, - "loss": 2.4541, - "step": 868 - }, - { - "epoch": 0.032544753561279345, - "grad_norm": 5.20266580581665, - "learning_rate": 2.9456094364351244e-05, - "loss": 2.449, - "step": 899 - }, - { - "epoch": 0.03366698644270277, - "grad_norm": 5.6633830070495605, - "learning_rate": 3.0471821756225428e-05, - "loss": 2.4085, - "step": 930 - }, - { - "epoch": 0.034789219324126194, - "grad_norm": 6.414912700653076, - "learning_rate": 3.148754914809961e-05, - "loss": 2.3791, - "step": 961 - }, - { - "epoch": 0.03591145220554962, - "grad_norm": 4.983119964599609, - "learning_rate": 3.2503276539973785e-05, - "loss": 2.3505, - "step": 992 - }, - { - "epoch": 0.03703368508697305, - "grad_norm": 5.280698299407959, - "learning_rate": 3.351900393184797e-05, - "loss": 2.3191, - "step": 1023 - }, - { - "epoch": 0.03815591796839647, - "grad_norm": 5.565277099609375, - "learning_rate": 3.453473132372215e-05, - "loss": 2.2957, - "step": 1054 - }, - { - "epoch": 0.0392781508498199, - "grad_norm": 5.02451753616333, - "learning_rate": 3.555045871559633e-05, - "loss": 2.2618, - "step": 1085 - }, - { - "epoch": 0.04040038373124333, - "grad_norm": 4.424225807189941, - "learning_rate": 3.6566186107470514e-05, - "loss": 2.2512, - "step": 1116 - }, - { - "epoch": 0.04152261661266675, - "grad_norm": 6.270051002502441, - "learning_rate": 3.7581913499344695e-05, - "loss": 2.2354, - "step": 1147 - }, - { - "epoch": 0.04264484949409018, - "grad_norm": 14.256332397460938, - "learning_rate": 3.8597640891218876e-05, - "loss": 2.2364, - "step": 1178 - }, - { - "epoch": 0.043767082375513605, - "grad_norm": 4.837010383605957, - "learning_rate": 3.9613368283093056e-05, - "loss": 2.2346, - "step": 1209 - }, - { - "epoch": 0.044889315256937026, - "grad_norm": 3.9555633068084717, - "learning_rate": 4.062909567496724e-05, - "loss": 2.2003, - "step": 1240 - }, - { - "epoch": 0.046011548138360454, - "grad_norm": 4.136904716491699, - "learning_rate": 4.164482306684142e-05, - "loss": 2.2056, - "step": 1271 - }, - { - "epoch": 0.04713378101978388, - "grad_norm": 4.25378942489624, - "learning_rate": 4.26605504587156e-05, - "loss": 2.1395, - "step": 1302 - }, - { - "epoch": 0.048256013901207304, - "grad_norm": 3.6108360290527344, - "learning_rate": 4.367627785058978e-05, - "loss": 2.1296, - "step": 1333 - }, - { - "epoch": 0.04937824678263073, - "grad_norm": 3.66212797164917, - "learning_rate": 4.469200524246396e-05, - "loss": 2.1316, - "step": 1364 - }, - { - "epoch": 0.05050047966405416, - "grad_norm": 3.5523183345794678, - "learning_rate": 4.570773263433814e-05, - "loss": 2.1381, - "step": 1395 - }, - { - "epoch": 0.05162271254547758, - "grad_norm": 3.710803747177124, - "learning_rate": 4.672346002621232e-05, - "loss": 2.1296, - "step": 1426 - }, - { - "epoch": 0.05274494542690101, - "grad_norm": 3.346266031265259, - "learning_rate": 4.77391874180865e-05, - "loss": 2.0755, - "step": 1457 - }, - { - "epoch": 0.05386717830832444, - "grad_norm": 3.264901876449585, - "learning_rate": 4.875491480996068e-05, - "loss": 2.0902, - "step": 1488 - }, - { - "epoch": 0.05498941118974786, - "grad_norm": 3.031913995742798, - "learning_rate": 4.977064220183487e-05, - "loss": 2.1002, - "step": 1519 - }, - { - "epoch": 0.056111644071171286, - "grad_norm": 3.3827006816864014, - "learning_rate": 4.9999915451558777e-05, - "loss": 2.111, - "step": 1550 - }, - { - "epoch": 0.057233876952594714, - "grad_norm": 3.5572054386138916, - "learning_rate": 4.999955597496219e-05, - "loss": 2.0809, - "step": 1581 - }, - { - "epoch": 0.058356109834018136, - "grad_norm": 3.2875311374664307, - "learning_rate": 4.9998914381774255e-05, - "loss": 2.0562, - "step": 1612 - }, - { - "epoch": 0.059478342715441564, - "grad_norm": 2.903362274169922, - "learning_rate": 4.999799067923527e-05, - "loss": 2.0598, - "step": 1643 - }, - { - "epoch": 0.060600575596864985, - "grad_norm": 2.980804681777954, - "learning_rate": 4.999678487776908e-05, - "loss": 2.0458, - "step": 1674 - }, - { - "epoch": 0.06172280847828841, - "grad_norm": 2.880610466003418, - "learning_rate": 4.9995296990983006e-05, - "loss": 2.0433, - "step": 1705 - }, - { - "epoch": 0.06284504135971183, - "grad_norm": 2.7269234657287598, - "learning_rate": 4.999352703566763e-05, - "loss": 2.0189, - "step": 1736 - }, - { - "epoch": 0.06396727424113527, - "grad_norm": 2.808084487915039, - "learning_rate": 4.999147503179668e-05, - "loss": 2.0083, - "step": 1767 - }, - { - "epoch": 0.06508950712255869, - "grad_norm": 2.925065279006958, - "learning_rate": 4.998914100252672e-05, - "loss": 2.001, - "step": 1798 - }, - { - "epoch": 0.06621174000398211, - "grad_norm": 2.996300458908081, - "learning_rate": 4.998652497419696e-05, - "loss": 1.9877, - "step": 1829 - }, - { - "epoch": 0.06733397288540555, - "grad_norm": 2.6028084754943848, - "learning_rate": 4.9983626976328927e-05, - "loss": 1.9778, - "step": 1860 - }, - { - "epoch": 0.06845620576682897, - "grad_norm": 2.4577603340148926, - "learning_rate": 4.998044704162613e-05, - "loss": 1.9998, - "step": 1891 - }, - { - "epoch": 0.06957843864825239, - "grad_norm": 2.4269509315490723, - "learning_rate": 4.9976985205973705e-05, - "loss": 1.9813, - "step": 1922 - }, - { - "epoch": 0.07070067152967582, - "grad_norm": 2.6069250106811523, - "learning_rate": 4.997324150843799e-05, - "loss": 1.9781, - "step": 1953 - }, - { - "epoch": 0.07182290441109924, - "grad_norm": 2.5287699699401855, - "learning_rate": 4.99692159912661e-05, - "loss": 1.9684, - "step": 1984 - }, - { - "epoch": 0.07294513729252267, - "grad_norm": 2.6519899368286133, - "learning_rate": 4.996490869988546e-05, - "loss": 1.9821, - "step": 2015 - }, - { - "epoch": 0.0740673701739461, - "grad_norm": 2.525928497314453, - "learning_rate": 4.996031968290326e-05, - "loss": 1.9512, - "step": 2046 - }, - { - "epoch": 0.07518960305536952, - "grad_norm": 2.4517486095428467, - "learning_rate": 4.995544899210594e-05, - "loss": 1.9283, - "step": 2077 - }, - { - "epoch": 0.07631183593679294, - "grad_norm": 2.7807457447052, - "learning_rate": 4.9950296682458583e-05, - "loss": 1.9448, - "step": 2108 - }, - { - "epoch": 0.07743406881821638, - "grad_norm": 2.4739558696746826, - "learning_rate": 4.994486281210429e-05, - "loss": 1.946, - "step": 2139 - }, - { - "epoch": 0.0785563016996398, - "grad_norm": 2.6515214443206787, - "learning_rate": 4.9939147442363566e-05, - "loss": 1.9474, - "step": 2170 - }, - { - "epoch": 0.07967853458106322, - "grad_norm": 2.8361852169036865, - "learning_rate": 4.9933150637733574e-05, - "loss": 1.9463, - "step": 2201 - }, - { - "epoch": 0.08080076746248666, - "grad_norm": 2.332261323928833, - "learning_rate": 4.992687246588743e-05, - "loss": 1.9607, - "step": 2232 - }, - { - "epoch": 0.08192300034391008, - "grad_norm": 2.3486499786376953, - "learning_rate": 4.992031299767347e-05, - "loss": 1.9248, - "step": 2263 - }, - { - "epoch": 0.0830452332253335, - "grad_norm": 3.125208616256714, - "learning_rate": 4.9913472307114386e-05, - "loss": 1.9088, - "step": 2294 - }, - { - "epoch": 0.08416746610675693, - "grad_norm": 2.2809853553771973, - "learning_rate": 4.9906350471406446e-05, - "loss": 1.9199, - "step": 2325 - }, - { - "epoch": 0.08528969898818035, - "grad_norm": 2.567641258239746, - "learning_rate": 4.989894757091861e-05, - "loss": 1.9054, - "step": 2356 - }, - { - "epoch": 0.08641193186960378, - "grad_norm": 2.2755303382873535, - "learning_rate": 4.989126368919158e-05, - "loss": 1.903, - "step": 2387 - }, - { - "epoch": 0.08753416475102721, - "grad_norm": 2.147775888442993, - "learning_rate": 4.988329891293693e-05, - "loss": 1.8993, - "step": 2418 - }, - { - "epoch": 0.08865639763245063, - "grad_norm": 2.2279839515686035, - "learning_rate": 4.987505333203608e-05, - "loss": 1.905, - "step": 2449 - }, - { - "epoch": 0.08977863051387405, - "grad_norm": 2.317538022994995, - "learning_rate": 4.9866527039539276e-05, - "loss": 1.8776, - "step": 2480 - }, - { - "epoch": 0.09090086339529749, - "grad_norm": 2.296868324279785, - "learning_rate": 4.9857720131664594e-05, - "loss": 1.8714, - "step": 2511 - }, - { - "epoch": 0.09202309627672091, - "grad_norm": 2.282538890838623, - "learning_rate": 4.9848632707796773e-05, - "loss": 1.8765, - "step": 2542 - }, - { - "epoch": 0.09314532915814433, - "grad_norm": 2.1396827697753906, - "learning_rate": 4.9839264870486155e-05, - "loss": 1.8827, - "step": 2573 - }, - { - "epoch": 0.09426756203956776, - "grad_norm": 2.1897048950195312, - "learning_rate": 4.9829616725447526e-05, - "loss": 1.8655, - "step": 2604 - }, - { - "epoch": 0.09538979492099119, - "grad_norm": 2.1385130882263184, - "learning_rate": 4.981968838155888e-05, - "loss": 1.8768, - "step": 2635 - }, - { - "epoch": 0.09651202780241461, - "grad_norm": 2.264171600341797, - "learning_rate": 4.980947995086024e-05, - "loss": 1.8734, - "step": 2666 - }, - { - "epoch": 0.09763426068383804, - "grad_norm": 2.089871883392334, - "learning_rate": 4.979899154855234e-05, - "loss": 1.8516, - "step": 2697 - }, - { - "epoch": 0.09875649356526146, - "grad_norm": 2.092179298400879, - "learning_rate": 4.9788223292995386e-05, - "loss": 1.8729, - "step": 2728 - }, - { - "epoch": 0.09987872644668488, - "grad_norm": 2.3216769695281982, - "learning_rate": 4.977717530570768e-05, - "loss": 1.8673, - "step": 2759 - }, - { - "epoch": 0.10100095932810832, - "grad_norm": 2.104457139968872, - "learning_rate": 4.976584771136425e-05, - "loss": 1.8734, - "step": 2790 - }, - { - "epoch": 0.10212319220953174, - "grad_norm": 2.236363649368286, - "learning_rate": 4.975424063779547e-05, - "loss": 1.8316, - "step": 2821 - }, - { - "epoch": 0.10324542509095516, - "grad_norm": 2.264967203140259, - "learning_rate": 4.974235421598557e-05, - "loss": 1.8614, - "step": 2852 - }, - { - "epoch": 0.1043676579723786, - "grad_norm": 2.1815454959869385, - "learning_rate": 4.973018858007122e-05, - "loss": 1.8365, - "step": 2883 - }, - { - "epoch": 0.10548989085380202, - "grad_norm": 2.049677848815918, - "learning_rate": 4.9717743867339963e-05, - "loss": 1.8454, - "step": 2914 - }, - { - "epoch": 0.10661212373522544, - "grad_norm": 1.9844895601272583, - "learning_rate": 4.9705020218228695e-05, - "loss": 1.8419, - "step": 2945 - }, - { - "epoch": 0.10773435661664887, - "grad_norm": 2.052708387374878, - "learning_rate": 4.969201777632205e-05, - "loss": 1.8509, - "step": 2976 - }, - { - "epoch": 0.1088565894980723, - "grad_norm": 2.014535665512085, - "learning_rate": 4.9678736688350846e-05, - "loss": 1.8129, - "step": 3007 - }, - { - "epoch": 0.10997882237949572, - "grad_norm": 1.9768311977386475, - "learning_rate": 4.966517710419033e-05, - "loss": 1.8375, - "step": 3038 - }, - { - "epoch": 0.11110105526091915, - "grad_norm": 2.046293258666992, - "learning_rate": 4.965133917685858e-05, - "loss": 1.8132, - "step": 3069 - }, - { - "epoch": 0.11222328814234257, - "grad_norm": 2.104555368423462, - "learning_rate": 4.9637223062514714e-05, - "loss": 1.8147, - "step": 3100 - }, - { - "epoch": 0.113345521023766, - "grad_norm": 2.04533052444458, - "learning_rate": 4.962282892045718e-05, - "loss": 1.8591, - "step": 3131 - }, - { - "epoch": 0.11446775390518943, - "grad_norm": 1.967282772064209, - "learning_rate": 4.9608156913121904e-05, - "loss": 1.7966, - "step": 3162 - }, - { - "epoch": 0.11558998678661285, - "grad_norm": 2.092106342315674, - "learning_rate": 4.959320720608049e-05, - "loss": 1.8301, - "step": 3193 - }, - { - "epoch": 0.11671221966803627, - "grad_norm": 2.0512046813964844, - "learning_rate": 4.9577979968038354e-05, - "loss": 1.8211, - "step": 3224 - }, - { - "epoch": 0.11783445254945969, - "grad_norm": 1.9260915517807007, - "learning_rate": 4.956247537083282e-05, - "loss": 1.7989, - "step": 3255 - }, - { - "epoch": 0.11895668543088313, - "grad_norm": 2.0938026905059814, - "learning_rate": 4.9546693589431145e-05, - "loss": 1.8336, - "step": 3286 - }, - { - "epoch": 0.12007891831230655, - "grad_norm": 1.9972988367080688, - "learning_rate": 4.9530634801928595e-05, - "loss": 1.8147, - "step": 3317 - }, - { - "epoch": 0.12120115119372997, - "grad_norm": 1.9120224714279175, - "learning_rate": 4.9514299189546395e-05, - "loss": 1.8028, - "step": 3348 - }, - { - "epoch": 0.1223233840751534, - "grad_norm": 1.959033727645874, - "learning_rate": 4.949768693662973e-05, - "loss": 1.8281, - "step": 3379 - }, - { - "epoch": 0.12344561695657683, - "grad_norm": 1.9182357788085938, - "learning_rate": 4.948079823064559e-05, - "loss": 1.8165, - "step": 3410 - }, - { - "epoch": 0.12456784983800025, - "grad_norm": 1.9079999923706055, - "learning_rate": 4.946363326218074e-05, - "loss": 1.7916, - "step": 3441 - }, - { - "epoch": 0.12569008271942367, - "grad_norm": 1.916276216506958, - "learning_rate": 4.9446192224939525e-05, - "loss": 1.8086, - "step": 3472 - }, - { - "epoch": 0.1268123156008471, - "grad_norm": 1.903389811515808, - "learning_rate": 4.942847531574167e-05, - "loss": 1.8116, - "step": 3503 - }, - { - "epoch": 0.12793454848227054, - "grad_norm": 2.064885139465332, - "learning_rate": 4.941048273452008e-05, - "loss": 1.8144, - "step": 3534 - }, - { - "epoch": 0.12905678136369395, - "grad_norm": 2.1314241886138916, - "learning_rate": 4.9392214684318605e-05, - "loss": 1.7943, - "step": 3565 - }, - { - "epoch": 0.13017901424511738, - "grad_norm": 2.0061681270599365, - "learning_rate": 4.93736713712897e-05, - "loss": 1.794, - "step": 3596 - }, - { - "epoch": 0.13130124712654082, - "grad_norm": 1.9408286809921265, - "learning_rate": 4.9354853004692124e-05, - "loss": 1.7882, - "step": 3627 - }, - { - "epoch": 0.13242348000796422, - "grad_norm": 1.8884766101837158, - "learning_rate": 4.93357597968886e-05, - "loss": 1.7846, - "step": 3658 - }, - { - "epoch": 0.13354571288938766, - "grad_norm": 1.9393378496170044, - "learning_rate": 4.931639196334338e-05, - "loss": 1.7923, - "step": 3689 - }, - { - "epoch": 0.1346679457708111, - "grad_norm": 1.8815410137176514, - "learning_rate": 4.9296749722619826e-05, - "loss": 1.7939, - "step": 3720 - }, - { - "epoch": 0.1357901786522345, - "grad_norm": 1.8603038787841797, - "learning_rate": 4.9276833296377966e-05, - "loss": 1.7589, - "step": 3751 - }, - { - "epoch": 0.13691241153365794, - "grad_norm": 1.775247573852539, - "learning_rate": 4.925664290937196e-05, - "loss": 1.7897, - "step": 3782 - }, - { - "epoch": 0.13803464441508137, - "grad_norm": 1.8576780557632446, - "learning_rate": 4.9236178789447576e-05, - "loss": 1.7908, - "step": 3813 - }, - { - "epoch": 0.13915687729650478, - "grad_norm": 1.800264596939087, - "learning_rate": 4.921544116753962e-05, - "loss": 1.7736, - "step": 3844 - }, - { - "epoch": 0.1402791101779282, - "grad_norm": 1.9730401039123535, - "learning_rate": 4.919443027766935e-05, - "loss": 1.7639, - "step": 3875 - }, - { - "epoch": 0.14140134305935165, - "grad_norm": 1.8654968738555908, - "learning_rate": 4.91731463569418e-05, - "loss": 1.7477, - "step": 3906 - }, - { - "epoch": 0.14252357594077505, - "grad_norm": 1.8131386041641235, - "learning_rate": 4.915158964554312e-05, - "loss": 1.7887, - "step": 3937 - }, - { - "epoch": 0.1436458088221985, - "grad_norm": 1.8576264381408691, - "learning_rate": 4.912976038673786e-05, - "loss": 1.7779, - "step": 3968 - }, - { - "epoch": 0.14476804170362192, - "grad_norm": 1.8940199613571167, - "learning_rate": 4.9107658826866254e-05, - "loss": 1.7653, - "step": 3999 - }, - { - "epoch": 0.14589027458504533, - "grad_norm": 1.7727802991867065, - "learning_rate": 4.908528521534139e-05, - "loss": 1.7809, - "step": 4030 - }, - { - "epoch": 0.14701250746646877, - "grad_norm": 1.7416553497314453, - "learning_rate": 4.906263980464644e-05, - "loss": 1.7605, - "step": 4061 - }, - { - "epoch": 0.1481347403478922, - "grad_norm": 1.82987642288208, - "learning_rate": 4.903972285033178e-05, - "loss": 1.7554, - "step": 4092 - }, - { - "epoch": 0.1492569732293156, - "grad_norm": 1.916339635848999, - "learning_rate": 4.901653461101213e-05, - "loss": 1.7872, - "step": 4123 - }, - { - "epoch": 0.15037920611073904, - "grad_norm": 1.8903008699417114, - "learning_rate": 4.8993075348363626e-05, - "loss": 1.782, - "step": 4154 - }, - { - "epoch": 0.15150143899216248, - "grad_norm": 1.9334847927093506, - "learning_rate": 4.896934532712084e-05, - "loss": 1.7565, - "step": 4185 - }, - { - "epoch": 0.1526236718735859, - "grad_norm": 1.7778478860855103, - "learning_rate": 4.8945344815073846e-05, - "loss": 1.7613, - "step": 4216 - }, - { - "epoch": 0.15374590475500932, - "grad_norm": 1.7348295450210571, - "learning_rate": 4.892107408306516e-05, - "loss": 1.7512, - "step": 4247 - }, - { - "epoch": 0.15486813763643276, - "grad_norm": 1.7189710140228271, - "learning_rate": 4.889653340498669e-05, - "loss": 1.741, - "step": 4278 - }, - { - "epoch": 0.15599037051785616, - "grad_norm": 1.8557075262069702, - "learning_rate": 4.8871723057776664e-05, - "loss": 1.7471, - "step": 4309 - }, - { - "epoch": 0.1571126033992796, - "grad_norm": 1.7188880443572998, - "learning_rate": 4.8846643321416476e-05, - "loss": 1.7492, - "step": 4340 - }, - { - "epoch": 0.15823483628070303, - "grad_norm": 1.6712063550949097, - "learning_rate": 4.882129447892753e-05, - "loss": 1.7434, - "step": 4371 - }, - { - "epoch": 0.15935706916212644, - "grad_norm": 1.7652437686920166, - "learning_rate": 4.8795676816368076e-05, - "loss": 1.7422, - "step": 4402 - }, - { - "epoch": 0.16047930204354988, - "grad_norm": 1.7910144329071045, - "learning_rate": 4.876979062282995e-05, - "loss": 1.7635, - "step": 4433 - }, - { - "epoch": 0.1616015349249733, - "grad_norm": 1.9248684644699097, - "learning_rate": 4.8743636190435325e-05, - "loss": 1.7401, - "step": 4464 - }, - { - "epoch": 0.16272376780639672, - "grad_norm": 1.828202486038208, - "learning_rate": 4.871721381433344e-05, - "loss": 1.7419, - "step": 4495 - }, - { - "epoch": 0.16384600068782015, - "grad_norm": 1.7170790433883667, - "learning_rate": 4.869052379269719e-05, - "loss": 1.7562, - "step": 4526 - }, - { - "epoch": 0.1649682335692436, - "grad_norm": 1.753203272819519, - "learning_rate": 4.866356642671985e-05, - "loss": 1.7569, - "step": 4557 - }, - { - "epoch": 0.166090466450667, - "grad_norm": 1.7906442880630493, - "learning_rate": 4.8636342020611634e-05, - "loss": 1.7376, - "step": 4588 - }, - { - "epoch": 0.16721269933209043, - "grad_norm": 1.7113378047943115, - "learning_rate": 4.860885088159626e-05, - "loss": 1.7386, - "step": 4619 - }, - { - "epoch": 0.16833493221351387, - "grad_norm": 1.7997937202453613, - "learning_rate": 4.858109331990751e-05, - "loss": 1.7531, - "step": 4650 - }, - { - "epoch": 0.16945716509493727, - "grad_norm": 1.76421320438385, - "learning_rate": 4.855306964878567e-05, - "loss": 1.7402, - "step": 4681 - }, - { - "epoch": 0.1705793979763607, - "grad_norm": 1.7803616523742676, - "learning_rate": 4.8524780184474084e-05, - "loss": 1.7345, - "step": 4712 - }, - { - "epoch": 0.17170163085778414, - "grad_norm": 1.7763142585754395, - "learning_rate": 4.8496225246215496e-05, - "loss": 1.7469, - "step": 4743 - }, - { - "epoch": 0.17282386373920755, - "grad_norm": 1.728219747543335, - "learning_rate": 4.8467405156248505e-05, - "loss": 1.7182, - "step": 4774 - }, - { - "epoch": 0.17394609662063099, - "grad_norm": 1.7837860584259033, - "learning_rate": 4.843832023980392e-05, - "loss": 1.739, - "step": 4805 - }, - { - "epoch": 0.17506832950205442, - "grad_norm": 1.7005128860473633, - "learning_rate": 4.840897082510106e-05, - "loss": 1.7377, - "step": 4836 - }, - { - "epoch": 0.17619056238347783, - "grad_norm": 1.6570392847061157, - "learning_rate": 4.8379357243344084e-05, - "loss": 1.712, - "step": 4867 - }, - { - "epoch": 0.17731279526490126, - "grad_norm": 1.6575350761413574, - "learning_rate": 4.8349479828718236e-05, - "loss": 1.7147, - "step": 4898 - }, - { - "epoch": 0.1784350281463247, - "grad_norm": 1.8768808841705322, - "learning_rate": 4.8319338918386075e-05, - "loss": 1.7312, - "step": 4929 - }, - { - "epoch": 0.1795572610277481, - "grad_norm": 1.7145389318466187, - "learning_rate": 4.828893485248369e-05, - "loss": 1.7221, - "step": 4960 - }, - { - "epoch": 0.18067949390917154, - "grad_norm": 1.834173560142517, - "learning_rate": 4.825826797411682e-05, - "loss": 1.7322, - "step": 4991 - }, - { - "epoch": 0.18180172679059498, - "grad_norm": 1.7125933170318604, - "learning_rate": 4.822733862935702e-05, - "loss": 1.7156, - "step": 5022 - }, - { - "epoch": 0.18292395967201838, - "grad_norm": 1.7470024824142456, - "learning_rate": 4.819614716723775e-05, - "loss": 1.7176, - "step": 5053 - }, - { - "epoch": 0.18404619255344182, - "grad_norm": 1.7042289972305298, - "learning_rate": 4.8164693939750425e-05, - "loss": 1.7192, - "step": 5084 - }, - { - "epoch": 0.18516842543486525, - "grad_norm": 1.6803418397903442, - "learning_rate": 4.813297930184042e-05, - "loss": 1.7197, - "step": 5115 - }, - { - "epoch": 0.18629065831628866, - "grad_norm": 1.7296956777572632, - "learning_rate": 4.810100361140314e-05, - "loss": 1.72, - "step": 5146 - }, - { - "epoch": 0.1874128911977121, - "grad_norm": 1.6245464086532593, - "learning_rate": 4.8068767229279885e-05, - "loss": 1.7081, - "step": 5177 - }, - { - "epoch": 0.18853512407913553, - "grad_norm": 1.7138885259628296, - "learning_rate": 4.8036270519253854e-05, - "loss": 1.7068, - "step": 5208 - }, - { - "epoch": 0.18965735696055894, - "grad_norm": 1.704185128211975, - "learning_rate": 4.8003513848046e-05, - "loss": 1.7219, - "step": 5239 - }, - { - "epoch": 0.19077958984198237, - "grad_norm": 1.712551236152649, - "learning_rate": 4.79704975853109e-05, - "loss": 1.7118, - "step": 5270 - }, - { - "epoch": 0.1919018227234058, - "grad_norm": 1.7193052768707275, - "learning_rate": 4.793722210363262e-05, - "loss": 1.7195, - "step": 5301 - }, - { - "epoch": 0.19302405560482921, - "grad_norm": 1.5574607849121094, - "learning_rate": 4.7903687778520414e-05, - "loss": 1.7286, - "step": 5332 - }, - { - "epoch": 0.19414628848625265, - "grad_norm": 1.7480719089508057, - "learning_rate": 4.7869894988404593e-05, - "loss": 1.6957, - "step": 5363 - }, - { - "epoch": 0.19526852136767608, - "grad_norm": 1.7487633228302002, - "learning_rate": 4.783584411463221e-05, - "loss": 1.7203, - "step": 5394 - }, - { - "epoch": 0.1963907542490995, - "grad_norm": 1.6720587015151978, - "learning_rate": 4.780153554146274e-05, - "loss": 1.7009, - "step": 5425 - }, - { - "epoch": 0.19751298713052293, - "grad_norm": 1.6622951030731201, - "learning_rate": 4.7766969656063766e-05, - "loss": 1.7049, - "step": 5456 - }, - { - "epoch": 0.19863522001194636, - "grad_norm": 1.656158208847046, - "learning_rate": 4.773214684850662e-05, - "loss": 1.7104, - "step": 5487 - }, - { - "epoch": 0.19975745289336977, - "grad_norm": 1.6559454202651978, - "learning_rate": 4.769706751176193e-05, - "loss": 1.7089, - "step": 5518 - }, - { - "epoch": 0.2008796857747932, - "grad_norm": 1.7262494564056396, - "learning_rate": 4.7661732041695264e-05, - "loss": 1.7143, - "step": 5549 - }, - { - "epoch": 0.20200191865621664, - "grad_norm": 1.6877381801605225, - "learning_rate": 4.762614083706258e-05, - "loss": 1.7134, - "step": 5580 - }, - { - "epoch": 0.20312415153764005, - "grad_norm": 1.5669549703598022, - "learning_rate": 4.759029429950581e-05, - "loss": 1.7213, - "step": 5611 - }, - { - "epoch": 0.20424638441906348, - "grad_norm": 1.7044217586517334, - "learning_rate": 4.7554192833548235e-05, - "loss": 1.7185, - "step": 5642 - }, - { - "epoch": 0.20536861730048692, - "grad_norm": 1.6999757289886475, - "learning_rate": 4.751783684659e-05, - "loss": 1.7163, - "step": 5673 - }, - { - "epoch": 0.20649085018191032, - "grad_norm": 1.6043522357940674, - "learning_rate": 4.748122674890348e-05, - "loss": 1.7031, - "step": 5704 - }, - { - "epoch": 0.20761308306333376, - "grad_norm": 1.7062305212020874, - "learning_rate": 4.7444362953628654e-05, - "loss": 1.7035, - "step": 5735 - }, - { - "epoch": 0.2087353159447572, - "grad_norm": 1.6612005233764648, - "learning_rate": 4.7407245876768424e-05, - "loss": 1.6981, - "step": 5766 - }, - { - "epoch": 0.2098575488261806, - "grad_norm": 1.7277076244354248, - "learning_rate": 4.736987593718397e-05, - "loss": 1.7161, - "step": 5797 - }, - { - "epoch": 0.21097978170760404, - "grad_norm": 1.705458402633667, - "learning_rate": 4.733225355658999e-05, - "loss": 1.6854, - "step": 5828 - }, - { - "epoch": 0.21210201458902747, - "grad_norm": 1.629443883895874, - "learning_rate": 4.7294379159549926e-05, - "loss": 1.7025, - "step": 5859 - }, - { - "epoch": 0.21322424747045088, - "grad_norm": 1.613192081451416, - "learning_rate": 4.725625317347119e-05, - "loss": 1.6992, - "step": 5890 - }, - { - "epoch": 0.2143464803518743, - "grad_norm": 1.6801332235336304, - "learning_rate": 4.7217876028600374e-05, - "loss": 1.6798, - "step": 5921 - }, - { - "epoch": 0.21546871323329775, - "grad_norm": 1.6418830156326294, - "learning_rate": 4.717924815801832e-05, - "loss": 1.6918, - "step": 5952 - }, - { - "epoch": 0.21659094611472116, - "grad_norm": 1.6128371953964233, - "learning_rate": 4.714036999763532e-05, - "loss": 1.706, - "step": 5983 - }, - { - "epoch": 0.2177131789961446, - "grad_norm": 1.71291983127594, - "learning_rate": 4.7101241986186116e-05, - "loss": 1.6861, - "step": 6014 - }, - { - "epoch": 0.21883541187756803, - "grad_norm": 1.5903745889663696, - "learning_rate": 4.7061864565225e-05, - "loss": 1.6886, - "step": 6045 - }, - { - "epoch": 0.21995764475899143, - "grad_norm": 1.71088445186615, - "learning_rate": 4.702223817912081e-05, - "loss": 1.7003, - "step": 6076 - }, - { - "epoch": 0.22107987764041487, - "grad_norm": 1.541530966758728, - "learning_rate": 4.698236327505195e-05, - "loss": 1.6956, - "step": 6107 - }, - { - "epoch": 0.2222021105218383, - "grad_norm": 1.539455533027649, - "learning_rate": 4.694224030300127e-05, - "loss": 1.6833, - "step": 6138 - }, - { - "epoch": 0.2233243434032617, - "grad_norm": 1.688120722770691, - "learning_rate": 4.690186971575107e-05, - "loss": 1.6973, - "step": 6169 - }, - { - "epoch": 0.22444657628468515, - "grad_norm": 1.6934964656829834, - "learning_rate": 4.6861251968877916e-05, - "loss": 1.6979, - "step": 6200 - }, - { - "epoch": 0.22556880916610858, - "grad_norm": 1.6558688879013062, - "learning_rate": 4.68203875207476e-05, - "loss": 1.6925, - "step": 6231 - }, - { - "epoch": 0.226691042047532, - "grad_norm": 1.6245280504226685, - "learning_rate": 4.677927683250983e-05, - "loss": 1.6688, - "step": 6262 - }, - { - "epoch": 0.22781327492895542, - "grad_norm": 1.5808422565460205, - "learning_rate": 4.6737920368093156e-05, - "loss": 1.688, - "step": 6293 - }, - { - "epoch": 0.22893550781037886, - "grad_norm": 1.5224875211715698, - "learning_rate": 4.669631859419965e-05, - "loss": 1.6864, - "step": 6324 - }, - { - "epoch": 0.23005774069180226, - "grad_norm": 1.5904366970062256, - "learning_rate": 4.6654471980299676e-05, - "loss": 1.6893, - "step": 6355 - }, - { - "epoch": 0.2311799735732257, - "grad_norm": 1.6145131587982178, - "learning_rate": 4.661238099862658e-05, - "loss": 1.6818, - "step": 6386 - }, - { - "epoch": 0.23230220645464913, - "grad_norm": 1.6297610998153687, - "learning_rate": 4.657004612417138e-05, - "loss": 1.687, - "step": 6417 - }, - { - "epoch": 0.23342443933607254, - "grad_norm": 1.6199692487716675, - "learning_rate": 4.6527467834677374e-05, - "loss": 1.6945, - "step": 6448 - }, - { - "epoch": 0.23454667221749598, - "grad_norm": 1.5439369678497314, - "learning_rate": 4.648464661063478e-05, - "loss": 1.6926, - "step": 6479 - }, - { - "epoch": 0.23566890509891938, - "grad_norm": 1.6095410585403442, - "learning_rate": 4.6441582935275264e-05, - "loss": 1.689, - "step": 6510 - }, - { - "epoch": 0.23679113798034282, - "grad_norm": 1.4971855878829956, - "learning_rate": 4.6398277294566586e-05, - "loss": 1.6622, - "step": 6541 - }, - { - "epoch": 0.23791337086176625, - "grad_norm": 1.53174889087677, - "learning_rate": 4.6354730177207e-05, - "loss": 1.6785, - "step": 6572 - }, - { - "epoch": 0.23903560374318966, - "grad_norm": 1.4567692279815674, - "learning_rate": 4.6310942074619787e-05, - "loss": 1.6776, - "step": 6603 - }, - { - "epoch": 0.2401578366246131, - "grad_norm": 1.6813284158706665, - "learning_rate": 4.626691348094777e-05, - "loss": 1.6692, - "step": 6634 - }, - { - "epoch": 0.24128006950603653, - "grad_norm": 1.5593857765197754, - "learning_rate": 4.622264489304762e-05, - "loss": 1.6811, - "step": 6665 - }, - { - "epoch": 0.24240230238745994, - "grad_norm": 1.5681389570236206, - "learning_rate": 4.617813681048434e-05, - "loss": 1.689, - "step": 6696 - }, - { - "epoch": 0.24352453526888337, - "grad_norm": 1.6402842998504639, - "learning_rate": 4.61333897355256e-05, - "loss": 1.6621, - "step": 6727 - }, - { - "epoch": 0.2446467681503068, - "grad_norm": 1.642669677734375, - "learning_rate": 4.608840417313604e-05, - "loss": 1.6562, - "step": 6758 - }, - { - "epoch": 0.24576900103173022, - "grad_norm": 1.6442660093307495, - "learning_rate": 4.6043180630971646e-05, - "loss": 1.6721, - "step": 6789 - }, - { - "epoch": 0.24689123391315365, - "grad_norm": 1.5577408075332642, - "learning_rate": 4.599771961937391e-05, - "loss": 1.6837, - "step": 6820 - }, - { - "epoch": 0.2480134667945771, - "grad_norm": 1.8555899858474731, - "learning_rate": 4.5952021651364204e-05, - "loss": 1.6739, - "step": 6851 - }, - { - "epoch": 0.2491356996760005, - "grad_norm": 1.667812466621399, - "learning_rate": 4.590608724263786e-05, - "loss": 1.6704, - "step": 6882 - }, - { - "epoch": 0.25025793255742396, - "grad_norm": 1.6642868518829346, - "learning_rate": 4.585991691155845e-05, - "loss": 1.6784, - "step": 6913 - }, - { - "epoch": 0.25138016543884734, - "grad_norm": 1.6429824829101562, - "learning_rate": 4.581351117915188e-05, - "loss": 1.6729, - "step": 6944 - }, - { - "epoch": 0.25250239832027077, - "grad_norm": 1.6268694400787354, - "learning_rate": 4.5766870569100534e-05, - "loss": 1.6657, - "step": 6975 - }, - { - "epoch": 0.2536246312016942, - "grad_norm": 1.496177315711975, - "learning_rate": 4.571999560773736e-05, - "loss": 1.6611, - "step": 7006 - }, - { - "epoch": 0.25474686408311764, - "grad_norm": 1.7032805681228638, - "learning_rate": 4.5672886824039915e-05, - "loss": 1.6816, - "step": 7037 - }, - { - "epoch": 0.2558690969645411, - "grad_norm": 1.791925072669983, - "learning_rate": 4.5625544749624435e-05, - "loss": 1.6689, - "step": 7068 - }, - { - "epoch": 0.2569913298459645, - "grad_norm": 1.5614711046218872, - "learning_rate": 4.5577969918739794e-05, - "loss": 1.6647, - "step": 7099 - }, - { - "epoch": 0.2581135627273879, - "grad_norm": 1.517112135887146, - "learning_rate": 4.5530162868261486e-05, - "loss": 1.6614, - "step": 7130 - }, - { - "epoch": 0.2592357956088113, - "grad_norm": 1.5636824369430542, - "learning_rate": 4.548212413768558e-05, - "loss": 1.6599, - "step": 7161 - }, - { - "epoch": 0.26035802849023476, - "grad_norm": 1.5803399085998535, - "learning_rate": 4.543385426912261e-05, - "loss": 1.6558, - "step": 7192 - }, - { - "epoch": 0.2614802613716582, - "grad_norm": 1.6228526830673218, - "learning_rate": 4.53853538072915e-05, - "loss": 1.6778, - "step": 7223 - }, - { - "epoch": 0.26260249425308163, - "grad_norm": 1.5660549402236938, - "learning_rate": 4.533662329951336e-05, - "loss": 1.6827, - "step": 7254 - }, - { - "epoch": 0.26372472713450507, - "grad_norm": 1.555421233177185, - "learning_rate": 4.528766329570536e-05, - "loss": 1.6755, - "step": 7285 - }, - { - "epoch": 0.26484696001592845, - "grad_norm": 1.603285312652588, - "learning_rate": 4.523847434837447e-05, - "loss": 1.6455, - "step": 7316 - }, - { - "epoch": 0.2659691928973519, - "grad_norm": 1.510772943496704, - "learning_rate": 4.518905701261128e-05, - "loss": 1.6736, - "step": 7347 - }, - { - "epoch": 0.2670914257787753, - "grad_norm": 1.6260360479354858, - "learning_rate": 4.5139411846083715e-05, - "loss": 1.6643, - "step": 7378 - }, - { - "epoch": 0.26821365866019875, - "grad_norm": 3.0237209796905518, - "learning_rate": 4.508953940903073e-05, - "loss": 1.6615, - "step": 7409 - }, - { - "epoch": 0.2693358915416222, - "grad_norm": 1.4725430011749268, - "learning_rate": 4.5039440264255994e-05, - "loss": 1.6582, - "step": 7440 - }, - { - "epoch": 0.2704581244230456, - "grad_norm": 1.5135307312011719, - "learning_rate": 4.498911497712155e-05, - "loss": 1.6754, - "step": 7471 - }, - { - "epoch": 0.271580357304469, - "grad_norm": 1.5741811990737915, - "learning_rate": 4.493856411554142e-05, - "loss": 1.6889, - "step": 7502 - }, - { - "epoch": 0.27270259018589244, - "grad_norm": 1.5469688177108765, - "learning_rate": 4.4887788249975206e-05, - "loss": 1.6542, - "step": 7533 - }, - { - "epoch": 0.27382482306731587, - "grad_norm": 1.4596927165985107, - "learning_rate": 4.4836787953421656e-05, - "loss": 1.6365, - "step": 7564 - }, - { - "epoch": 0.2749470559487393, - "grad_norm": 1.566522479057312, - "learning_rate": 4.478556380141218e-05, - "loss": 1.657, - "step": 7595 - }, - { - "epoch": 0.27606928883016274, - "grad_norm": 1.5141624212265015, - "learning_rate": 4.4734116372004375e-05, - "loss": 1.6695, - "step": 7626 - }, - { - "epoch": 0.2771915217115862, - "grad_norm": 1.4138630628585815, - "learning_rate": 4.4682446245775477e-05, - "loss": 1.6638, - "step": 7657 - }, - { - "epoch": 0.27831375459300955, - "grad_norm": 1.4885402917861938, - "learning_rate": 4.463055400581586e-05, - "loss": 1.6817, - "step": 7688 - }, - { - "epoch": 0.279435987474433, - "grad_norm": 1.645486831665039, - "learning_rate": 4.4578440237722374e-05, - "loss": 1.6392, - "step": 7719 - }, - { - "epoch": 0.2805582203558564, - "grad_norm": 1.5977535247802734, - "learning_rate": 4.452610552959183e-05, - "loss": 1.6557, - "step": 7750 - }, - { - "epoch": 0.28168045323727986, - "grad_norm": 1.6347745656967163, - "learning_rate": 4.447355047201428e-05, - "loss": 1.6573, - "step": 7781 - }, - { - "epoch": 0.2828026861187033, - "grad_norm": 1.5288081169128418, - "learning_rate": 4.4420775658066414e-05, - "loss": 1.638, - "step": 7812 - }, - { - "epoch": 0.28392491900012673, - "grad_norm": 1.4643625020980835, - "learning_rate": 4.436778168330484e-05, - "loss": 1.6402, - "step": 7843 - }, - { - "epoch": 0.2850471518815501, - "grad_norm": 1.568663239479065, - "learning_rate": 4.4314569145759353e-05, - "loss": 1.6565, - "step": 7874 - }, - { - "epoch": 0.28616938476297354, - "grad_norm": 1.476515293121338, - "learning_rate": 4.42611386459262e-05, - "loss": 1.6709, - "step": 7905 - }, - { - "epoch": 0.287291617644397, - "grad_norm": 1.532404899597168, - "learning_rate": 4.420749078676133e-05, - "loss": 1.6333, - "step": 7936 - }, - { - "epoch": 0.2884138505258204, - "grad_norm": 1.5388779640197754, - "learning_rate": 4.4153626173673516e-05, - "loss": 1.6494, - "step": 7967 - }, - { - "epoch": 0.28953608340724385, - "grad_norm": 1.5787324905395508, - "learning_rate": 4.409954541451762e-05, - "loss": 1.6362, - "step": 7998 - }, - { - "epoch": 0.2906583162886673, - "grad_norm": 1.4780092239379883, - "learning_rate": 4.404524911958764e-05, - "loss": 1.643, - "step": 8029 - }, - { - "epoch": 0.29178054917009066, - "grad_norm": 1.5434736013412476, - "learning_rate": 4.399073790160989e-05, - "loss": 1.6472, - "step": 8060 - }, - { - "epoch": 0.2929027820515141, - "grad_norm": 1.4898840188980103, - "learning_rate": 4.393601237573607e-05, - "loss": 1.6483, - "step": 8091 - }, - { - "epoch": 0.29402501493293753, - "grad_norm": 1.5529502630233765, - "learning_rate": 4.388107315953628e-05, - "loss": 1.6291, - "step": 8122 - }, - { - "epoch": 0.29514724781436097, - "grad_norm": 1.4831997156143188, - "learning_rate": 4.382592087299212e-05, - "loss": 1.6518, - "step": 8153 - }, - { - "epoch": 0.2962694806957844, - "grad_norm": 1.4568578004837036, - "learning_rate": 4.377055613848964e-05, - "loss": 1.6465, - "step": 8184 - }, - { - "epoch": 0.29739171357720784, - "grad_norm": 1.4941576719284058, - "learning_rate": 4.3714979580812355e-05, - "loss": 1.634, - "step": 8215 - }, - { - "epoch": 0.2985139464586312, - "grad_norm": 1.5891722440719604, - "learning_rate": 4.365919182713416e-05, - "loss": 1.6422, - "step": 8246 - }, - { - "epoch": 0.29963617934005465, - "grad_norm": 1.5435233116149902, - "learning_rate": 4.360319350701226e-05, - "loss": 1.6446, - "step": 8277 - }, - { - "epoch": 0.3007584122214781, - "grad_norm": 1.4754277467727661, - "learning_rate": 4.3546985252380115e-05, - "loss": 1.655, - "step": 8308 - }, - { - "epoch": 0.3018806451029015, - "grad_norm": 1.5463342666625977, - "learning_rate": 4.349056769754021e-05, - "loss": 1.6407, - "step": 8339 - }, - { - "epoch": 0.30300287798432496, - "grad_norm": 1.4847484827041626, - "learning_rate": 4.3433941479156994e-05, - "loss": 1.65, - "step": 8370 - }, - { - "epoch": 0.3041251108657484, - "grad_norm": 1.475669264793396, - "learning_rate": 4.3377107236249647e-05, - "loss": 1.6398, - "step": 8401 - }, - { - "epoch": 0.3052473437471718, - "grad_norm": 1.558566689491272, - "learning_rate": 4.332006561018488e-05, - "loss": 1.6501, - "step": 8432 - }, - { - "epoch": 0.3063695766285952, - "grad_norm": 1.5497310161590576, - "learning_rate": 4.3262817244669683e-05, - "loss": 1.6371, - "step": 8463 - }, - { - "epoch": 0.30749180951001864, - "grad_norm": 1.464553952217102, - "learning_rate": 4.3205362785744083e-05, - "loss": 1.6766, - "step": 8494 - }, - { - "epoch": 0.3086140423914421, - "grad_norm": 1.5198413133621216, - "learning_rate": 4.314770288177384e-05, - "loss": 1.633, - "step": 8525 - }, - { - "epoch": 0.3097362752728655, - "grad_norm": 1.5493290424346924, - "learning_rate": 4.308983818344313e-05, - "loss": 1.6465, - "step": 8556 - }, - { - "epoch": 0.31085850815428895, - "grad_norm": 1.4413405656814575, - "learning_rate": 4.3031769343747206e-05, - "loss": 1.6463, - "step": 8587 - }, - { - "epoch": 0.31198074103571233, - "grad_norm": 1.508507251739502, - "learning_rate": 4.297349701798505e-05, - "loss": 1.6262, - "step": 8618 - }, - { - "epoch": 0.31310297391713576, - "grad_norm": 1.560054063796997, - "learning_rate": 4.2915021863751916e-05, - "loss": 1.6484, - "step": 8649 - }, - { - "epoch": 0.3142252067985592, - "grad_norm": 1.495651125907898, - "learning_rate": 4.285634454093198e-05, - "loss": 1.6329, - "step": 8680 - }, - { - "epoch": 0.31534743967998263, - "grad_norm": 1.481740117073059, - "learning_rate": 4.279746571169086e-05, - "loss": 1.6274, - "step": 8711 - }, - { - "epoch": 0.31646967256140607, - "grad_norm": 1.53792142868042, - "learning_rate": 4.2738386040468136e-05, - "loss": 1.6252, - "step": 8742 - }, - { - "epoch": 0.31759190544282945, - "grad_norm": 1.4411643743515015, - "learning_rate": 4.2679106193969866e-05, - "loss": 1.6423, - "step": 8773 - }, - { - "epoch": 0.3187141383242529, - "grad_norm": 1.5158967971801758, - "learning_rate": 4.261962684116106e-05, - "loss": 1.6596, - "step": 8804 - }, - { - "epoch": 0.3198363712056763, - "grad_norm": 1.6026604175567627, - "learning_rate": 4.2559948653258145e-05, - "loss": 1.6399, - "step": 8835 - }, - { - "epoch": 0.32095860408709975, - "grad_norm": 1.4422760009765625, - "learning_rate": 4.250007230372134e-05, - "loss": 1.646, - "step": 8866 - }, - { - "epoch": 0.3220808369685232, - "grad_norm": 1.4450057744979858, - "learning_rate": 4.2439998468247126e-05, - "loss": 1.6311, - "step": 8897 - }, - { - "epoch": 0.3232030698499466, - "grad_norm": 1.432768702507019, - "learning_rate": 4.2379727824760566e-05, - "loss": 1.6234, - "step": 8928 - }, - { - "epoch": 0.32432530273137, - "grad_norm": 1.5206103324890137, - "learning_rate": 4.231926105340768e-05, - "loss": 1.6268, - "step": 8959 - }, - { - "epoch": 0.32544753561279344, - "grad_norm": 1.5703397989273071, - "learning_rate": 4.225859883654776e-05, - "loss": 1.6409, - "step": 8990 - }, - { - "epoch": 0.32656976849421687, - "grad_norm": 1.4549362659454346, - "learning_rate": 4.219774185874569e-05, - "loss": 1.6471, - "step": 9021 - }, - { - "epoch": 0.3276920013756403, - "grad_norm": 1.669263243675232, - "learning_rate": 4.213669080676418e-05, - "loss": 1.6355, - "step": 9052 - }, - { - "epoch": 0.32881423425706374, - "grad_norm": 1.4004725217819214, - "learning_rate": 4.2075446369556056e-05, - "loss": 1.6046, - "step": 9083 - }, - { - "epoch": 0.3299364671384872, - "grad_norm": 1.4844101667404175, - "learning_rate": 4.201400923825648e-05, - "loss": 1.6357, - "step": 9114 - }, - { - "epoch": 0.33105870001991056, - "grad_norm": 1.5377836227416992, - "learning_rate": 4.195238010617511e-05, - "loss": 1.6425, - "step": 9145 - }, - { - "epoch": 0.332180932901334, - "grad_norm": 1.4880887269973755, - "learning_rate": 4.1890559668788344e-05, - "loss": 1.6368, - "step": 9176 - }, - { - "epoch": 0.3333031657827574, - "grad_norm": 1.5786559581756592, - "learning_rate": 4.1828548623731405e-05, - "loss": 1.6327, - "step": 9207 - }, - { - "epoch": 0.33442539866418086, - "grad_norm": 1.4619288444519043, - "learning_rate": 4.1766347670790506e-05, - "loss": 1.6431, - "step": 9238 - }, - { - "epoch": 0.3355476315456043, - "grad_norm": 1.4946295022964478, - "learning_rate": 4.170395751189495e-05, - "loss": 1.6265, - "step": 9269 - }, - { - "epoch": 0.33666986442702773, - "grad_norm": 1.4698960781097412, - "learning_rate": 4.164137885110921e-05, - "loss": 1.6356, - "step": 9300 - }, - { - "epoch": 0.3377920973084511, - "grad_norm": 1.4136701822280884, - "learning_rate": 4.157861239462495e-05, - "loss": 1.606, - "step": 9331 - }, - { - "epoch": 0.33891433018987455, - "grad_norm": 1.5250601768493652, - "learning_rate": 4.1515658850753114e-05, - "loss": 1.6266, - "step": 9362 - }, - { - "epoch": 0.340036563071298, - "grad_norm": 1.5827070474624634, - "learning_rate": 4.145251892991588e-05, - "loss": 1.618, - "step": 9393 - }, - { - "epoch": 0.3411587959527214, - "grad_norm": 1.4887738227844238, - "learning_rate": 4.138919334463868e-05, - "loss": 1.6196, - "step": 9424 - }, - { - "epoch": 0.34228102883414485, - "grad_norm": 1.5627696514129639, - "learning_rate": 4.1325682809542124e-05, - "loss": 1.6155, - "step": 9455 - }, - { - "epoch": 0.3434032617155683, - "grad_norm": 1.4552607536315918, - "learning_rate": 4.126198804133398e-05, - "loss": 1.6272, - "step": 9486 - }, - { - "epoch": 0.34452549459699167, - "grad_norm": 1.5104546546936035, - "learning_rate": 4.1198109758801055e-05, - "loss": 1.6245, - "step": 9517 - }, - { - "epoch": 0.3456477274784151, - "grad_norm": 1.4588383436203003, - "learning_rate": 4.113404868280107e-05, - "loss": 1.6285, - "step": 9548 - }, - { - "epoch": 0.34676996035983854, - "grad_norm": 1.40166437625885, - "learning_rate": 4.106980553625457e-05, - "loss": 1.6181, - "step": 9579 - }, - { - "epoch": 0.34789219324126197, - "grad_norm": 1.4949356317520142, - "learning_rate": 4.100538104413674e-05, - "loss": 1.6148, - "step": 9610 - }, - { - "epoch": 0.3490144261226854, - "grad_norm": 1.4863393306732178, - "learning_rate": 4.09407759334692e-05, - "loss": 1.6218, - "step": 9641 - }, - { - "epoch": 0.35013665900410884, - "grad_norm": 1.4831593036651611, - "learning_rate": 4.087599093331186e-05, - "loss": 1.6201, - "step": 9672 - }, - { - "epoch": 0.3512588918855322, - "grad_norm": 1.487328052520752, - "learning_rate": 4.081102677475462e-05, - "loss": 1.6203, - "step": 9703 - }, - { - "epoch": 0.35238112476695566, - "grad_norm": 1.560600996017456, - "learning_rate": 4.0745884190909194e-05, - "loss": 1.6099, - "step": 9734 - }, - { - "epoch": 0.3535033576483791, - "grad_norm": 1.45511794090271, - "learning_rate": 4.0680563916900796e-05, - "loss": 1.6494, - "step": 9765 - }, - { - "epoch": 0.3546255905298025, - "grad_norm": 1.4966280460357666, - "learning_rate": 4.0615066689859815e-05, - "loss": 1.6157, - "step": 9796 - }, - { - "epoch": 0.35574782341122596, - "grad_norm": 1.4888532161712646, - "learning_rate": 4.0549393248913584e-05, - "loss": 1.6203, - "step": 9827 - }, - { - "epoch": 0.3568700562926494, - "grad_norm": 1.5495861768722534, - "learning_rate": 4.048354433517794e-05, - "loss": 1.6131, - "step": 9858 - }, - { - "epoch": 0.3579922891740728, - "grad_norm": 1.4991432428359985, - "learning_rate": 4.0417520691748916e-05, - "loss": 1.6371, - "step": 9889 - }, - { - "epoch": 0.3591145220554962, - "grad_norm": 1.5163663625717163, - "learning_rate": 4.035132306369438e-05, - "loss": 1.5911, - "step": 9920 - }, - { - "epoch": 0.36023675493691965, - "grad_norm": 1.439622402191162, - "learning_rate": 4.028495219804555e-05, - "loss": 1.6218, - "step": 9951 - }, - { - "epoch": 0.3613589878183431, - "grad_norm": 1.4068893194198608, - "learning_rate": 4.021840884378864e-05, - "loss": 1.6284, - "step": 9982 - }, - { - "epoch": 0.3624812206997665, - "grad_norm": 1.4577332735061646, - "learning_rate": 4.015169375185633e-05, - "loss": 1.6104, - "step": 10013 - }, - { - "epoch": 0.36360345358118995, - "grad_norm": 1.448833703994751, - "learning_rate": 4.0084807675119396e-05, - "loss": 1.6299, - "step": 10044 - }, - { - "epoch": 0.36472568646261333, - "grad_norm": 1.440450668334961, - "learning_rate": 4.0017751368378106e-05, - "loss": 1.6255, - "step": 10075 - }, - { - "epoch": 0.36584791934403676, - "grad_norm": 1.3380858898162842, - "learning_rate": 3.995052558835377e-05, - "loss": 1.6162, - "step": 10106 - }, - { - "epoch": 0.3669701522254602, - "grad_norm": 1.4549713134765625, - "learning_rate": 3.988313109368017e-05, - "loss": 1.6181, - "step": 10137 - }, - { - "epoch": 0.36809238510688363, - "grad_norm": 1.4933863878250122, - "learning_rate": 3.981556864489504e-05, - "loss": 1.634, - "step": 10168 - }, - { - "epoch": 0.36921461798830707, - "grad_norm": 1.5157703161239624, - "learning_rate": 3.974783900443142e-05, - "loss": 1.6258, - "step": 10199 - }, - { - "epoch": 0.3703368508697305, - "grad_norm": 1.464006781578064, - "learning_rate": 3.9679942936609095e-05, - "loss": 1.6235, - "step": 10230 - }, - { - "epoch": 0.3714590837511539, - "grad_norm": 1.3768154382705688, - "learning_rate": 3.961188120762596e-05, - "loss": 1.6044, - "step": 10261 - }, - { - "epoch": 0.3725813166325773, - "grad_norm": 1.4427024126052856, - "learning_rate": 3.954365458554938e-05, - "loss": 1.6403, - "step": 10292 - }, - { - "epoch": 0.37370354951400075, - "grad_norm": 1.3831264972686768, - "learning_rate": 3.947526384030751e-05, - "loss": 1.6136, - "step": 10323 - }, - { - "epoch": 0.3748257823954242, - "grad_norm": 1.4275633096694946, - "learning_rate": 3.9406709743680624e-05, - "loss": 1.6167, - "step": 10354 - }, - { - "epoch": 0.3759480152768476, - "grad_norm": 1.4378384351730347, - "learning_rate": 3.9337993069292366e-05, - "loss": 1.6231, - "step": 10385 - }, - { - "epoch": 0.37707024815827106, - "grad_norm": 1.3743884563446045, - "learning_rate": 3.926911459260109e-05, - "loss": 1.6171, - "step": 10416 - }, - { - "epoch": 0.37819248103969444, - "grad_norm": 1.496160864830017, - "learning_rate": 3.920007509089102e-05, - "loss": 1.6234, - "step": 10447 - }, - { - "epoch": 0.3793147139211179, - "grad_norm": 1.4610028266906738, - "learning_rate": 3.913087534326357e-05, - "loss": 1.5963, - "step": 10478 - }, - { - "epoch": 0.3804369468025413, - "grad_norm": 1.483314037322998, - "learning_rate": 3.9061516130628475e-05, - "loss": 1.6021, - "step": 10509 - }, - { - "epoch": 0.38155917968396474, - "grad_norm": 1.4944846630096436, - "learning_rate": 3.8991998235695025e-05, - "loss": 1.5833, - "step": 10540 - }, - { - "epoch": 0.3826814125653882, - "grad_norm": 1.3831861019134521, - "learning_rate": 3.8922322442963224e-05, - "loss": 1.624, - "step": 10571 - }, - { - "epoch": 0.3838036454468116, - "grad_norm": 1.4178634881973267, - "learning_rate": 3.885248953871491e-05, - "loss": 1.6188, - "step": 10602 - }, - { - "epoch": 0.384925878328235, - "grad_norm": 1.4889320135116577, - "learning_rate": 3.8782500311004915e-05, - "loss": 1.608, - "step": 10633 - }, - { - "epoch": 0.38604811120965843, - "grad_norm": 1.3335620164871216, - "learning_rate": 3.871235554965218e-05, - "loss": 1.6182, - "step": 10664 - }, - { - "epoch": 0.38717034409108186, - "grad_norm": 1.4620449542999268, - "learning_rate": 3.864205604623078e-05, - "loss": 1.5848, - "step": 10695 - }, - { - "epoch": 0.3882925769725053, - "grad_norm": 1.3857917785644531, - "learning_rate": 3.857160259406107e-05, - "loss": 1.6048, - "step": 10726 - }, - { - "epoch": 0.38941480985392873, - "grad_norm": 1.4226957559585571, - "learning_rate": 3.8500995988200674e-05, - "loss": 1.6052, - "step": 10757 - }, - { - "epoch": 0.39053704273535217, - "grad_norm": 1.478182077407837, - "learning_rate": 3.843023702543556e-05, - "loss": 1.6268, - "step": 10788 - }, - { - "epoch": 0.39165927561677555, - "grad_norm": 1.431401014328003, - "learning_rate": 3.8359326504270984e-05, - "loss": 1.6176, - "step": 10819 - }, - { - "epoch": 0.392781508498199, - "grad_norm": 1.339880108833313, - "learning_rate": 3.828826522492255e-05, - "loss": 1.5902, - "step": 10850 - }, - { - "epoch": 0.3939037413796224, - "grad_norm": 1.4537174701690674, - "learning_rate": 3.821705398930713e-05, - "loss": 1.6107, - "step": 10881 - }, - { - "epoch": 0.39502597426104585, - "grad_norm": 1.3559256792068481, - "learning_rate": 3.814569360103385e-05, - "loss": 1.5879, - "step": 10912 - }, - { - "epoch": 0.3961482071424693, - "grad_norm": 1.3561891317367554, - "learning_rate": 3.807418486539499e-05, - "loss": 1.6162, - "step": 10943 - }, - { - "epoch": 0.3972704400238927, - "grad_norm": 1.471112847328186, - "learning_rate": 3.80025285893569e-05, - "loss": 1.5968, - "step": 10974 - }, - { - "epoch": 0.3983926729053161, - "grad_norm": 1.3438925743103027, - "learning_rate": 3.793072558155093e-05, - "loss": 1.5876, - "step": 11005 - }, - { - "epoch": 0.39951490578673954, - "grad_norm": 1.4102482795715332, - "learning_rate": 3.785877665226426e-05, - "loss": 1.5886, - "step": 11036 - }, - { - "epoch": 0.400637138668163, - "grad_norm": 1.4435259103775024, - "learning_rate": 3.778668261343079e-05, - "loss": 1.5999, - "step": 11067 - }, - { - "epoch": 0.4017593715495864, - "grad_norm": 1.4556541442871094, - "learning_rate": 3.771444427862192e-05, - "loss": 1.6185, - "step": 11098 - }, - { - "epoch": 0.40288160443100984, - "grad_norm": 1.370553970336914, - "learning_rate": 3.7642062463037465e-05, - "loss": 1.6005, - "step": 11129 - }, - { - "epoch": 0.4040038373124333, - "grad_norm": 1.368855595588684, - "learning_rate": 3.7569537983496373e-05, - "loss": 1.6024, - "step": 11160 - }, - { - "epoch": 0.40512607019385666, - "grad_norm": 1.4200265407562256, - "learning_rate": 3.749687165842753e-05, - "loss": 1.6082, - "step": 11191 - }, - { - "epoch": 0.4062483030752801, - "grad_norm": 1.4704499244689941, - "learning_rate": 3.7424064307860536e-05, - "loss": 1.6227, - "step": 11222 - }, - { - "epoch": 0.40737053595670353, - "grad_norm": 1.3868876695632935, - "learning_rate": 3.735111675341645e-05, - "loss": 1.6008, - "step": 11253 - }, - { - "epoch": 0.40849276883812696, - "grad_norm": 1.473650574684143, - "learning_rate": 3.7278029818298524e-05, - "loss": 1.5825, - "step": 11284 - }, - { - "epoch": 0.4096150017195504, - "grad_norm": 1.412559986114502, - "learning_rate": 3.720480432728287e-05, - "loss": 1.5971, - "step": 11315 - }, - { - "epoch": 0.41073723460097383, - "grad_norm": 1.4288370609283447, - "learning_rate": 3.71314411067092e-05, - "loss": 1.6079, - "step": 11346 - }, - { - "epoch": 0.4118594674823972, - "grad_norm": 1.4781348705291748, - "learning_rate": 3.70579409844715e-05, - "loss": 1.5904, - "step": 11377 - }, - { - "epoch": 0.41298170036382065, - "grad_norm": 1.377030611038208, - "learning_rate": 3.698430479000865e-05, - "loss": 1.5804, - "step": 11408 - }, - { - "epoch": 0.4141039332452441, - "grad_norm": 1.4176589250564575, - "learning_rate": 3.691053335429509e-05, - "loss": 1.6046, - "step": 11439 - }, - { - "epoch": 0.4152261661266675, - "grad_norm": 1.4933243989944458, - "learning_rate": 3.683662750983147e-05, - "loss": 1.6018, - "step": 11470 - }, - { - "epoch": 0.41634839900809095, - "grad_norm": 1.4382365942001343, - "learning_rate": 3.676258809063518e-05, - "loss": 1.5962, - "step": 11501 - }, - { - "epoch": 0.4174706318895144, - "grad_norm": 1.468005657196045, - "learning_rate": 3.6688415932231004e-05, - "loss": 1.6044, - "step": 11532 - }, - { - "epoch": 0.41859286477093777, - "grad_norm": 1.4858007431030273, - "learning_rate": 3.661411187164166e-05, - "loss": 1.5973, - "step": 11563 - }, - { - "epoch": 0.4197150976523612, - "grad_norm": 1.457524061203003, - "learning_rate": 3.65396767473784e-05, - "loss": 1.5872, - "step": 11594 - }, - { - "epoch": 0.42083733053378464, - "grad_norm": 1.4685806035995483, - "learning_rate": 3.6465111399431465e-05, - "loss": 1.6072, - "step": 11625 - }, - { - "epoch": 0.42195956341520807, - "grad_norm": 1.4355812072753906, - "learning_rate": 3.6390416669260674e-05, - "loss": 1.6005, - "step": 11656 - }, - { - "epoch": 0.4230817962966315, - "grad_norm": 1.4105843305587769, - "learning_rate": 3.63155933997859e-05, - "loss": 1.5999, - "step": 11687 - }, - { - "epoch": 0.42420402917805494, - "grad_norm": 1.4515639543533325, - "learning_rate": 3.624064243537758e-05, - "loss": 1.5903, - "step": 11718 - }, - { - "epoch": 0.4253262620594783, - "grad_norm": 1.4507205486297607, - "learning_rate": 3.616556462184716e-05, - "loss": 1.6004, - "step": 11749 - }, - { - "epoch": 0.42644849494090176, - "grad_norm": 1.3846348524093628, - "learning_rate": 3.609036080643755e-05, - "loss": 1.5878, - "step": 11780 - }, - { - "epoch": 0.4275707278223252, - "grad_norm": 1.4062190055847168, - "learning_rate": 3.60150318378136e-05, - "loss": 1.6049, - "step": 11811 - }, - { - "epoch": 0.4286929607037486, - "grad_norm": 1.5231355428695679, - "learning_rate": 3.5939578566052465e-05, - "loss": 1.5972, - "step": 11842 - }, - { - "epoch": 0.42981519358517206, - "grad_norm": 1.4500449895858765, - "learning_rate": 3.586400184263408e-05, - "loss": 1.5918, - "step": 11873 - }, - { - "epoch": 0.4309374264665955, - "grad_norm": 1.415440559387207, - "learning_rate": 3.578830252043148e-05, - "loss": 1.6111, - "step": 11904 - }, - { - "epoch": 0.4320596593480189, - "grad_norm": 1.3857108354568481, - "learning_rate": 3.571248145370125e-05, - "loss": 1.5882, - "step": 11935 - }, - { - "epoch": 0.4331818922294423, - "grad_norm": 1.442830204963684, - "learning_rate": 3.5636539498073794e-05, - "loss": 1.587, - "step": 11966 - }, - { - "epoch": 0.43430412511086575, - "grad_norm": 1.3706488609313965, - "learning_rate": 3.556047751054378e-05, - "loss": 1.5942, - "step": 11997 - }, - { - "epoch": 0.4354263579922892, - "grad_norm": 1.450567364692688, - "learning_rate": 3.548429634946039e-05, - "loss": 1.6011, - "step": 12028 - }, - { - "epoch": 0.4365485908737126, - "grad_norm": 1.4172272682189941, - "learning_rate": 3.540799687451768e-05, - "loss": 1.5726, - "step": 12059 - }, - { - "epoch": 0.43767082375513605, - "grad_norm": 1.4156157970428467, - "learning_rate": 3.533157994674485e-05, - "loss": 1.5848, - "step": 12090 - }, - { - "epoch": 0.43879305663655943, - "grad_norm": 1.3843419551849365, - "learning_rate": 3.5255046428496546e-05, - "loss": 1.5893, - "step": 12121 - }, - { - "epoch": 0.43991528951798287, - "grad_norm": 1.43569815158844, - "learning_rate": 3.517839718344311e-05, - "loss": 1.5922, - "step": 12152 - }, - { - "epoch": 0.4410375223994063, - "grad_norm": 1.4200314283370972, - "learning_rate": 3.510163307656086e-05, - "loss": 1.6047, - "step": 12183 - }, - { - "epoch": 0.44215975528082974, - "grad_norm": 1.4956674575805664, - "learning_rate": 3.5024754974122324e-05, - "loss": 1.5802, - "step": 12214 - }, - { - "epoch": 0.44328198816225317, - "grad_norm": 1.4289231300354004, - "learning_rate": 3.494776374368643e-05, - "loss": 1.6193, - "step": 12245 - }, - { - "epoch": 0.4444042210436766, - "grad_norm": 1.389282464981079, - "learning_rate": 3.4870660254088724e-05, - "loss": 1.5977, - "step": 12276 - }, - { - "epoch": 0.4455264539251, - "grad_norm": 1.4207974672317505, - "learning_rate": 3.479344537543164e-05, - "loss": 1.5789, - "step": 12307 - }, - { - "epoch": 0.4466486868065234, - "grad_norm": 1.355353832244873, - "learning_rate": 3.4716119979074565e-05, - "loss": 1.5889, - "step": 12338 - }, - { - "epoch": 0.44777091968794686, - "grad_norm": 1.3336408138275146, - "learning_rate": 3.463868493762412e-05, - "loss": 1.5865, - "step": 12369 - }, - { - "epoch": 0.4488931525693703, - "grad_norm": 1.5265244245529175, - "learning_rate": 3.456114112492418e-05, - "loss": 1.5993, - "step": 12400 - }, - { - "epoch": 0.4500153854507937, - "grad_norm": 1.4629555940628052, - "learning_rate": 3.4483489416046164e-05, - "loss": 1.5982, - "step": 12431 - }, - { - "epoch": 0.45113761833221716, - "grad_norm": 1.43988835811615, - "learning_rate": 3.440573068727905e-05, - "loss": 1.5816, - "step": 12462 - }, - { - "epoch": 0.45225985121364054, - "grad_norm": 1.4607633352279663, - "learning_rate": 3.4327865816119495e-05, - "loss": 1.571, - "step": 12493 - }, - { - "epoch": 0.453382084095064, - "grad_norm": 1.3664649724960327, - "learning_rate": 3.4249895681262025e-05, - "loss": 1.5736, - "step": 12524 - }, - { - "epoch": 0.4545043169764874, - "grad_norm": 1.436094880104065, - "learning_rate": 3.417182116258899e-05, - "loss": 1.5829, - "step": 12555 - }, - { - "epoch": 0.45562654985791085, - "grad_norm": 1.3681309223175049, - "learning_rate": 3.409364314116074e-05, - "loss": 1.5938, - "step": 12586 - }, - { - "epoch": 0.4567487827393343, - "grad_norm": 1.3929277658462524, - "learning_rate": 3.401536249920559e-05, - "loss": 1.572, - "step": 12617 - }, - { - "epoch": 0.4578710156207577, - "grad_norm": 1.3980777263641357, - "learning_rate": 3.393698012010998e-05, - "loss": 1.5941, - "step": 12648 - }, - { - "epoch": 0.4589932485021811, - "grad_norm": 1.4055850505828857, - "learning_rate": 3.385849688840839e-05, - "loss": 1.5818, - "step": 12679 - }, - { - "epoch": 0.46011548138360453, - "grad_norm": 1.3678046464920044, - "learning_rate": 3.3779913689773414e-05, - "loss": 1.5759, - "step": 12710 - }, - { - "epoch": 0.46123771426502796, - "grad_norm": 1.468201994895935, - "learning_rate": 3.370123141100578e-05, - "loss": 1.5792, - "step": 12741 - }, - { - "epoch": 0.4623599471464514, - "grad_norm": 1.346614122390747, - "learning_rate": 3.3622450940024305e-05, - "loss": 1.5983, - "step": 12772 - }, - { - "epoch": 0.46348218002787483, - "grad_norm": 1.3895704746246338, - "learning_rate": 3.35435731658559e-05, - "loss": 1.5809, - "step": 12803 - }, - { - "epoch": 0.46460441290929827, - "grad_norm": 1.3664804697036743, - "learning_rate": 3.346459897862552e-05, - "loss": 1.5788, - "step": 12834 - }, - { - "epoch": 0.46572664579072165, - "grad_norm": 1.4561264514923096, - "learning_rate": 3.338552926954613e-05, - "loss": 1.5867, - "step": 12865 - }, - { - "epoch": 0.4668488786721451, - "grad_norm": 1.3407316207885742, - "learning_rate": 3.330636493090868e-05, - "loss": 1.5729, - "step": 12896 - }, - { - "epoch": 0.4679711115535685, - "grad_norm": 1.3465179204940796, - "learning_rate": 3.322710685607193e-05, - "loss": 1.5915, - "step": 12927 - }, - { - "epoch": 0.46909334443499195, - "grad_norm": 1.553585171699524, - "learning_rate": 3.314775593945251e-05, - "loss": 1.5875, - "step": 12958 - }, - { - "epoch": 0.4702155773164154, - "grad_norm": 1.3964170217514038, - "learning_rate": 3.3068313076514714e-05, - "loss": 1.5783, - "step": 12989 - }, - { - "epoch": 0.47133781019783877, - "grad_norm": 1.3884953260421753, - "learning_rate": 3.298877916376047e-05, - "loss": 1.5577, - "step": 13020 - }, - { - "epoch": 0.4724600430792622, - "grad_norm": 1.3421337604522705, - "learning_rate": 3.290915509871915e-05, - "loss": 1.5791, - "step": 13051 - }, - { - "epoch": 0.47358227596068564, - "grad_norm": 1.297429084777832, - "learning_rate": 3.282944177993753e-05, - "loss": 1.5699, - "step": 13082 - }, - { - "epoch": 0.4747045088421091, - "grad_norm": 1.3672280311584473, - "learning_rate": 3.274964010696957e-05, - "loss": 1.5711, - "step": 13113 - }, - { - "epoch": 0.4758267417235325, - "grad_norm": 1.4202091693878174, - "learning_rate": 3.266975098036629e-05, - "loss": 1.5679, - "step": 13144 - }, - { - "epoch": 0.47694897460495594, - "grad_norm": 1.383973479270935, - "learning_rate": 3.258977530166562e-05, - "loss": 1.6019, - "step": 13175 - }, - { - "epoch": 0.4780712074863793, - "grad_norm": 1.3134119510650635, - "learning_rate": 3.250971397338227e-05, - "loss": 1.5721, - "step": 13206 - }, - { - "epoch": 0.47919344036780276, - "grad_norm": 1.3229272365570068, - "learning_rate": 3.2429567898997404e-05, - "loss": 1.5812, - "step": 13237 - }, - { - "epoch": 0.4803156732492262, - "grad_norm": 1.2991341352462769, - "learning_rate": 3.234933798294859e-05, - "loss": 1.5793, - "step": 13268 - }, - { - "epoch": 0.48143790613064963, - "grad_norm": 1.384522795677185, - "learning_rate": 3.2269025130619535e-05, - "loss": 1.5592, - "step": 13299 - }, - { - "epoch": 0.48256013901207306, - "grad_norm": 1.3743617534637451, - "learning_rate": 3.218863024832985e-05, - "loss": 1.5785, - "step": 13330 - }, - { - "epoch": 0.4836823718934965, - "grad_norm": 1.4512649774551392, - "learning_rate": 3.2108154243324864e-05, - "loss": 1.5703, - "step": 13361 - }, - { - "epoch": 0.4848046047749199, - "grad_norm": 1.2982932329177856, - "learning_rate": 3.2027598023765345e-05, - "loss": 1.5609, - "step": 13392 - }, - { - "epoch": 0.4859268376563433, - "grad_norm": 1.3747495412826538, - "learning_rate": 3.194696249871729e-05, - "loss": 1.5766, - "step": 13423 - }, - { - "epoch": 0.48704907053776675, - "grad_norm": 1.3155137300491333, - "learning_rate": 3.186624857814164e-05, - "loss": 1.57, - "step": 13454 - }, - { - "epoch": 0.4881713034191902, - "grad_norm": 1.4094924926757812, - "learning_rate": 3.178545717288401e-05, - "loss": 1.5855, - "step": 13485 - }, - { - "epoch": 0.4892935363006136, - "grad_norm": 1.3931294679641724, - "learning_rate": 3.170458919466444e-05, - "loss": 1.5486, - "step": 13516 - }, - { - "epoch": 0.49041576918203705, - "grad_norm": 1.48263418674469, - "learning_rate": 3.1623645556067063e-05, - "loss": 1.5829, - "step": 13547 - }, - { - "epoch": 0.49153800206346043, - "grad_norm": 1.3016873598098755, - "learning_rate": 3.154262717052985e-05, - "loss": 1.5808, - "step": 13578 - }, - { - "epoch": 0.49266023494488387, - "grad_norm": 1.623724102973938, - "learning_rate": 3.146153495233426e-05, - "loss": 1.5582, - "step": 13609 - }, - { - "epoch": 0.4937824678263073, - "grad_norm": 1.3603851795196533, - "learning_rate": 3.1380369816594944e-05, - "loss": 1.5703, - "step": 13640 - }, - { - "epoch": 0.49490470070773074, - "grad_norm": 1.4793063402175903, - "learning_rate": 3.129913267924946e-05, - "loss": 1.5739, - "step": 13671 - }, - { - "epoch": 0.4960269335891542, - "grad_norm": 1.4615710973739624, - "learning_rate": 3.121782445704782e-05, - "loss": 1.5846, - "step": 13702 - }, - { - "epoch": 0.4971491664705776, - "grad_norm": 1.419823408126831, - "learning_rate": 3.11364460675423e-05, - "loss": 1.5702, - "step": 13733 - }, - { - "epoch": 0.498271399352001, - "grad_norm": 1.429337501525879, - "learning_rate": 3.1054998429076934e-05, - "loss": 1.5825, - "step": 13764 - }, - { - "epoch": 0.4993936322334244, - "grad_norm": 1.3171850442886353, - "learning_rate": 3.097348246077728e-05, - "loss": 1.5721, - "step": 13795 - }, - { - "epoch": 0.5005158651148479, - "grad_norm": 1.487111210823059, - "learning_rate": 3.0891899082539924e-05, - "loss": 1.5879, - "step": 13826 - }, - { - "epoch": 0.5016380979962712, - "grad_norm": 1.4311749935150146, - "learning_rate": 3.0810249215022233e-05, - "loss": 1.5843, - "step": 13857 - }, - { - "epoch": 0.5027603308776947, - "grad_norm": 1.468863844871521, - "learning_rate": 3.0728533779631865e-05, - "loss": 1.5884, - "step": 13888 - }, - { - "epoch": 0.5038825637591181, - "grad_norm": 1.3970764875411987, - "learning_rate": 3.064675369851637e-05, - "loss": 1.5769, - "step": 13919 - }, - { - "epoch": 0.5050047966405415, - "grad_norm": 1.3623278141021729, - "learning_rate": 3.056490989455289e-05, - "loss": 1.5706, - "step": 13950 - }, - { - "epoch": 0.506127029521965, - "grad_norm": 1.3077219724655151, - "learning_rate": 3.0483003291337596e-05, - "loss": 1.5761, - "step": 13981 - }, - { - "epoch": 0.5072492624033884, - "grad_norm": 1.3295941352844238, - "learning_rate": 3.040103481317539e-05, - "loss": 1.5776, - "step": 14012 - }, - { - "epoch": 0.5083714952848118, - "grad_norm": 1.3900631666183472, - "learning_rate": 3.03190053850694e-05, - "loss": 1.5777, - "step": 14043 - }, - { - "epoch": 0.5094937281662353, - "grad_norm": 1.3359615802764893, - "learning_rate": 3.0236915932710573e-05, - "loss": 1.5569, - "step": 14074 - }, - { - "epoch": 0.5106159610476587, - "grad_norm": 1.2790296077728271, - "learning_rate": 3.0154767382467232e-05, - "loss": 1.5598, - "step": 14105 - }, - { - "epoch": 0.5117381939290822, - "grad_norm": 1.5767478942871094, - "learning_rate": 3.0072560661374582e-05, - "loss": 1.5483, - "step": 14136 - }, - { - "epoch": 0.5128604268105056, - "grad_norm": 1.343381404876709, - "learning_rate": 2.999029669712431e-05, - "loss": 1.5689, - "step": 14167 - }, - { - "epoch": 0.513982659691929, - "grad_norm": 1.4147651195526123, - "learning_rate": 2.990797641805408e-05, - "loss": 1.5643, - "step": 14198 - }, - { - "epoch": 0.5151048925733523, - "grad_norm": 1.3360931873321533, - "learning_rate": 2.982560075313704e-05, - "loss": 1.5689, - "step": 14229 - }, - { - "epoch": 0.5162271254547758, - "grad_norm": 1.458016037940979, - "learning_rate": 2.9743170631971368e-05, - "loss": 1.5633, - "step": 14260 - }, - { - "epoch": 0.5173493583361992, - "grad_norm": 1.430955171585083, - "learning_rate": 2.9660686984769792e-05, - "loss": 1.5559, - "step": 14291 - }, - { - "epoch": 0.5184715912176227, - "grad_norm": 1.3806464672088623, - "learning_rate": 2.9578150742349047e-05, - "loss": 1.577, - "step": 14322 - }, - { - "epoch": 0.5195938240990461, - "grad_norm": 1.359813928604126, - "learning_rate": 2.949556283611942e-05, - "loss": 1.5485, - "step": 14353 - }, - { - "epoch": 0.5207160569804695, - "grad_norm": 1.4222601652145386, - "learning_rate": 2.9412924198074206e-05, - "loss": 1.575, - "step": 14384 - }, - { - "epoch": 0.521838289861893, - "grad_norm": 1.3186180591583252, - "learning_rate": 2.9330235760779208e-05, - "loss": 1.5744, - "step": 14415 - }, - { - "epoch": 0.5229605227433164, - "grad_norm": 1.3309999704360962, - "learning_rate": 2.9247498457362188e-05, - "loss": 1.5664, - "step": 14446 - }, - { - "epoch": 0.5240827556247398, - "grad_norm": 1.368514060974121, - "learning_rate": 2.9164713221502373e-05, - "loss": 1.56, - "step": 14477 - }, - { - "epoch": 0.5252049885061633, - "grad_norm": 1.3132268190383911, - "learning_rate": 2.9081880987419912e-05, - "loss": 1.563, - "step": 14508 - }, - { - "epoch": 0.5263272213875867, - "grad_norm": 1.431347131729126, - "learning_rate": 2.8999002689865296e-05, - "loss": 1.5612, - "step": 14539 - }, - { - "epoch": 0.5274494542690101, - "grad_norm": 1.303941249847412, - "learning_rate": 2.8916079264108852e-05, - "loss": 1.5601, - "step": 14570 - }, - { - "epoch": 0.5285716871504335, - "grad_norm": 1.4077236652374268, - "learning_rate": 2.883311164593017e-05, - "loss": 1.5516, - "step": 14601 - }, - { - "epoch": 0.5296939200318569, - "grad_norm": 1.3132708072662354, - "learning_rate": 2.875010077160754e-05, - "loss": 1.5538, - "step": 14632 - }, - { - "epoch": 0.5308161529132803, - "grad_norm": 1.2660679817199707, - "learning_rate": 2.866704757790741e-05, - "loss": 1.5652, - "step": 14663 - }, - { - "epoch": 0.5319383857947038, - "grad_norm": 1.4541290998458862, - "learning_rate": 2.858395300207376e-05, - "loss": 1.5602, - "step": 14694 - }, - { - "epoch": 0.5330606186761272, - "grad_norm": 1.3694487810134888, - "learning_rate": 2.8500817981817607e-05, - "loss": 1.5483, - "step": 14725 - }, - { - "epoch": 0.5341828515575506, - "grad_norm": 1.3493553400039673, - "learning_rate": 2.8417643455306336e-05, - "loss": 1.5539, - "step": 14756 - }, - { - "epoch": 0.5353050844389741, - "grad_norm": 1.4280232191085815, - "learning_rate": 2.8334430361153185e-05, - "loss": 1.5672, - "step": 14787 - }, - { - "epoch": 0.5364273173203975, - "grad_norm": 1.3430079221725464, - "learning_rate": 2.8251179638406612e-05, - "loss": 1.5474, - "step": 14818 - }, - { - "epoch": 0.5375495502018209, - "grad_norm": 1.3380746841430664, - "learning_rate": 2.8167892226539704e-05, - "loss": 1.5508, - "step": 14849 - }, - { - "epoch": 0.5386717830832444, - "grad_norm": 1.3501845598220825, - "learning_rate": 2.8084569065439588e-05, - "loss": 1.5656, - "step": 14880 - }, - { - "epoch": 0.5397940159646678, - "grad_norm": 1.3564043045043945, - "learning_rate": 2.8001211095396807e-05, - "loss": 1.5726, - "step": 14911 - }, - { - "epoch": 0.5409162488460912, - "grad_norm": 1.3949267864227295, - "learning_rate": 2.791781925709473e-05, - "loss": 1.5635, - "step": 14942 - }, - { - "epoch": 0.5420384817275146, - "grad_norm": 1.4317481517791748, - "learning_rate": 2.7834394491598908e-05, - "loss": 1.5447, - "step": 14973 - }, - { - "epoch": 0.543160714608938, - "grad_norm": 1.396610140800476, - "learning_rate": 2.7750937740346485e-05, - "loss": 1.557, - "step": 15004 - }, - { - "epoch": 0.5442829474903614, - "grad_norm": 1.369884967803955, - "learning_rate": 2.7667449945135564e-05, - "loss": 1.5672, - "step": 15035 - }, - { - "epoch": 0.5454051803717849, - "grad_norm": 1.4686237573623657, - "learning_rate": 2.7583932048114557e-05, - "loss": 1.572, - "step": 15066 - }, - { - "epoch": 0.5465274132532083, - "grad_norm": 1.524717926979065, - "learning_rate": 2.7500384991771587e-05, - "loss": 1.5537, - "step": 15097 - }, - { - "epoch": 0.5476496461346317, - "grad_norm": 1.3461147546768188, - "learning_rate": 2.7416809718923825e-05, - "loss": 1.5321, - "step": 15128 - }, - { - "epoch": 0.5487718790160552, - "grad_norm": 1.3704477548599243, - "learning_rate": 2.7333207172706864e-05, - "loss": 1.5677, - "step": 15159 - }, - { - "epoch": 0.5498941118974786, - "grad_norm": 1.3601664304733276, - "learning_rate": 2.7249578296564088e-05, - "loss": 1.5577, - "step": 15190 - }, - { - "epoch": 0.551016344778902, - "grad_norm": 1.4055489301681519, - "learning_rate": 2.7165924034235973e-05, - "loss": 1.5453, - "step": 15221 - }, - { - "epoch": 0.5521385776603255, - "grad_norm": 1.3587946891784668, - "learning_rate": 2.708224532974953e-05, - "loss": 1.5401, - "step": 15252 - }, - { - "epoch": 0.5532608105417489, - "grad_norm": 1.3209632635116577, - "learning_rate": 2.6998543127407538e-05, - "loss": 1.5383, - "step": 15283 - }, - { - "epoch": 0.5543830434231724, - "grad_norm": 1.294921636581421, - "learning_rate": 2.6914818371777988e-05, - "loss": 1.5734, - "step": 15314 - }, - { - "epoch": 0.5555052763045957, - "grad_norm": 1.6017462015151978, - "learning_rate": 2.6831072007683373e-05, - "loss": 1.5702, - "step": 15345 - }, - { - "epoch": 0.5566275091860191, - "grad_norm": 1.3644670248031616, - "learning_rate": 2.6747304980190018e-05, - "loss": 1.571, - "step": 15376 - }, - { - "epoch": 0.5577497420674425, - "grad_norm": 1.3694461584091187, - "learning_rate": 2.6663518234597453e-05, - "loss": 1.5398, - "step": 15407 - }, - { - "epoch": 0.558871974948866, - "grad_norm": 1.3380069732666016, - "learning_rate": 2.6579712716427696e-05, - "loss": 1.5628, - "step": 15438 - }, - { - "epoch": 0.5599942078302894, - "grad_norm": 1.322144627571106, - "learning_rate": 2.6495889371414652e-05, - "loss": 1.5682, - "step": 15469 - }, - { - "epoch": 0.5611164407117128, - "grad_norm": 1.3240221738815308, - "learning_rate": 2.6412049145493367e-05, - "loss": 1.5506, - "step": 15500 - }, - { - "epoch": 0.5622386735931363, - "grad_norm": 1.3131070137023926, - "learning_rate": 2.632819298478939e-05, - "loss": 1.5529, - "step": 15531 - }, - { - "epoch": 0.5633609064745597, - "grad_norm": 1.3907220363616943, - "learning_rate": 2.6244321835608105e-05, - "loss": 1.547, - "step": 15562 - }, - { - "epoch": 0.5644831393559832, - "grad_norm": 1.233981966972351, - "learning_rate": 2.6160436644424024e-05, - "loss": 1.5377, - "step": 15593 - }, - { - "epoch": 0.5656053722374066, - "grad_norm": 1.443326711654663, - "learning_rate": 2.6076538357870133e-05, - "loss": 1.5788, - "step": 15624 - }, - { - "epoch": 0.56672760511883, - "grad_norm": 1.4688999652862549, - "learning_rate": 2.5992627922727196e-05, - "loss": 1.5629, - "step": 15655 - }, - { - "epoch": 0.5678498380002535, - "grad_norm": 1.3365731239318848, - "learning_rate": 2.5908706285913066e-05, - "loss": 1.5544, - "step": 15686 - }, - { - "epoch": 0.5689720708816768, - "grad_norm": 1.3793649673461914, - "learning_rate": 2.5824774394472008e-05, - "loss": 1.5317, - "step": 15717 - }, - { - "epoch": 0.5700943037631002, - "grad_norm": 1.417433738708496, - "learning_rate": 2.5740833195563996e-05, - "loss": 1.5506, - "step": 15748 - }, - { - "epoch": 0.5712165366445237, - "grad_norm": 1.346710443496704, - "learning_rate": 2.5656883636454067e-05, - "loss": 1.5462, - "step": 15779 - }, - { - "epoch": 0.5723387695259471, - "grad_norm": 1.4065468311309814, - "learning_rate": 2.557292666450159e-05, - "loss": 1.5464, - "step": 15810 - }, - { - "epoch": 0.5734610024073705, - "grad_norm": 1.3797588348388672, - "learning_rate": 2.5488963227149566e-05, - "loss": 1.565, - "step": 15841 - }, - { - "epoch": 0.574583235288794, - "grad_norm": 1.2842196226119995, - "learning_rate": 2.5404994271913983e-05, - "loss": 1.5489, - "step": 15872 - }, - { - "epoch": 0.5757054681702174, - "grad_norm": 1.368696689605713, - "learning_rate": 2.5321020746373085e-05, - "loss": 1.5358, - "step": 15903 - }, - { - "epoch": 0.5768277010516408, - "grad_norm": 1.3306961059570312, - "learning_rate": 2.52370435981567e-05, - "loss": 1.541, - "step": 15934 - }, - { - "epoch": 0.5779499339330643, - "grad_norm": 1.286727786064148, - "learning_rate": 2.5153063774935533e-05, - "loss": 1.533, - "step": 15965 - }, - { - "epoch": 0.5790721668144877, - "grad_norm": 1.434964656829834, - "learning_rate": 2.506908222441045e-05, - "loss": 1.5404, - "step": 15996 - }, - { - "epoch": 0.5801943996959111, - "grad_norm": 1.3955284357070923, - "learning_rate": 2.498509989430187e-05, - "loss": 1.5532, - "step": 16027 - }, - { - "epoch": 0.5813166325773346, - "grad_norm": 1.3676408529281616, - "learning_rate": 2.4901117732338958e-05, - "loss": 1.5263, - "step": 16058 - }, - { - "epoch": 0.5824388654587579, - "grad_norm": 1.3900113105773926, - "learning_rate": 2.481713668624899e-05, - "loss": 1.5465, - "step": 16089 - }, - { - "epoch": 0.5835610983401813, - "grad_norm": 1.3808554410934448, - "learning_rate": 2.4733157703746663e-05, - "loss": 1.5332, - "step": 16120 - }, - { - "epoch": 0.5846833312216048, - "grad_norm": 1.2974086999893188, - "learning_rate": 2.4649181732523392e-05, - "loss": 1.5562, - "step": 16151 - }, - { - "epoch": 0.5858055641030282, - "grad_norm": 1.4109300374984741, - "learning_rate": 2.4565209720236582e-05, - "loss": 1.5273, - "step": 16182 - }, - { - "epoch": 0.5869277969844516, - "grad_norm": 1.3626701831817627, - "learning_rate": 2.4481242614498975e-05, - "loss": 1.5311, - "step": 16213 - }, - { - "epoch": 0.5880500298658751, - "grad_norm": 1.3017241954803467, - "learning_rate": 2.439728136286796e-05, - "loss": 1.5522, - "step": 16244 - }, - { - "epoch": 0.5891722627472985, - "grad_norm": 1.349171757698059, - "learning_rate": 2.4313326912834852e-05, - "loss": 1.5262, - "step": 16275 - }, - { - "epoch": 0.5902944956287219, - "grad_norm": 1.3548376560211182, - "learning_rate": 2.4229380211814206e-05, - "loss": 1.5455, - "step": 16306 - }, - { - "epoch": 0.5914167285101454, - "grad_norm": 1.412003755569458, - "learning_rate": 2.4145442207133124e-05, - "loss": 1.5634, - "step": 16337 - }, - { - "epoch": 0.5925389613915688, - "grad_norm": 1.3400499820709229, - "learning_rate": 2.406151384602059e-05, - "loss": 1.5398, - "step": 16368 - }, - { - "epoch": 0.5936611942729922, - "grad_norm": 1.3035651445388794, - "learning_rate": 2.3977596075596747e-05, - "loss": 1.5289, - "step": 16399 - }, - { - "epoch": 0.5947834271544157, - "grad_norm": 1.322824478149414, - "learning_rate": 2.3893689842862223e-05, - "loss": 1.5509, - "step": 16430 - }, - { - "epoch": 0.595905660035839, - "grad_norm": 1.3810386657714844, - "learning_rate": 2.3809796094687475e-05, - "loss": 1.5439, - "step": 16461 - }, - { - "epoch": 0.5970278929172624, - "grad_norm": 1.399760127067566, - "learning_rate": 2.372591577780202e-05, - "loss": 1.5459, - "step": 16492 - }, - { - "epoch": 0.5981501257986859, - "grad_norm": 1.3253116607666016, - "learning_rate": 2.3642049838783838e-05, - "loss": 1.5556, - "step": 16523 - }, - { - "epoch": 0.5992723586801093, - "grad_norm": 1.3376234769821167, - "learning_rate": 2.3558199224048666e-05, - "loss": 1.5322, - "step": 16554 - }, - { - "epoch": 0.6003945915615327, - "grad_norm": 1.274533748626709, - "learning_rate": 2.347436487983929e-05, - "loss": 1.5288, - "step": 16585 - }, - { - "epoch": 0.6015168244429562, - "grad_norm": 1.3756400346755981, - "learning_rate": 2.3390547752214888e-05, - "loss": 1.5287, - "step": 16616 - }, - { - "epoch": 0.6026390573243796, - "grad_norm": 1.391845941543579, - "learning_rate": 2.330674878704035e-05, - "loss": 1.5329, - "step": 16647 - }, - { - "epoch": 0.603761290205803, - "grad_norm": 1.414237380027771, - "learning_rate": 2.322296892997561e-05, - "loss": 1.5482, - "step": 16678 - }, - { - "epoch": 0.6048835230872265, - "grad_norm": 1.3953816890716553, - "learning_rate": 2.313920912646497e-05, - "loss": 1.5372, - "step": 16709 - }, - { - "epoch": 0.6060057559686499, - "grad_norm": 1.3669557571411133, - "learning_rate": 2.305547032172643e-05, - "loss": 1.5522, - "step": 16740 - }, - { - "epoch": 0.6071279888500734, - "grad_norm": 1.3847616910934448, - "learning_rate": 2.2971753460741014e-05, - "loss": 1.5314, - "step": 16771 - }, - { - "epoch": 0.6082502217314968, - "grad_norm": 1.2923661470413208, - "learning_rate": 2.288805948824212e-05, - "loss": 1.5434, - "step": 16802 - }, - { - "epoch": 0.6093724546129201, - "grad_norm": 1.3146955966949463, - "learning_rate": 2.2804389348704858e-05, - "loss": 1.5442, - "step": 16833 - }, - { - "epoch": 0.6104946874943435, - "grad_norm": 1.362166166305542, - "learning_rate": 2.2720743986335374e-05, - "loss": 1.546, - "step": 16864 - }, - { - "epoch": 0.611616920375767, - "grad_norm": 1.3853099346160889, - "learning_rate": 2.2637124345060233e-05, - "loss": 1.5385, - "step": 16895 - }, - { - "epoch": 0.6127391532571904, - "grad_norm": 1.3611940145492554, - "learning_rate": 2.2553531368515695e-05, - "loss": 1.5577, - "step": 16926 - }, - { - "epoch": 0.6138613861386139, - "grad_norm": 1.3302477598190308, - "learning_rate": 2.2469966000037144e-05, - "loss": 1.5566, - "step": 16957 - }, - { - "epoch": 0.6149836190200373, - "grad_norm": 1.3969210386276245, - "learning_rate": 2.2386429182648417e-05, - "loss": 1.5459, - "step": 16988 - }, - { - "epoch": 0.6161058519014607, - "grad_norm": 1.3878018856048584, - "learning_rate": 2.230292185905114e-05, - "loss": 1.5295, - "step": 17019 - }, - { - "epoch": 0.6172280847828842, - "grad_norm": 1.3366162776947021, - "learning_rate": 2.2219444971614116e-05, - "loss": 1.5485, - "step": 17050 - }, - { - "epoch": 0.6183503176643076, - "grad_norm": 1.3503491878509521, - "learning_rate": 2.2135999462362655e-05, - "loss": 1.5266, - "step": 17081 - }, - { - "epoch": 0.619472550545731, - "grad_norm": 1.3379223346710205, - "learning_rate": 2.2052586272968003e-05, - "loss": 1.5366, - "step": 17112 - }, - { - "epoch": 0.6205947834271545, - "grad_norm": 1.299849033355713, - "learning_rate": 2.196920634473666e-05, - "loss": 1.5315, - "step": 17143 - }, - { - "epoch": 0.6217170163085779, - "grad_norm": 1.3590292930603027, - "learning_rate": 2.1885860618599787e-05, - "loss": 1.5332, - "step": 17174 - }, - { - "epoch": 0.6228392491900012, - "grad_norm": 1.3150153160095215, - "learning_rate": 2.1802550035102577e-05, - "loss": 1.5197, - "step": 17205 - }, - { - "epoch": 0.6239614820714247, - "grad_norm": 1.3216016292572021, - "learning_rate": 2.171927553439363e-05, - "loss": 1.5344, - "step": 17236 - }, - { - "epoch": 0.6250837149528481, - "grad_norm": 1.3521660566329956, - "learning_rate": 2.1636038056214376e-05, - "loss": 1.5236, - "step": 17267 - }, - { - "epoch": 0.6262059478342715, - "grad_norm": 1.4077104330062866, - "learning_rate": 2.155283853988844e-05, - "loss": 1.5318, - "step": 17298 - }, - { - "epoch": 0.627328180715695, - "grad_norm": 1.4986066818237305, - "learning_rate": 2.146967792431106e-05, - "loss": 1.5466, - "step": 17329 - }, - { - "epoch": 0.6284504135971184, - "grad_norm": 1.2227765321731567, - "learning_rate": 2.138655714793849e-05, - "loss": 1.5345, - "step": 17360 - }, - { - "epoch": 0.6295726464785418, - "grad_norm": 1.3314886093139648, - "learning_rate": 2.1303477148777367e-05, - "loss": 1.5376, - "step": 17391 - }, - { - "epoch": 0.6306948793599653, - "grad_norm": 1.3682267665863037, - "learning_rate": 2.122043886437421e-05, - "loss": 1.5313, - "step": 17422 - }, - { - "epoch": 0.6318171122413887, - "grad_norm": 1.3226497173309326, - "learning_rate": 2.1137443231804765e-05, - "loss": 1.5361, - "step": 17453 - }, - { - "epoch": 0.6329393451228121, - "grad_norm": 1.3603419065475464, - "learning_rate": 2.105449118766347e-05, - "loss": 1.5353, - "step": 17484 - }, - { - "epoch": 0.6340615780042356, - "grad_norm": 1.3611435890197754, - "learning_rate": 2.097158366805287e-05, - "loss": 1.5449, - "step": 17515 - }, - { - "epoch": 0.6351838108856589, - "grad_norm": 1.3318766355514526, - "learning_rate": 2.0888721608573047e-05, - "loss": 1.5194, - "step": 17546 - }, - { - "epoch": 0.6363060437670823, - "grad_norm": 1.3144105672836304, - "learning_rate": 2.0805905944311087e-05, - "loss": 1.5288, - "step": 17577 - }, - { - "epoch": 0.6374282766485058, - "grad_norm": 1.3346774578094482, - "learning_rate": 2.0723137609830497e-05, - "loss": 1.5278, - "step": 17608 - }, - { - "epoch": 0.6385505095299292, - "grad_norm": 1.4217780828475952, - "learning_rate": 2.0640417539160686e-05, - "loss": 1.5467, - "step": 17639 - }, - { - "epoch": 0.6396727424113526, - "grad_norm": 1.3335380554199219, - "learning_rate": 2.0557746665786427e-05, - "loss": 1.5506, - "step": 17670 - }, - { - "epoch": 0.6407949752927761, - "grad_norm": 1.3793307542800903, - "learning_rate": 2.0475125922637256e-05, - "loss": 1.5172, - "step": 17701 - }, - { - "epoch": 0.6419172081741995, - "grad_norm": 1.3435157537460327, - "learning_rate": 2.0392556242077047e-05, - "loss": 1.5137, - "step": 17732 - }, - { - "epoch": 0.6430394410556229, - "grad_norm": 1.3066918849945068, - "learning_rate": 2.031003855589343e-05, - "loss": 1.5184, - "step": 17763 - }, - { - "epoch": 0.6441616739370464, - "grad_norm": 1.4214332103729248, - "learning_rate": 2.022757379528727e-05, - "loss": 1.5239, - "step": 17794 - }, - { - "epoch": 0.6452839068184698, - "grad_norm": 1.3571085929870605, - "learning_rate": 2.0145162890862184e-05, - "loss": 1.5234, - "step": 17825 - }, - { - "epoch": 0.6464061396998932, - "grad_norm": 1.2680344581604004, - "learning_rate": 2.0062806772614022e-05, - "loss": 1.5207, - "step": 17856 - }, - { - "epoch": 0.6475283725813167, - "grad_norm": 1.3365403413772583, - "learning_rate": 1.9980506369920392e-05, - "loss": 1.5457, - "step": 17887 - }, - { - "epoch": 0.64865060546274, - "grad_norm": 1.3576997518539429, - "learning_rate": 1.989826261153015e-05, - "loss": 1.516, - "step": 17918 - }, - { - "epoch": 0.6497728383441634, - "grad_norm": 1.3189170360565186, - "learning_rate": 1.9816076425552923e-05, - "loss": 1.5204, - "step": 17949 - }, - { - "epoch": 0.6508950712255869, - "grad_norm": 1.2855075597763062, - "learning_rate": 1.9733948739448676e-05, - "loss": 1.5131, - "step": 17980 - }, - { - "epoch": 0.6520173041070103, - "grad_norm": 1.3004227876663208, - "learning_rate": 1.9651880480017155e-05, - "loss": 1.5495, - "step": 18011 - }, - { - "epoch": 0.6531395369884337, - "grad_norm": 1.3858931064605713, - "learning_rate": 1.9569872573387516e-05, - "loss": 1.529, - "step": 18042 - }, - { - "epoch": 0.6542617698698572, - "grad_norm": 1.378490686416626, - "learning_rate": 1.9487925945007854e-05, - "loss": 1.5281, - "step": 18073 - }, - { - "epoch": 0.6553840027512806, - "grad_norm": 1.317062258720398, - "learning_rate": 1.9406041519634726e-05, - "loss": 1.5294, - "step": 18104 - }, - { - "epoch": 0.656506235632704, - "grad_norm": 1.313314437866211, - "learning_rate": 1.932422022132275e-05, - "loss": 1.5343, - "step": 18135 - }, - { - "epoch": 0.6576284685141275, - "grad_norm": 1.3339669704437256, - "learning_rate": 1.924246297341414e-05, - "loss": 1.5203, - "step": 18166 - }, - { - "epoch": 0.6587507013955509, - "grad_norm": 1.298256516456604, - "learning_rate": 1.9160770698528338e-05, - "loss": 1.5297, - "step": 18197 - }, - { - "epoch": 0.6598729342769744, - "grad_norm": 1.322373628616333, - "learning_rate": 1.907914431855156e-05, - "loss": 1.5307, - "step": 18228 - }, - { - "epoch": 0.6609951671583978, - "grad_norm": 1.403425931930542, - "learning_rate": 1.8997584754626412e-05, - "loss": 1.5279, - "step": 18259 - }, - { - "epoch": 0.6621174000398211, - "grad_norm": 1.3005762100219727, - "learning_rate": 1.8916092927141486e-05, - "loss": 1.5325, - "step": 18290 - } - ], - "logging_steps": 31, - "max_steps": 30517, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 3052, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.3583670324133626e+19, - "train_batch_size": 8, - "trial_name": null, - "trial_params": null -}