{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11369019422074846, "eval_steps": 10, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011369019422074846, "grad_norm": 44.27219009399414, "learning_rate": 2.6333333333333332e-06, "loss": 0.1206, "step": 1 }, { "epoch": 0.0022738038844149692, "grad_norm": 19.341272354125977, "learning_rate": 4.266666666666667e-06, "loss": 0.1003, "step": 2 }, { "epoch": 0.0034107058266224536, "grad_norm": 16.74460220336914, "learning_rate": 5.9e-06, "loss": 0.09, "step": 3 }, { "epoch": 0.0045476077688299385, "grad_norm": 13.033002853393555, "learning_rate": 7.533333333333334e-06, "loss": 0.0804, "step": 4 }, { "epoch": 0.005684509711037423, "grad_norm": 11.623870849609375, "learning_rate": 9.166666666666668e-06, "loss": 0.0779, "step": 5 }, { "epoch": 0.006821411653244907, "grad_norm": 53.65879821777344, "learning_rate": 1.0800000000000002e-05, "loss": 0.0691, "step": 6 }, { "epoch": 0.007958313595452392, "grad_norm": 16.973114013671875, "learning_rate": 1.2433333333333335e-05, "loss": 0.0718, "step": 7 }, { "epoch": 0.009095215537659877, "grad_norm": 9.403450012207031, "learning_rate": 1.4066666666666669e-05, "loss": 0.0511, "step": 8 }, { "epoch": 0.010232117479867362, "grad_norm": 7.661226749420166, "learning_rate": 1.5700000000000002e-05, "loss": 0.0475, "step": 9 }, { "epoch": 0.011369019422074847, "grad_norm": 4.5566935539245605, "learning_rate": 1.7333333333333336e-05, "loss": 0.0436, "step": 10 }, { "epoch": 0.012505921364282331, "grad_norm": 16.773433685302734, "learning_rate": 1.896666666666667e-05, "loss": 0.0377, "step": 11 }, { "epoch": 0.013642823306489815, "grad_norm": 11.166831970214844, "learning_rate": 2.0600000000000003e-05, "loss": 0.0498, "step": 12 }, { "epoch": 0.0147797252486973, "grad_norm": 7.220335006713867, "learning_rate": 2.2233333333333336e-05, "loss": 0.0481, "step": 13 }, { "epoch": 0.015916627190904784, "grad_norm": 3.7963950634002686, "learning_rate": 2.386666666666667e-05, "loss": 0.0355, "step": 14 }, { "epoch": 0.01705352913311227, "grad_norm": 2.3258769512176514, "learning_rate": 2.5500000000000003e-05, "loss": 0.0312, "step": 15 }, { "epoch": 0.018190431075319754, "grad_norm": 10.604592323303223, "learning_rate": 2.7133333333333337e-05, "loss": 0.0413, "step": 16 }, { "epoch": 0.01932733301752724, "grad_norm": 6.096976280212402, "learning_rate": 2.876666666666667e-05, "loss": 0.0393, "step": 17 }, { "epoch": 0.020464234959734724, "grad_norm": 8.784629821777344, "learning_rate": 3.0400000000000004e-05, "loss": 0.0437, "step": 18 }, { "epoch": 0.02160113690194221, "grad_norm": 2.440566062927246, "learning_rate": 3.203333333333333e-05, "loss": 0.0371, "step": 19 }, { "epoch": 0.022738038844149693, "grad_norm": 2.531832456588745, "learning_rate": 3.366666666666667e-05, "loss": 0.039, "step": 20 }, { "epoch": 0.023874940786357178, "grad_norm": 6.957711696624756, "learning_rate": 3.53e-05, "loss": 0.0395, "step": 21 }, { "epoch": 0.025011842728564663, "grad_norm": 3.283383846282959, "learning_rate": 3.6933333333333334e-05, "loss": 0.0411, "step": 22 }, { "epoch": 0.026148744670772144, "grad_norm": 4.330756187438965, "learning_rate": 3.856666666666667e-05, "loss": 0.0424, "step": 23 }, { "epoch": 0.02728564661297963, "grad_norm": 6.183596134185791, "learning_rate": 4.02e-05, "loss": 0.0374, "step": 24 }, { "epoch": 0.028422548555187114, "grad_norm": 2.192777633666992, "learning_rate": 4.183333333333334e-05, "loss": 0.0351, "step": 25 }, { "epoch": 0.0295594504973946, "grad_norm": 1.8996012210845947, "learning_rate": 4.346666666666667e-05, "loss": 0.0343, "step": 26 }, { "epoch": 0.030696352439602084, "grad_norm": 2.9002740383148193, "learning_rate": 4.5100000000000005e-05, "loss": 0.0344, "step": 27 }, { "epoch": 0.03183325438180957, "grad_norm": 2.6055006980895996, "learning_rate": 4.6733333333333335e-05, "loss": 0.0412, "step": 28 }, { "epoch": 0.03297015632401706, "grad_norm": 2.2943973541259766, "learning_rate": 4.836666666666667e-05, "loss": 0.0384, "step": 29 }, { "epoch": 0.03410705826622454, "grad_norm": 2.2072465419769287, "learning_rate": 5e-05, "loss": 0.0376, "step": 30 }, { "epoch": 0.03524396020843202, "grad_norm": 1.7547041177749634, "learning_rate": 4.999834154609218e-05, "loss": 0.0345, "step": 31 }, { "epoch": 0.03638086215063951, "grad_norm": 1.8482444286346436, "learning_rate": 4.999336640889681e-05, "loss": 0.03, "step": 32 }, { "epoch": 0.03751776409284699, "grad_norm": 1.3557249307632446, "learning_rate": 4.998507526196785e-05, "loss": 0.0325, "step": 33 }, { "epoch": 0.03865466603505448, "grad_norm": 1.5277765989303589, "learning_rate": 4.997346922779386e-05, "loss": 0.0286, "step": 34 }, { "epoch": 0.03979156797726196, "grad_norm": 1.3418374061584473, "learning_rate": 4.9958549877646073e-05, "loss": 0.027, "step": 35 }, { "epoch": 0.04092846991946945, "grad_norm": 1.3661500215530396, "learning_rate": 4.994031923136569e-05, "loss": 0.032, "step": 36 }, { "epoch": 0.04206537186167693, "grad_norm": 2.6273722648620605, "learning_rate": 4.99187797570904e-05, "loss": 0.0408, "step": 37 }, { "epoch": 0.04320227380388442, "grad_norm": 2.770503044128418, "learning_rate": 4.9893934370920207e-05, "loss": 0.0442, "step": 38 }, { "epoch": 0.0443391757460919, "grad_norm": 1.315748929977417, "learning_rate": 4.98657864365227e-05, "loss": 0.0284, "step": 39 }, { "epoch": 0.045476077688299386, "grad_norm": 1.6384832859039307, "learning_rate": 4.9834339764677606e-05, "loss": 0.0327, "step": 40 }, { "epoch": 0.04661297963050687, "grad_norm": 1.925856590270996, "learning_rate": 4.979959861276091e-05, "loss": 0.0368, "step": 41 }, { "epoch": 0.047749881572714356, "grad_norm": 1.6436312198638916, "learning_rate": 4.976156768416848e-05, "loss": 0.0343, "step": 42 }, { "epoch": 0.04888678351492184, "grad_norm": 2.1728603839874268, "learning_rate": 4.9720252127679233e-05, "loss": 0.0412, "step": 43 }, { "epoch": 0.050023685457129326, "grad_norm": 2.440638303756714, "learning_rate": 4.96756575367582e-05, "loss": 0.0464, "step": 44 }, { "epoch": 0.05116058739933681, "grad_norm": 1.398763656616211, "learning_rate": 4.96277899487991e-05, "loss": 0.0295, "step": 45 }, { "epoch": 0.05229748934154429, "grad_norm": 1.3202557563781738, "learning_rate": 4.957665584430713e-05, "loss": 0.0275, "step": 46 }, { "epoch": 0.05343439128375178, "grad_norm": 1.2263014316558838, "learning_rate": 4.9522262146021495e-05, "loss": 0.0328, "step": 47 }, { "epoch": 0.05457129322595926, "grad_norm": 1.163794994354248, "learning_rate": 4.946461621797824e-05, "loss": 0.0251, "step": 48 }, { "epoch": 0.055708195168166746, "grad_norm": 1.9840651750564575, "learning_rate": 4.940372586451325e-05, "loss": 0.036, "step": 49 }, { "epoch": 0.05684509711037423, "grad_norm": 1.7282077074050903, "learning_rate": 4.9339599329205686e-05, "loss": 0.0333, "step": 50 }, { "epoch": 0.057981999052581716, "grad_norm": 1.5609796047210693, "learning_rate": 4.927224529376191e-05, "loss": 0.0353, "step": 51 }, { "epoch": 0.0591189009947892, "grad_norm": 1.7375032901763916, "learning_rate": 4.920167287684016e-05, "loss": 0.0299, "step": 52 }, { "epoch": 0.060255802936996686, "grad_norm": 2.5660016536712646, "learning_rate": 4.912789163281601e-05, "loss": 0.0348, "step": 53 }, { "epoch": 0.06139270487920417, "grad_norm": 1.1275742053985596, "learning_rate": 4.905091155048882e-05, "loss": 0.0279, "step": 54 }, { "epoch": 0.06252960682141165, "grad_norm": 2.46233868598938, "learning_rate": 4.897074305172948e-05, "loss": 0.0384, "step": 55 }, { "epoch": 0.06366650876361914, "grad_norm": 1.4302200078964233, "learning_rate": 4.8887396990069434e-05, "loss": 0.0291, "step": 56 }, { "epoch": 0.06480341070582663, "grad_norm": 1.8360644578933716, "learning_rate": 4.8800884649231264e-05, "loss": 0.0355, "step": 57 }, { "epoch": 0.06594031264803411, "grad_norm": 2.182518720626831, "learning_rate": 4.871121774160107e-05, "loss": 0.0428, "step": 58 }, { "epoch": 0.06707721459024159, "grad_norm": 3.9822723865509033, "learning_rate": 4.8618408406642795e-05, "loss": 0.0336, "step": 59 }, { "epoch": 0.06821411653244908, "grad_norm": 1.4780116081237793, "learning_rate": 4.852246920925476e-05, "loss": 0.035, "step": 60 }, { "epoch": 0.06935101847465656, "grad_norm": 1.0790804624557495, "learning_rate": 4.842341313806852e-05, "loss": 0.0279, "step": 61 }, { "epoch": 0.07048792041686404, "grad_norm": 1.4283066987991333, "learning_rate": 4.832125360369049e-05, "loss": 0.032, "step": 62 }, { "epoch": 0.07162482235907153, "grad_norm": 1.0618592500686646, "learning_rate": 4.82160044368863e-05, "loss": 0.0275, "step": 63 }, { "epoch": 0.07276172430127902, "grad_norm": 1.6941381692886353, "learning_rate": 4.810767988670834e-05, "loss": 0.038, "step": 64 }, { "epoch": 0.0738986262434865, "grad_norm": 1.1400556564331055, "learning_rate": 4.799629461856672e-05, "loss": 0.0246, "step": 65 }, { "epoch": 0.07503552818569398, "grad_norm": 1.0262879133224487, "learning_rate": 4.788186371224372e-05, "loss": 0.0274, "step": 66 }, { "epoch": 0.07617243012790147, "grad_norm": 1.6120742559432983, "learning_rate": 4.776440265985233e-05, "loss": 0.0382, "step": 67 }, { "epoch": 0.07730933207010895, "grad_norm": 1.3485040664672852, "learning_rate": 4.764392736373876e-05, "loss": 0.0339, "step": 68 }, { "epoch": 0.07844623401231644, "grad_norm": 0.9511722922325134, "learning_rate": 4.7520454134329594e-05, "loss": 0.024, "step": 69 }, { "epoch": 0.07958313595452392, "grad_norm": 1.5677837133407593, "learning_rate": 4.73939996879236e-05, "loss": 0.0387, "step": 70 }, { "epoch": 0.0807200378967314, "grad_norm": 2.56783390045166, "learning_rate": 4.72645811444286e-05, "loss": 0.0357, "step": 71 }, { "epoch": 0.0818569398389389, "grad_norm": 1.1185239553451538, "learning_rate": 4.7132216025043714e-05, "loss": 0.0254, "step": 72 }, { "epoch": 0.08299384178114638, "grad_norm": 1.1929640769958496, "learning_rate": 4.699692224988726e-05, "loss": 0.028, "step": 73 }, { "epoch": 0.08413074372335386, "grad_norm": 1.2500364780426025, "learning_rate": 4.685871813557068e-05, "loss": 0.0288, "step": 74 }, { "epoch": 0.08526764566556135, "grad_norm": 2.6409554481506348, "learning_rate": 4.671762239271875e-05, "loss": 0.0318, "step": 75 }, { "epoch": 0.08640454760776883, "grad_norm": 1.37834632396698, "learning_rate": 4.6573654123436456e-05, "loss": 0.032, "step": 76 }, { "epoch": 0.08754144954997631, "grad_norm": 1.6866464614868164, "learning_rate": 4.642683281872288e-05, "loss": 0.036, "step": 77 }, { "epoch": 0.0886783514921838, "grad_norm": 1.586599588394165, "learning_rate": 4.6277178355832434e-05, "loss": 0.0315, "step": 78 }, { "epoch": 0.08981525343439128, "grad_norm": 1.7033288478851318, "learning_rate": 4.6124710995583807e-05, "loss": 0.0401, "step": 79 }, { "epoch": 0.09095215537659877, "grad_norm": 1.076962947845459, "learning_rate": 4.5969451379616945e-05, "loss": 0.0279, "step": 80 }, { "epoch": 0.09208905731880625, "grad_norm": 1.5642842054367065, "learning_rate": 4.581142052759852e-05, "loss": 0.0335, "step": 81 }, { "epoch": 0.09322595926101374, "grad_norm": 1.076861023902893, "learning_rate": 4.565063983437623e-05, "loss": 0.0273, "step": 82 }, { "epoch": 0.09436286120322122, "grad_norm": 1.4380812644958496, "learning_rate": 4.548713106708222e-05, "loss": 0.035, "step": 83 }, { "epoch": 0.09549976314542871, "grad_norm": 1.1404873132705688, "learning_rate": 4.532091636218621e-05, "loss": 0.0265, "step": 84 }, { "epoch": 0.09663666508763619, "grad_norm": 1.5228550434112549, "learning_rate": 4.5152018222498574e-05, "loss": 0.0357, "step": 85 }, { "epoch": 0.09777356702984367, "grad_norm": 1.139212727546692, "learning_rate": 4.498045951412377e-05, "loss": 0.0275, "step": 86 }, { "epoch": 0.09891046897205116, "grad_norm": 1.1634677648544312, "learning_rate": 4.480626346336469e-05, "loss": 0.0278, "step": 87 }, { "epoch": 0.10004737091425865, "grad_norm": 0.8284496665000916, "learning_rate": 4.462945365357815e-05, "loss": 0.0219, "step": 88 }, { "epoch": 0.10118427285646613, "grad_norm": 1.2063112258911133, "learning_rate": 4.4450054021982115e-05, "loss": 0.0325, "step": 89 }, { "epoch": 0.10232117479867361, "grad_norm": 1.024289608001709, "learning_rate": 4.426808885641496e-05, "loss": 0.0261, "step": 90 }, { "epoch": 0.1034580767408811, "grad_norm": 1.0136009454727173, "learning_rate": 4.408358279204729e-05, "loss": 0.0262, "step": 91 }, { "epoch": 0.10459497868308858, "grad_norm": 1.2363288402557373, "learning_rate": 4.389656080804674e-05, "loss": 0.0254, "step": 92 }, { "epoch": 0.10573188062529607, "grad_norm": 0.9798862338066101, "learning_rate": 4.370704822419616e-05, "loss": 0.0239, "step": 93 }, { "epoch": 0.10686878256750355, "grad_norm": 1.3530546426773071, "learning_rate": 4.3515070697465805e-05, "loss": 0.0282, "step": 94 }, { "epoch": 0.10800568450971104, "grad_norm": 2.1118574142456055, "learning_rate": 4.33206542185397e-05, "loss": 0.0391, "step": 95 }, { "epoch": 0.10914258645191852, "grad_norm": 1.6785742044448853, "learning_rate": 4.3123825108296954e-05, "loss": 0.0326, "step": 96 }, { "epoch": 0.110279488394126, "grad_norm": 1.7932261228561401, "learning_rate": 4.292461001424836e-05, "loss": 0.0337, "step": 97 }, { "epoch": 0.11141639033633349, "grad_norm": 2.1050026416778564, "learning_rate": 4.272303590692872e-05, "loss": 0.0332, "step": 98 }, { "epoch": 0.11255329227854098, "grad_norm": 1.169748306274414, "learning_rate": 4.251913007624543e-05, "loss": 0.0273, "step": 99 }, { "epoch": 0.11369019422074846, "grad_norm": 1.7639108896255493, "learning_rate": 4.231292012778398e-05, "loss": 0.0367, "step": 100 } ], "logging_steps": 1, "max_steps": 26400, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }