| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.11369019422074846, | |
| "eval_steps": 10, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0011369019422074846, | |
| "grad_norm": 44.27219009399414, | |
| "learning_rate": 2.6333333333333332e-06, | |
| "loss": 0.1206, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0022738038844149692, | |
| "grad_norm": 19.341272354125977, | |
| "learning_rate": 4.266666666666667e-06, | |
| "loss": 0.1003, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0034107058266224536, | |
| "grad_norm": 16.74460220336914, | |
| "learning_rate": 5.9e-06, | |
| "loss": 0.09, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0045476077688299385, | |
| "grad_norm": 13.033002853393555, | |
| "learning_rate": 7.533333333333334e-06, | |
| "loss": 0.0804, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.005684509711037423, | |
| "grad_norm": 11.623870849609375, | |
| "learning_rate": 9.166666666666668e-06, | |
| "loss": 0.0779, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.006821411653244907, | |
| "grad_norm": 53.65879821777344, | |
| "learning_rate": 1.0800000000000002e-05, | |
| "loss": 0.0691, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.007958313595452392, | |
| "grad_norm": 16.973114013671875, | |
| "learning_rate": 1.2433333333333335e-05, | |
| "loss": 0.0718, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.009095215537659877, | |
| "grad_norm": 9.403450012207031, | |
| "learning_rate": 1.4066666666666669e-05, | |
| "loss": 0.0511, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.010232117479867362, | |
| "grad_norm": 7.661226749420166, | |
| "learning_rate": 1.5700000000000002e-05, | |
| "loss": 0.0475, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.011369019422074847, | |
| "grad_norm": 4.5566935539245605, | |
| "learning_rate": 1.7333333333333336e-05, | |
| "loss": 0.0436, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.012505921364282331, | |
| "grad_norm": 16.773433685302734, | |
| "learning_rate": 1.896666666666667e-05, | |
| "loss": 0.0377, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.013642823306489815, | |
| "grad_norm": 11.166831970214844, | |
| "learning_rate": 2.0600000000000003e-05, | |
| "loss": 0.0498, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0147797252486973, | |
| "grad_norm": 7.220335006713867, | |
| "learning_rate": 2.2233333333333336e-05, | |
| "loss": 0.0481, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.015916627190904784, | |
| "grad_norm": 3.7963950634002686, | |
| "learning_rate": 2.386666666666667e-05, | |
| "loss": 0.0355, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.01705352913311227, | |
| "grad_norm": 2.3258769512176514, | |
| "learning_rate": 2.5500000000000003e-05, | |
| "loss": 0.0312, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.018190431075319754, | |
| "grad_norm": 10.604592323303223, | |
| "learning_rate": 2.7133333333333337e-05, | |
| "loss": 0.0413, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.01932733301752724, | |
| "grad_norm": 6.096976280212402, | |
| "learning_rate": 2.876666666666667e-05, | |
| "loss": 0.0393, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.020464234959734724, | |
| "grad_norm": 8.784629821777344, | |
| "learning_rate": 3.0400000000000004e-05, | |
| "loss": 0.0437, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.02160113690194221, | |
| "grad_norm": 2.440566062927246, | |
| "learning_rate": 3.203333333333333e-05, | |
| "loss": 0.0371, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.022738038844149693, | |
| "grad_norm": 2.531832456588745, | |
| "learning_rate": 3.366666666666667e-05, | |
| "loss": 0.039, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.023874940786357178, | |
| "grad_norm": 6.957711696624756, | |
| "learning_rate": 3.53e-05, | |
| "loss": 0.0395, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.025011842728564663, | |
| "grad_norm": 3.283383846282959, | |
| "learning_rate": 3.6933333333333334e-05, | |
| "loss": 0.0411, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.026148744670772144, | |
| "grad_norm": 4.330756187438965, | |
| "learning_rate": 3.856666666666667e-05, | |
| "loss": 0.0424, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.02728564661297963, | |
| "grad_norm": 6.183596134185791, | |
| "learning_rate": 4.02e-05, | |
| "loss": 0.0374, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.028422548555187114, | |
| "grad_norm": 2.192777633666992, | |
| "learning_rate": 4.183333333333334e-05, | |
| "loss": 0.0351, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0295594504973946, | |
| "grad_norm": 1.8996012210845947, | |
| "learning_rate": 4.346666666666667e-05, | |
| "loss": 0.0343, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.030696352439602084, | |
| "grad_norm": 2.9002740383148193, | |
| "learning_rate": 4.5100000000000005e-05, | |
| "loss": 0.0344, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.03183325438180957, | |
| "grad_norm": 2.6055006980895996, | |
| "learning_rate": 4.6733333333333335e-05, | |
| "loss": 0.0412, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.03297015632401706, | |
| "grad_norm": 2.2943973541259766, | |
| "learning_rate": 4.836666666666667e-05, | |
| "loss": 0.0384, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.03410705826622454, | |
| "grad_norm": 2.2072465419769287, | |
| "learning_rate": 5e-05, | |
| "loss": 0.0376, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03524396020843202, | |
| "grad_norm": 1.7547041177749634, | |
| "learning_rate": 4.999834154609218e-05, | |
| "loss": 0.0345, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.03638086215063951, | |
| "grad_norm": 1.8482444286346436, | |
| "learning_rate": 4.999336640889681e-05, | |
| "loss": 0.03, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.03751776409284699, | |
| "grad_norm": 1.3557249307632446, | |
| "learning_rate": 4.998507526196785e-05, | |
| "loss": 0.0325, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.03865466603505448, | |
| "grad_norm": 1.5277765989303589, | |
| "learning_rate": 4.997346922779386e-05, | |
| "loss": 0.0286, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.03979156797726196, | |
| "grad_norm": 1.3418374061584473, | |
| "learning_rate": 4.9958549877646073e-05, | |
| "loss": 0.027, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04092846991946945, | |
| "grad_norm": 1.3661500215530396, | |
| "learning_rate": 4.994031923136569e-05, | |
| "loss": 0.032, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.04206537186167693, | |
| "grad_norm": 2.6273722648620605, | |
| "learning_rate": 4.99187797570904e-05, | |
| "loss": 0.0408, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.04320227380388442, | |
| "grad_norm": 2.770503044128418, | |
| "learning_rate": 4.9893934370920207e-05, | |
| "loss": 0.0442, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0443391757460919, | |
| "grad_norm": 1.315748929977417, | |
| "learning_rate": 4.98657864365227e-05, | |
| "loss": 0.0284, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.045476077688299386, | |
| "grad_norm": 1.6384832859039307, | |
| "learning_rate": 4.9834339764677606e-05, | |
| "loss": 0.0327, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04661297963050687, | |
| "grad_norm": 1.925856590270996, | |
| "learning_rate": 4.979959861276091e-05, | |
| "loss": 0.0368, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.047749881572714356, | |
| "grad_norm": 1.6436312198638916, | |
| "learning_rate": 4.976156768416848e-05, | |
| "loss": 0.0343, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.04888678351492184, | |
| "grad_norm": 2.1728603839874268, | |
| "learning_rate": 4.9720252127679233e-05, | |
| "loss": 0.0412, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.050023685457129326, | |
| "grad_norm": 2.440638303756714, | |
| "learning_rate": 4.96756575367582e-05, | |
| "loss": 0.0464, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.05116058739933681, | |
| "grad_norm": 1.398763656616211, | |
| "learning_rate": 4.96277899487991e-05, | |
| "loss": 0.0295, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05229748934154429, | |
| "grad_norm": 1.3202557563781738, | |
| "learning_rate": 4.957665584430713e-05, | |
| "loss": 0.0275, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.05343439128375178, | |
| "grad_norm": 1.2263014316558838, | |
| "learning_rate": 4.9522262146021495e-05, | |
| "loss": 0.0328, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.05457129322595926, | |
| "grad_norm": 1.163794994354248, | |
| "learning_rate": 4.946461621797824e-05, | |
| "loss": 0.0251, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.055708195168166746, | |
| "grad_norm": 1.9840651750564575, | |
| "learning_rate": 4.940372586451325e-05, | |
| "loss": 0.036, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.05684509711037423, | |
| "grad_norm": 1.7282077074050903, | |
| "learning_rate": 4.9339599329205686e-05, | |
| "loss": 0.0333, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.057981999052581716, | |
| "grad_norm": 1.5609796047210693, | |
| "learning_rate": 4.927224529376191e-05, | |
| "loss": 0.0353, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.0591189009947892, | |
| "grad_norm": 1.7375032901763916, | |
| "learning_rate": 4.920167287684016e-05, | |
| "loss": 0.0299, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.060255802936996686, | |
| "grad_norm": 2.5660016536712646, | |
| "learning_rate": 4.912789163281601e-05, | |
| "loss": 0.0348, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.06139270487920417, | |
| "grad_norm": 1.1275742053985596, | |
| "learning_rate": 4.905091155048882e-05, | |
| "loss": 0.0279, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.06252960682141165, | |
| "grad_norm": 2.46233868598938, | |
| "learning_rate": 4.897074305172948e-05, | |
| "loss": 0.0384, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06366650876361914, | |
| "grad_norm": 1.4302200078964233, | |
| "learning_rate": 4.8887396990069434e-05, | |
| "loss": 0.0291, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.06480341070582663, | |
| "grad_norm": 1.8360644578933716, | |
| "learning_rate": 4.8800884649231264e-05, | |
| "loss": 0.0355, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.06594031264803411, | |
| "grad_norm": 2.182518720626831, | |
| "learning_rate": 4.871121774160107e-05, | |
| "loss": 0.0428, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.06707721459024159, | |
| "grad_norm": 3.9822723865509033, | |
| "learning_rate": 4.8618408406642795e-05, | |
| "loss": 0.0336, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.06821411653244908, | |
| "grad_norm": 1.4780116081237793, | |
| "learning_rate": 4.852246920925476e-05, | |
| "loss": 0.035, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06935101847465656, | |
| "grad_norm": 1.0790804624557495, | |
| "learning_rate": 4.842341313806852e-05, | |
| "loss": 0.0279, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.07048792041686404, | |
| "grad_norm": 1.4283066987991333, | |
| "learning_rate": 4.832125360369049e-05, | |
| "loss": 0.032, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.07162482235907153, | |
| "grad_norm": 1.0618592500686646, | |
| "learning_rate": 4.82160044368863e-05, | |
| "loss": 0.0275, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.07276172430127902, | |
| "grad_norm": 1.6941381692886353, | |
| "learning_rate": 4.810767988670834e-05, | |
| "loss": 0.038, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.0738986262434865, | |
| "grad_norm": 1.1400556564331055, | |
| "learning_rate": 4.799629461856672e-05, | |
| "loss": 0.0246, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07503552818569398, | |
| "grad_norm": 1.0262879133224487, | |
| "learning_rate": 4.788186371224372e-05, | |
| "loss": 0.0274, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.07617243012790147, | |
| "grad_norm": 1.6120742559432983, | |
| "learning_rate": 4.776440265985233e-05, | |
| "loss": 0.0382, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.07730933207010895, | |
| "grad_norm": 1.3485040664672852, | |
| "learning_rate": 4.764392736373876e-05, | |
| "loss": 0.0339, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.07844623401231644, | |
| "grad_norm": 0.9511722922325134, | |
| "learning_rate": 4.7520454134329594e-05, | |
| "loss": 0.024, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.07958313595452392, | |
| "grad_norm": 1.5677837133407593, | |
| "learning_rate": 4.73939996879236e-05, | |
| "loss": 0.0387, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0807200378967314, | |
| "grad_norm": 2.56783390045166, | |
| "learning_rate": 4.72645811444286e-05, | |
| "loss": 0.0357, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.0818569398389389, | |
| "grad_norm": 1.1185239553451538, | |
| "learning_rate": 4.7132216025043714e-05, | |
| "loss": 0.0254, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.08299384178114638, | |
| "grad_norm": 1.1929640769958496, | |
| "learning_rate": 4.699692224988726e-05, | |
| "loss": 0.028, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.08413074372335386, | |
| "grad_norm": 1.2500364780426025, | |
| "learning_rate": 4.685871813557068e-05, | |
| "loss": 0.0288, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.08526764566556135, | |
| "grad_norm": 2.6409554481506348, | |
| "learning_rate": 4.671762239271875e-05, | |
| "loss": 0.0318, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08640454760776883, | |
| "grad_norm": 1.37834632396698, | |
| "learning_rate": 4.6573654123436456e-05, | |
| "loss": 0.032, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.08754144954997631, | |
| "grad_norm": 1.6866464614868164, | |
| "learning_rate": 4.642683281872288e-05, | |
| "loss": 0.036, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.0886783514921838, | |
| "grad_norm": 1.586599588394165, | |
| "learning_rate": 4.6277178355832434e-05, | |
| "loss": 0.0315, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.08981525343439128, | |
| "grad_norm": 1.7033288478851318, | |
| "learning_rate": 4.6124710995583807e-05, | |
| "loss": 0.0401, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.09095215537659877, | |
| "grad_norm": 1.076962947845459, | |
| "learning_rate": 4.5969451379616945e-05, | |
| "loss": 0.0279, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09208905731880625, | |
| "grad_norm": 1.5642842054367065, | |
| "learning_rate": 4.581142052759852e-05, | |
| "loss": 0.0335, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.09322595926101374, | |
| "grad_norm": 1.076861023902893, | |
| "learning_rate": 4.565063983437623e-05, | |
| "loss": 0.0273, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.09436286120322122, | |
| "grad_norm": 1.4380812644958496, | |
| "learning_rate": 4.548713106708222e-05, | |
| "loss": 0.035, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.09549976314542871, | |
| "grad_norm": 1.1404873132705688, | |
| "learning_rate": 4.532091636218621e-05, | |
| "loss": 0.0265, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.09663666508763619, | |
| "grad_norm": 1.5228550434112549, | |
| "learning_rate": 4.5152018222498574e-05, | |
| "loss": 0.0357, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09777356702984367, | |
| "grad_norm": 1.139212727546692, | |
| "learning_rate": 4.498045951412377e-05, | |
| "loss": 0.0275, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.09891046897205116, | |
| "grad_norm": 1.1634677648544312, | |
| "learning_rate": 4.480626346336469e-05, | |
| "loss": 0.0278, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.10004737091425865, | |
| "grad_norm": 0.8284496665000916, | |
| "learning_rate": 4.462945365357815e-05, | |
| "loss": 0.0219, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.10118427285646613, | |
| "grad_norm": 1.2063112258911133, | |
| "learning_rate": 4.4450054021982115e-05, | |
| "loss": 0.0325, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.10232117479867361, | |
| "grad_norm": 1.024289608001709, | |
| "learning_rate": 4.426808885641496e-05, | |
| "loss": 0.0261, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1034580767408811, | |
| "grad_norm": 1.0136009454727173, | |
| "learning_rate": 4.408358279204729e-05, | |
| "loss": 0.0262, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.10459497868308858, | |
| "grad_norm": 1.2363288402557373, | |
| "learning_rate": 4.389656080804674e-05, | |
| "loss": 0.0254, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.10573188062529607, | |
| "grad_norm": 0.9798862338066101, | |
| "learning_rate": 4.370704822419616e-05, | |
| "loss": 0.0239, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.10686878256750355, | |
| "grad_norm": 1.3530546426773071, | |
| "learning_rate": 4.3515070697465805e-05, | |
| "loss": 0.0282, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.10800568450971104, | |
| "grad_norm": 2.1118574142456055, | |
| "learning_rate": 4.33206542185397e-05, | |
| "loss": 0.0391, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.10914258645191852, | |
| "grad_norm": 1.6785742044448853, | |
| "learning_rate": 4.3123825108296954e-05, | |
| "loss": 0.0326, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.110279488394126, | |
| "grad_norm": 1.7932261228561401, | |
| "learning_rate": 4.292461001424836e-05, | |
| "loss": 0.0337, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.11141639033633349, | |
| "grad_norm": 2.1050026416778564, | |
| "learning_rate": 4.272303590692872e-05, | |
| "loss": 0.0332, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.11255329227854098, | |
| "grad_norm": 1.169748306274414, | |
| "learning_rate": 4.251913007624543e-05, | |
| "loss": 0.0273, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.11369019422074846, | |
| "grad_norm": 1.7639108896255493, | |
| "learning_rate": 4.231292012778398e-05, | |
| "loss": 0.0367, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 26400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 30, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |