| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 10, | |
| "global_step": 261, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011494252873563218, | |
| "grad_norm": 0.45773613452911377, | |
| "learning_rate": 0.0, | |
| "loss": 1.065, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.022988505747126436, | |
| "grad_norm": 0.4720604717731476, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 1.1025, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.034482758620689655, | |
| "grad_norm": 0.4526257812976837, | |
| "learning_rate": 1.4814814814814815e-05, | |
| "loss": 1.067, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04597701149425287, | |
| "grad_norm": 0.44910383224487305, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 1.0694, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.05747126436781609, | |
| "grad_norm": 0.4383523762226105, | |
| "learning_rate": 2.962962962962963e-05, | |
| "loss": 1.0767, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06896551724137931, | |
| "grad_norm": 0.4290314018726349, | |
| "learning_rate": 3.7037037037037037e-05, | |
| "loss": 1.0414, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.08045977011494253, | |
| "grad_norm": 0.4098808467388153, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 1.021, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.09195402298850575, | |
| "grad_norm": 0.4100661277770996, | |
| "learning_rate": 5.185185185185185e-05, | |
| "loss": 1.0113, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.10344827586206896, | |
| "grad_norm": 0.37694185972213745, | |
| "learning_rate": 5.925925925925926e-05, | |
| "loss": 0.9566, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.11494252873563218, | |
| "grad_norm": 0.3378658890724182, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.9539, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11494252873563218, | |
| "eval_loss": 0.9960598945617676, | |
| "eval_runtime": 605.6954, | |
| "eval_samples_per_second": 16.342, | |
| "eval_steps_per_second": 0.129, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12643678160919541, | |
| "grad_norm": 0.29339659214019775, | |
| "learning_rate": 7.407407407407407e-05, | |
| "loss": 0.9163, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.13793103448275862, | |
| "grad_norm": 0.23120510578155518, | |
| "learning_rate": 8.148148148148148e-05, | |
| "loss": 0.8886, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.14942528735632185, | |
| "grad_norm": 0.18651129305362701, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 0.8748, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.16091954022988506, | |
| "grad_norm": 0.14475475251674652, | |
| "learning_rate": 9.62962962962963e-05, | |
| "loss": 0.8235, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.1724137931034483, | |
| "grad_norm": 0.11758769303560257, | |
| "learning_rate": 0.0001037037037037037, | |
| "loss": 0.8539, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1839080459770115, | |
| "grad_norm": 0.10646044462919235, | |
| "learning_rate": 0.00011111111111111112, | |
| "loss": 0.8363, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.19540229885057472, | |
| "grad_norm": 0.10539838671684265, | |
| "learning_rate": 0.00011851851851851852, | |
| "loss": 0.819, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.20689655172413793, | |
| "grad_norm": 0.11887332051992416, | |
| "learning_rate": 0.00012592592592592592, | |
| "loss": 0.8069, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.21839080459770116, | |
| "grad_norm": 0.1281956136226654, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 0.8042, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.22988505747126436, | |
| "grad_norm": 0.1338774859905243, | |
| "learning_rate": 0.00014074074074074076, | |
| "loss": 0.8283, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.22988505747126436, | |
| "eval_loss": 0.813089907169342, | |
| "eval_runtime": 601.317, | |
| "eval_samples_per_second": 16.461, | |
| "eval_steps_per_second": 0.13, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2413793103448276, | |
| "grad_norm": 0.12568038702011108, | |
| "learning_rate": 0.00014814814814814815, | |
| "loss": 0.8142, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.25287356321839083, | |
| "grad_norm": 0.11528006941080093, | |
| "learning_rate": 0.00015555555555555556, | |
| "loss": 0.794, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.26436781609195403, | |
| "grad_norm": 0.10474701225757599, | |
| "learning_rate": 0.00016296296296296295, | |
| "loss": 0.8231, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 0.09240291267633438, | |
| "learning_rate": 0.00017037037037037037, | |
| "loss": 0.7874, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.28735632183908044, | |
| "grad_norm": 0.07213829457759857, | |
| "learning_rate": 0.00017777777777777779, | |
| "loss": 0.8075, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2988505747126437, | |
| "grad_norm": 0.0564056895673275, | |
| "learning_rate": 0.0001851851851851852, | |
| "loss": 0.779, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3103448275862069, | |
| "grad_norm": 0.04973220080137253, | |
| "learning_rate": 0.0001925925925925926, | |
| "loss": 0.7878, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.3218390804597701, | |
| "grad_norm": 0.04492342844605446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7876, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.050321951508522034, | |
| "learning_rate": 0.00019999098778567212, | |
| "loss": 0.7686, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 0.05602027848362923, | |
| "learning_rate": 0.00019996395276708856, | |
| "loss": 0.7525, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "eval_loss": 0.7719228863716125, | |
| "eval_runtime": 606.7346, | |
| "eval_samples_per_second": 16.314, | |
| "eval_steps_per_second": 0.129, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3563218390804598, | |
| "grad_norm": 0.055378761142492294, | |
| "learning_rate": 0.00019991889981715698, | |
| "loss": 0.777, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.367816091954023, | |
| "grad_norm": 0.052891045808792114, | |
| "learning_rate": 0.00019985583705641418, | |
| "loss": 0.7797, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.3793103448275862, | |
| "grad_norm": 0.044340070337057114, | |
| "learning_rate": 0.00019977477585156252, | |
| "loss": 0.7744, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.39080459770114945, | |
| "grad_norm": 0.037772953510284424, | |
| "learning_rate": 0.00019967573081342103, | |
| "loss": 0.7631, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.40229885057471265, | |
| "grad_norm": 0.03553105518221855, | |
| "learning_rate": 0.0001995587197942919, | |
| "loss": 0.7512, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.41379310344827586, | |
| "grad_norm": 0.0346490740776062, | |
| "learning_rate": 0.0001994237638847428, | |
| "loss": 0.7567, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.42528735632183906, | |
| "grad_norm": 0.03471450135111809, | |
| "learning_rate": 0.0001992708874098054, | |
| "loss": 0.7723, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.4367816091954023, | |
| "grad_norm": 0.03868038207292557, | |
| "learning_rate": 0.00019910011792459087, | |
| "loss": 0.77, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.4482758620689655, | |
| "grad_norm": 0.04141271859407425, | |
| "learning_rate": 0.00019891148620932318, | |
| "loss": 0.7764, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.45977011494252873, | |
| "grad_norm": 0.04017995670437813, | |
| "learning_rate": 0.00019870502626379127, | |
| "loss": 0.7418, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.45977011494252873, | |
| "eval_loss": 0.7480100393295288, | |
| "eval_runtime": 601.4446, | |
| "eval_samples_per_second": 16.457, | |
| "eval_steps_per_second": 0.13, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.47126436781609193, | |
| "grad_norm": 0.04055652394890785, | |
| "learning_rate": 0.00019848077530122083, | |
| "loss": 0.7487, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4827586206896552, | |
| "grad_norm": 0.03968408331274986, | |
| "learning_rate": 0.00019823877374156647, | |
| "loss": 0.7417, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4942528735632184, | |
| "grad_norm": 0.03533465415239334, | |
| "learning_rate": 0.00019797906520422677, | |
| "loss": 0.7437, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.5057471264367817, | |
| "grad_norm": 0.03385720029473305, | |
| "learning_rate": 0.00019770169650018172, | |
| "loss": 0.7752, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.5172413793103449, | |
| "grad_norm": 0.030534420162439346, | |
| "learning_rate": 0.00019740671762355548, | |
| "loss": 0.7363, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5287356321839081, | |
| "grad_norm": 0.02837216667830944, | |
| "learning_rate": 0.0001970941817426052, | |
| "loss": 0.74, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5402298850574713, | |
| "grad_norm": 0.02879689820110798, | |
| "learning_rate": 0.00019676414519013781, | |
| "loss": 0.7246, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.02807699516415596, | |
| "learning_rate": 0.00019641666745335624, | |
| "loss": 0.7465, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5632183908045977, | |
| "grad_norm": 0.029897579923272133, | |
| "learning_rate": 0.00019605181116313724, | |
| "loss": 0.721, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.5747126436781609, | |
| "grad_norm": 0.028458919376134872, | |
| "learning_rate": 0.00019566964208274254, | |
| "loss": 0.7291, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5747126436781609, | |
| "eval_loss": 0.7319945096969604, | |
| "eval_runtime": 601.7402, | |
| "eval_samples_per_second": 16.449, | |
| "eval_steps_per_second": 0.13, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5862068965517241, | |
| "grad_norm": 0.028347671031951904, | |
| "learning_rate": 0.00019527022909596536, | |
| "loss": 0.7396, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5977011494252874, | |
| "grad_norm": 0.028352508321404457, | |
| "learning_rate": 0.00019485364419471454, | |
| "loss": 0.733, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.6091954022988506, | |
| "grad_norm": 0.025768019258975983, | |
| "learning_rate": 0.00019441996246603846, | |
| "loss": 0.7378, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6206896551724138, | |
| "grad_norm": 0.0259398240596056, | |
| "learning_rate": 0.00019396926207859084, | |
| "loss": 0.7293, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.632183908045977, | |
| "grad_norm": 0.024649152532219887, | |
| "learning_rate": 0.0001935016242685415, | |
| "loss": 0.7287, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6436781609195402, | |
| "grad_norm": 0.026843328028917313, | |
| "learning_rate": 0.00019301713332493386, | |
| "loss": 0.7354, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6551724137931034, | |
| "grad_norm": 0.028928296640515327, | |
| "learning_rate": 0.00019251587657449236, | |
| "loss": 0.7376, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.027994588017463684, | |
| "learning_rate": 0.00019199794436588243, | |
| "loss": 0.7389, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.6781609195402298, | |
| "grad_norm": 0.027422698214650154, | |
| "learning_rate": 0.00019146343005342547, | |
| "loss": 0.7275, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 0.02938409335911274, | |
| "learning_rate": 0.0001909124299802724, | |
| "loss": 0.7244, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "eval_loss": 0.7189474701881409, | |
| "eval_runtime": 605.038, | |
| "eval_samples_per_second": 16.359, | |
| "eval_steps_per_second": 0.129, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7011494252873564, | |
| "grad_norm": 0.02854277938604355, | |
| "learning_rate": 0.00019034504346103823, | |
| "loss": 0.7339, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.7126436781609196, | |
| "grad_norm": 0.028162814676761627, | |
| "learning_rate": 0.0001897613727639014, | |
| "loss": 0.7349, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.7241379310344828, | |
| "grad_norm": 0.027026742696762085, | |
| "learning_rate": 0.0001891615230921703, | |
| "loss": 0.7008, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.735632183908046, | |
| "grad_norm": 0.02867995575070381, | |
| "learning_rate": 0.000188545602565321, | |
| "loss": 0.7404, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7471264367816092, | |
| "grad_norm": 0.025570319965481758, | |
| "learning_rate": 0.00018791372219950948, | |
| "loss": 0.7203, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7586206896551724, | |
| "grad_norm": 0.026673492044210434, | |
| "learning_rate": 0.00018726599588756145, | |
| "loss": 0.7188, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.7701149425287356, | |
| "grad_norm": 0.028060389682650566, | |
| "learning_rate": 0.00018660254037844388, | |
| "loss": 0.7129, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.7816091954022989, | |
| "grad_norm": 0.025640789419412613, | |
| "learning_rate": 0.0001859234752562217, | |
| "loss": 0.7081, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.7931034482758621, | |
| "grad_norm": 0.026264475658535957, | |
| "learning_rate": 0.00018522892291850335, | |
| "loss": 0.6971, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.8045977011494253, | |
| "grad_norm": 0.028900163248181343, | |
| "learning_rate": 0.0001845190085543795, | |
| "loss": 0.721, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8045977011494253, | |
| "eval_loss": 0.7084506154060364, | |
| "eval_runtime": 603.3937, | |
| "eval_samples_per_second": 16.404, | |
| "eval_steps_per_second": 0.129, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8160919540229885, | |
| "grad_norm": 0.02878301776945591, | |
| "learning_rate": 0.00018379386012185814, | |
| "loss": 0.7192, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 0.028177903965115547, | |
| "learning_rate": 0.00018305360832480117, | |
| "loss": 0.7258, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.8390804597701149, | |
| "grad_norm": 0.027689015492796898, | |
| "learning_rate": 0.00018229838658936564, | |
| "loss": 0.7327, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.8505747126436781, | |
| "grad_norm": 0.027420515194535255, | |
| "learning_rate": 0.00018152833103995443, | |
| "loss": 0.7007, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.8620689655172413, | |
| "grad_norm": 0.02628767490386963, | |
| "learning_rate": 0.0001807435804746807, | |
| "loss": 0.7106, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8735632183908046, | |
| "grad_norm": 0.027107784524559975, | |
| "learning_rate": 0.00017994427634035015, | |
| "loss": 0.7056, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.8850574712643678, | |
| "grad_norm": 0.026692209765315056, | |
| "learning_rate": 0.0001791305627069662, | |
| "loss": 0.6898, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.896551724137931, | |
| "grad_norm": 0.02739943191409111, | |
| "learning_rate": 0.00017830258624176225, | |
| "loss": 0.71, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.9080459770114943, | |
| "grad_norm": 0.027159228920936584, | |
| "learning_rate": 0.00017746049618276545, | |
| "loss": 0.7104, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.9195402298850575, | |
| "grad_norm": 0.027961738407611847, | |
| "learning_rate": 0.0001766044443118978, | |
| "loss": 0.6964, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9195402298850575, | |
| "eval_loss": 0.6998673677444458, | |
| "eval_runtime": 599.046, | |
| "eval_samples_per_second": 16.523, | |
| "eval_steps_per_second": 0.13, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9310344827586207, | |
| "grad_norm": 0.028458958491683006, | |
| "learning_rate": 0.00017573458492761801, | |
| "loss": 0.7048, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.9425287356321839, | |
| "grad_norm": 0.0295415036380291, | |
| "learning_rate": 0.00017485107481711012, | |
| "loss": 0.7089, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.9540229885057471, | |
| "grad_norm": 0.027641592547297478, | |
| "learning_rate": 0.00017395407322802372, | |
| "loss": 0.7184, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.9655172413793104, | |
| "grad_norm": 0.029828142374753952, | |
| "learning_rate": 0.00017304374183977033, | |
| "loss": 0.6878, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.9770114942528736, | |
| "grad_norm": 0.029184194281697273, | |
| "learning_rate": 0.00017212024473438147, | |
| "loss": 0.7223, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.9885057471264368, | |
| "grad_norm": 0.02929309941828251, | |
| "learning_rate": 0.00017118374836693406, | |
| "loss": 0.6936, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.03450490161776543, | |
| "learning_rate": 0.00017023442153554777, | |
| "loss": 0.6906, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.0114942528735633, | |
| "grad_norm": 0.03116275928914547, | |
| "learning_rate": 0.00016927243535095997, | |
| "loss": 0.6915, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.0229885057471264, | |
| "grad_norm": 0.03145065903663635, | |
| "learning_rate": 0.00016829796320568416, | |
| "loss": 0.6792, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.0344827586206897, | |
| "grad_norm": 0.03122427873313427, | |
| "learning_rate": 0.00016731118074275704, | |
| "loss": 0.6965, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0344827586206897, | |
| "eval_loss": 0.6928127408027649, | |
| "eval_runtime": 606.6499, | |
| "eval_samples_per_second": 16.316, | |
| "eval_steps_per_second": 0.129, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0459770114942528, | |
| "grad_norm": 0.030179064720869064, | |
| "learning_rate": 0.00016631226582407952, | |
| "loss": 0.6726, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.0574712643678161, | |
| "grad_norm": 0.029219962656497955, | |
| "learning_rate": 0.0001653013984983585, | |
| "loss": 0.6792, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.0689655172413792, | |
| "grad_norm": 0.02873355709016323, | |
| "learning_rate": 0.00016427876096865394, | |
| "loss": 0.6921, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.0804597701149425, | |
| "grad_norm": 0.029455283656716347, | |
| "learning_rate": 0.00016324453755953773, | |
| "loss": 0.6829, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.0919540229885056, | |
| "grad_norm": 0.030247965827584267, | |
| "learning_rate": 0.0001621989146838704, | |
| "loss": 0.6737, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.103448275862069, | |
| "grad_norm": 0.029829107224941254, | |
| "learning_rate": 0.00016114208080920123, | |
| "loss": 0.6852, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.1149425287356323, | |
| "grad_norm": 0.031125420704483986, | |
| "learning_rate": 0.0001600742264237979, | |
| "loss": 0.6863, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.1264367816091954, | |
| "grad_norm": 0.029601775109767914, | |
| "learning_rate": 0.00015899554400231232, | |
| "loss": 0.6785, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.1379310344827587, | |
| "grad_norm": 0.031090950593352318, | |
| "learning_rate": 0.0001579062279710879, | |
| "loss": 0.7001, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.1494252873563218, | |
| "grad_norm": 0.030084125697612762, | |
| "learning_rate": 0.00015680647467311557, | |
| "loss": 0.6856, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1494252873563218, | |
| "eval_loss": 0.686887264251709, | |
| "eval_runtime": 599.4005, | |
| "eval_samples_per_second": 16.513, | |
| "eval_steps_per_second": 0.13, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.160919540229885, | |
| "grad_norm": 0.03157448023557663, | |
| "learning_rate": 0.00015569648233264394, | |
| "loss": 0.6836, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.1724137931034484, | |
| "grad_norm": 0.032123371958732605, | |
| "learning_rate": 0.00015457645101945046, | |
| "loss": 0.6827, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.1839080459770115, | |
| "grad_norm": 0.03208347037434578, | |
| "learning_rate": 0.0001534465826127801, | |
| "loss": 0.6823, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.1954022988505748, | |
| "grad_norm": 0.030994586646556854, | |
| "learning_rate": 0.00015230708076495775, | |
| "loss": 0.6729, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.206896551724138, | |
| "grad_norm": 0.03224639222025871, | |
| "learning_rate": 0.00015115815086468102, | |
| "loss": 0.6778, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.2183908045977012, | |
| "grad_norm": 0.030437005683779716, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.674, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.2298850574712643, | |
| "grad_norm": 0.030820753425359726, | |
| "learning_rate": 0.00014883283692099112, | |
| "loss": 0.6799, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.2413793103448276, | |
| "grad_norm": 0.03301486000418663, | |
| "learning_rate": 0.0001476568720021308, | |
| "loss": 0.6851, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.2528735632183907, | |
| "grad_norm": 0.033183734863996506, | |
| "learning_rate": 0.00014647231720437686, | |
| "loss": 0.7094, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.264367816091954, | |
| "grad_norm": 0.03154841437935829, | |
| "learning_rate": 0.00014527938603696376, | |
| "loss": 0.6812, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.264367816091954, | |
| "eval_loss": 0.6802834868431091, | |
| "eval_runtime": 597.3843, | |
| "eval_samples_per_second": 16.569, | |
| "eval_steps_per_second": 0.131, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2758620689655173, | |
| "grad_norm": 0.03554477170109749, | |
| "learning_rate": 0.00014407829351891857, | |
| "loss": 0.679, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.2873563218390804, | |
| "grad_norm": 0.03362204134464264, | |
| "learning_rate": 0.00014286925614030542, | |
| "loss": 0.6722, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.2988505747126438, | |
| "grad_norm": 0.032853253185749054, | |
| "learning_rate": 0.00014165249182320402, | |
| "loss": 0.6879, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.3103448275862069, | |
| "grad_norm": 0.033823542296886444, | |
| "learning_rate": 0.0001404282198824305, | |
| "loss": 0.6627, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.3218390804597702, | |
| "grad_norm": 0.0326109379529953, | |
| "learning_rate": 0.00013919666098600753, | |
| "loss": 0.6751, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.03431953117251396, | |
| "learning_rate": 0.00013795803711538966, | |
| "loss": 0.6827, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.3448275862068966, | |
| "grad_norm": 0.03328808769583702, | |
| "learning_rate": 0.00013671257152545277, | |
| "loss": 0.6648, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.3563218390804597, | |
| "grad_norm": 0.035666704177856445, | |
| "learning_rate": 0.00013546048870425356, | |
| "loss": 0.6771, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.367816091954023, | |
| "grad_norm": 0.03441452234983444, | |
| "learning_rate": 0.00013420201433256689, | |
| "loss": 0.6623, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.3793103448275863, | |
| "grad_norm": 0.03363391384482384, | |
| "learning_rate": 0.00013293737524320797, | |
| "loss": 0.6734, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3793103448275863, | |
| "eval_loss": 0.6736027002334595, | |
| "eval_runtime": 601.0855, | |
| "eval_samples_per_second": 16.467, | |
| "eval_steps_per_second": 0.13, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3908045977011494, | |
| "grad_norm": 0.03441128134727478, | |
| "learning_rate": 0.00013166679938014726, | |
| "loss": 0.6625, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.4022988505747127, | |
| "grad_norm": 0.03332269564270973, | |
| "learning_rate": 0.0001303905157574247, | |
| "loss": 0.6738, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.4137931034482758, | |
| "grad_norm": 0.03411813825368881, | |
| "learning_rate": 0.00012910875441787128, | |
| "loss": 0.6511, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.4252873563218391, | |
| "grad_norm": 0.034489769488573074, | |
| "learning_rate": 0.0001278217463916453, | |
| "loss": 0.6579, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.4367816091954024, | |
| "grad_norm": 0.03562283143401146, | |
| "learning_rate": 0.0001265297236545901, | |
| "loss": 0.6508, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4482758620689655, | |
| "grad_norm": 0.034433409571647644, | |
| "learning_rate": 0.00012523291908642217, | |
| "loss": 0.6867, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.4597701149425286, | |
| "grad_norm": 0.034091442823410034, | |
| "learning_rate": 0.0001239315664287558, | |
| "loss": 0.6609, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.471264367816092, | |
| "grad_norm": 0.034659866243600845, | |
| "learning_rate": 0.00012262590024297225, | |
| "loss": 0.6708, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.4827586206896552, | |
| "grad_norm": 0.03355137258768082, | |
| "learning_rate": 0.0001213161558679416, | |
| "loss": 0.6853, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.4942528735632183, | |
| "grad_norm": 0.03497845306992531, | |
| "learning_rate": 0.00012000256937760445, | |
| "loss": 0.658, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.4942528735632183, | |
| "eval_loss": 0.6697064638137817, | |
| "eval_runtime": 595.4108, | |
| "eval_samples_per_second": 16.624, | |
| "eval_steps_per_second": 0.131, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5057471264367817, | |
| "grad_norm": 0.03422487527132034, | |
| "learning_rate": 0.00011868537753842051, | |
| "loss": 0.6688, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.5172413793103448, | |
| "grad_norm": 0.034450776875019073, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 0.6828, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.528735632183908, | |
| "grad_norm": 0.034574706107378006, | |
| "learning_rate": 0.00011604112808577603, | |
| "loss": 0.6776, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.5402298850574714, | |
| "grad_norm": 0.03596516326069832, | |
| "learning_rate": 0.00011471454708317162, | |
| "loss": 0.6276, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.5517241379310345, | |
| "grad_norm": 0.035429686307907104, | |
| "learning_rate": 0.00011338531386752618, | |
| "loss": 0.6427, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5632183908045976, | |
| "grad_norm": 0.033536769449710846, | |
| "learning_rate": 0.0001120536680255323, | |
| "loss": 0.652, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.5747126436781609, | |
| "grad_norm": 0.03417116403579712, | |
| "learning_rate": 0.00011071984957874479, | |
| "loss": 0.6725, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.5862068965517242, | |
| "grad_norm": 0.03480486571788788, | |
| "learning_rate": 0.00010938409894031794, | |
| "loss": 0.6479, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.5977011494252875, | |
| "grad_norm": 0.03562786802649498, | |
| "learning_rate": 0.00010804665687167262, | |
| "loss": 0.6681, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.6091954022988506, | |
| "grad_norm": 0.034858204424381256, | |
| "learning_rate": 0.00010670776443910024, | |
| "loss": 0.6703, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6091954022988506, | |
| "eval_loss": 0.6661256551742554, | |
| "eval_runtime": 590.9429, | |
| "eval_samples_per_second": 16.75, | |
| "eval_steps_per_second": 0.132, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6206896551724137, | |
| "grad_norm": 0.03447253257036209, | |
| "learning_rate": 0.00010536766297031215, | |
| "loss": 0.6643, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.632183908045977, | |
| "grad_norm": 0.03508715331554413, | |
| "learning_rate": 0.00010402659401094152, | |
| "loss": 0.6583, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.6436781609195403, | |
| "grad_norm": 0.0343579463660717, | |
| "learning_rate": 0.00010268479928100614, | |
| "loss": 0.6753, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.6551724137931034, | |
| "grad_norm": 0.0350567027926445, | |
| "learning_rate": 0.00010134252063133975, | |
| "loss": 0.6493, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.034686826169490814, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6515, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.6781609195402298, | |
| "grad_norm": 0.035702142864465714, | |
| "learning_rate": 9.865747936866027e-05, | |
| "loss": 0.6638, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.6896551724137931, | |
| "grad_norm": 0.034140318632125854, | |
| "learning_rate": 9.73152007189939e-05, | |
| "loss": 0.6496, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.7011494252873565, | |
| "grad_norm": 0.038083408027887344, | |
| "learning_rate": 9.597340598905852e-05, | |
| "loss": 0.6893, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.7126436781609196, | |
| "grad_norm": 0.03810959309339523, | |
| "learning_rate": 9.463233702968783e-05, | |
| "loss": 0.6728, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.7241379310344827, | |
| "grad_norm": 0.033978622406721115, | |
| "learning_rate": 9.329223556089975e-05, | |
| "loss": 0.6814, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7241379310344827, | |
| "eval_loss": 0.6631415486335754, | |
| "eval_runtime": 596.1091, | |
| "eval_samples_per_second": 16.604, | |
| "eval_steps_per_second": 0.131, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.735632183908046, | |
| "grad_norm": 0.03391426429152489, | |
| "learning_rate": 9.195334312832742e-05, | |
| "loss": 0.6421, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.7471264367816093, | |
| "grad_norm": 0.03562890738248825, | |
| "learning_rate": 9.061590105968208e-05, | |
| "loss": 0.6739, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.7586206896551724, | |
| "grad_norm": 0.03706149384379387, | |
| "learning_rate": 8.928015042125523e-05, | |
| "loss": 0.6579, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.7701149425287355, | |
| "grad_norm": 0.03591468557715416, | |
| "learning_rate": 8.79463319744677e-05, | |
| "loss": 0.645, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.7816091954022988, | |
| "grad_norm": 0.03639413043856621, | |
| "learning_rate": 8.661468613247387e-05, | |
| "loss": 0.6594, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.793103448275862, | |
| "grad_norm": 0.03598083183169365, | |
| "learning_rate": 8.528545291682838e-05, | |
| "loss": 0.6723, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.8045977011494254, | |
| "grad_norm": 0.03628537803888321, | |
| "learning_rate": 8.395887191422397e-05, | |
| "loss": 0.6715, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.8160919540229885, | |
| "grad_norm": 0.03826047480106354, | |
| "learning_rate": 8.263518223330697e-05, | |
| "loss": 0.6767, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.8275862068965516, | |
| "grad_norm": 0.03769649192690849, | |
| "learning_rate": 8.131462246157953e-05, | |
| "loss": 0.6552, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.839080459770115, | |
| "grad_norm": 0.03599262982606888, | |
| "learning_rate": 7.999743062239557e-05, | |
| "loss": 0.6688, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.839080459770115, | |
| "eval_loss": 0.6607028245925903, | |
| "eval_runtime": 589.6737, | |
| "eval_samples_per_second": 16.786, | |
| "eval_steps_per_second": 0.132, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8505747126436782, | |
| "grad_norm": 0.03584510087966919, | |
| "learning_rate": 7.868384413205842e-05, | |
| "loss": 0.672, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.8620689655172413, | |
| "grad_norm": 0.03545341268181801, | |
| "learning_rate": 7.73740997570278e-05, | |
| "loss": 0.6629, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.8735632183908046, | |
| "grad_norm": 0.035858072340488434, | |
| "learning_rate": 7.606843357124426e-05, | |
| "loss": 0.6391, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.8850574712643677, | |
| "grad_norm": 0.035794083029031754, | |
| "learning_rate": 7.476708091357782e-05, | |
| "loss": 0.6516, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.896551724137931, | |
| "grad_norm": 0.03674920275807381, | |
| "learning_rate": 7.347027634540993e-05, | |
| "loss": 0.6624, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.9080459770114944, | |
| "grad_norm": 0.034701887518167496, | |
| "learning_rate": 7.217825360835473e-05, | |
| "loss": 0.6574, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.9195402298850575, | |
| "grad_norm": 0.03606560826301575, | |
| "learning_rate": 7.089124558212871e-05, | |
| "loss": 0.6668, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.9310344827586206, | |
| "grad_norm": 0.03570757061243057, | |
| "learning_rate": 6.960948424257532e-05, | |
| "loss": 0.6358, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.9425287356321839, | |
| "grad_norm": 0.034459397196769714, | |
| "learning_rate": 6.833320061985277e-05, | |
| "loss": 0.6294, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.9540229885057472, | |
| "grad_norm": 0.035828083753585815, | |
| "learning_rate": 6.706262475679205e-05, | |
| "loss": 0.6781, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9540229885057472, | |
| "eval_loss": 0.6586677432060242, | |
| "eval_runtime": 590.832, | |
| "eval_samples_per_second": 16.753, | |
| "eval_steps_per_second": 0.132, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9655172413793105, | |
| "grad_norm": 0.03741836175322533, | |
| "learning_rate": 6.579798566743314e-05, | |
| "loss": 0.6679, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.9770114942528736, | |
| "grad_norm": 0.03573041409254074, | |
| "learning_rate": 6.453951129574644e-05, | |
| "loss": 0.6805, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.9885057471264367, | |
| "grad_norm": 0.03586418181657791, | |
| "learning_rate": 6.328742847454724e-05, | |
| "loss": 0.6606, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.03636249899864197, | |
| "learning_rate": 6.204196288461037e-05, | |
| "loss": 0.6962, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.0114942528735633, | |
| "grad_norm": 0.04032210260629654, | |
| "learning_rate": 6.080333901399251e-05, | |
| "loss": 0.6426, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.0229885057471266, | |
| "grad_norm": 0.039117176085710526, | |
| "learning_rate": 5.957178011756952e-05, | |
| "loss": 0.6418, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.0344827586206895, | |
| "grad_norm": 0.037345871329307556, | |
| "learning_rate": 5.834750817679606e-05, | |
| "loss": 0.6406, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.045977011494253, | |
| "grad_norm": 0.03830842301249504, | |
| "learning_rate": 5.713074385969457e-05, | |
| "loss": 0.6262, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.057471264367816, | |
| "grad_norm": 0.037265144288539886, | |
| "learning_rate": 5.59217064810814e-05, | |
| "loss": 0.6385, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.0689655172413794, | |
| "grad_norm": 0.03784786909818649, | |
| "learning_rate": 5.472061396303629e-05, | |
| "loss": 0.618, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0689655172413794, | |
| "eval_loss": 0.6581570506095886, | |
| "eval_runtime": 593.2384, | |
| "eval_samples_per_second": 16.685, | |
| "eval_steps_per_second": 0.131, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0804597701149423, | |
| "grad_norm": 0.03711731731891632, | |
| "learning_rate": 5.3527682795623146e-05, | |
| "loss": 0.6354, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.0919540229885056, | |
| "grad_norm": 0.037277545779943466, | |
| "learning_rate": 5.234312799786921e-05, | |
| "loss": 0.631, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.103448275862069, | |
| "grad_norm": 0.040995873510837555, | |
| "learning_rate": 5.116716307900893e-05, | |
| "loss": 0.631, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.1149425287356323, | |
| "grad_norm": 0.038000449538230896, | |
| "learning_rate": 5.000000000000002e-05, | |
| "loss": 0.6691, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.1264367816091956, | |
| "grad_norm": 0.03552795201539993, | |
| "learning_rate": 4.884184913531902e-05, | |
| "loss": 0.6147, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.1379310344827585, | |
| "grad_norm": 0.03610774129629135, | |
| "learning_rate": 4.7692919235042255e-05, | |
| "loss": 0.6259, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.1494252873563218, | |
| "grad_norm": 0.038160186260938644, | |
| "learning_rate": 4.6553417387219886e-05, | |
| "loss": 0.6375, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.160919540229885, | |
| "grad_norm": 0.03843645006418228, | |
| "learning_rate": 4.542354898054953e-05, | |
| "loss": 0.6329, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.1724137931034484, | |
| "grad_norm": 0.03745341673493385, | |
| "learning_rate": 4.430351766735609e-05, | |
| "loss": 0.6379, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.1839080459770113, | |
| "grad_norm": 0.03651278465986252, | |
| "learning_rate": 4.3193525326884435e-05, | |
| "loss": 0.6575, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1839080459770113, | |
| "eval_loss": 0.657158374786377, | |
| "eval_runtime": 591.1387, | |
| "eval_samples_per_second": 16.744, | |
| "eval_steps_per_second": 0.132, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1954022988505746, | |
| "grad_norm": 0.03700386360287666, | |
| "learning_rate": 4.209377202891212e-05, | |
| "loss": 0.643, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.206896551724138, | |
| "grad_norm": 0.038101959973573685, | |
| "learning_rate": 4.100445599768774e-05, | |
| "loss": 0.6349, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.218390804597701, | |
| "grad_norm": 0.036965154111385345, | |
| "learning_rate": 3.99257735762021e-05, | |
| "loss": 0.6404, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.2298850574712645, | |
| "grad_norm": 0.03756578266620636, | |
| "learning_rate": 3.885791919079878e-05, | |
| "loss": 0.6615, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.2413793103448274, | |
| "grad_norm": 0.035037923604249954, | |
| "learning_rate": 3.7801085316129615e-05, | |
| "loss": 0.6237, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.2528735632183907, | |
| "grad_norm": 0.03736288473010063, | |
| "learning_rate": 3.675546244046228e-05, | |
| "loss": 0.6482, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.264367816091954, | |
| "grad_norm": 0.03758701682090759, | |
| "learning_rate": 3.5721239031346066e-05, | |
| "loss": 0.6101, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.2758620689655173, | |
| "grad_norm": 0.03676440194249153, | |
| "learning_rate": 3.469860150164152e-05, | |
| "loss": 0.6499, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.2873563218390807, | |
| "grad_norm": 0.03628651425242424, | |
| "learning_rate": 3.36877341759205e-05, | |
| "loss": 0.629, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.2988505747126435, | |
| "grad_norm": 0.03841525688767433, | |
| "learning_rate": 3.268881925724297e-05, | |
| "loss": 0.6372, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2988505747126435, | |
| "eval_loss": 0.6563421487808228, | |
| "eval_runtime": 591.4937, | |
| "eval_samples_per_second": 16.734, | |
| "eval_steps_per_second": 0.132, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.310344827586207, | |
| "grad_norm": 0.03878968209028244, | |
| "learning_rate": 3.170203679431584e-05, | |
| "loss": 0.6417, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.32183908045977, | |
| "grad_norm": 0.03762966766953468, | |
| "learning_rate": 3.072756464904006e-05, | |
| "loss": 0.6434, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.03814293071627617, | |
| "learning_rate": 2.976557846445225e-05, | |
| "loss": 0.643, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.344827586206897, | |
| "grad_norm": 0.03540797904133797, | |
| "learning_rate": 2.881625163306596e-05, | |
| "loss": 0.6374, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.3563218390804597, | |
| "grad_norm": 0.03709061071276665, | |
| "learning_rate": 2.7879755265618555e-05, | |
| "loss": 0.6418, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.367816091954023, | |
| "grad_norm": 0.03803767263889313, | |
| "learning_rate": 2.6956258160229695e-05, | |
| "loss": 0.632, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.3793103448275863, | |
| "grad_norm": 0.03654790297150612, | |
| "learning_rate": 2.6045926771976303e-05, | |
| "loss": 0.6538, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.3908045977011496, | |
| "grad_norm": 0.03603474050760269, | |
| "learning_rate": 2.514892518288988e-05, | |
| "loss": 0.6145, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.4022988505747125, | |
| "grad_norm": 0.036107271909713745, | |
| "learning_rate": 2.4265415072382016e-05, | |
| "loss": 0.6222, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.413793103448276, | |
| "grad_norm": 0.036303840577602386, | |
| "learning_rate": 2.339555568810221e-05, | |
| "loss": 0.617, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.413793103448276, | |
| "eval_loss": 0.6558452248573303, | |
| "eval_runtime": 595.1508, | |
| "eval_samples_per_second": 16.631, | |
| "eval_steps_per_second": 0.131, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.425287356321839, | |
| "grad_norm": 0.037371959537267685, | |
| "learning_rate": 2.2539503817234553e-05, | |
| "loss": 0.654, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.4367816091954024, | |
| "grad_norm": 0.037925343960523605, | |
| "learning_rate": 2.1697413758237784e-05, | |
| "loss": 0.6322, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.4482758620689653, | |
| "grad_norm": 0.03855719789862633, | |
| "learning_rate": 2.0869437293033835e-05, | |
| "loss": 0.6335, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.4597701149425286, | |
| "grad_norm": 0.037308286875486374, | |
| "learning_rate": 2.0055723659649904e-05, | |
| "loss": 0.6184, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.471264367816092, | |
| "grad_norm": 0.03688681870698929, | |
| "learning_rate": 1.9256419525319313e-05, | |
| "loss": 0.627, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4827586206896552, | |
| "grad_norm": 0.038670193403959274, | |
| "learning_rate": 1.8471668960045574e-05, | |
| "loss": 0.6487, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.4942528735632186, | |
| "grad_norm": 0.03826979547739029, | |
| "learning_rate": 1.7701613410634365e-05, | |
| "loss": 0.6337, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.5057471264367814, | |
| "grad_norm": 0.03768225386738777, | |
| "learning_rate": 1.6946391675198836e-05, | |
| "loss": 0.6356, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.5172413793103448, | |
| "grad_norm": 0.037600524723529816, | |
| "learning_rate": 1.620613987814189e-05, | |
| "loss": 0.6237, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.528735632183908, | |
| "grad_norm": 0.036940865218639374, | |
| "learning_rate": 1.5480991445620542e-05, | |
| "loss": 0.6152, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.528735632183908, | |
| "eval_loss": 0.655238926410675, | |
| "eval_runtime": 592.9652, | |
| "eval_samples_per_second": 16.692, | |
| "eval_steps_per_second": 0.132, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.5402298850574714, | |
| "grad_norm": 0.03826668858528137, | |
| "learning_rate": 1.4771077081496654e-05, | |
| "loss": 0.6511, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.5517241379310347, | |
| "grad_norm": 0.0361490473151207, | |
| "learning_rate": 1.4076524743778319e-05, | |
| "loss": 0.6311, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.5632183908045976, | |
| "grad_norm": 0.03727561607956886, | |
| "learning_rate": 1.339745962155613e-05, | |
| "loss": 0.6324, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.574712643678161, | |
| "grad_norm": 0.038558244705200195, | |
| "learning_rate": 1.2734004112438568e-05, | |
| "loss": 0.6568, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.586206896551724, | |
| "grad_norm": 0.03707597777247429, | |
| "learning_rate": 1.2086277800490554e-05, | |
| "loss": 0.649, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5977011494252875, | |
| "grad_norm": 0.037075500935316086, | |
| "learning_rate": 1.1454397434679021e-05, | |
| "loss": 0.6603, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.609195402298851, | |
| "grad_norm": 0.03692416474223137, | |
| "learning_rate": 1.083847690782972e-05, | |
| "loss": 0.6295, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.6206896551724137, | |
| "grad_norm": 0.03675093874335289, | |
| "learning_rate": 1.0238627236098619e-05, | |
| "loss": 0.6209, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.632183908045977, | |
| "grad_norm": 0.03640436753630638, | |
| "learning_rate": 9.65495653896179e-06, | |
| "loss": 0.6339, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.6436781609195403, | |
| "grad_norm": 0.037452854216098785, | |
| "learning_rate": 9.08757001972762e-06, | |
| "loss": 0.6407, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.6436781609195403, | |
| "eval_loss": 0.6549434065818787, | |
| "eval_runtime": 592.4603, | |
| "eval_samples_per_second": 16.707, | |
| "eval_steps_per_second": 0.132, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.655172413793103, | |
| "grad_norm": 0.03795592859387398, | |
| "learning_rate": 8.536569946574546e-06, | |
| "loss": 0.6534, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.03788486495614052, | |
| "learning_rate": 8.002055634117578e-06, | |
| "loss": 0.6507, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.67816091954023, | |
| "grad_norm": 0.03641374036669731, | |
| "learning_rate": 7.4841234255076495e-06, | |
| "loss": 0.6282, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.689655172413793, | |
| "grad_norm": 0.03832123428583145, | |
| "learning_rate": 6.9828666750661795e-06, | |
| "loss": 0.6276, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.7011494252873565, | |
| "grad_norm": 0.03677581995725632, | |
| "learning_rate": 6.498375731458528e-06, | |
| "loss": 0.639, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.7126436781609193, | |
| "grad_norm": 0.03623311221599579, | |
| "learning_rate": 6.030737921409169e-06, | |
| "loss": 0.6425, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.7241379310344827, | |
| "grad_norm": 0.036890316754579544, | |
| "learning_rate": 5.580037533961546e-06, | |
| "loss": 0.6438, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.735632183908046, | |
| "grad_norm": 0.03658117353916168, | |
| "learning_rate": 5.146355805285452e-06, | |
| "loss": 0.6275, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.7471264367816093, | |
| "grad_norm": 0.03849633410573006, | |
| "learning_rate": 4.729770904034647e-06, | |
| "loss": 0.6357, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.7586206896551726, | |
| "grad_norm": 0.03720705956220627, | |
| "learning_rate": 4.3303579172574885e-06, | |
| "loss": 0.6498, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7586206896551726, | |
| "eval_loss": 0.6547917723655701, | |
| "eval_runtime": 593.6976, | |
| "eval_samples_per_second": 16.672, | |
| "eval_steps_per_second": 0.131, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7701149425287355, | |
| "grad_norm": 0.03713015094399452, | |
| "learning_rate": 3.948188836862776e-06, | |
| "loss": 0.63, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.781609195402299, | |
| "grad_norm": 0.03703474998474121, | |
| "learning_rate": 3.5833325466437694e-06, | |
| "loss": 0.6267, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.793103448275862, | |
| "grad_norm": 0.0391690619289875, | |
| "learning_rate": 3.2358548098621932e-06, | |
| "loss": 0.6297, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.8045977011494254, | |
| "grad_norm": 0.03748522326350212, | |
| "learning_rate": 2.905818257394799e-06, | |
| "loss": 0.6289, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.8160919540229887, | |
| "grad_norm": 0.036214690655469894, | |
| "learning_rate": 2.5932823764445392e-06, | |
| "loss": 0.6274, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.8275862068965516, | |
| "grad_norm": 0.03697674348950386, | |
| "learning_rate": 2.2983034998182997e-06, | |
| "loss": 0.6205, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.839080459770115, | |
| "grad_norm": 0.0381477065384388, | |
| "learning_rate": 2.0209347957732328e-06, | |
| "loss": 0.6585, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.8505747126436782, | |
| "grad_norm": 0.03660466521978378, | |
| "learning_rate": 1.7612262584335237e-06, | |
| "loss": 0.6215, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.862068965517241, | |
| "grad_norm": 0.03698161989450455, | |
| "learning_rate": 1.5192246987791981e-06, | |
| "loss": 0.6454, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.873563218390805, | |
| "grad_norm": 0.037139616906642914, | |
| "learning_rate": 1.2949737362087156e-06, | |
| "loss": 0.6365, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.873563218390805, | |
| "eval_loss": 0.6547266840934753, | |
| "eval_runtime": 593.9462, | |
| "eval_samples_per_second": 16.665, | |
| "eval_steps_per_second": 0.131, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.8850574712643677, | |
| "grad_norm": 0.03719855844974518, | |
| "learning_rate": 1.0885137906768372e-06, | |
| "loss": 0.628, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.896551724137931, | |
| "grad_norm": 0.03709800913929939, | |
| "learning_rate": 8.998820754091531e-07, | |
| "loss": 0.6447, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.9080459770114944, | |
| "grad_norm": 0.03928203135728836, | |
| "learning_rate": 7.291125901946027e-07, | |
| "loss": 0.6615, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.9195402298850572, | |
| "grad_norm": 0.03835693374276161, | |
| "learning_rate": 5.762361152572115e-07, | |
| "loss": 0.6382, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.9310344827586206, | |
| "grad_norm": 0.037969332188367844, | |
| "learning_rate": 4.412802057081278e-07, | |
| "loss": 0.6339, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.942528735632184, | |
| "grad_norm": 0.0369732528924942, | |
| "learning_rate": 3.2426918657900704e-07, | |
| "loss": 0.6557, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.954022988505747, | |
| "grad_norm": 0.03954119607806206, | |
| "learning_rate": 2.2522414843748618e-07, | |
| "loss": 0.6419, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.9655172413793105, | |
| "grad_norm": 0.03722945228219032, | |
| "learning_rate": 1.4416294358582384e-07, | |
| "loss": 0.6224, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.9770114942528734, | |
| "grad_norm": 0.037498198449611664, | |
| "learning_rate": 8.110018284304133e-08, | |
| "loss": 0.6472, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.9885057471264367, | |
| "grad_norm": 0.03706786781549454, | |
| "learning_rate": 3.60472329114625e-08, | |
| "loss": 0.6532, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9885057471264367, | |
| "eval_loss": 0.6546847820281982, | |
| "eval_runtime": 591.4156, | |
| "eval_samples_per_second": 16.736, | |
| "eval_steps_per_second": 0.132, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.0390935055911541, | |
| "learning_rate": 9.012214327897006e-09, | |
| "loss": 0.6416, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 261, | |
| "total_flos": 4.878497427033686e+19, | |
| "train_loss": 0.6974607110023499, | |
| "train_runtime": 68284.7248, | |
| "train_samples_per_second": 3.914, | |
| "train_steps_per_second": 0.004 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 261, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.878497427033686e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |