{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.99968, "eval_steps": 500, "global_step": 1562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00064, "grad_norm": 48.05089569091797, "learning_rate": 0.0, "loss": 1.4121, "mean_token_accuracy": 0.6530859991908073, "num_tokens": 13040.0, "step": 1 }, { "epoch": 0.00128, "grad_norm": 26.102807998657227, "learning_rate": 6.329113924050633e-08, "loss": 1.5231, "mean_token_accuracy": 0.6276459023356438, "num_tokens": 26989.0, "step": 2 }, { "epoch": 0.00192, "grad_norm": 95.80143737792969, "learning_rate": 1.2658227848101266e-07, "loss": 1.4079, "mean_token_accuracy": 0.6467841640114784, "num_tokens": 37679.0, "step": 3 }, { "epoch": 0.00256, "grad_norm": 15.393555641174316, "learning_rate": 1.89873417721519e-07, "loss": 1.5839, "mean_token_accuracy": 0.6144984066486359, "num_tokens": 52503.0, "step": 4 }, { "epoch": 0.0032, "grad_norm": 76.43201446533203, "learning_rate": 2.5316455696202533e-07, "loss": 1.5376, "mean_token_accuracy": 0.614102203398943, "num_tokens": 62165.0, "step": 5 }, { "epoch": 0.00384, "grad_norm": 13.124272346496582, "learning_rate": 3.164556962025317e-07, "loss": 1.4651, "mean_token_accuracy": 0.6233282685279846, "num_tokens": 76003.0, "step": 6 }, { "epoch": 0.00448, "grad_norm": 22.9676570892334, "learning_rate": 3.79746835443038e-07, "loss": 1.5145, "mean_token_accuracy": 0.6122585535049438, "num_tokens": 92265.0, "step": 7 }, { "epoch": 0.00512, "grad_norm": 15.335000038146973, "learning_rate": 4.4303797468354435e-07, "loss": 1.4871, "mean_token_accuracy": 0.6254958733916283, "num_tokens": 106642.0, "step": 8 }, { "epoch": 0.00576, "grad_norm": 13.938623428344727, "learning_rate": 5.063291139240507e-07, "loss": 1.2911, "mean_token_accuracy": 0.6838521659374237, "num_tokens": 118655.0, "step": 9 }, { "epoch": 0.0064, "grad_norm": 12.838367462158203, "learning_rate": 5.69620253164557e-07, "loss": 1.5914, "mean_token_accuracy": 0.6144495904445648, "num_tokens": 133015.0, "step": 10 }, { "epoch": 0.00704, "grad_norm": 28.111896514892578, "learning_rate": 6.329113924050634e-07, "loss": 1.5112, "mean_token_accuracy": 0.6265368536114693, "num_tokens": 147398.0, "step": 11 }, { "epoch": 0.00768, "grad_norm": 86.63944244384766, "learning_rate": 6.962025316455696e-07, "loss": 1.3589, "mean_token_accuracy": 0.6383277997374535, "num_tokens": 158812.0, "step": 12 }, { "epoch": 0.00832, "grad_norm": 168.6136016845703, "learning_rate": 7.59493670886076e-07, "loss": 1.7851, "mean_token_accuracy": 0.5883053466677666, "num_tokens": 170703.0, "step": 13 }, { "epoch": 0.00896, "grad_norm": 217.70408630371094, "learning_rate": 8.227848101265823e-07, "loss": 1.4027, "mean_token_accuracy": 0.6471100524067879, "num_tokens": 184556.0, "step": 14 }, { "epoch": 0.0096, "grad_norm": 90.14910125732422, "learning_rate": 8.860759493670887e-07, "loss": 1.376, "mean_token_accuracy": 0.6501478627324104, "num_tokens": 202523.0, "step": 15 }, { "epoch": 0.01024, "grad_norm": 97.01738739013672, "learning_rate": 9.493670886075951e-07, "loss": 1.1443, "mean_token_accuracy": 0.6894906312227249, "num_tokens": 212189.0, "step": 16 }, { "epoch": 0.01088, "grad_norm": 64.89408874511719, "learning_rate": 1.0126582278481013e-06, "loss": 1.5423, "mean_token_accuracy": 0.6109918430447578, "num_tokens": 226375.0, "step": 17 }, { "epoch": 0.01152, "grad_norm": 82.11033630371094, "learning_rate": 1.0759493670886077e-06, "loss": 1.6578, "mean_token_accuracy": 0.6166552416980267, "num_tokens": 238636.0, "step": 18 }, { "epoch": 0.01216, "grad_norm": 102.00513458251953, "learning_rate": 1.139240506329114e-06, "loss": 1.3764, "mean_token_accuracy": 0.6483487188816071, "num_tokens": 250678.0, "step": 19 }, { "epoch": 0.0128, "grad_norm": 69.1679916381836, "learning_rate": 1.2025316455696204e-06, "loss": 1.5599, "mean_token_accuracy": 0.6092639714479446, "num_tokens": 266795.0, "step": 20 }, { "epoch": 0.01344, "grad_norm": 138.91578674316406, "learning_rate": 1.2658227848101267e-06, "loss": 1.4293, "mean_token_accuracy": 0.6301368102431297, "num_tokens": 280316.0, "step": 21 }, { "epoch": 0.01408, "grad_norm": 84.13861083984375, "learning_rate": 1.3291139240506329e-06, "loss": 1.5571, "mean_token_accuracy": 0.6376139968633652, "num_tokens": 292273.0, "step": 22 }, { "epoch": 0.01472, "grad_norm": 75.20580291748047, "learning_rate": 1.3924050632911392e-06, "loss": 1.7514, "mean_token_accuracy": 0.5854216478765011, "num_tokens": 304665.0, "step": 23 }, { "epoch": 0.01536, "grad_norm": 33.301204681396484, "learning_rate": 1.4556962025316456e-06, "loss": 1.445, "mean_token_accuracy": 0.6291614323854446, "num_tokens": 317582.0, "step": 24 }, { "epoch": 0.016, "grad_norm": 49.913997650146484, "learning_rate": 1.518987341772152e-06, "loss": 1.1659, "mean_token_accuracy": 0.6733755245804787, "num_tokens": 330930.0, "step": 25 }, { "epoch": 0.01664, "grad_norm": 16.423551559448242, "learning_rate": 1.5822784810126585e-06, "loss": 1.4189, "mean_token_accuracy": 0.6274379268288612, "num_tokens": 343897.0, "step": 26 }, { "epoch": 0.01728, "grad_norm": 48.93357467651367, "learning_rate": 1.6455696202531647e-06, "loss": 1.5449, "mean_token_accuracy": 0.6013954728841782, "num_tokens": 358229.0, "step": 27 }, { "epoch": 0.01792, "grad_norm": 41.11186981201172, "learning_rate": 1.708860759493671e-06, "loss": 1.4404, "mean_token_accuracy": 0.6342460885643959, "num_tokens": 369284.0, "step": 28 }, { "epoch": 0.01856, "grad_norm": 32.76636505126953, "learning_rate": 1.7721518987341774e-06, "loss": 1.5705, "mean_token_accuracy": 0.6268726661801338, "num_tokens": 384919.0, "step": 29 }, { "epoch": 0.0192, "grad_norm": 65.5134048461914, "learning_rate": 1.8354430379746838e-06, "loss": 1.4156, "mean_token_accuracy": 0.6435664221644402, "num_tokens": 399789.0, "step": 30 }, { "epoch": 0.01984, "grad_norm": 6.874727725982666, "learning_rate": 1.8987341772151901e-06, "loss": 1.3794, "mean_token_accuracy": 0.6731243506073952, "num_tokens": 410300.0, "step": 31 }, { "epoch": 0.02048, "grad_norm": 30.61371421813965, "learning_rate": 1.9620253164556965e-06, "loss": 1.4516, "mean_token_accuracy": 0.6296539008617401, "num_tokens": 425086.0, "step": 32 }, { "epoch": 0.02112, "grad_norm": 53.762176513671875, "learning_rate": 2.0253164556962026e-06, "loss": 1.3964, "mean_token_accuracy": 0.6357481107115746, "num_tokens": 438978.0, "step": 33 }, { "epoch": 0.02176, "grad_norm": 58.90037536621094, "learning_rate": 2.088607594936709e-06, "loss": 1.4799, "mean_token_accuracy": 0.619662769138813, "num_tokens": 451824.0, "step": 34 }, { "epoch": 0.0224, "grad_norm": 8.633367538452148, "learning_rate": 2.1518987341772153e-06, "loss": 1.4459, "mean_token_accuracy": 0.6490786001086235, "num_tokens": 465569.0, "step": 35 }, { "epoch": 0.02304, "grad_norm": 152.17538452148438, "learning_rate": 2.2151898734177215e-06, "loss": 1.5855, "mean_token_accuracy": 0.5879618301987648, "num_tokens": 476187.0, "step": 36 }, { "epoch": 0.02368, "grad_norm": 63.17410659790039, "learning_rate": 2.278481012658228e-06, "loss": 1.4941, "mean_token_accuracy": 0.6467924416065216, "num_tokens": 490658.0, "step": 37 }, { "epoch": 0.02432, "grad_norm": 40.93563461303711, "learning_rate": 2.341772151898734e-06, "loss": 1.2798, "mean_token_accuracy": 0.6463187485933304, "num_tokens": 500707.0, "step": 38 }, { "epoch": 0.02496, "grad_norm": 47.74807357788086, "learning_rate": 2.4050632911392408e-06, "loss": 1.682, "mean_token_accuracy": 0.5752647258341312, "num_tokens": 514924.0, "step": 39 }, { "epoch": 0.0256, "grad_norm": 8.83420181274414, "learning_rate": 2.4683544303797473e-06, "loss": 1.374, "mean_token_accuracy": 0.6248214021325111, "num_tokens": 529540.0, "step": 40 }, { "epoch": 0.02624, "grad_norm": 73.02564239501953, "learning_rate": 2.5316455696202535e-06, "loss": 1.4803, "mean_token_accuracy": 0.63168865442276, "num_tokens": 542779.0, "step": 41 }, { "epoch": 0.02688, "grad_norm": 58.30765914916992, "learning_rate": 2.5949367088607596e-06, "loss": 1.2061, "mean_token_accuracy": 0.6671500578522682, "num_tokens": 556396.0, "step": 42 }, { "epoch": 0.02752, "grad_norm": 33.67079162597656, "learning_rate": 2.6582278481012658e-06, "loss": 1.522, "mean_token_accuracy": 0.5986876226961613, "num_tokens": 569089.0, "step": 43 }, { "epoch": 0.02816, "grad_norm": 31.859474182128906, "learning_rate": 2.7215189873417724e-06, "loss": 1.3638, "mean_token_accuracy": 0.6643245741724968, "num_tokens": 583380.0, "step": 44 }, { "epoch": 0.0288, "grad_norm": 33.59089660644531, "learning_rate": 2.7848101265822785e-06, "loss": 1.4956, "mean_token_accuracy": 0.6372537463903427, "num_tokens": 596014.0, "step": 45 }, { "epoch": 0.02944, "grad_norm": 25.843647003173828, "learning_rate": 2.848101265822785e-06, "loss": 1.1846, "mean_token_accuracy": 0.6587028503417969, "num_tokens": 608555.0, "step": 46 }, { "epoch": 0.03008, "grad_norm": 5.101419925689697, "learning_rate": 2.9113924050632912e-06, "loss": 1.3947, "mean_token_accuracy": 0.6413091421127319, "num_tokens": 621134.0, "step": 47 }, { "epoch": 0.03072, "grad_norm": 40.20245361328125, "learning_rate": 2.9746835443037974e-06, "loss": 1.345, "mean_token_accuracy": 0.6560010835528374, "num_tokens": 635852.0, "step": 48 }, { "epoch": 0.03136, "grad_norm": 94.18912506103516, "learning_rate": 3.037974683544304e-06, "loss": 1.3185, "mean_token_accuracy": 0.6504691988229752, "num_tokens": 648238.0, "step": 49 }, { "epoch": 0.032, "grad_norm": 23.04238510131836, "learning_rate": 3.10126582278481e-06, "loss": 1.5373, "mean_token_accuracy": 0.6217592805624008, "num_tokens": 662824.0, "step": 50 }, { "epoch": 0.03264, "grad_norm": 56.0700569152832, "learning_rate": 3.164556962025317e-06, "loss": 1.2922, "mean_token_accuracy": 0.6528200656175613, "num_tokens": 675378.0, "step": 51 }, { "epoch": 0.03328, "grad_norm": 14.78956127166748, "learning_rate": 3.2278481012658232e-06, "loss": 1.5514, "mean_token_accuracy": 0.6194410622119904, "num_tokens": 689687.0, "step": 52 }, { "epoch": 0.03392, "grad_norm": 52.21746063232422, "learning_rate": 3.2911392405063294e-06, "loss": 1.5048, "mean_token_accuracy": 0.6074254661798477, "num_tokens": 702196.0, "step": 53 }, { "epoch": 0.03456, "grad_norm": 39.832069396972656, "learning_rate": 3.354430379746836e-06, "loss": 1.2862, "mean_token_accuracy": 0.6434847190976143, "num_tokens": 716114.0, "step": 54 }, { "epoch": 0.0352, "grad_norm": 13.459836959838867, "learning_rate": 3.417721518987342e-06, "loss": 1.6001, "mean_token_accuracy": 0.6024078205227852, "num_tokens": 729970.0, "step": 55 }, { "epoch": 0.03584, "grad_norm": 7.002089023590088, "learning_rate": 3.4810126582278487e-06, "loss": 1.2902, "mean_token_accuracy": 0.6684707179665565, "num_tokens": 742129.0, "step": 56 }, { "epoch": 0.03648, "grad_norm": 5.5110907554626465, "learning_rate": 3.544303797468355e-06, "loss": 1.287, "mean_token_accuracy": 0.6572533845901489, "num_tokens": 756207.0, "step": 57 }, { "epoch": 0.03712, "grad_norm": 19.13697052001953, "learning_rate": 3.607594936708861e-06, "loss": 1.3056, "mean_token_accuracy": 0.6451508551836014, "num_tokens": 768845.0, "step": 58 }, { "epoch": 0.03776, "grad_norm": 15.833284378051758, "learning_rate": 3.6708860759493675e-06, "loss": 1.738, "mean_token_accuracy": 0.5944546982645988, "num_tokens": 780788.0, "step": 59 }, { "epoch": 0.0384, "grad_norm": 20.870206832885742, "learning_rate": 3.7341772151898737e-06, "loss": 1.4071, "mean_token_accuracy": 0.6308668106794357, "num_tokens": 793426.0, "step": 60 }, { "epoch": 0.03904, "grad_norm": 16.34093475341797, "learning_rate": 3.7974683544303802e-06, "loss": 1.5468, "mean_token_accuracy": 0.627624161541462, "num_tokens": 804663.0, "step": 61 }, { "epoch": 0.03968, "grad_norm": 17.309402465820312, "learning_rate": 3.860759493670886e-06, "loss": 1.505, "mean_token_accuracy": 0.6210604161024094, "num_tokens": 818199.0, "step": 62 }, { "epoch": 0.04032, "grad_norm": 4.618063449859619, "learning_rate": 3.924050632911393e-06, "loss": 1.2564, "mean_token_accuracy": 0.664100281894207, "num_tokens": 830056.0, "step": 63 }, { "epoch": 0.04096, "grad_norm": 61.135398864746094, "learning_rate": 3.9873417721518995e-06, "loss": 1.2191, "mean_token_accuracy": 0.6882363706827164, "num_tokens": 844140.0, "step": 64 }, { "epoch": 0.0416, "grad_norm": 58.18097686767578, "learning_rate": 4.050632911392405e-06, "loss": 1.4123, "mean_token_accuracy": 0.6319537982344627, "num_tokens": 856421.0, "step": 65 }, { "epoch": 0.04224, "grad_norm": 4.918764591217041, "learning_rate": 4.113924050632912e-06, "loss": 1.342, "mean_token_accuracy": 0.6437618285417557, "num_tokens": 870219.0, "step": 66 }, { "epoch": 0.04288, "grad_norm": 6.8447585105896, "learning_rate": 4.177215189873418e-06, "loss": 1.2137, "mean_token_accuracy": 0.6827035769820213, "num_tokens": 880524.0, "step": 67 }, { "epoch": 0.04352, "grad_norm": 12.660514831542969, "learning_rate": 4.240506329113924e-06, "loss": 1.4528, "mean_token_accuracy": 0.6331579238176346, "num_tokens": 891958.0, "step": 68 }, { "epoch": 0.04416, "grad_norm": 7.19080924987793, "learning_rate": 4.303797468354431e-06, "loss": 1.4252, "mean_token_accuracy": 0.65432970225811, "num_tokens": 904908.0, "step": 69 }, { "epoch": 0.0448, "grad_norm": 3.9678971767425537, "learning_rate": 4.367088607594937e-06, "loss": 1.2971, "mean_token_accuracy": 0.6341002583503723, "num_tokens": 918681.0, "step": 70 }, { "epoch": 0.04544, "grad_norm": 7.076503276824951, "learning_rate": 4.430379746835443e-06, "loss": 1.2615, "mean_token_accuracy": 0.6656483858823776, "num_tokens": 928006.0, "step": 71 }, { "epoch": 0.04608, "grad_norm": 5.339262962341309, "learning_rate": 4.4936708860759495e-06, "loss": 1.344, "mean_token_accuracy": 0.6466359868645668, "num_tokens": 940426.0, "step": 72 }, { "epoch": 0.04672, "grad_norm": 7.0051703453063965, "learning_rate": 4.556962025316456e-06, "loss": 1.4216, "mean_token_accuracy": 0.630195863544941, "num_tokens": 954324.0, "step": 73 }, { "epoch": 0.04736, "grad_norm": 4.788408279418945, "learning_rate": 4.620253164556963e-06, "loss": 1.4698, "mean_token_accuracy": 0.614908404648304, "num_tokens": 967599.0, "step": 74 }, { "epoch": 0.048, "grad_norm": 5.4968366622924805, "learning_rate": 4.683544303797468e-06, "loss": 1.3819, "mean_token_accuracy": 0.6416184306144714, "num_tokens": 981616.0, "step": 75 }, { "epoch": 0.04864, "grad_norm": 4.646515369415283, "learning_rate": 4.746835443037975e-06, "loss": 1.4479, "mean_token_accuracy": 0.6559240221977234, "num_tokens": 993440.0, "step": 76 }, { "epoch": 0.04928, "grad_norm": 5.726164817810059, "learning_rate": 4.8101265822784815e-06, "loss": 1.3377, "mean_token_accuracy": 0.6394649744033813, "num_tokens": 1005930.0, "step": 77 }, { "epoch": 0.04992, "grad_norm": 7.1859588623046875, "learning_rate": 4.873417721518987e-06, "loss": 1.5267, "mean_token_accuracy": 0.6308169737458229, "num_tokens": 1018113.0, "step": 78 }, { "epoch": 0.05056, "grad_norm": 5.919410705566406, "learning_rate": 4.936708860759495e-06, "loss": 1.5063, "mean_token_accuracy": 0.6113156750798225, "num_tokens": 1030828.0, "step": 79 }, { "epoch": 0.0512, "grad_norm": 4.142421722412109, "learning_rate": 5e-06, "loss": 1.365, "mean_token_accuracy": 0.6484999246895313, "num_tokens": 1045699.0, "step": 80 }, { "epoch": 0.05184, "grad_norm": 5.720826625823975, "learning_rate": 5e-06, "loss": 1.324, "mean_token_accuracy": 0.6485482379794121, "num_tokens": 1060409.0, "step": 81 }, { "epoch": 0.05248, "grad_norm": 4.827797889709473, "learning_rate": 5e-06, "loss": 1.2039, "mean_token_accuracy": 0.6779980957508087, "num_tokens": 1072000.0, "step": 82 }, { "epoch": 0.05312, "grad_norm": 4.712104797363281, "learning_rate": 5e-06, "loss": 1.5722, "mean_token_accuracy": 0.6242717280983925, "num_tokens": 1085195.0, "step": 83 }, { "epoch": 0.05376, "grad_norm": 4.540091514587402, "learning_rate": 5e-06, "loss": 1.5155, "mean_token_accuracy": 0.633654311299324, "num_tokens": 1099571.0, "step": 84 }, { "epoch": 0.0544, "grad_norm": 5.3648905754089355, "learning_rate": 5e-06, "loss": 1.4137, "mean_token_accuracy": 0.6268021315336227, "num_tokens": 1114063.0, "step": 85 }, { "epoch": 0.05504, "grad_norm": 4.212844371795654, "learning_rate": 5e-06, "loss": 1.3986, "mean_token_accuracy": 0.638413742184639, "num_tokens": 1128456.0, "step": 86 }, { "epoch": 0.05568, "grad_norm": 4.77896785736084, "learning_rate": 5e-06, "loss": 1.3913, "mean_token_accuracy": 0.6454877629876137, "num_tokens": 1141977.0, "step": 87 }, { "epoch": 0.05632, "grad_norm": 6.540133953094482, "learning_rate": 5e-06, "loss": 1.1014, "mean_token_accuracy": 0.6967510804533958, "num_tokens": 1154057.0, "step": 88 }, { "epoch": 0.05696, "grad_norm": 3.7844600677490234, "learning_rate": 5e-06, "loss": 1.4211, "mean_token_accuracy": 0.6254619807004929, "num_tokens": 1169061.0, "step": 89 }, { "epoch": 0.0576, "grad_norm": 3.695892810821533, "learning_rate": 5e-06, "loss": 1.2861, "mean_token_accuracy": 0.6654118373990059, "num_tokens": 1182796.0, "step": 90 }, { "epoch": 0.05824, "grad_norm": 4.524760723114014, "learning_rate": 5e-06, "loss": 1.5883, "mean_token_accuracy": 0.6046858802437782, "num_tokens": 1196449.0, "step": 91 }, { "epoch": 0.05888, "grad_norm": 5.951873779296875, "learning_rate": 5e-06, "loss": 1.3191, "mean_token_accuracy": 0.6647541224956512, "num_tokens": 1209472.0, "step": 92 }, { "epoch": 0.05952, "grad_norm": 5.607054233551025, "learning_rate": 5e-06, "loss": 1.5771, "mean_token_accuracy": 0.6196302324533463, "num_tokens": 1222720.0, "step": 93 }, { "epoch": 0.06016, "grad_norm": 4.97398567199707, "learning_rate": 5e-06, "loss": 1.2972, "mean_token_accuracy": 0.6596869081258774, "num_tokens": 1236366.0, "step": 94 }, { "epoch": 0.0608, "grad_norm": 5.066143035888672, "learning_rate": 5e-06, "loss": 1.6685, "mean_token_accuracy": 0.6038380563259125, "num_tokens": 1248615.0, "step": 95 }, { "epoch": 0.06144, "grad_norm": 4.967097282409668, "learning_rate": 5e-06, "loss": 1.3559, "mean_token_accuracy": 0.649577222764492, "num_tokens": 1258892.0, "step": 96 }, { "epoch": 0.06208, "grad_norm": 3.9898176193237305, "learning_rate": 5e-06, "loss": 1.1218, "mean_token_accuracy": 0.6794590428471565, "num_tokens": 1272167.0, "step": 97 }, { "epoch": 0.06272, "grad_norm": 4.856038570404053, "learning_rate": 5e-06, "loss": 1.4458, "mean_token_accuracy": 0.6120704114437103, "num_tokens": 1284435.0, "step": 98 }, { "epoch": 0.06336, "grad_norm": 4.787650108337402, "learning_rate": 5e-06, "loss": 1.1262, "mean_token_accuracy": 0.7047765105962753, "num_tokens": 1295683.0, "step": 99 }, { "epoch": 0.064, "grad_norm": 4.880126953125, "learning_rate": 5e-06, "loss": 1.359, "mean_token_accuracy": 0.6566307917237282, "num_tokens": 1309253.0, "step": 100 }, { "epoch": 0.06464, "grad_norm": 4.704743385314941, "learning_rate": 5e-06, "loss": 1.3073, "mean_token_accuracy": 0.6889312416315079, "num_tokens": 1321838.0, "step": 101 }, { "epoch": 0.06528, "grad_norm": 4.521302700042725, "learning_rate": 5e-06, "loss": 1.199, "mean_token_accuracy": 0.6745252087712288, "num_tokens": 1333677.0, "step": 102 }, { "epoch": 0.06592, "grad_norm": 4.4061689376831055, "learning_rate": 5e-06, "loss": 1.2863, "mean_token_accuracy": 0.6601276621222496, "num_tokens": 1345912.0, "step": 103 }, { "epoch": 0.06656, "grad_norm": 4.12923002243042, "learning_rate": 5e-06, "loss": 1.3052, "mean_token_accuracy": 0.6593477055430412, "num_tokens": 1356535.0, "step": 104 }, { "epoch": 0.0672, "grad_norm": 4.265780448913574, "learning_rate": 5e-06, "loss": 1.5341, "mean_token_accuracy": 0.6348154991865158, "num_tokens": 1368522.0, "step": 105 }, { "epoch": 0.06784, "grad_norm": 4.388949394226074, "learning_rate": 5e-06, "loss": 1.3616, "mean_token_accuracy": 0.6796349138021469, "num_tokens": 1381247.0, "step": 106 }, { "epoch": 0.06848, "grad_norm": 4.523592948913574, "learning_rate": 5e-06, "loss": 1.4017, "mean_token_accuracy": 0.638551875948906, "num_tokens": 1392378.0, "step": 107 }, { "epoch": 0.06912, "grad_norm": 4.722465991973877, "learning_rate": 5e-06, "loss": 1.1751, "mean_token_accuracy": 0.6694681495428085, "num_tokens": 1404081.0, "step": 108 }, { "epoch": 0.06976, "grad_norm": 3.7663962841033936, "learning_rate": 5e-06, "loss": 1.2044, "mean_token_accuracy": 0.6716165691614151, "num_tokens": 1417942.0, "step": 109 }, { "epoch": 0.0704, "grad_norm": 3.8090057373046875, "learning_rate": 5e-06, "loss": 1.4822, "mean_token_accuracy": 0.6303943246603012, "num_tokens": 1429633.0, "step": 110 }, { "epoch": 0.07104, "grad_norm": 4.707150936126709, "learning_rate": 5e-06, "loss": 1.4048, "mean_token_accuracy": 0.661470353603363, "num_tokens": 1439918.0, "step": 111 }, { "epoch": 0.07168, "grad_norm": 4.384817600250244, "learning_rate": 5e-06, "loss": 1.3427, "mean_token_accuracy": 0.6572804600000381, "num_tokens": 1451652.0, "step": 112 }, { "epoch": 0.07232, "grad_norm": 3.9072980880737305, "learning_rate": 5e-06, "loss": 1.3317, "mean_token_accuracy": 0.6496234610676765, "num_tokens": 1466346.0, "step": 113 }, { "epoch": 0.07296, "grad_norm": 3.8167197704315186, "learning_rate": 5e-06, "loss": 1.5108, "mean_token_accuracy": 0.619833417236805, "num_tokens": 1482241.0, "step": 114 }, { "epoch": 0.0736, "grad_norm": 3.857537031173706, "learning_rate": 5e-06, "loss": 1.3917, "mean_token_accuracy": 0.6645993143320084, "num_tokens": 1495196.0, "step": 115 }, { "epoch": 0.07424, "grad_norm": 4.024837970733643, "learning_rate": 5e-06, "loss": 1.3252, "mean_token_accuracy": 0.6474068984389305, "num_tokens": 1508711.0, "step": 116 }, { "epoch": 0.07488, "grad_norm": 3.6451432704925537, "learning_rate": 5e-06, "loss": 1.3883, "mean_token_accuracy": 0.658332034945488, "num_tokens": 1521622.0, "step": 117 }, { "epoch": 0.07552, "grad_norm": 3.7489166259765625, "learning_rate": 5e-06, "loss": 1.4337, "mean_token_accuracy": 0.6373646706342697, "num_tokens": 1536109.0, "step": 118 }, { "epoch": 0.07616, "grad_norm": 4.419317245483398, "learning_rate": 5e-06, "loss": 1.6063, "mean_token_accuracy": 0.5973443016409874, "num_tokens": 1548521.0, "step": 119 }, { "epoch": 0.0768, "grad_norm": 3.8151636123657227, "learning_rate": 5e-06, "loss": 1.2434, "mean_token_accuracy": 0.6510246470570564, "num_tokens": 1562145.0, "step": 120 }, { "epoch": 0.07744, "grad_norm": 4.26577091217041, "learning_rate": 5e-06, "loss": 1.5936, "mean_token_accuracy": 0.6028061434626579, "num_tokens": 1575331.0, "step": 121 }, { "epoch": 0.07808, "grad_norm": 4.482457637786865, "learning_rate": 5e-06, "loss": 1.3789, "mean_token_accuracy": 0.6534934043884277, "num_tokens": 1587643.0, "step": 122 }, { "epoch": 0.07872, "grad_norm": 3.56472110748291, "learning_rate": 5e-06, "loss": 1.4365, "mean_token_accuracy": 0.636934220790863, "num_tokens": 1602307.0, "step": 123 }, { "epoch": 0.07936, "grad_norm": 3.643859386444092, "learning_rate": 5e-06, "loss": 1.3166, "mean_token_accuracy": 0.6632036790251732, "num_tokens": 1616771.0, "step": 124 }, { "epoch": 0.08, "grad_norm": 3.907698154449463, "learning_rate": 5e-06, "loss": 1.347, "mean_token_accuracy": 0.6501271575689316, "num_tokens": 1628788.0, "step": 125 }, { "epoch": 0.08064, "grad_norm": 3.952827215194702, "learning_rate": 5e-06, "loss": 1.5638, "mean_token_accuracy": 0.6289772987365723, "num_tokens": 1643292.0, "step": 126 }, { "epoch": 0.08128, "grad_norm": 3.829796314239502, "learning_rate": 5e-06, "loss": 1.4867, "mean_token_accuracy": 0.6409079134464264, "num_tokens": 1657859.0, "step": 127 }, { "epoch": 0.08192, "grad_norm": 3.4832980632781982, "learning_rate": 5e-06, "loss": 1.1756, "mean_token_accuracy": 0.6971960365772247, "num_tokens": 1672891.0, "step": 128 }, { "epoch": 0.08256, "grad_norm": 4.326021671295166, "learning_rate": 5e-06, "loss": 1.3608, "mean_token_accuracy": 0.663967490196228, "num_tokens": 1685243.0, "step": 129 }, { "epoch": 0.0832, "grad_norm": 3.8590521812438965, "learning_rate": 5e-06, "loss": 1.3535, "mean_token_accuracy": 0.6475187167525291, "num_tokens": 1699220.0, "step": 130 }, { "epoch": 0.08384, "grad_norm": 4.005199432373047, "learning_rate": 5e-06, "loss": 1.5247, "mean_token_accuracy": 0.6049772128462791, "num_tokens": 1711827.0, "step": 131 }, { "epoch": 0.08448, "grad_norm": 5.4232378005981445, "learning_rate": 5e-06, "loss": 1.4393, "mean_token_accuracy": 0.6452240273356438, "num_tokens": 1722307.0, "step": 132 }, { "epoch": 0.08512, "grad_norm": 3.7561964988708496, "learning_rate": 5e-06, "loss": 1.3973, "mean_token_accuracy": 0.643314465880394, "num_tokens": 1735760.0, "step": 133 }, { "epoch": 0.08576, "grad_norm": 4.557453155517578, "learning_rate": 5e-06, "loss": 1.4625, "mean_token_accuracy": 0.6588743627071381, "num_tokens": 1749840.0, "step": 134 }, { "epoch": 0.0864, "grad_norm": 4.375631809234619, "learning_rate": 5e-06, "loss": 1.3369, "mean_token_accuracy": 0.6503657773137093, "num_tokens": 1764508.0, "step": 135 }, { "epoch": 0.08704, "grad_norm": 3.6710991859436035, "learning_rate": 5e-06, "loss": 1.3455, "mean_token_accuracy": 0.6592239439487457, "num_tokens": 1777991.0, "step": 136 }, { "epoch": 0.08768, "grad_norm": 4.055100440979004, "learning_rate": 5e-06, "loss": 1.2004, "mean_token_accuracy": 0.6970224753022194, "num_tokens": 1790908.0, "step": 137 }, { "epoch": 0.08832, "grad_norm": 3.4759104251861572, "learning_rate": 5e-06, "loss": 1.3996, "mean_token_accuracy": 0.6468167528510094, "num_tokens": 1807180.0, "step": 138 }, { "epoch": 0.08896, "grad_norm": 4.201884746551514, "learning_rate": 5e-06, "loss": 1.36, "mean_token_accuracy": 0.645078033208847, "num_tokens": 1819408.0, "step": 139 }, { "epoch": 0.0896, "grad_norm": 4.253586769104004, "learning_rate": 5e-06, "loss": 1.555, "mean_token_accuracy": 0.6383631229400635, "num_tokens": 1832773.0, "step": 140 }, { "epoch": 0.09024, "grad_norm": 3.354541063308716, "learning_rate": 5e-06, "loss": 1.2203, "mean_token_accuracy": 0.6814669519662857, "num_tokens": 1848152.0, "step": 141 }, { "epoch": 0.09088, "grad_norm": 3.436411142349243, "learning_rate": 5e-06, "loss": 1.4883, "mean_token_accuracy": 0.6492328196763992, "num_tokens": 1863701.0, "step": 142 }, { "epoch": 0.09152, "grad_norm": 4.413644790649414, "learning_rate": 5e-06, "loss": 1.0295, "mean_token_accuracy": 0.7075678631663322, "num_tokens": 1872939.0, "step": 143 }, { "epoch": 0.09216, "grad_norm": 5.079326152801514, "learning_rate": 5e-06, "loss": 1.3153, "mean_token_accuracy": 0.6815094351768494, "num_tokens": 1881487.0, "step": 144 }, { "epoch": 0.0928, "grad_norm": 4.065243721008301, "learning_rate": 5e-06, "loss": 1.3596, "mean_token_accuracy": 0.6275613754987717, "num_tokens": 1892857.0, "step": 145 }, { "epoch": 0.09344, "grad_norm": 3.9777028560638428, "learning_rate": 5e-06, "loss": 1.1708, "mean_token_accuracy": 0.7008514180779457, "num_tokens": 1905468.0, "step": 146 }, { "epoch": 0.09408, "grad_norm": 3.9590489864349365, "learning_rate": 5e-06, "loss": 1.3645, "mean_token_accuracy": 0.645123079419136, "num_tokens": 1921339.0, "step": 147 }, { "epoch": 0.09472, "grad_norm": 4.232624053955078, "learning_rate": 5e-06, "loss": 1.4376, "mean_token_accuracy": 0.6167034581303596, "num_tokens": 1933922.0, "step": 148 }, { "epoch": 0.09536, "grad_norm": 4.538359642028809, "learning_rate": 5e-06, "loss": 1.2695, "mean_token_accuracy": 0.6443182751536369, "num_tokens": 1946327.0, "step": 149 }, { "epoch": 0.096, "grad_norm": 3.987658977508545, "learning_rate": 5e-06, "loss": 1.165, "mean_token_accuracy": 0.700744241476059, "num_tokens": 1958145.0, "step": 150 }, { "epoch": 0.09664, "grad_norm": 5.451640605926514, "learning_rate": 5e-06, "loss": 1.1847, "mean_token_accuracy": 0.6844369322061539, "num_tokens": 1968466.0, "step": 151 }, { "epoch": 0.09728, "grad_norm": 3.7554731369018555, "learning_rate": 5e-06, "loss": 1.3363, "mean_token_accuracy": 0.6585431769490242, "num_tokens": 1981386.0, "step": 152 }, { "epoch": 0.09792, "grad_norm": 3.601236581802368, "learning_rate": 5e-06, "loss": 1.3988, "mean_token_accuracy": 0.6534986943006516, "num_tokens": 1995319.0, "step": 153 }, { "epoch": 0.09856, "grad_norm": 3.569467306137085, "learning_rate": 5e-06, "loss": 1.1972, "mean_token_accuracy": 0.6820317879319191, "num_tokens": 2008019.0, "step": 154 }, { "epoch": 0.0992, "grad_norm": 3.896125078201294, "learning_rate": 5e-06, "loss": 1.3651, "mean_token_accuracy": 0.6400049701333046, "num_tokens": 2021300.0, "step": 155 }, { "epoch": 0.09984, "grad_norm": 3.486210584640503, "learning_rate": 5e-06, "loss": 1.3398, "mean_token_accuracy": 0.6543328985571861, "num_tokens": 2033964.0, "step": 156 }, { "epoch": 0.10048, "grad_norm": 3.03397274017334, "learning_rate": 5e-06, "loss": 1.4379, "mean_token_accuracy": 0.6368994787335396, "num_tokens": 2051392.0, "step": 157 }, { "epoch": 0.10112, "grad_norm": 3.8133559226989746, "learning_rate": 5e-06, "loss": 1.4191, "mean_token_accuracy": 0.6660285517573357, "num_tokens": 2063899.0, "step": 158 }, { "epoch": 0.10176, "grad_norm": 2.894871234893799, "learning_rate": 5e-06, "loss": 1.149, "mean_token_accuracy": 0.687338799238205, "num_tokens": 2081505.0, "step": 159 }, { "epoch": 0.1024, "grad_norm": 4.369359016418457, "learning_rate": 5e-06, "loss": 1.3478, "mean_token_accuracy": 0.6496308445930481, "num_tokens": 2092604.0, "step": 160 }, { "epoch": 0.10304, "grad_norm": 4.516582489013672, "learning_rate": 5e-06, "loss": 1.3535, "mean_token_accuracy": 0.6436882838606834, "num_tokens": 2103256.0, "step": 161 }, { "epoch": 0.10368, "grad_norm": 3.317488431930542, "learning_rate": 5e-06, "loss": 1.3131, "mean_token_accuracy": 0.6673252284526825, "num_tokens": 2119060.0, "step": 162 }, { "epoch": 0.10432, "grad_norm": 4.195248603820801, "learning_rate": 5e-06, "loss": 1.5371, "mean_token_accuracy": 0.6269242167472839, "num_tokens": 2131564.0, "step": 163 }, { "epoch": 0.10496, "grad_norm": 4.055263042449951, "learning_rate": 5e-06, "loss": 1.2917, "mean_token_accuracy": 0.672488197684288, "num_tokens": 2144473.0, "step": 164 }, { "epoch": 0.1056, "grad_norm": 3.9197511672973633, "learning_rate": 5e-06, "loss": 1.3064, "mean_token_accuracy": 0.664051964879036, "num_tokens": 2157277.0, "step": 165 }, { "epoch": 0.10624, "grad_norm": 4.073387145996094, "learning_rate": 5e-06, "loss": 1.3085, "mean_token_accuracy": 0.6389222107827663, "num_tokens": 2168765.0, "step": 166 }, { "epoch": 0.10688, "grad_norm": 3.508542060852051, "learning_rate": 5e-06, "loss": 1.2401, "mean_token_accuracy": 0.6622222438454628, "num_tokens": 2182480.0, "step": 167 }, { "epoch": 0.10752, "grad_norm": 5.038687229156494, "learning_rate": 5e-06, "loss": 1.3458, "mean_token_accuracy": 0.6548017933964729, "num_tokens": 2192216.0, "step": 168 }, { "epoch": 0.10816, "grad_norm": 3.743532180786133, "learning_rate": 5e-06, "loss": 1.3079, "mean_token_accuracy": 0.675239585340023, "num_tokens": 2205231.0, "step": 169 }, { "epoch": 0.1088, "grad_norm": 3.9550719261169434, "learning_rate": 5e-06, "loss": 1.4297, "mean_token_accuracy": 0.6310381144285202, "num_tokens": 2219210.0, "step": 170 }, { "epoch": 0.10944, "grad_norm": 3.988621950149536, "learning_rate": 5e-06, "loss": 1.2593, "mean_token_accuracy": 0.6804088428616524, "num_tokens": 2232718.0, "step": 171 }, { "epoch": 0.11008, "grad_norm": 4.214746475219727, "learning_rate": 5e-06, "loss": 1.1987, "mean_token_accuracy": 0.6554268151521683, "num_tokens": 2244509.0, "step": 172 }, { "epoch": 0.11072, "grad_norm": 4.047118186950684, "learning_rate": 5e-06, "loss": 1.3908, "mean_token_accuracy": 0.6510942876338959, "num_tokens": 2256799.0, "step": 173 }, { "epoch": 0.11136, "grad_norm": 4.169956207275391, "learning_rate": 5e-06, "loss": 1.1637, "mean_token_accuracy": 0.6976971700787544, "num_tokens": 2267854.0, "step": 174 }, { "epoch": 0.112, "grad_norm": 4.0025434494018555, "learning_rate": 5e-06, "loss": 1.2681, "mean_token_accuracy": 0.6660499349236488, "num_tokens": 2281207.0, "step": 175 }, { "epoch": 0.11264, "grad_norm": 3.6148102283477783, "learning_rate": 5e-06, "loss": 1.0893, "mean_token_accuracy": 0.6946917325258255, "num_tokens": 2294364.0, "step": 176 }, { "epoch": 0.11328, "grad_norm": 4.246650695800781, "learning_rate": 5e-06, "loss": 1.3055, "mean_token_accuracy": 0.6432985439896584, "num_tokens": 2304580.0, "step": 177 }, { "epoch": 0.11392, "grad_norm": 3.6579151153564453, "learning_rate": 5e-06, "loss": 1.3814, "mean_token_accuracy": 0.6313494071364403, "num_tokens": 2319903.0, "step": 178 }, { "epoch": 0.11456, "grad_norm": 3.988365411758423, "learning_rate": 5e-06, "loss": 1.1713, "mean_token_accuracy": 0.6748137697577477, "num_tokens": 2334193.0, "step": 179 }, { "epoch": 0.1152, "grad_norm": 4.839256286621094, "learning_rate": 5e-06, "loss": 1.6099, "mean_token_accuracy": 0.642399325966835, "num_tokens": 2344137.0, "step": 180 }, { "epoch": 0.11584, "grad_norm": 3.8175253868103027, "learning_rate": 5e-06, "loss": 1.3848, "mean_token_accuracy": 0.6434383615851402, "num_tokens": 2356787.0, "step": 181 }, { "epoch": 0.11648, "grad_norm": 4.244999885559082, "learning_rate": 5e-06, "loss": 1.3926, "mean_token_accuracy": 0.6472559943795204, "num_tokens": 2369124.0, "step": 182 }, { "epoch": 0.11712, "grad_norm": 3.850306749343872, "learning_rate": 5e-06, "loss": 1.266, "mean_token_accuracy": 0.6437094509601593, "num_tokens": 2383314.0, "step": 183 }, { "epoch": 0.11776, "grad_norm": 5.292626857757568, "learning_rate": 5e-06, "loss": 1.7328, "mean_token_accuracy": 0.5816368944942951, "num_tokens": 2392822.0, "step": 184 }, { "epoch": 0.1184, "grad_norm": 4.827669620513916, "learning_rate": 5e-06, "loss": 1.6344, "mean_token_accuracy": 0.6154987290501595, "num_tokens": 2404362.0, "step": 185 }, { "epoch": 0.11904, "grad_norm": 4.1474995613098145, "learning_rate": 5e-06, "loss": 1.3776, "mean_token_accuracy": 0.6394111067056656, "num_tokens": 2415206.0, "step": 186 }, { "epoch": 0.11968, "grad_norm": 4.1867995262146, "learning_rate": 5e-06, "loss": 1.1555, "mean_token_accuracy": 0.6839761063456535, "num_tokens": 2428366.0, "step": 187 }, { "epoch": 0.12032, "grad_norm": 3.8448567390441895, "learning_rate": 5e-06, "loss": 1.3755, "mean_token_accuracy": 0.6562356427311897, "num_tokens": 2440401.0, "step": 188 }, { "epoch": 0.12096, "grad_norm": 3.82326078414917, "learning_rate": 5e-06, "loss": 1.2447, "mean_token_accuracy": 0.651421345770359, "num_tokens": 2453194.0, "step": 189 }, { "epoch": 0.1216, "grad_norm": 3.8324315547943115, "learning_rate": 5e-06, "loss": 1.4677, "mean_token_accuracy": 0.6340256333351135, "num_tokens": 2466328.0, "step": 190 }, { "epoch": 0.12224, "grad_norm": 3.4532899856567383, "learning_rate": 5e-06, "loss": 1.3832, "mean_token_accuracy": 0.6311789453029633, "num_tokens": 2480165.0, "step": 191 }, { "epoch": 0.12288, "grad_norm": 6.352081298828125, "learning_rate": 5e-06, "loss": 1.5605, "mean_token_accuracy": 0.6426242366433144, "num_tokens": 2489563.0, "step": 192 }, { "epoch": 0.12352, "grad_norm": 3.9290707111358643, "learning_rate": 5e-06, "loss": 1.4923, "mean_token_accuracy": 0.621891662478447, "num_tokens": 2503752.0, "step": 193 }, { "epoch": 0.12416, "grad_norm": 3.5599541664123535, "learning_rate": 5e-06, "loss": 1.658, "mean_token_accuracy": 0.5923861265182495, "num_tokens": 2517941.0, "step": 194 }, { "epoch": 0.1248, "grad_norm": 4.907262802124023, "learning_rate": 5e-06, "loss": 1.3243, "mean_token_accuracy": 0.684236004948616, "num_tokens": 2531694.0, "step": 195 }, { "epoch": 0.12544, "grad_norm": 3.895585298538208, "learning_rate": 5e-06, "loss": 1.2594, "mean_token_accuracy": 0.6675282418727875, "num_tokens": 2543318.0, "step": 196 }, { "epoch": 0.12608, "grad_norm": 3.7483768463134766, "learning_rate": 5e-06, "loss": 1.3555, "mean_token_accuracy": 0.6702639237046242, "num_tokens": 2555858.0, "step": 197 }, { "epoch": 0.12672, "grad_norm": 3.980715751647949, "learning_rate": 5e-06, "loss": 1.3247, "mean_token_accuracy": 0.654958538711071, "num_tokens": 2570848.0, "step": 198 }, { "epoch": 0.12736, "grad_norm": 3.402679443359375, "learning_rate": 5e-06, "loss": 1.4151, "mean_token_accuracy": 0.639847457408905, "num_tokens": 2586951.0, "step": 199 }, { "epoch": 0.128, "grad_norm": 3.603440284729004, "learning_rate": 5e-06, "loss": 1.138, "mean_token_accuracy": 0.6972187757492065, "num_tokens": 2600380.0, "step": 200 }, { "epoch": 0.12864, "grad_norm": 4.226911544799805, "learning_rate": 5e-06, "loss": 1.2305, "mean_token_accuracy": 0.6777093335986137, "num_tokens": 2612169.0, "step": 201 }, { "epoch": 0.12928, "grad_norm": 4.133816719055176, "learning_rate": 5e-06, "loss": 1.5127, "mean_token_accuracy": 0.645031102001667, "num_tokens": 2625681.0, "step": 202 }, { "epoch": 0.12992, "grad_norm": 4.464379787445068, "learning_rate": 5e-06, "loss": 1.3419, "mean_token_accuracy": 0.655508816242218, "num_tokens": 2638449.0, "step": 203 }, { "epoch": 0.13056, "grad_norm": 3.691314697265625, "learning_rate": 5e-06, "loss": 1.4329, "mean_token_accuracy": 0.6271785870194435, "num_tokens": 2651404.0, "step": 204 }, { "epoch": 0.1312, "grad_norm": 3.735065460205078, "learning_rate": 5e-06, "loss": 1.259, "mean_token_accuracy": 0.6792504116892815, "num_tokens": 2663577.0, "step": 205 }, { "epoch": 0.13184, "grad_norm": 3.8141613006591797, "learning_rate": 5e-06, "loss": 1.3812, "mean_token_accuracy": 0.6525682806968689, "num_tokens": 2675704.0, "step": 206 }, { "epoch": 0.13248, "grad_norm": 4.096824645996094, "learning_rate": 5e-06, "loss": 1.4175, "mean_token_accuracy": 0.6340369358658791, "num_tokens": 2687284.0, "step": 207 }, { "epoch": 0.13312, "grad_norm": 4.180744171142578, "learning_rate": 5e-06, "loss": 1.37, "mean_token_accuracy": 0.6400524824857712, "num_tokens": 2700343.0, "step": 208 }, { "epoch": 0.13376, "grad_norm": 4.275300979614258, "learning_rate": 5e-06, "loss": 1.3739, "mean_token_accuracy": 0.6336183995008469, "num_tokens": 2713760.0, "step": 209 }, { "epoch": 0.1344, "grad_norm": 3.547708511352539, "learning_rate": 5e-06, "loss": 1.4059, "mean_token_accuracy": 0.6354609504342079, "num_tokens": 2728045.0, "step": 210 }, { "epoch": 0.13504, "grad_norm": 4.222541809082031, "learning_rate": 5e-06, "loss": 1.3929, "mean_token_accuracy": 0.6449902206659317, "num_tokens": 2741308.0, "step": 211 }, { "epoch": 0.13568, "grad_norm": 3.930753707885742, "learning_rate": 5e-06, "loss": 1.5346, "mean_token_accuracy": 0.6030523180961609, "num_tokens": 2754620.0, "step": 212 }, { "epoch": 0.13632, "grad_norm": 3.6813647747039795, "learning_rate": 5e-06, "loss": 1.3267, "mean_token_accuracy": 0.6522213146090508, "num_tokens": 2768180.0, "step": 213 }, { "epoch": 0.13696, "grad_norm": 4.064117431640625, "learning_rate": 5e-06, "loss": 1.3081, "mean_token_accuracy": 0.6697950512170792, "num_tokens": 2781050.0, "step": 214 }, { "epoch": 0.1376, "grad_norm": 3.927386522293091, "learning_rate": 5e-06, "loss": 1.5213, "mean_token_accuracy": 0.6275500729680061, "num_tokens": 2792644.0, "step": 215 }, { "epoch": 0.13824, "grad_norm": 3.762558937072754, "learning_rate": 5e-06, "loss": 1.339, "mean_token_accuracy": 0.6525787115097046, "num_tokens": 2805857.0, "step": 216 }, { "epoch": 0.13888, "grad_norm": 3.3911473751068115, "learning_rate": 5e-06, "loss": 1.3861, "mean_token_accuracy": 0.646359771490097, "num_tokens": 2822403.0, "step": 217 }, { "epoch": 0.13952, "grad_norm": 3.3811612129211426, "learning_rate": 5e-06, "loss": 1.4414, "mean_token_accuracy": 0.6381309777498245, "num_tokens": 2836579.0, "step": 218 }, { "epoch": 0.14016, "grad_norm": 3.9682304859161377, "learning_rate": 5e-06, "loss": 1.4728, "mean_token_accuracy": 0.611208513379097, "num_tokens": 2848833.0, "step": 219 }, { "epoch": 0.1408, "grad_norm": 4.066648483276367, "learning_rate": 5e-06, "loss": 1.3081, "mean_token_accuracy": 0.6451118811964989, "num_tokens": 2859672.0, "step": 220 }, { "epoch": 0.14144, "grad_norm": 3.577544927597046, "learning_rate": 5e-06, "loss": 1.4143, "mean_token_accuracy": 0.6415835171937943, "num_tokens": 2875418.0, "step": 221 }, { "epoch": 0.14208, "grad_norm": 3.8373844623565674, "learning_rate": 5e-06, "loss": 1.1867, "mean_token_accuracy": 0.6628148853778839, "num_tokens": 2886843.0, "step": 222 }, { "epoch": 0.14272, "grad_norm": 3.243741273880005, "learning_rate": 5e-06, "loss": 1.3639, "mean_token_accuracy": 0.6402468308806419, "num_tokens": 2903000.0, "step": 223 }, { "epoch": 0.14336, "grad_norm": 3.6917643547058105, "learning_rate": 5e-06, "loss": 1.4826, "mean_token_accuracy": 0.6137516796588898, "num_tokens": 2916086.0, "step": 224 }, { "epoch": 0.144, "grad_norm": 3.6961069107055664, "learning_rate": 5e-06, "loss": 1.3914, "mean_token_accuracy": 0.6221867948770523, "num_tokens": 2928082.0, "step": 225 }, { "epoch": 0.14464, "grad_norm": 3.3489155769348145, "learning_rate": 5e-06, "loss": 1.2829, "mean_token_accuracy": 0.6520458236336708, "num_tokens": 2941972.0, "step": 226 }, { "epoch": 0.14528, "grad_norm": 3.9291248321533203, "learning_rate": 5e-06, "loss": 1.4375, "mean_token_accuracy": 0.6534432545304298, "num_tokens": 2954326.0, "step": 227 }, { "epoch": 0.14592, "grad_norm": 4.408154487609863, "learning_rate": 5e-06, "loss": 1.37, "mean_token_accuracy": 0.6362905651330948, "num_tokens": 2964773.0, "step": 228 }, { "epoch": 0.14656, "grad_norm": 3.3480911254882812, "learning_rate": 5e-06, "loss": 1.2627, "mean_token_accuracy": 0.664552852511406, "num_tokens": 2979164.0, "step": 229 }, { "epoch": 0.1472, "grad_norm": 3.5520999431610107, "learning_rate": 5e-06, "loss": 1.2172, "mean_token_accuracy": 0.677513062953949, "num_tokens": 2993457.0, "step": 230 }, { "epoch": 0.14784, "grad_norm": 3.3027398586273193, "learning_rate": 5e-06, "loss": 1.4642, "mean_token_accuracy": 0.6177601739764214, "num_tokens": 3007665.0, "step": 231 }, { "epoch": 0.14848, "grad_norm": 3.64074444770813, "learning_rate": 5e-06, "loss": 1.2464, "mean_token_accuracy": 0.662057913839817, "num_tokens": 3020550.0, "step": 232 }, { "epoch": 0.14912, "grad_norm": 3.9199254512786865, "learning_rate": 5e-06, "loss": 1.3942, "mean_token_accuracy": 0.6174388378858566, "num_tokens": 3034384.0, "step": 233 }, { "epoch": 0.14976, "grad_norm": 4.028416633605957, "learning_rate": 5e-06, "loss": 1.2443, "mean_token_accuracy": 0.6706736162304878, "num_tokens": 3046491.0, "step": 234 }, { "epoch": 0.1504, "grad_norm": 3.4330265522003174, "learning_rate": 5e-06, "loss": 1.4317, "mean_token_accuracy": 0.6397150233387947, "num_tokens": 3061547.0, "step": 235 }, { "epoch": 0.15104, "grad_norm": 4.62261438369751, "learning_rate": 5e-06, "loss": 1.4654, "mean_token_accuracy": 0.6216901019215584, "num_tokens": 3073316.0, "step": 236 }, { "epoch": 0.15168, "grad_norm": 3.8148386478424072, "learning_rate": 5e-06, "loss": 1.3572, "mean_token_accuracy": 0.6381306573748589, "num_tokens": 3086074.0, "step": 237 }, { "epoch": 0.15232, "grad_norm": 3.6774654388427734, "learning_rate": 5e-06, "loss": 1.2743, "mean_token_accuracy": 0.6746890023350716, "num_tokens": 3099218.0, "step": 238 }, { "epoch": 0.15296, "grad_norm": 3.8915648460388184, "learning_rate": 5e-06, "loss": 1.3005, "mean_token_accuracy": 0.6652230620384216, "num_tokens": 3113283.0, "step": 239 }, { "epoch": 0.1536, "grad_norm": 3.641663074493408, "learning_rate": 5e-06, "loss": 1.4299, "mean_token_accuracy": 0.6305139660835266, "num_tokens": 3127092.0, "step": 240 }, { "epoch": 0.15424, "grad_norm": 3.9802157878875732, "learning_rate": 5e-06, "loss": 1.3628, "mean_token_accuracy": 0.6499741598963737, "num_tokens": 3137977.0, "step": 241 }, { "epoch": 0.15488, "grad_norm": 3.3519856929779053, "learning_rate": 5e-06, "loss": 1.3649, "mean_token_accuracy": 0.6538999378681183, "num_tokens": 3153296.0, "step": 242 }, { "epoch": 0.15552, "grad_norm": 3.9312145709991455, "learning_rate": 5e-06, "loss": 1.1886, "mean_token_accuracy": 0.687839575111866, "num_tokens": 3165430.0, "step": 243 }, { "epoch": 0.15616, "grad_norm": 3.9684488773345947, "learning_rate": 5e-06, "loss": 1.392, "mean_token_accuracy": 0.629355788230896, "num_tokens": 3176799.0, "step": 244 }, { "epoch": 0.1568, "grad_norm": 3.610091209411621, "learning_rate": 5e-06, "loss": 1.3166, "mean_token_accuracy": 0.6479083597660065, "num_tokens": 3190258.0, "step": 245 }, { "epoch": 0.15744, "grad_norm": 3.921807289123535, "learning_rate": 5e-06, "loss": 1.1064, "mean_token_accuracy": 0.7143979370594025, "num_tokens": 3201789.0, "step": 246 }, { "epoch": 0.15808, "grad_norm": 3.4888627529144287, "learning_rate": 5e-06, "loss": 1.2273, "mean_token_accuracy": 0.6818583980202675, "num_tokens": 3214773.0, "step": 247 }, { "epoch": 0.15872, "grad_norm": 3.9141690731048584, "learning_rate": 5e-06, "loss": 1.2463, "mean_token_accuracy": 0.6758697032928467, "num_tokens": 3226302.0, "step": 248 }, { "epoch": 0.15936, "grad_norm": 3.585526943206787, "learning_rate": 5e-06, "loss": 1.29, "mean_token_accuracy": 0.6522084772586823, "num_tokens": 3239487.0, "step": 249 }, { "epoch": 0.16, "grad_norm": 2.9985756874084473, "learning_rate": 5e-06, "loss": 1.2875, "mean_token_accuracy": 0.665367841720581, "num_tokens": 3254553.0, "step": 250 }, { "epoch": 0.16064, "grad_norm": 4.460598945617676, "learning_rate": 5e-06, "loss": 1.413, "mean_token_accuracy": 0.6606506556272507, "num_tokens": 3266374.0, "step": 251 }, { "epoch": 0.16128, "grad_norm": 3.867008686065674, "learning_rate": 5e-06, "loss": 1.4733, "mean_token_accuracy": 0.6357235088944435, "num_tokens": 3278642.0, "step": 252 }, { "epoch": 0.16192, "grad_norm": 3.6840028762817383, "learning_rate": 5e-06, "loss": 1.3735, "mean_token_accuracy": 0.6500705629587173, "num_tokens": 3292643.0, "step": 253 }, { "epoch": 0.16256, "grad_norm": 3.631727933883667, "learning_rate": 5e-06, "loss": 1.3561, "mean_token_accuracy": 0.6603741720318794, "num_tokens": 3308572.0, "step": 254 }, { "epoch": 0.1632, "grad_norm": 3.8139543533325195, "learning_rate": 5e-06, "loss": 1.4079, "mean_token_accuracy": 0.6566642299294472, "num_tokens": 3321852.0, "step": 255 }, { "epoch": 0.16384, "grad_norm": 4.278744697570801, "learning_rate": 5e-06, "loss": 1.3128, "mean_token_accuracy": 0.6340290307998657, "num_tokens": 3333364.0, "step": 256 }, { "epoch": 0.16448, "grad_norm": 3.855288505554199, "learning_rate": 5e-06, "loss": 1.2726, "mean_token_accuracy": 0.6573414877057076, "num_tokens": 3346153.0, "step": 257 }, { "epoch": 0.16512, "grad_norm": 3.894836187362671, "learning_rate": 5e-06, "loss": 1.5052, "mean_token_accuracy": 0.6395176202058792, "num_tokens": 3357803.0, "step": 258 }, { "epoch": 0.16576, "grad_norm": 3.7376608848571777, "learning_rate": 5e-06, "loss": 1.3856, "mean_token_accuracy": 0.6377875059843063, "num_tokens": 3370640.0, "step": 259 }, { "epoch": 0.1664, "grad_norm": 3.66434907913208, "learning_rate": 5e-06, "loss": 1.2933, "mean_token_accuracy": 0.6526513993740082, "num_tokens": 3384626.0, "step": 260 }, { "epoch": 0.16704, "grad_norm": 4.31889533996582, "learning_rate": 5e-06, "loss": 1.4037, "mean_token_accuracy": 0.6519733518362045, "num_tokens": 3396351.0, "step": 261 }, { "epoch": 0.16768, "grad_norm": 4.194382667541504, "learning_rate": 5e-06, "loss": 1.2248, "mean_token_accuracy": 0.6719919368624687, "num_tokens": 3410809.0, "step": 262 }, { "epoch": 0.16832, "grad_norm": 5.298657417297363, "learning_rate": 5e-06, "loss": 1.2344, "mean_token_accuracy": 0.6541409119963646, "num_tokens": 3421666.0, "step": 263 }, { "epoch": 0.16896, "grad_norm": 3.7578792572021484, "learning_rate": 5e-06, "loss": 1.4221, "mean_token_accuracy": 0.6374265551567078, "num_tokens": 3435240.0, "step": 264 }, { "epoch": 0.1696, "grad_norm": 4.36591100692749, "learning_rate": 5e-06, "loss": 1.3996, "mean_token_accuracy": 0.6582349985837936, "num_tokens": 3447417.0, "step": 265 }, { "epoch": 0.17024, "grad_norm": 4.242166042327881, "learning_rate": 5e-06, "loss": 1.2213, "mean_token_accuracy": 0.6886605694890022, "num_tokens": 3457202.0, "step": 266 }, { "epoch": 0.17088, "grad_norm": 4.421549320220947, "learning_rate": 5e-06, "loss": 1.4154, "mean_token_accuracy": 0.6361653730273247, "num_tokens": 3470888.0, "step": 267 }, { "epoch": 0.17152, "grad_norm": 3.4272501468658447, "learning_rate": 5e-06, "loss": 1.4722, "mean_token_accuracy": 0.617170162498951, "num_tokens": 3483711.0, "step": 268 }, { "epoch": 0.17216, "grad_norm": 4.099259853363037, "learning_rate": 5e-06, "loss": 1.3181, "mean_token_accuracy": 0.6635381802916527, "num_tokens": 3494261.0, "step": 269 }, { "epoch": 0.1728, "grad_norm": 3.460908889770508, "learning_rate": 5e-06, "loss": 1.2027, "mean_token_accuracy": 0.6816031113266945, "num_tokens": 3508416.0, "step": 270 }, { "epoch": 0.17344, "grad_norm": 4.011609077453613, "learning_rate": 5e-06, "loss": 1.2527, "mean_token_accuracy": 0.6691607385873795, "num_tokens": 3521566.0, "step": 271 }, { "epoch": 0.17408, "grad_norm": 4.310615062713623, "learning_rate": 5e-06, "loss": 1.5243, "mean_token_accuracy": 0.606864832341671, "num_tokens": 3532437.0, "step": 272 }, { "epoch": 0.17472, "grad_norm": 3.865201950073242, "learning_rate": 5e-06, "loss": 1.3655, "mean_token_accuracy": 0.6517080217599869, "num_tokens": 3544654.0, "step": 273 }, { "epoch": 0.17536, "grad_norm": 3.779001235961914, "learning_rate": 5e-06, "loss": 1.5361, "mean_token_accuracy": 0.6139826104044914, "num_tokens": 3560143.0, "step": 274 }, { "epoch": 0.176, "grad_norm": 3.909745454788208, "learning_rate": 5e-06, "loss": 1.2, "mean_token_accuracy": 0.6911701187491417, "num_tokens": 3571846.0, "step": 275 }, { "epoch": 0.17664, "grad_norm": 4.487984657287598, "learning_rate": 5e-06, "loss": 1.307, "mean_token_accuracy": 0.6519964337348938, "num_tokens": 3583280.0, "step": 276 }, { "epoch": 0.17728, "grad_norm": 4.58504056930542, "learning_rate": 5e-06, "loss": 1.4673, "mean_token_accuracy": 0.6244921982288361, "num_tokens": 3593797.0, "step": 277 }, { "epoch": 0.17792, "grad_norm": 3.6989223957061768, "learning_rate": 5e-06, "loss": 1.4841, "mean_token_accuracy": 0.6436078920960426, "num_tokens": 3606077.0, "step": 278 }, { "epoch": 0.17856, "grad_norm": 3.5363776683807373, "learning_rate": 5e-06, "loss": 1.3562, "mean_token_accuracy": 0.6404093876481056, "num_tokens": 3619274.0, "step": 279 }, { "epoch": 0.1792, "grad_norm": 3.5803604125976562, "learning_rate": 5e-06, "loss": 1.2417, "mean_token_accuracy": 0.6808914020657539, "num_tokens": 3631353.0, "step": 280 }, { "epoch": 0.17984, "grad_norm": 3.8783459663391113, "learning_rate": 5e-06, "loss": 1.3802, "mean_token_accuracy": 0.6372303292155266, "num_tokens": 3645098.0, "step": 281 }, { "epoch": 0.18048, "grad_norm": 4.057406425476074, "learning_rate": 5e-06, "loss": 1.2089, "mean_token_accuracy": 0.6749606877565384, "num_tokens": 3657936.0, "step": 282 }, { "epoch": 0.18112, "grad_norm": 3.0335772037506104, "learning_rate": 5e-06, "loss": 1.1465, "mean_token_accuracy": 0.6885220557451248, "num_tokens": 3672249.0, "step": 283 }, { "epoch": 0.18176, "grad_norm": 3.654318332672119, "learning_rate": 5e-06, "loss": 1.2322, "mean_token_accuracy": 0.6707694306969643, "num_tokens": 3685850.0, "step": 284 }, { "epoch": 0.1824, "grad_norm": 3.4704298973083496, "learning_rate": 5e-06, "loss": 1.1906, "mean_token_accuracy": 0.6730613932013512, "num_tokens": 3699407.0, "step": 285 }, { "epoch": 0.18304, "grad_norm": 4.028052806854248, "learning_rate": 5e-06, "loss": 1.511, "mean_token_accuracy": 0.6216867938637733, "num_tokens": 3711240.0, "step": 286 }, { "epoch": 0.18368, "grad_norm": 3.9164350032806396, "learning_rate": 5e-06, "loss": 1.2674, "mean_token_accuracy": 0.6724821552634239, "num_tokens": 3723872.0, "step": 287 }, { "epoch": 0.18432, "grad_norm": 4.470592498779297, "learning_rate": 5e-06, "loss": 1.3975, "mean_token_accuracy": 0.6604571491479874, "num_tokens": 3737013.0, "step": 288 }, { "epoch": 0.18496, "grad_norm": 3.5540971755981445, "learning_rate": 5e-06, "loss": 1.1055, "mean_token_accuracy": 0.683054082095623, "num_tokens": 3750893.0, "step": 289 }, { "epoch": 0.1856, "grad_norm": 3.6694583892822266, "learning_rate": 5e-06, "loss": 1.4439, "mean_token_accuracy": 0.6296076104044914, "num_tokens": 3763965.0, "step": 290 }, { "epoch": 0.18624, "grad_norm": 4.5381059646606445, "learning_rate": 5e-06, "loss": 1.4017, "mean_token_accuracy": 0.646364264190197, "num_tokens": 3774483.0, "step": 291 }, { "epoch": 0.18688, "grad_norm": 3.607478141784668, "learning_rate": 5e-06, "loss": 1.5724, "mean_token_accuracy": 0.6356127932667732, "num_tokens": 3791198.0, "step": 292 }, { "epoch": 0.18752, "grad_norm": 3.7672901153564453, "learning_rate": 5e-06, "loss": 1.5793, "mean_token_accuracy": 0.6113990694284439, "num_tokens": 3805077.0, "step": 293 }, { "epoch": 0.18816, "grad_norm": 3.517371892929077, "learning_rate": 5e-06, "loss": 1.3182, "mean_token_accuracy": 0.6544737070798874, "num_tokens": 3819471.0, "step": 294 }, { "epoch": 0.1888, "grad_norm": 3.6588094234466553, "learning_rate": 5e-06, "loss": 1.1415, "mean_token_accuracy": 0.6868576034903526, "num_tokens": 3833299.0, "step": 295 }, { "epoch": 0.18944, "grad_norm": 4.042988300323486, "learning_rate": 5e-06, "loss": 1.331, "mean_token_accuracy": 0.6815094500780106, "num_tokens": 3845749.0, "step": 296 }, { "epoch": 0.19008, "grad_norm": 3.829592227935791, "learning_rate": 5e-06, "loss": 1.5645, "mean_token_accuracy": 0.6153044253587723, "num_tokens": 3858961.0, "step": 297 }, { "epoch": 0.19072, "grad_norm": 4.074889659881592, "learning_rate": 5e-06, "loss": 1.4884, "mean_token_accuracy": 0.6340715438127518, "num_tokens": 3870935.0, "step": 298 }, { "epoch": 0.19136, "grad_norm": 3.7292230129241943, "learning_rate": 5e-06, "loss": 1.461, "mean_token_accuracy": 0.6340260431170464, "num_tokens": 3883149.0, "step": 299 }, { "epoch": 0.192, "grad_norm": 3.7191953659057617, "learning_rate": 5e-06, "loss": 1.2, "mean_token_accuracy": 0.6816589832305908, "num_tokens": 3896395.0, "step": 300 }, { "epoch": 0.19264, "grad_norm": 3.5360212326049805, "learning_rate": 5e-06, "loss": 1.0832, "mean_token_accuracy": 0.6812401190400124, "num_tokens": 3912111.0, "step": 301 }, { "epoch": 0.19328, "grad_norm": 4.3103132247924805, "learning_rate": 5e-06, "loss": 1.1345, "mean_token_accuracy": 0.6864209771156311, "num_tokens": 3922023.0, "step": 302 }, { "epoch": 0.19392, "grad_norm": 4.525723457336426, "learning_rate": 5e-06, "loss": 1.1642, "mean_token_accuracy": 0.6812352165579796, "num_tokens": 3933051.0, "step": 303 }, { "epoch": 0.19456, "grad_norm": 4.2806172370910645, "learning_rate": 5e-06, "loss": 1.4921, "mean_token_accuracy": 0.6231048293411732, "num_tokens": 3943203.0, "step": 304 }, { "epoch": 0.1952, "grad_norm": 3.759788751602173, "learning_rate": 5e-06, "loss": 1.4501, "mean_token_accuracy": 0.6561701893806458, "num_tokens": 3956583.0, "step": 305 }, { "epoch": 0.19584, "grad_norm": 3.7161481380462646, "learning_rate": 5e-06, "loss": 1.4199, "mean_token_accuracy": 0.6432743892073631, "num_tokens": 3968468.0, "step": 306 }, { "epoch": 0.19648, "grad_norm": 3.6811437606811523, "learning_rate": 5e-06, "loss": 1.3131, "mean_token_accuracy": 0.6686923652887344, "num_tokens": 3980727.0, "step": 307 }, { "epoch": 0.19712, "grad_norm": 4.159343242645264, "learning_rate": 5e-06, "loss": 1.4896, "mean_token_accuracy": 0.6509639658033848, "num_tokens": 3993831.0, "step": 308 }, { "epoch": 0.19776, "grad_norm": 3.5082013607025146, "learning_rate": 5e-06, "loss": 1.1129, "mean_token_accuracy": 0.7138783186674118, "num_tokens": 4006704.0, "step": 309 }, { "epoch": 0.1984, "grad_norm": 4.171331882476807, "learning_rate": 5e-06, "loss": 1.2373, "mean_token_accuracy": 0.6580014526844025, "num_tokens": 4023069.0, "step": 310 }, { "epoch": 0.19904, "grad_norm": 3.516143321990967, "learning_rate": 5e-06, "loss": 1.4212, "mean_token_accuracy": 0.6425874978303909, "num_tokens": 4036994.0, "step": 311 }, { "epoch": 0.19968, "grad_norm": 3.506361484527588, "learning_rate": 5e-06, "loss": 1.5113, "mean_token_accuracy": 0.6125459745526314, "num_tokens": 4050240.0, "step": 312 }, { "epoch": 0.20032, "grad_norm": 4.198498725891113, "learning_rate": 5e-06, "loss": 1.4596, "mean_token_accuracy": 0.6291738748550415, "num_tokens": 4061778.0, "step": 313 }, { "epoch": 0.20096, "grad_norm": 3.3201327323913574, "learning_rate": 5e-06, "loss": 1.4545, "mean_token_accuracy": 0.6387949883937836, "num_tokens": 4076918.0, "step": 314 }, { "epoch": 0.2016, "grad_norm": 3.174764394760132, "learning_rate": 5e-06, "loss": 1.2661, "mean_token_accuracy": 0.6788045838475227, "num_tokens": 4091861.0, "step": 315 }, { "epoch": 0.20224, "grad_norm": 3.773123264312744, "learning_rate": 5e-06, "loss": 1.0687, "mean_token_accuracy": 0.7004147991538048, "num_tokens": 4103615.0, "step": 316 }, { "epoch": 0.20288, "grad_norm": 3.759938955307007, "learning_rate": 5e-06, "loss": 1.3967, "mean_token_accuracy": 0.6160966157913208, "num_tokens": 4116084.0, "step": 317 }, { "epoch": 0.20352, "grad_norm": 3.3908169269561768, "learning_rate": 5e-06, "loss": 1.3196, "mean_token_accuracy": 0.6666592955589294, "num_tokens": 4130227.0, "step": 318 }, { "epoch": 0.20416, "grad_norm": 3.709275007247925, "learning_rate": 5e-06, "loss": 1.1854, "mean_token_accuracy": 0.690848097205162, "num_tokens": 4144753.0, "step": 319 }, { "epoch": 0.2048, "grad_norm": 4.040079116821289, "learning_rate": 5e-06, "loss": 1.271, "mean_token_accuracy": 0.6566968783736229, "num_tokens": 4157685.0, "step": 320 }, { "epoch": 0.20544, "grad_norm": 3.6473450660705566, "learning_rate": 5e-06, "loss": 1.2116, "mean_token_accuracy": 0.6666957810521126, "num_tokens": 4171592.0, "step": 321 }, { "epoch": 0.20608, "grad_norm": 4.44047212600708, "learning_rate": 5e-06, "loss": 1.3629, "mean_token_accuracy": 0.6269867643713951, "num_tokens": 4182621.0, "step": 322 }, { "epoch": 0.20672, "grad_norm": 4.875802993774414, "learning_rate": 5e-06, "loss": 1.251, "mean_token_accuracy": 0.671268492937088, "num_tokens": 4191893.0, "step": 323 }, { "epoch": 0.20736, "grad_norm": 3.2327218055725098, "learning_rate": 5e-06, "loss": 1.2432, "mean_token_accuracy": 0.6710969433188438, "num_tokens": 4207608.0, "step": 324 }, { "epoch": 0.208, "grad_norm": 3.433987617492676, "learning_rate": 5e-06, "loss": 1.4811, "mean_token_accuracy": 0.641696572303772, "num_tokens": 4222070.0, "step": 325 }, { "epoch": 0.20864, "grad_norm": 3.3024795055389404, "learning_rate": 5e-06, "loss": 1.4027, "mean_token_accuracy": 0.6242343187332153, "num_tokens": 4237064.0, "step": 326 }, { "epoch": 0.20928, "grad_norm": 3.8479273319244385, "learning_rate": 5e-06, "loss": 1.1806, "mean_token_accuracy": 0.6728235110640526, "num_tokens": 4251385.0, "step": 327 }, { "epoch": 0.20992, "grad_norm": 3.911982774734497, "learning_rate": 5e-06, "loss": 1.3907, "mean_token_accuracy": 0.633483037352562, "num_tokens": 4264013.0, "step": 328 }, { "epoch": 0.21056, "grad_norm": 3.055570125579834, "learning_rate": 5e-06, "loss": 1.5427, "mean_token_accuracy": 0.6328605860471725, "num_tokens": 4280497.0, "step": 329 }, { "epoch": 0.2112, "grad_norm": 3.9111008644104004, "learning_rate": 5e-06, "loss": 1.3318, "mean_token_accuracy": 0.6829836070537567, "num_tokens": 4293657.0, "step": 330 }, { "epoch": 0.21184, "grad_norm": 3.3383522033691406, "learning_rate": 5e-06, "loss": 1.3847, "mean_token_accuracy": 0.6295205429196358, "num_tokens": 4309618.0, "step": 331 }, { "epoch": 0.21248, "grad_norm": 3.3280251026153564, "learning_rate": 5e-06, "loss": 1.5237, "mean_token_accuracy": 0.6586425974965096, "num_tokens": 4326147.0, "step": 332 }, { "epoch": 0.21312, "grad_norm": 4.489631175994873, "learning_rate": 5e-06, "loss": 1.2546, "mean_token_accuracy": 0.653937578201294, "num_tokens": 4336964.0, "step": 333 }, { "epoch": 0.21376, "grad_norm": 3.654022693634033, "learning_rate": 5e-06, "loss": 1.3246, "mean_token_accuracy": 0.6411551535129547, "num_tokens": 4349801.0, "step": 334 }, { "epoch": 0.2144, "grad_norm": 3.9658567905426025, "learning_rate": 5e-06, "loss": 1.2223, "mean_token_accuracy": 0.6976972743868828, "num_tokens": 4362565.0, "step": 335 }, { "epoch": 0.21504, "grad_norm": 4.284513473510742, "learning_rate": 5e-06, "loss": 1.2982, "mean_token_accuracy": 0.6423984244465828, "num_tokens": 4373113.0, "step": 336 }, { "epoch": 0.21568, "grad_norm": 3.3546524047851562, "learning_rate": 5e-06, "loss": 1.5144, "mean_token_accuracy": 0.6321973502635956, "num_tokens": 4388315.0, "step": 337 }, { "epoch": 0.21632, "grad_norm": 3.7386813163757324, "learning_rate": 5e-06, "loss": 1.2948, "mean_token_accuracy": 0.6624687612056732, "num_tokens": 4400428.0, "step": 338 }, { "epoch": 0.21696, "grad_norm": 4.466668128967285, "learning_rate": 5e-06, "loss": 1.7564, "mean_token_accuracy": 0.6115086637437344, "num_tokens": 4412812.0, "step": 339 }, { "epoch": 0.2176, "grad_norm": 3.6271438598632812, "learning_rate": 5e-06, "loss": 1.2008, "mean_token_accuracy": 0.6809025183320045, "num_tokens": 4427547.0, "step": 340 }, { "epoch": 0.21824, "grad_norm": 4.270169258117676, "learning_rate": 5e-06, "loss": 1.4229, "mean_token_accuracy": 0.6368228495121002, "num_tokens": 4440979.0, "step": 341 }, { "epoch": 0.21888, "grad_norm": 4.036962509155273, "learning_rate": 5e-06, "loss": 1.4317, "mean_token_accuracy": 0.6311650201678276, "num_tokens": 4452973.0, "step": 342 }, { "epoch": 0.21952, "grad_norm": 3.645164728164673, "learning_rate": 5e-06, "loss": 1.2559, "mean_token_accuracy": 0.6653162762522697, "num_tokens": 4465907.0, "step": 343 }, { "epoch": 0.22016, "grad_norm": 4.088701248168945, "learning_rate": 5e-06, "loss": 1.2515, "mean_token_accuracy": 0.6554296687245369, "num_tokens": 4477731.0, "step": 344 }, { "epoch": 0.2208, "grad_norm": 3.935673713684082, "learning_rate": 5e-06, "loss": 1.23, "mean_token_accuracy": 0.6872739866375923, "num_tokens": 4490092.0, "step": 345 }, { "epoch": 0.22144, "grad_norm": 3.8297736644744873, "learning_rate": 5e-06, "loss": 1.3338, "mean_token_accuracy": 0.6665596142411232, "num_tokens": 4502310.0, "step": 346 }, { "epoch": 0.22208, "grad_norm": 3.4555552005767822, "learning_rate": 5e-06, "loss": 1.3386, "mean_token_accuracy": 0.645504966378212, "num_tokens": 4517152.0, "step": 347 }, { "epoch": 0.22272, "grad_norm": 3.445380926132202, "learning_rate": 5e-06, "loss": 1.3176, "mean_token_accuracy": 0.656374916434288, "num_tokens": 4531588.0, "step": 348 }, { "epoch": 0.22336, "grad_norm": 3.376492500305176, "learning_rate": 5e-06, "loss": 1.4416, "mean_token_accuracy": 0.6500495374202728, "num_tokens": 4548945.0, "step": 349 }, { "epoch": 0.224, "grad_norm": 3.7682902812957764, "learning_rate": 5e-06, "loss": 1.1904, "mean_token_accuracy": 0.7001358345150948, "num_tokens": 4561085.0, "step": 350 }, { "epoch": 0.22464, "grad_norm": 3.9040138721466064, "learning_rate": 5e-06, "loss": 1.3457, "mean_token_accuracy": 0.6525379121303558, "num_tokens": 4574945.0, "step": 351 }, { "epoch": 0.22528, "grad_norm": 3.5685391426086426, "learning_rate": 5e-06, "loss": 1.3322, "mean_token_accuracy": 0.6565421000123024, "num_tokens": 4588253.0, "step": 352 }, { "epoch": 0.22592, "grad_norm": 3.4802379608154297, "learning_rate": 5e-06, "loss": 1.2408, "mean_token_accuracy": 0.6631387919187546, "num_tokens": 4603347.0, "step": 353 }, { "epoch": 0.22656, "grad_norm": 4.1048126220703125, "learning_rate": 5e-06, "loss": 1.2342, "mean_token_accuracy": 0.7041416242718697, "num_tokens": 4616180.0, "step": 354 }, { "epoch": 0.2272, "grad_norm": 3.617142677307129, "learning_rate": 5e-06, "loss": 1.193, "mean_token_accuracy": 0.68916055560112, "num_tokens": 4628116.0, "step": 355 }, { "epoch": 0.22784, "grad_norm": 3.48990797996521, "learning_rate": 5e-06, "loss": 1.3371, "mean_token_accuracy": 0.6546562537550926, "num_tokens": 4644302.0, "step": 356 }, { "epoch": 0.22848, "grad_norm": 4.8016180992126465, "learning_rate": 5e-06, "loss": 1.3828, "mean_token_accuracy": 0.6490079835057259, "num_tokens": 4654201.0, "step": 357 }, { "epoch": 0.22912, "grad_norm": 3.589632749557495, "learning_rate": 5e-06, "loss": 1.303, "mean_token_accuracy": 0.6622688621282578, "num_tokens": 4666579.0, "step": 358 }, { "epoch": 0.22976, "grad_norm": 3.8532536029815674, "learning_rate": 5e-06, "loss": 1.2905, "mean_token_accuracy": 0.6697241440415382, "num_tokens": 4678614.0, "step": 359 }, { "epoch": 0.2304, "grad_norm": 3.768440008163452, "learning_rate": 5e-06, "loss": 1.3384, "mean_token_accuracy": 0.6274634152650833, "num_tokens": 4690259.0, "step": 360 }, { "epoch": 0.23104, "grad_norm": 4.048650741577148, "learning_rate": 5e-06, "loss": 1.27, "mean_token_accuracy": 0.6553780138492584, "num_tokens": 4702794.0, "step": 361 }, { "epoch": 0.23168, "grad_norm": 3.264341354370117, "learning_rate": 5e-06, "loss": 1.2521, "mean_token_accuracy": 0.7201630547642708, "num_tokens": 4718863.0, "step": 362 }, { "epoch": 0.23232, "grad_norm": 3.293111562728882, "learning_rate": 5e-06, "loss": 1.206, "mean_token_accuracy": 0.691804438829422, "num_tokens": 4731459.0, "step": 363 }, { "epoch": 0.23296, "grad_norm": 3.562152862548828, "learning_rate": 5e-06, "loss": 1.5825, "mean_token_accuracy": 0.6220528446137905, "num_tokens": 4744183.0, "step": 364 }, { "epoch": 0.2336, "grad_norm": 3.858302116394043, "learning_rate": 5e-06, "loss": 1.2556, "mean_token_accuracy": 0.6687511652708054, "num_tokens": 4755339.0, "step": 365 }, { "epoch": 0.23424, "grad_norm": 3.6017565727233887, "learning_rate": 5e-06, "loss": 1.4059, "mean_token_accuracy": 0.6345420032739639, "num_tokens": 4767629.0, "step": 366 }, { "epoch": 0.23488, "grad_norm": 3.706761598587036, "learning_rate": 5e-06, "loss": 1.1984, "mean_token_accuracy": 0.6690258160233498, "num_tokens": 4778905.0, "step": 367 }, { "epoch": 0.23552, "grad_norm": 3.1312525272369385, "learning_rate": 5e-06, "loss": 1.3188, "mean_token_accuracy": 0.6492372825741768, "num_tokens": 4794948.0, "step": 368 }, { "epoch": 0.23616, "grad_norm": 4.282083034515381, "learning_rate": 5e-06, "loss": 1.4944, "mean_token_accuracy": 0.6254525110125542, "num_tokens": 4807887.0, "step": 369 }, { "epoch": 0.2368, "grad_norm": 3.156104564666748, "learning_rate": 5e-06, "loss": 1.4343, "mean_token_accuracy": 0.6384943351149559, "num_tokens": 4823135.0, "step": 370 }, { "epoch": 0.23744, "grad_norm": 3.9901719093322754, "learning_rate": 5e-06, "loss": 1.2035, "mean_token_accuracy": 0.7160904258489609, "num_tokens": 4835841.0, "step": 371 }, { "epoch": 0.23808, "grad_norm": 3.2367820739746094, "learning_rate": 5e-06, "loss": 1.276, "mean_token_accuracy": 0.659798189997673, "num_tokens": 4852490.0, "step": 372 }, { "epoch": 0.23872, "grad_norm": 3.741534948348999, "learning_rate": 5e-06, "loss": 1.3245, "mean_token_accuracy": 0.6480759754776955, "num_tokens": 4864536.0, "step": 373 }, { "epoch": 0.23936, "grad_norm": 5.090270042419434, "learning_rate": 5e-06, "loss": 1.3845, "mean_token_accuracy": 0.6432400941848755, "num_tokens": 4873861.0, "step": 374 }, { "epoch": 0.24, "grad_norm": 3.550171136856079, "learning_rate": 5e-06, "loss": 1.305, "mean_token_accuracy": 0.6453453898429871, "num_tokens": 4888154.0, "step": 375 }, { "epoch": 0.24064, "grad_norm": 4.662119388580322, "learning_rate": 5e-06, "loss": 1.3197, "mean_token_accuracy": 0.6590218544006348, "num_tokens": 4898468.0, "step": 376 }, { "epoch": 0.24128, "grad_norm": 5.356217861175537, "learning_rate": 5e-06, "loss": 1.3352, "mean_token_accuracy": 0.6680933758616447, "num_tokens": 4910094.0, "step": 377 }, { "epoch": 0.24192, "grad_norm": 3.6107497215270996, "learning_rate": 5e-06, "loss": 1.1964, "mean_token_accuracy": 0.6513196639716625, "num_tokens": 4924004.0, "step": 378 }, { "epoch": 0.24256, "grad_norm": 3.8560822010040283, "learning_rate": 5e-06, "loss": 1.4503, "mean_token_accuracy": 0.6257938891649246, "num_tokens": 4937000.0, "step": 379 }, { "epoch": 0.2432, "grad_norm": 3.5278120040893555, "learning_rate": 5e-06, "loss": 1.5268, "mean_token_accuracy": 0.6288462430238724, "num_tokens": 4951330.0, "step": 380 }, { "epoch": 0.24384, "grad_norm": 3.4525208473205566, "learning_rate": 5e-06, "loss": 1.2486, "mean_token_accuracy": 0.6658232286572456, "num_tokens": 4966074.0, "step": 381 }, { "epoch": 0.24448, "grad_norm": 3.9059042930603027, "learning_rate": 5e-06, "loss": 1.3658, "mean_token_accuracy": 0.6580025032162666, "num_tokens": 4977899.0, "step": 382 }, { "epoch": 0.24512, "grad_norm": 3.895254135131836, "learning_rate": 5e-06, "loss": 1.2906, "mean_token_accuracy": 0.659791849553585, "num_tokens": 4990481.0, "step": 383 }, { "epoch": 0.24576, "grad_norm": 3.6709907054901123, "learning_rate": 5e-06, "loss": 1.243, "mean_token_accuracy": 0.6784983575344086, "num_tokens": 5003987.0, "step": 384 }, { "epoch": 0.2464, "grad_norm": 3.8411707878112793, "learning_rate": 5e-06, "loss": 1.3464, "mean_token_accuracy": 0.6375136002898216, "num_tokens": 5015850.0, "step": 385 }, { "epoch": 0.24704, "grad_norm": 4.552581787109375, "learning_rate": 5e-06, "loss": 1.5245, "mean_token_accuracy": 0.6128373965620995, "num_tokens": 5026044.0, "step": 386 }, { "epoch": 0.24768, "grad_norm": 3.8649439811706543, "learning_rate": 5e-06, "loss": 1.4339, "mean_token_accuracy": 0.6444417163729668, "num_tokens": 5039126.0, "step": 387 }, { "epoch": 0.24832, "grad_norm": 4.057676315307617, "learning_rate": 5e-06, "loss": 1.352, "mean_token_accuracy": 0.6415472850203514, "num_tokens": 5051247.0, "step": 388 }, { "epoch": 0.24896, "grad_norm": 4.093824863433838, "learning_rate": 5e-06, "loss": 1.3294, "mean_token_accuracy": 0.6551511734724045, "num_tokens": 5066290.0, "step": 389 }, { "epoch": 0.2496, "grad_norm": 3.478832244873047, "learning_rate": 5e-06, "loss": 1.2387, "mean_token_accuracy": 0.6599762067198753, "num_tokens": 5079904.0, "step": 390 }, { "epoch": 0.25024, "grad_norm": 3.4885847568511963, "learning_rate": 5e-06, "loss": 1.2518, "mean_token_accuracy": 0.6731147542595863, "num_tokens": 5093663.0, "step": 391 }, { "epoch": 0.25088, "grad_norm": 3.4742021560668945, "learning_rate": 5e-06, "loss": 1.1443, "mean_token_accuracy": 0.7067101299762726, "num_tokens": 5108375.0, "step": 392 }, { "epoch": 0.25152, "grad_norm": 3.459711790084839, "learning_rate": 5e-06, "loss": 1.2421, "mean_token_accuracy": 0.677531287074089, "num_tokens": 5121820.0, "step": 393 }, { "epoch": 0.25216, "grad_norm": 3.607994794845581, "learning_rate": 5e-06, "loss": 1.7177, "mean_token_accuracy": 0.5983672738075256, "num_tokens": 5136753.0, "step": 394 }, { "epoch": 0.2528, "grad_norm": 3.9843177795410156, "learning_rate": 5e-06, "loss": 1.2271, "mean_token_accuracy": 0.6913007572293282, "num_tokens": 5148997.0, "step": 395 }, { "epoch": 0.25344, "grad_norm": 3.323129177093506, "learning_rate": 5e-06, "loss": 1.4278, "mean_token_accuracy": 0.6322130486369133, "num_tokens": 5163285.0, "step": 396 }, { "epoch": 0.25408, "grad_norm": 4.542083740234375, "learning_rate": 5e-06, "loss": 1.3214, "mean_token_accuracy": 0.676998108625412, "num_tokens": 5174430.0, "step": 397 }, { "epoch": 0.25472, "grad_norm": 3.523313045501709, "learning_rate": 5e-06, "loss": 1.5198, "mean_token_accuracy": 0.6127360239624977, "num_tokens": 5188411.0, "step": 398 }, { "epoch": 0.25536, "grad_norm": 3.990492820739746, "learning_rate": 5e-06, "loss": 1.4177, "mean_token_accuracy": 0.6671818047761917, "num_tokens": 5199067.0, "step": 399 }, { "epoch": 0.256, "grad_norm": 3.5755157470703125, "learning_rate": 5e-06, "loss": 1.3593, "mean_token_accuracy": 0.6605222076177597, "num_tokens": 5212285.0, "step": 400 }, { "epoch": 0.25664, "grad_norm": 3.8733558654785156, "learning_rate": 5e-06, "loss": 1.2535, "mean_token_accuracy": 0.6705236658453941, "num_tokens": 5224693.0, "step": 401 }, { "epoch": 0.25728, "grad_norm": 3.86195707321167, "learning_rate": 5e-06, "loss": 1.0007, "mean_token_accuracy": 0.7054353207349777, "num_tokens": 5235503.0, "step": 402 }, { "epoch": 0.25792, "grad_norm": 4.819467067718506, "learning_rate": 5e-06, "loss": 1.2544, "mean_token_accuracy": 0.6714291796088219, "num_tokens": 5244676.0, "step": 403 }, { "epoch": 0.25856, "grad_norm": 4.117583274841309, "learning_rate": 5e-06, "loss": 1.4205, "mean_token_accuracy": 0.6371640935540199, "num_tokens": 5259342.0, "step": 404 }, { "epoch": 0.2592, "grad_norm": 3.8214738368988037, "learning_rate": 5e-06, "loss": 1.2089, "mean_token_accuracy": 0.682219110429287, "num_tokens": 5271812.0, "step": 405 }, { "epoch": 0.25984, "grad_norm": 4.264610290527344, "learning_rate": 5e-06, "loss": 1.2525, "mean_token_accuracy": 0.6648172214627266, "num_tokens": 5285329.0, "step": 406 }, { "epoch": 0.26048, "grad_norm": 3.759557008743286, "learning_rate": 5e-06, "loss": 1.2922, "mean_token_accuracy": 0.6575791016221046, "num_tokens": 5298290.0, "step": 407 }, { "epoch": 0.26112, "grad_norm": 5.103738784790039, "learning_rate": 5e-06, "loss": 1.3045, "mean_token_accuracy": 0.6531935781240463, "num_tokens": 5313458.0, "step": 408 }, { "epoch": 0.26176, "grad_norm": 4.379658222198486, "learning_rate": 5e-06, "loss": 1.2592, "mean_token_accuracy": 0.6718562245368958, "num_tokens": 5324820.0, "step": 409 }, { "epoch": 0.2624, "grad_norm": 3.613741636276245, "learning_rate": 5e-06, "loss": 1.3052, "mean_token_accuracy": 0.6661521196365356, "num_tokens": 5340445.0, "step": 410 }, { "epoch": 0.26304, "grad_norm": 3.643263578414917, "learning_rate": 5e-06, "loss": 1.4656, "mean_token_accuracy": 0.6501626446843147, "num_tokens": 5353074.0, "step": 411 }, { "epoch": 0.26368, "grad_norm": 3.359731912612915, "learning_rate": 5e-06, "loss": 1.1761, "mean_token_accuracy": 0.6895303055644035, "num_tokens": 5367294.0, "step": 412 }, { "epoch": 0.26432, "grad_norm": 4.145616054534912, "learning_rate": 5e-06, "loss": 1.3095, "mean_token_accuracy": 0.6614864692091942, "num_tokens": 5378260.0, "step": 413 }, { "epoch": 0.26496, "grad_norm": 4.191911697387695, "learning_rate": 5e-06, "loss": 1.4995, "mean_token_accuracy": 0.6586913987994194, "num_tokens": 5390393.0, "step": 414 }, { "epoch": 0.2656, "grad_norm": 3.9197440147399902, "learning_rate": 5e-06, "loss": 1.2837, "mean_token_accuracy": 0.649936854839325, "num_tokens": 5404355.0, "step": 415 }, { "epoch": 0.26624, "grad_norm": 3.791869640350342, "learning_rate": 5e-06, "loss": 1.3852, "mean_token_accuracy": 0.6447301283478737, "num_tokens": 5418228.0, "step": 416 }, { "epoch": 0.26688, "grad_norm": 3.4961142539978027, "learning_rate": 5e-06, "loss": 1.2148, "mean_token_accuracy": 0.6757354438304901, "num_tokens": 5431981.0, "step": 417 }, { "epoch": 0.26752, "grad_norm": 3.777859687805176, "learning_rate": 5e-06, "loss": 1.2461, "mean_token_accuracy": 0.6744889244437218, "num_tokens": 5446699.0, "step": 418 }, { "epoch": 0.26816, "grad_norm": 4.008702754974365, "learning_rate": 5e-06, "loss": 1.4865, "mean_token_accuracy": 0.628353901207447, "num_tokens": 5459373.0, "step": 419 }, { "epoch": 0.2688, "grad_norm": 3.69231915473938, "learning_rate": 5e-06, "loss": 1.418, "mean_token_accuracy": 0.6555268168449402, "num_tokens": 5473223.0, "step": 420 }, { "epoch": 0.26944, "grad_norm": 3.597212314605713, "learning_rate": 5e-06, "loss": 1.4661, "mean_token_accuracy": 0.6289801895618439, "num_tokens": 5487849.0, "step": 421 }, { "epoch": 0.27008, "grad_norm": 3.8283562660217285, "learning_rate": 5e-06, "loss": 1.2745, "mean_token_accuracy": 0.6649068146944046, "num_tokens": 5499444.0, "step": 422 }, { "epoch": 0.27072, "grad_norm": 3.896993398666382, "learning_rate": 5e-06, "loss": 1.3163, "mean_token_accuracy": 0.6707305237650871, "num_tokens": 5512030.0, "step": 423 }, { "epoch": 0.27136, "grad_norm": 3.609224557876587, "learning_rate": 5e-06, "loss": 1.2482, "mean_token_accuracy": 0.6678152307868004, "num_tokens": 5525475.0, "step": 424 }, { "epoch": 0.272, "grad_norm": 3.715836763381958, "learning_rate": 5e-06, "loss": 1.1806, "mean_token_accuracy": 0.6893536150455475, "num_tokens": 5536706.0, "step": 425 }, { "epoch": 0.27264, "grad_norm": 4.006832599639893, "learning_rate": 5e-06, "loss": 1.3409, "mean_token_accuracy": 0.6921984776854515, "num_tokens": 5551024.0, "step": 426 }, { "epoch": 0.27328, "grad_norm": 3.5625905990600586, "learning_rate": 5e-06, "loss": 1.3058, "mean_token_accuracy": 0.6508674696087837, "num_tokens": 5566008.0, "step": 427 }, { "epoch": 0.27392, "grad_norm": 3.7165002822875977, "learning_rate": 5e-06, "loss": 1.4402, "mean_token_accuracy": 0.6360224187374115, "num_tokens": 5579661.0, "step": 428 }, { "epoch": 0.27456, "grad_norm": 3.702185869216919, "learning_rate": 5e-06, "loss": 1.3791, "mean_token_accuracy": 0.6388497278094292, "num_tokens": 5593091.0, "step": 429 }, { "epoch": 0.2752, "grad_norm": 3.397646188735962, "learning_rate": 5e-06, "loss": 1.4501, "mean_token_accuracy": 0.6443121284246445, "num_tokens": 5607808.0, "step": 430 }, { "epoch": 0.27584, "grad_norm": 4.425196170806885, "learning_rate": 5e-06, "loss": 1.2816, "mean_token_accuracy": 0.646281823515892, "num_tokens": 5619010.0, "step": 431 }, { "epoch": 0.27648, "grad_norm": 3.7968697547912598, "learning_rate": 5e-06, "loss": 1.4615, "mean_token_accuracy": 0.6492092609405518, "num_tokens": 5634182.0, "step": 432 }, { "epoch": 0.27712, "grad_norm": 3.3441648483276367, "learning_rate": 5e-06, "loss": 1.4186, "mean_token_accuracy": 0.6269052773714066, "num_tokens": 5647759.0, "step": 433 }, { "epoch": 0.27776, "grad_norm": 3.4352946281433105, "learning_rate": 5e-06, "loss": 1.3009, "mean_token_accuracy": 0.666948527097702, "num_tokens": 5662089.0, "step": 434 }, { "epoch": 0.2784, "grad_norm": 3.8102269172668457, "learning_rate": 5e-06, "loss": 1.3815, "mean_token_accuracy": 0.6399514004588127, "num_tokens": 5674982.0, "step": 435 }, { "epoch": 0.27904, "grad_norm": 3.747889995574951, "learning_rate": 5e-06, "loss": 1.5238, "mean_token_accuracy": 0.6195821687579155, "num_tokens": 5687944.0, "step": 436 }, { "epoch": 0.27968, "grad_norm": 3.963461399078369, "learning_rate": 5e-06, "loss": 1.2936, "mean_token_accuracy": 0.6649496257305145, "num_tokens": 5699002.0, "step": 437 }, { "epoch": 0.28032, "grad_norm": 3.3493547439575195, "learning_rate": 5e-06, "loss": 1.346, "mean_token_accuracy": 0.6441225036978722, "num_tokens": 5712275.0, "step": 438 }, { "epoch": 0.28096, "grad_norm": 3.779747247695923, "learning_rate": 5e-06, "loss": 1.5187, "mean_token_accuracy": 0.6183040626347065, "num_tokens": 5726089.0, "step": 439 }, { "epoch": 0.2816, "grad_norm": 3.8327977657318115, "learning_rate": 5e-06, "loss": 1.301, "mean_token_accuracy": 0.6831925585865974, "num_tokens": 5737061.0, "step": 440 }, { "epoch": 0.28224, "grad_norm": 3.559340476989746, "learning_rate": 5e-06, "loss": 1.286, "mean_token_accuracy": 0.667030468583107, "num_tokens": 5751187.0, "step": 441 }, { "epoch": 0.28288, "grad_norm": 3.395509719848633, "learning_rate": 5e-06, "loss": 1.1954, "mean_token_accuracy": 0.6770320907235146, "num_tokens": 5765243.0, "step": 442 }, { "epoch": 0.28352, "grad_norm": 4.320680618286133, "learning_rate": 5e-06, "loss": 1.4825, "mean_token_accuracy": 0.6353371068835258, "num_tokens": 5776809.0, "step": 443 }, { "epoch": 0.28416, "grad_norm": 4.229187488555908, "learning_rate": 5e-06, "loss": 1.3178, "mean_token_accuracy": 0.6752159968018532, "num_tokens": 5788234.0, "step": 444 }, { "epoch": 0.2848, "grad_norm": 3.9184088706970215, "learning_rate": 5e-06, "loss": 1.2316, "mean_token_accuracy": 0.6824081540107727, "num_tokens": 5799793.0, "step": 445 }, { "epoch": 0.28544, "grad_norm": 4.083866596221924, "learning_rate": 5e-06, "loss": 1.4558, "mean_token_accuracy": 0.6533151641488075, "num_tokens": 5812228.0, "step": 446 }, { "epoch": 0.28608, "grad_norm": 4.136886119842529, "learning_rate": 5e-06, "loss": 1.2802, "mean_token_accuracy": 0.6800569593906403, "num_tokens": 5822937.0, "step": 447 }, { "epoch": 0.28672, "grad_norm": 3.92091965675354, "learning_rate": 5e-06, "loss": 1.253, "mean_token_accuracy": 0.6916609779000282, "num_tokens": 5835964.0, "step": 448 }, { "epoch": 0.28736, "grad_norm": 3.784158706665039, "learning_rate": 5e-06, "loss": 1.2972, "mean_token_accuracy": 0.674082837998867, "num_tokens": 5847766.0, "step": 449 }, { "epoch": 0.288, "grad_norm": 4.055779933929443, "learning_rate": 5e-06, "loss": 1.5458, "mean_token_accuracy": 0.6409785822033882, "num_tokens": 5860297.0, "step": 450 }, { "epoch": 0.28864, "grad_norm": 4.014561176300049, "learning_rate": 5e-06, "loss": 1.239, "mean_token_accuracy": 0.6951504573225975, "num_tokens": 5871881.0, "step": 451 }, { "epoch": 0.28928, "grad_norm": 3.908066987991333, "learning_rate": 5e-06, "loss": 1.1405, "mean_token_accuracy": 0.6899219900369644, "num_tokens": 5883233.0, "step": 452 }, { "epoch": 0.28992, "grad_norm": 3.5451455116271973, "learning_rate": 5e-06, "loss": 1.4424, "mean_token_accuracy": 0.6282008588314056, "num_tokens": 5897342.0, "step": 453 }, { "epoch": 0.29056, "grad_norm": 3.9957897663116455, "learning_rate": 5e-06, "loss": 1.2495, "mean_token_accuracy": 0.6782330796122551, "num_tokens": 5909429.0, "step": 454 }, { "epoch": 0.2912, "grad_norm": 3.5935301780700684, "learning_rate": 5e-06, "loss": 1.292, "mean_token_accuracy": 0.6655653864145279, "num_tokens": 5923163.0, "step": 455 }, { "epoch": 0.29184, "grad_norm": 3.677741765975952, "learning_rate": 5e-06, "loss": 1.3873, "mean_token_accuracy": 0.6691123694181442, "num_tokens": 5938353.0, "step": 456 }, { "epoch": 0.29248, "grad_norm": 3.7560808658599854, "learning_rate": 5e-06, "loss": 1.2574, "mean_token_accuracy": 0.6840595826506615, "num_tokens": 5950566.0, "step": 457 }, { "epoch": 0.29312, "grad_norm": 4.219088077545166, "learning_rate": 5e-06, "loss": 1.4552, "mean_token_accuracy": 0.6695370376110077, "num_tokens": 5963475.0, "step": 458 }, { "epoch": 0.29376, "grad_norm": 4.02653169631958, "learning_rate": 5e-06, "loss": 1.3478, "mean_token_accuracy": 0.6490977182984352, "num_tokens": 5974934.0, "step": 459 }, { "epoch": 0.2944, "grad_norm": 3.8300678730010986, "learning_rate": 5e-06, "loss": 1.2826, "mean_token_accuracy": 0.64987413585186, "num_tokens": 5987250.0, "step": 460 }, { "epoch": 0.29504, "grad_norm": 3.818307876586914, "learning_rate": 5e-06, "loss": 1.1296, "mean_token_accuracy": 0.6943321749567986, "num_tokens": 5999834.0, "step": 461 }, { "epoch": 0.29568, "grad_norm": 3.6047048568725586, "learning_rate": 5e-06, "loss": 1.168, "mean_token_accuracy": 0.6777333468198776, "num_tokens": 6012454.0, "step": 462 }, { "epoch": 0.29632, "grad_norm": 3.4579696655273438, "learning_rate": 5e-06, "loss": 1.4147, "mean_token_accuracy": 0.6355468481779099, "num_tokens": 6026563.0, "step": 463 }, { "epoch": 0.29696, "grad_norm": 4.736328125, "learning_rate": 5e-06, "loss": 1.3852, "mean_token_accuracy": 0.6531487628817558, "num_tokens": 6037687.0, "step": 464 }, { "epoch": 0.2976, "grad_norm": 5.537712574005127, "learning_rate": 5e-06, "loss": 1.3899, "mean_token_accuracy": 0.6446737200021744, "num_tokens": 6052621.0, "step": 465 }, { "epoch": 0.29824, "grad_norm": 4.118095397949219, "learning_rate": 5e-06, "loss": 1.4534, "mean_token_accuracy": 0.6429826766252518, "num_tokens": 6065725.0, "step": 466 }, { "epoch": 0.29888, "grad_norm": 3.415851354598999, "learning_rate": 5e-06, "loss": 1.1162, "mean_token_accuracy": 0.6864155679941177, "num_tokens": 6080168.0, "step": 467 }, { "epoch": 0.29952, "grad_norm": 3.098151922225952, "learning_rate": 5e-06, "loss": 1.3817, "mean_token_accuracy": 0.6485566720366478, "num_tokens": 6096552.0, "step": 468 }, { "epoch": 0.30016, "grad_norm": 4.419194221496582, "learning_rate": 5e-06, "loss": 1.2934, "mean_token_accuracy": 0.6655605882406235, "num_tokens": 6107311.0, "step": 469 }, { "epoch": 0.3008, "grad_norm": 2.9706687927246094, "learning_rate": 5e-06, "loss": 1.4043, "mean_token_accuracy": 0.6342752501368523, "num_tokens": 6123233.0, "step": 470 }, { "epoch": 0.30144, "grad_norm": 4.0415940284729, "learning_rate": 5e-06, "loss": 1.4607, "mean_token_accuracy": 0.6415122263133526, "num_tokens": 6133347.0, "step": 471 }, { "epoch": 0.30208, "grad_norm": 3.6789848804473877, "learning_rate": 5e-06, "loss": 1.359, "mean_token_accuracy": 0.6488511562347412, "num_tokens": 6147619.0, "step": 472 }, { "epoch": 0.30272, "grad_norm": 3.8090357780456543, "learning_rate": 5e-06, "loss": 1.3368, "mean_token_accuracy": 0.638521321117878, "num_tokens": 6159564.0, "step": 473 }, { "epoch": 0.30336, "grad_norm": 3.4183847904205322, "learning_rate": 5e-06, "loss": 1.428, "mean_token_accuracy": 0.6213861741125584, "num_tokens": 6173025.0, "step": 474 }, { "epoch": 0.304, "grad_norm": 3.822892427444458, "learning_rate": 5e-06, "loss": 1.0865, "mean_token_accuracy": 0.6979233846068382, "num_tokens": 6185966.0, "step": 475 }, { "epoch": 0.30464, "grad_norm": 3.686979293823242, "learning_rate": 5e-06, "loss": 1.2195, "mean_token_accuracy": 0.6794964447617531, "num_tokens": 6198680.0, "step": 476 }, { "epoch": 0.30528, "grad_norm": 3.797368049621582, "learning_rate": 5e-06, "loss": 1.3273, "mean_token_accuracy": 0.6465971991419792, "num_tokens": 6212513.0, "step": 477 }, { "epoch": 0.30592, "grad_norm": 3.9698474407196045, "learning_rate": 5e-06, "loss": 1.3636, "mean_token_accuracy": 0.6398535817861557, "num_tokens": 6224341.0, "step": 478 }, { "epoch": 0.30656, "grad_norm": 3.755352258682251, "learning_rate": 5e-06, "loss": 1.2421, "mean_token_accuracy": 0.6844679713249207, "num_tokens": 6236749.0, "step": 479 }, { "epoch": 0.3072, "grad_norm": 3.6229302883148193, "learning_rate": 5e-06, "loss": 1.1653, "mean_token_accuracy": 0.7136622071266174, "num_tokens": 6250516.0, "step": 480 }, { "epoch": 0.30784, "grad_norm": 4.006715774536133, "learning_rate": 5e-06, "loss": 1.0948, "mean_token_accuracy": 0.6872854009270668, "num_tokens": 6264775.0, "step": 481 }, { "epoch": 0.30848, "grad_norm": 3.036703586578369, "learning_rate": 5e-06, "loss": 1.4079, "mean_token_accuracy": 0.6424620673060417, "num_tokens": 6280727.0, "step": 482 }, { "epoch": 0.30912, "grad_norm": 3.861215114593506, "learning_rate": 5e-06, "loss": 1.5746, "mean_token_accuracy": 0.6214606538414955, "num_tokens": 6294874.0, "step": 483 }, { "epoch": 0.30976, "grad_norm": 3.6067492961883545, "learning_rate": 5e-06, "loss": 1.2666, "mean_token_accuracy": 0.6533055976033211, "num_tokens": 6308075.0, "step": 484 }, { "epoch": 0.3104, "grad_norm": 3.8777058124542236, "learning_rate": 5e-06, "loss": 1.4112, "mean_token_accuracy": 0.6393994837999344, "num_tokens": 6319998.0, "step": 485 }, { "epoch": 0.31104, "grad_norm": 3.640782594680786, "learning_rate": 5e-06, "loss": 1.3312, "mean_token_accuracy": 0.6485870778560638, "num_tokens": 6332589.0, "step": 486 }, { "epoch": 0.31168, "grad_norm": 3.792318344116211, "learning_rate": 5e-06, "loss": 1.4024, "mean_token_accuracy": 0.6588046550750732, "num_tokens": 6345037.0, "step": 487 }, { "epoch": 0.31232, "grad_norm": 3.5393240451812744, "learning_rate": 5e-06, "loss": 1.3419, "mean_token_accuracy": 0.6457289680838585, "num_tokens": 6359457.0, "step": 488 }, { "epoch": 0.31296, "grad_norm": 3.974876642227173, "learning_rate": 5e-06, "loss": 1.1741, "mean_token_accuracy": 0.6610330641269684, "num_tokens": 6370983.0, "step": 489 }, { "epoch": 0.3136, "grad_norm": 3.6941604614257812, "learning_rate": 5e-06, "loss": 1.2241, "mean_token_accuracy": 0.686374232172966, "num_tokens": 6384168.0, "step": 490 }, { "epoch": 0.31424, "grad_norm": 4.212184906005859, "learning_rate": 5e-06, "loss": 1.4216, "mean_token_accuracy": 0.6811521798372269, "num_tokens": 6395881.0, "step": 491 }, { "epoch": 0.31488, "grad_norm": 3.925226926803589, "learning_rate": 5e-06, "loss": 1.3582, "mean_token_accuracy": 0.6342300400137901, "num_tokens": 6409919.0, "step": 492 }, { "epoch": 0.31552, "grad_norm": 3.9599673748016357, "learning_rate": 5e-06, "loss": 1.3918, "mean_token_accuracy": 0.6489474773406982, "num_tokens": 6422097.0, "step": 493 }, { "epoch": 0.31616, "grad_norm": 3.42258358001709, "learning_rate": 5e-06, "loss": 1.1146, "mean_token_accuracy": 0.6913427859544754, "num_tokens": 6435173.0, "step": 494 }, { "epoch": 0.3168, "grad_norm": 4.284220218658447, "learning_rate": 5e-06, "loss": 1.4547, "mean_token_accuracy": 0.6440516263246536, "num_tokens": 6447321.0, "step": 495 }, { "epoch": 0.31744, "grad_norm": 3.7218246459960938, "learning_rate": 5e-06, "loss": 1.3806, "mean_token_accuracy": 0.629929706454277, "num_tokens": 6460270.0, "step": 496 }, { "epoch": 0.31808, "grad_norm": 3.406933546066284, "learning_rate": 5e-06, "loss": 1.1401, "mean_token_accuracy": 0.67889504134655, "num_tokens": 6473352.0, "step": 497 }, { "epoch": 0.31872, "grad_norm": 3.2584404945373535, "learning_rate": 5e-06, "loss": 1.2213, "mean_token_accuracy": 0.6629000529646873, "num_tokens": 6487623.0, "step": 498 }, { "epoch": 0.31936, "grad_norm": 4.134445667266846, "learning_rate": 5e-06, "loss": 1.1397, "mean_token_accuracy": 0.6819293051958084, "num_tokens": 6499986.0, "step": 499 }, { "epoch": 0.32, "grad_norm": 4.104599475860596, "learning_rate": 5e-06, "loss": 1.4537, "mean_token_accuracy": 0.6394720375537872, "num_tokens": 6512724.0, "step": 500 }, { "epoch": 0.32064, "grad_norm": 3.4379241466522217, "learning_rate": 5e-06, "loss": 1.3714, "mean_token_accuracy": 0.6542030349373817, "num_tokens": 6526727.0, "step": 501 }, { "epoch": 0.32128, "grad_norm": 3.4537572860717773, "learning_rate": 5e-06, "loss": 1.3053, "mean_token_accuracy": 0.6661063358187675, "num_tokens": 6542663.0, "step": 502 }, { "epoch": 0.32192, "grad_norm": 3.5106639862060547, "learning_rate": 5e-06, "loss": 1.3694, "mean_token_accuracy": 0.6464217305183411, "num_tokens": 6557847.0, "step": 503 }, { "epoch": 0.32256, "grad_norm": 3.924419641494751, "learning_rate": 5e-06, "loss": 1.2544, "mean_token_accuracy": 0.6603437811136246, "num_tokens": 6570515.0, "step": 504 }, { "epoch": 0.3232, "grad_norm": 3.385101318359375, "learning_rate": 5e-06, "loss": 1.4872, "mean_token_accuracy": 0.6421084851026535, "num_tokens": 6584786.0, "step": 505 }, { "epoch": 0.32384, "grad_norm": 3.378535032272339, "learning_rate": 5e-06, "loss": 1.1475, "mean_token_accuracy": 0.6894903257489204, "num_tokens": 6598436.0, "step": 506 }, { "epoch": 0.32448, "grad_norm": 4.74169397354126, "learning_rate": 5e-06, "loss": 1.4346, "mean_token_accuracy": 0.6547307670116425, "num_tokens": 6610436.0, "step": 507 }, { "epoch": 0.32512, "grad_norm": 3.473893165588379, "learning_rate": 5e-06, "loss": 1.3827, "mean_token_accuracy": 0.6525059714913368, "num_tokens": 6626032.0, "step": 508 }, { "epoch": 0.32576, "grad_norm": 4.2575273513793945, "learning_rate": 5e-06, "loss": 1.4238, "mean_token_accuracy": 0.6410808116197586, "num_tokens": 6637760.0, "step": 509 }, { "epoch": 0.3264, "grad_norm": 3.5705769062042236, "learning_rate": 5e-06, "loss": 1.2437, "mean_token_accuracy": 0.6628687754273415, "num_tokens": 6652912.0, "step": 510 }, { "epoch": 0.32704, "grad_norm": 4.5391011238098145, "learning_rate": 5e-06, "loss": 1.292, "mean_token_accuracy": 0.6497529372572899, "num_tokens": 6664537.0, "step": 511 }, { "epoch": 0.32768, "grad_norm": 4.8541789054870605, "learning_rate": 5e-06, "loss": 1.3102, "mean_token_accuracy": 0.6663089245557785, "num_tokens": 6674380.0, "step": 512 }, { "epoch": 0.32832, "grad_norm": 4.8184332847595215, "learning_rate": 5e-06, "loss": 1.3336, "mean_token_accuracy": 0.6582650914788246, "num_tokens": 6686208.0, "step": 513 }, { "epoch": 0.32896, "grad_norm": 3.9492416381835938, "learning_rate": 5e-06, "loss": 1.1609, "mean_token_accuracy": 0.6893665343523026, "num_tokens": 6698802.0, "step": 514 }, { "epoch": 0.3296, "grad_norm": 3.2947769165039062, "learning_rate": 5e-06, "loss": 1.3108, "mean_token_accuracy": 0.6393668726086617, "num_tokens": 6714353.0, "step": 515 }, { "epoch": 0.33024, "grad_norm": 3.5826685428619385, "learning_rate": 5e-06, "loss": 1.2592, "mean_token_accuracy": 0.663002572953701, "num_tokens": 6728955.0, "step": 516 }, { "epoch": 0.33088, "grad_norm": 6.832690238952637, "learning_rate": 5e-06, "loss": 1.2339, "mean_token_accuracy": 0.6615518927574158, "num_tokens": 6742119.0, "step": 517 }, { "epoch": 0.33152, "grad_norm": 3.935009479522705, "learning_rate": 5e-06, "loss": 1.2767, "mean_token_accuracy": 0.6649063900113106, "num_tokens": 6754185.0, "step": 518 }, { "epoch": 0.33216, "grad_norm": 4.145579814910889, "learning_rate": 5e-06, "loss": 1.5135, "mean_token_accuracy": 0.6258162558078766, "num_tokens": 6765367.0, "step": 519 }, { "epoch": 0.3328, "grad_norm": 3.592618227005005, "learning_rate": 5e-06, "loss": 1.4578, "mean_token_accuracy": 0.623950220644474, "num_tokens": 6778122.0, "step": 520 }, { "epoch": 0.33344, "grad_norm": 5.795764923095703, "learning_rate": 5e-06, "loss": 1.4626, "mean_token_accuracy": 0.6486967876553535, "num_tokens": 6790660.0, "step": 521 }, { "epoch": 0.33408, "grad_norm": 4.278341293334961, "learning_rate": 5e-06, "loss": 1.1897, "mean_token_accuracy": 0.6738953441381454, "num_tokens": 6802594.0, "step": 522 }, { "epoch": 0.33472, "grad_norm": 4.899449825286865, "learning_rate": 5e-06, "loss": 1.3754, "mean_token_accuracy": 0.6378564760088921, "num_tokens": 6818415.0, "step": 523 }, { "epoch": 0.33536, "grad_norm": 4.733186721801758, "learning_rate": 5e-06, "loss": 1.36, "mean_token_accuracy": 0.6564139500260353, "num_tokens": 6831504.0, "step": 524 }, { "epoch": 0.336, "grad_norm": 3.7966043949127197, "learning_rate": 5e-06, "loss": 1.2975, "mean_token_accuracy": 0.6624530181288719, "num_tokens": 6843279.0, "step": 525 }, { "epoch": 0.33664, "grad_norm": 5.124260425567627, "learning_rate": 5e-06, "loss": 1.5377, "mean_token_accuracy": 0.6152323558926582, "num_tokens": 6854556.0, "step": 526 }, { "epoch": 0.33728, "grad_norm": 4.210925579071045, "learning_rate": 5e-06, "loss": 1.3342, "mean_token_accuracy": 0.6440554708242416, "num_tokens": 6867858.0, "step": 527 }, { "epoch": 0.33792, "grad_norm": 3.751556873321533, "learning_rate": 5e-06, "loss": 1.4675, "mean_token_accuracy": 0.6253782510757446, "num_tokens": 6881065.0, "step": 528 }, { "epoch": 0.33856, "grad_norm": 6.117438793182373, "learning_rate": 5e-06, "loss": 1.356, "mean_token_accuracy": 0.6668061912059784, "num_tokens": 6894733.0, "step": 529 }, { "epoch": 0.3392, "grad_norm": 3.5207901000976562, "learning_rate": 5e-06, "loss": 1.2676, "mean_token_accuracy": 0.6576649472117424, "num_tokens": 6907851.0, "step": 530 }, { "epoch": 0.33984, "grad_norm": 3.6760780811309814, "learning_rate": 5e-06, "loss": 1.1176, "mean_token_accuracy": 0.6879062727093697, "num_tokens": 6921679.0, "step": 531 }, { "epoch": 0.34048, "grad_norm": 4.656152725219727, "learning_rate": 5e-06, "loss": 1.42, "mean_token_accuracy": 0.642581582069397, "num_tokens": 6935677.0, "step": 532 }, { "epoch": 0.34112, "grad_norm": 5.187691688537598, "learning_rate": 5e-06, "loss": 1.4074, "mean_token_accuracy": 0.668558657169342, "num_tokens": 6945440.0, "step": 533 }, { "epoch": 0.34176, "grad_norm": 7.5727949142456055, "learning_rate": 5e-06, "loss": 1.4775, "mean_token_accuracy": 0.6168788969516754, "num_tokens": 6959496.0, "step": 534 }, { "epoch": 0.3424, "grad_norm": 3.821122646331787, "learning_rate": 5e-06, "loss": 1.1118, "mean_token_accuracy": 0.6967110335826874, "num_tokens": 6970897.0, "step": 535 }, { "epoch": 0.34304, "grad_norm": 3.28977108001709, "learning_rate": 5e-06, "loss": 1.2668, "mean_token_accuracy": 0.6639266163110733, "num_tokens": 6986271.0, "step": 536 }, { "epoch": 0.34368, "grad_norm": 4.031164646148682, "learning_rate": 5e-06, "loss": 1.2953, "mean_token_accuracy": 0.6541831567883492, "num_tokens": 6998841.0, "step": 537 }, { "epoch": 0.34432, "grad_norm": 5.315206527709961, "learning_rate": 5e-06, "loss": 1.2881, "mean_token_accuracy": 0.6494873613119125, "num_tokens": 7008652.0, "step": 538 }, { "epoch": 0.34496, "grad_norm": 5.740390777587891, "learning_rate": 5e-06, "loss": 1.3778, "mean_token_accuracy": 0.6532981097698212, "num_tokens": 7020915.0, "step": 539 }, { "epoch": 0.3456, "grad_norm": 5.474863529205322, "learning_rate": 5e-06, "loss": 1.1128, "mean_token_accuracy": 0.7052044421434402, "num_tokens": 7032119.0, "step": 540 }, { "epoch": 0.34624, "grad_norm": 4.56429386138916, "learning_rate": 5e-06, "loss": 1.2831, "mean_token_accuracy": 0.6683964505791664, "num_tokens": 7045586.0, "step": 541 }, { "epoch": 0.34688, "grad_norm": 3.815187454223633, "learning_rate": 5e-06, "loss": 1.1035, "mean_token_accuracy": 0.7000684291124344, "num_tokens": 7057244.0, "step": 542 }, { "epoch": 0.34752, "grad_norm": 6.026943683624268, "learning_rate": 5e-06, "loss": 1.2064, "mean_token_accuracy": 0.6807686313986778, "num_tokens": 7068752.0, "step": 543 }, { "epoch": 0.34816, "grad_norm": 4.224482536315918, "learning_rate": 5e-06, "loss": 1.2508, "mean_token_accuracy": 0.6724436059594154, "num_tokens": 7082898.0, "step": 544 }, { "epoch": 0.3488, "grad_norm": 7.96382474899292, "learning_rate": 5e-06, "loss": 1.1555, "mean_token_accuracy": 0.6773002594709396, "num_tokens": 7095886.0, "step": 545 }, { "epoch": 0.34944, "grad_norm": 4.775862693786621, "learning_rate": 5e-06, "loss": 1.2992, "mean_token_accuracy": 0.6667480766773224, "num_tokens": 7107762.0, "step": 546 }, { "epoch": 0.35008, "grad_norm": 3.49412202835083, "learning_rate": 5e-06, "loss": 1.1785, "mean_token_accuracy": 0.7002041935920715, "num_tokens": 7121108.0, "step": 547 }, { "epoch": 0.35072, "grad_norm": 4.250086784362793, "learning_rate": 5e-06, "loss": 1.1921, "mean_token_accuracy": 0.7101981267333031, "num_tokens": 7132883.0, "step": 548 }, { "epoch": 0.35136, "grad_norm": 3.9039688110351562, "learning_rate": 5e-06, "loss": 1.2883, "mean_token_accuracy": 0.6617401614785194, "num_tokens": 7146456.0, "step": 549 }, { "epoch": 0.352, "grad_norm": 3.8325276374816895, "learning_rate": 5e-06, "loss": 1.315, "mean_token_accuracy": 0.6657034084200859, "num_tokens": 7160223.0, "step": 550 }, { "epoch": 0.35264, "grad_norm": 5.472667217254639, "learning_rate": 5e-06, "loss": 1.4434, "mean_token_accuracy": 0.635408416390419, "num_tokens": 7175945.0, "step": 551 }, { "epoch": 0.35328, "grad_norm": 4.009690761566162, "learning_rate": 5e-06, "loss": 1.3101, "mean_token_accuracy": 0.6696026399731636, "num_tokens": 7189467.0, "step": 552 }, { "epoch": 0.35392, "grad_norm": 4.114287853240967, "learning_rate": 5e-06, "loss": 1.2527, "mean_token_accuracy": 0.669425942003727, "num_tokens": 7201319.0, "step": 553 }, { "epoch": 0.35456, "grad_norm": 4.302579402923584, "learning_rate": 5e-06, "loss": 1.3494, "mean_token_accuracy": 0.63911372423172, "num_tokens": 7213327.0, "step": 554 }, { "epoch": 0.3552, "grad_norm": 3.737901210784912, "learning_rate": 5e-06, "loss": 1.3994, "mean_token_accuracy": 0.6371675282716751, "num_tokens": 7226217.0, "step": 555 }, { "epoch": 0.35584, "grad_norm": 3.517141103744507, "learning_rate": 5e-06, "loss": 1.5393, "mean_token_accuracy": 0.615730918943882, "num_tokens": 7240381.0, "step": 556 }, { "epoch": 0.35648, "grad_norm": 4.263305187225342, "learning_rate": 5e-06, "loss": 1.1575, "mean_token_accuracy": 0.6844438910484314, "num_tokens": 7251375.0, "step": 557 }, { "epoch": 0.35712, "grad_norm": 4.197317600250244, "learning_rate": 5e-06, "loss": 1.3062, "mean_token_accuracy": 0.642548106610775, "num_tokens": 7265647.0, "step": 558 }, { "epoch": 0.35776, "grad_norm": 4.2730560302734375, "learning_rate": 5e-06, "loss": 1.2598, "mean_token_accuracy": 0.6705774366855621, "num_tokens": 7277240.0, "step": 559 }, { "epoch": 0.3584, "grad_norm": 5.627854347229004, "learning_rate": 5e-06, "loss": 1.133, "mean_token_accuracy": 0.6944706663489342, "num_tokens": 7293050.0, "step": 560 }, { "epoch": 0.35904, "grad_norm": 5.039371013641357, "learning_rate": 5e-06, "loss": 1.3061, "mean_token_accuracy": 0.652328722178936, "num_tokens": 7305212.0, "step": 561 }, { "epoch": 0.35968, "grad_norm": 4.255235195159912, "learning_rate": 5e-06, "loss": 1.2425, "mean_token_accuracy": 0.6588255614042282, "num_tokens": 7318058.0, "step": 562 }, { "epoch": 0.36032, "grad_norm": 3.5205321311950684, "learning_rate": 5e-06, "loss": 1.3016, "mean_token_accuracy": 0.660710796713829, "num_tokens": 7330484.0, "step": 563 }, { "epoch": 0.36096, "grad_norm": 4.263877868652344, "learning_rate": 5e-06, "loss": 1.2689, "mean_token_accuracy": 0.6572084054350853, "num_tokens": 7342403.0, "step": 564 }, { "epoch": 0.3616, "grad_norm": 3.9740233421325684, "learning_rate": 5e-06, "loss": 1.1917, "mean_token_accuracy": 0.6762436851859093, "num_tokens": 7353974.0, "step": 565 }, { "epoch": 0.36224, "grad_norm": 3.4019787311553955, "learning_rate": 5e-06, "loss": 1.4559, "mean_token_accuracy": 0.6335306763648987, "num_tokens": 7371145.0, "step": 566 }, { "epoch": 0.36288, "grad_norm": 3.6773386001586914, "learning_rate": 5e-06, "loss": 1.3727, "mean_token_accuracy": 0.6477261707186699, "num_tokens": 7385503.0, "step": 567 }, { "epoch": 0.36352, "grad_norm": 3.533553123474121, "learning_rate": 5e-06, "loss": 1.3915, "mean_token_accuracy": 0.6359102874994278, "num_tokens": 7397974.0, "step": 568 }, { "epoch": 0.36416, "grad_norm": 4.083873271942139, "learning_rate": 5e-06, "loss": 1.2959, "mean_token_accuracy": 0.6282073631882668, "num_tokens": 7409071.0, "step": 569 }, { "epoch": 0.3648, "grad_norm": 3.371812582015991, "learning_rate": 5e-06, "loss": 1.3724, "mean_token_accuracy": 0.6561341881752014, "num_tokens": 7425801.0, "step": 570 }, { "epoch": 0.36544, "grad_norm": 5.2290425300598145, "learning_rate": 5e-06, "loss": 1.3808, "mean_token_accuracy": 0.66990677267313, "num_tokens": 7435083.0, "step": 571 }, { "epoch": 0.36608, "grad_norm": 3.8227179050445557, "learning_rate": 5e-06, "loss": 1.2523, "mean_token_accuracy": 0.6640745401382446, "num_tokens": 7445874.0, "step": 572 }, { "epoch": 0.36672, "grad_norm": 3.826213836669922, "learning_rate": 5e-06, "loss": 1.2955, "mean_token_accuracy": 0.6566179618239403, "num_tokens": 7458792.0, "step": 573 }, { "epoch": 0.36736, "grad_norm": 3.166212558746338, "learning_rate": 5e-06, "loss": 1.3854, "mean_token_accuracy": 0.6362887248396873, "num_tokens": 7474364.0, "step": 574 }, { "epoch": 0.368, "grad_norm": 3.225037097930908, "learning_rate": 5e-06, "loss": 1.4097, "mean_token_accuracy": 0.6328474953770638, "num_tokens": 7489627.0, "step": 575 }, { "epoch": 0.36864, "grad_norm": 4.110698699951172, "learning_rate": 5e-06, "loss": 1.2922, "mean_token_accuracy": 0.6546645760536194, "num_tokens": 7501958.0, "step": 576 }, { "epoch": 0.36928, "grad_norm": 3.426607608795166, "learning_rate": 5e-06, "loss": 1.4665, "mean_token_accuracy": 0.6124880164861679, "num_tokens": 7515385.0, "step": 577 }, { "epoch": 0.36992, "grad_norm": 3.6768105030059814, "learning_rate": 5e-06, "loss": 1.2686, "mean_token_accuracy": 0.6734522432088852, "num_tokens": 7528618.0, "step": 578 }, { "epoch": 0.37056, "grad_norm": 3.3351573944091797, "learning_rate": 5e-06, "loss": 1.351, "mean_token_accuracy": 0.6506511121988297, "num_tokens": 7543651.0, "step": 579 }, { "epoch": 0.3712, "grad_norm": 4.15482759475708, "learning_rate": 5e-06, "loss": 1.3044, "mean_token_accuracy": 0.6768280491232872, "num_tokens": 7554851.0, "step": 580 }, { "epoch": 0.37184, "grad_norm": 4.212845802307129, "learning_rate": 5e-06, "loss": 1.3038, "mean_token_accuracy": 0.6363187730312347, "num_tokens": 7567001.0, "step": 581 }, { "epoch": 0.37248, "grad_norm": 4.185598850250244, "learning_rate": 5e-06, "loss": 1.3491, "mean_token_accuracy": 0.6468858942389488, "num_tokens": 7579432.0, "step": 582 }, { "epoch": 0.37312, "grad_norm": 3.4942967891693115, "learning_rate": 5e-06, "loss": 1.364, "mean_token_accuracy": 0.650618351995945, "num_tokens": 7593471.0, "step": 583 }, { "epoch": 0.37376, "grad_norm": 3.4861021041870117, "learning_rate": 5e-06, "loss": 1.1446, "mean_token_accuracy": 0.6844891607761383, "num_tokens": 7608553.0, "step": 584 }, { "epoch": 0.3744, "grad_norm": 3.893850803375244, "learning_rate": 5e-06, "loss": 1.458, "mean_token_accuracy": 0.621130146086216, "num_tokens": 7621367.0, "step": 585 }, { "epoch": 0.37504, "grad_norm": 3.653973340988159, "learning_rate": 5e-06, "loss": 1.3408, "mean_token_accuracy": 0.6439371258020401, "num_tokens": 7634694.0, "step": 586 }, { "epoch": 0.37568, "grad_norm": 3.94148850440979, "learning_rate": 5e-06, "loss": 1.4367, "mean_token_accuracy": 0.6162022799253464, "num_tokens": 7646183.0, "step": 587 }, { "epoch": 0.37632, "grad_norm": 3.2505555152893066, "learning_rate": 5e-06, "loss": 1.4763, "mean_token_accuracy": 0.6298285201191902, "num_tokens": 7661964.0, "step": 588 }, { "epoch": 0.37696, "grad_norm": 3.1683783531188965, "learning_rate": 5e-06, "loss": 1.18, "mean_token_accuracy": 0.6652778312563896, "num_tokens": 7676405.0, "step": 589 }, { "epoch": 0.3776, "grad_norm": 4.221475601196289, "learning_rate": 5e-06, "loss": 1.2668, "mean_token_accuracy": 0.6732780113816261, "num_tokens": 7690168.0, "step": 590 }, { "epoch": 0.37824, "grad_norm": 4.262617111206055, "learning_rate": 5e-06, "loss": 1.2225, "mean_token_accuracy": 0.6608899086713791, "num_tokens": 7700128.0, "step": 591 }, { "epoch": 0.37888, "grad_norm": 3.564286231994629, "learning_rate": 5e-06, "loss": 1.4015, "mean_token_accuracy": 0.6410685330629349, "num_tokens": 7711700.0, "step": 592 }, { "epoch": 0.37952, "grad_norm": 3.5013587474823, "learning_rate": 5e-06, "loss": 1.0532, "mean_token_accuracy": 0.7084442153573036, "num_tokens": 7722670.0, "step": 593 }, { "epoch": 0.38016, "grad_norm": 4.010073661804199, "learning_rate": 5e-06, "loss": 1.2001, "mean_token_accuracy": 0.6808509230613708, "num_tokens": 7734476.0, "step": 594 }, { "epoch": 0.3808, "grad_norm": 3.8407106399536133, "learning_rate": 5e-06, "loss": 1.2772, "mean_token_accuracy": 0.661079652607441, "num_tokens": 7747191.0, "step": 595 }, { "epoch": 0.38144, "grad_norm": 3.9195191860198975, "learning_rate": 5e-06, "loss": 1.2063, "mean_token_accuracy": 0.6758553683757782, "num_tokens": 7757542.0, "step": 596 }, { "epoch": 0.38208, "grad_norm": 5.459002494812012, "learning_rate": 5e-06, "loss": 1.4173, "mean_token_accuracy": 0.6381874680519104, "num_tokens": 7770427.0, "step": 597 }, { "epoch": 0.38272, "grad_norm": 4.335068702697754, "learning_rate": 5e-06, "loss": 1.2628, "mean_token_accuracy": 0.6815316006541252, "num_tokens": 7781771.0, "step": 598 }, { "epoch": 0.38336, "grad_norm": 3.5294859409332275, "learning_rate": 5e-06, "loss": 1.1975, "mean_token_accuracy": 0.6658317893743515, "num_tokens": 7796104.0, "step": 599 }, { "epoch": 0.384, "grad_norm": 5.696824550628662, "learning_rate": 5e-06, "loss": 1.3531, "mean_token_accuracy": 0.6557957530021667, "num_tokens": 7808117.0, "step": 600 }, { "epoch": 0.38464, "grad_norm": 3.5926239490509033, "learning_rate": 5e-06, "loss": 1.276, "mean_token_accuracy": 0.6641542464494705, "num_tokens": 7819726.0, "step": 601 }, { "epoch": 0.38528, "grad_norm": 3.8258309364318848, "learning_rate": 5e-06, "loss": 1.3628, "mean_token_accuracy": 0.6388561427593231, "num_tokens": 7833205.0, "step": 602 }, { "epoch": 0.38592, "grad_norm": 8.0659761428833, "learning_rate": 5e-06, "loss": 1.2999, "mean_token_accuracy": 0.6774822995066643, "num_tokens": 7847836.0, "step": 603 }, { "epoch": 0.38656, "grad_norm": 3.9128899574279785, "learning_rate": 5e-06, "loss": 1.1609, "mean_token_accuracy": 0.6698524802923203, "num_tokens": 7860682.0, "step": 604 }, { "epoch": 0.3872, "grad_norm": 4.181707382202148, "learning_rate": 5e-06, "loss": 1.5005, "mean_token_accuracy": 0.6392693892121315, "num_tokens": 7872637.0, "step": 605 }, { "epoch": 0.38784, "grad_norm": 4.092964172363281, "learning_rate": 5e-06, "loss": 1.4009, "mean_token_accuracy": 0.6333677843213081, "num_tokens": 7884650.0, "step": 606 }, { "epoch": 0.38848, "grad_norm": 4.87518835067749, "learning_rate": 5e-06, "loss": 1.3242, "mean_token_accuracy": 0.6464787498116493, "num_tokens": 7898052.0, "step": 607 }, { "epoch": 0.38912, "grad_norm": 4.693853855133057, "learning_rate": 5e-06, "loss": 1.3407, "mean_token_accuracy": 0.6689095348119736, "num_tokens": 7912216.0, "step": 608 }, { "epoch": 0.38976, "grad_norm": 4.081251621246338, "learning_rate": 5e-06, "loss": 1.3657, "mean_token_accuracy": 0.6710544601082802, "num_tokens": 7924053.0, "step": 609 }, { "epoch": 0.3904, "grad_norm": 25.89602279663086, "learning_rate": 5e-06, "loss": 1.3437, "mean_token_accuracy": 0.6381836906075478, "num_tokens": 7936238.0, "step": 610 }, { "epoch": 0.39104, "grad_norm": 5.2313055992126465, "learning_rate": 5e-06, "loss": 1.341, "mean_token_accuracy": 0.6564094573259354, "num_tokens": 7948038.0, "step": 611 }, { "epoch": 0.39168, "grad_norm": 3.936708927154541, "learning_rate": 5e-06, "loss": 1.3564, "mean_token_accuracy": 0.6562279239296913, "num_tokens": 7960193.0, "step": 612 }, { "epoch": 0.39232, "grad_norm": 5.367516994476318, "learning_rate": 5e-06, "loss": 1.2065, "mean_token_accuracy": 0.6724315732717514, "num_tokens": 7974200.0, "step": 613 }, { "epoch": 0.39296, "grad_norm": 4.628478527069092, "learning_rate": 5e-06, "loss": 1.4374, "mean_token_accuracy": 0.6563806012272835, "num_tokens": 7988501.0, "step": 614 }, { "epoch": 0.3936, "grad_norm": 5.892339706420898, "learning_rate": 5e-06, "loss": 1.2956, "mean_token_accuracy": 0.6271971762180328, "num_tokens": 8000663.0, "step": 615 }, { "epoch": 0.39424, "grad_norm": 7.812566757202148, "learning_rate": 5e-06, "loss": 1.294, "mean_token_accuracy": 0.6547481939196587, "num_tokens": 8014228.0, "step": 616 }, { "epoch": 0.39488, "grad_norm": 3.2736427783966064, "learning_rate": 5e-06, "loss": 1.0853, "mean_token_accuracy": 0.7260592132806778, "num_tokens": 8028431.0, "step": 617 }, { "epoch": 0.39552, "grad_norm": 4.344757080078125, "learning_rate": 5e-06, "loss": 1.1555, "mean_token_accuracy": 0.6958058997988701, "num_tokens": 8038986.0, "step": 618 }, { "epoch": 0.39616, "grad_norm": 4.368517875671387, "learning_rate": 5e-06, "loss": 1.2744, "mean_token_accuracy": 0.6637471318244934, "num_tokens": 8050087.0, "step": 619 }, { "epoch": 0.3968, "grad_norm": 3.57128643989563, "learning_rate": 5e-06, "loss": 1.4689, "mean_token_accuracy": 0.6049885600805283, "num_tokens": 8063113.0, "step": 620 }, { "epoch": 0.39744, "grad_norm": 3.331205368041992, "learning_rate": 5e-06, "loss": 1.4329, "mean_token_accuracy": 0.6517436727881432, "num_tokens": 8079228.0, "step": 621 }, { "epoch": 0.39808, "grad_norm": 4.018087863922119, "learning_rate": 5e-06, "loss": 1.0947, "mean_token_accuracy": 0.696721188724041, "num_tokens": 8092023.0, "step": 622 }, { "epoch": 0.39872, "grad_norm": 3.527395486831665, "learning_rate": 5e-06, "loss": 1.3711, "mean_token_accuracy": 0.6707320511341095, "num_tokens": 8107080.0, "step": 623 }, { "epoch": 0.39936, "grad_norm": 3.9434077739715576, "learning_rate": 5e-06, "loss": 1.4069, "mean_token_accuracy": 0.6350800693035126, "num_tokens": 8120374.0, "step": 624 }, { "epoch": 0.4, "grad_norm": 9.819520950317383, "learning_rate": 5e-06, "loss": 1.3298, "mean_token_accuracy": 0.6648931205272675, "num_tokens": 8131526.0, "step": 625 }, { "epoch": 0.40064, "grad_norm": 3.7477002143859863, "learning_rate": 5e-06, "loss": 1.3742, "mean_token_accuracy": 0.6332258731126785, "num_tokens": 8144842.0, "step": 626 }, { "epoch": 0.40128, "grad_norm": 3.494314432144165, "learning_rate": 5e-06, "loss": 1.4336, "mean_token_accuracy": 0.6268866658210754, "num_tokens": 8157481.0, "step": 627 }, { "epoch": 0.40192, "grad_norm": 4.175013542175293, "learning_rate": 5e-06, "loss": 1.1405, "mean_token_accuracy": 0.6741645857691765, "num_tokens": 8168266.0, "step": 628 }, { "epoch": 0.40256, "grad_norm": 3.4416167736053467, "learning_rate": 5e-06, "loss": 1.3, "mean_token_accuracy": 0.6660801768302917, "num_tokens": 8182677.0, "step": 629 }, { "epoch": 0.4032, "grad_norm": 4.334346771240234, "learning_rate": 5e-06, "loss": 1.2434, "mean_token_accuracy": 0.6613388434052467, "num_tokens": 8195270.0, "step": 630 }, { "epoch": 0.40384, "grad_norm": 3.4856371879577637, "learning_rate": 5e-06, "loss": 1.3815, "mean_token_accuracy": 0.6581474095582962, "num_tokens": 8209764.0, "step": 631 }, { "epoch": 0.40448, "grad_norm": 4.153162002563477, "learning_rate": 5e-06, "loss": 1.3904, "mean_token_accuracy": 0.6714613437652588, "num_tokens": 8220815.0, "step": 632 }, { "epoch": 0.40512, "grad_norm": 4.055039405822754, "learning_rate": 5e-06, "loss": 1.3551, "mean_token_accuracy": 0.6623844504356384, "num_tokens": 8234827.0, "step": 633 }, { "epoch": 0.40576, "grad_norm": 3.558887481689453, "learning_rate": 5e-06, "loss": 1.3641, "mean_token_accuracy": 0.6388072147965431, "num_tokens": 8246889.0, "step": 634 }, { "epoch": 0.4064, "grad_norm": 4.767853736877441, "learning_rate": 5e-06, "loss": 1.6547, "mean_token_accuracy": 0.5957511439919472, "num_tokens": 8259650.0, "step": 635 }, { "epoch": 0.40704, "grad_norm": 3.799283504486084, "learning_rate": 5e-06, "loss": 1.2804, "mean_token_accuracy": 0.676610916852951, "num_tokens": 8271638.0, "step": 636 }, { "epoch": 0.40768, "grad_norm": 3.697746753692627, "learning_rate": 5e-06, "loss": 1.3211, "mean_token_accuracy": 0.6482478119432926, "num_tokens": 8285204.0, "step": 637 }, { "epoch": 0.40832, "grad_norm": 3.6978259086608887, "learning_rate": 5e-06, "loss": 1.2889, "mean_token_accuracy": 0.6479171589016914, "num_tokens": 8297791.0, "step": 638 }, { "epoch": 0.40896, "grad_norm": 4.445859432220459, "learning_rate": 5e-06, "loss": 1.4383, "mean_token_accuracy": 0.6379449293017387, "num_tokens": 8308088.0, "step": 639 }, { "epoch": 0.4096, "grad_norm": 3.462293863296509, "learning_rate": 5e-06, "loss": 1.1934, "mean_token_accuracy": 0.6681175008416176, "num_tokens": 8322994.0, "step": 640 }, { "epoch": 0.41024, "grad_norm": 3.471963405609131, "learning_rate": 5e-06, "loss": 1.4795, "mean_token_accuracy": 0.6364353597164154, "num_tokens": 8336239.0, "step": 641 }, { "epoch": 0.41088, "grad_norm": 4.054087162017822, "learning_rate": 5e-06, "loss": 1.4736, "mean_token_accuracy": 0.6363670602440834, "num_tokens": 8347731.0, "step": 642 }, { "epoch": 0.41152, "grad_norm": 3.717003583908081, "learning_rate": 5e-06, "loss": 1.4482, "mean_token_accuracy": 0.6272126361727715, "num_tokens": 8359999.0, "step": 643 }, { "epoch": 0.41216, "grad_norm": 3.302205801010132, "learning_rate": 5e-06, "loss": 1.0763, "mean_token_accuracy": 0.6774929463863373, "num_tokens": 8373684.0, "step": 644 }, { "epoch": 0.4128, "grad_norm": 3.4035229682922363, "learning_rate": 5e-06, "loss": 1.2946, "mean_token_accuracy": 0.6636142283678055, "num_tokens": 8387313.0, "step": 645 }, { "epoch": 0.41344, "grad_norm": 3.3320178985595703, "learning_rate": 5e-06, "loss": 1.2419, "mean_token_accuracy": 0.6601713374257088, "num_tokens": 8401909.0, "step": 646 }, { "epoch": 0.41408, "grad_norm": 4.073376178741455, "learning_rate": 5e-06, "loss": 1.392, "mean_token_accuracy": 0.6541470885276794, "num_tokens": 8412682.0, "step": 647 }, { "epoch": 0.41472, "grad_norm": 3.4275381565093994, "learning_rate": 5e-06, "loss": 1.2128, "mean_token_accuracy": 0.7010362893342972, "num_tokens": 8425791.0, "step": 648 }, { "epoch": 0.41536, "grad_norm": 3.013326644897461, "learning_rate": 5e-06, "loss": 1.2823, "mean_token_accuracy": 0.6790317669510841, "num_tokens": 8441869.0, "step": 649 }, { "epoch": 0.416, "grad_norm": 3.8601441383361816, "learning_rate": 5e-06, "loss": 1.1898, "mean_token_accuracy": 0.6818736344575882, "num_tokens": 8453324.0, "step": 650 }, { "epoch": 0.41664, "grad_norm": 4.1017537117004395, "learning_rate": 5e-06, "loss": 1.2861, "mean_token_accuracy": 0.6557259410619736, "num_tokens": 8465258.0, "step": 651 }, { "epoch": 0.41728, "grad_norm": 4.002110481262207, "learning_rate": 5e-06, "loss": 1.3043, "mean_token_accuracy": 0.6684800609946251, "num_tokens": 8477574.0, "step": 652 }, { "epoch": 0.41792, "grad_norm": 3.2750160694122314, "learning_rate": 5e-06, "loss": 1.137, "mean_token_accuracy": 0.6889987885951996, "num_tokens": 8493134.0, "step": 653 }, { "epoch": 0.41856, "grad_norm": 4.388451099395752, "learning_rate": 5e-06, "loss": 1.1421, "mean_token_accuracy": 0.6810869425535202, "num_tokens": 8504478.0, "step": 654 }, { "epoch": 0.4192, "grad_norm": 4.337303161621094, "learning_rate": 5e-06, "loss": 1.2395, "mean_token_accuracy": 0.6621519103646278, "num_tokens": 8515776.0, "step": 655 }, { "epoch": 0.41984, "grad_norm": 3.3329954147338867, "learning_rate": 5e-06, "loss": 1.2419, "mean_token_accuracy": 0.6687774360179901, "num_tokens": 8530155.0, "step": 656 }, { "epoch": 0.42048, "grad_norm": 4.358274459838867, "learning_rate": 5e-06, "loss": 1.4287, "mean_token_accuracy": 0.6304730176925659, "num_tokens": 8541253.0, "step": 657 }, { "epoch": 0.42112, "grad_norm": 3.5478384494781494, "learning_rate": 5e-06, "loss": 1.0695, "mean_token_accuracy": 0.7162381857633591, "num_tokens": 8555920.0, "step": 658 }, { "epoch": 0.42176, "grad_norm": 3.6502294540405273, "learning_rate": 5e-06, "loss": 1.5178, "mean_token_accuracy": 0.6263556554913521, "num_tokens": 8568098.0, "step": 659 }, { "epoch": 0.4224, "grad_norm": 3.196720838546753, "learning_rate": 5e-06, "loss": 1.3436, "mean_token_accuracy": 0.6646198481321335, "num_tokens": 8583176.0, "step": 660 }, { "epoch": 0.42304, "grad_norm": 4.109900951385498, "learning_rate": 5e-06, "loss": 1.2928, "mean_token_accuracy": 0.6598446816205978, "num_tokens": 8593933.0, "step": 661 }, { "epoch": 0.42368, "grad_norm": 3.258894205093384, "learning_rate": 5e-06, "loss": 1.1133, "mean_token_accuracy": 0.6876930743455887, "num_tokens": 8607476.0, "step": 662 }, { "epoch": 0.42432, "grad_norm": 3.369394063949585, "learning_rate": 5e-06, "loss": 1.4229, "mean_token_accuracy": 0.6260672360658646, "num_tokens": 8620304.0, "step": 663 }, { "epoch": 0.42496, "grad_norm": 2.924621820449829, "learning_rate": 5e-06, "loss": 1.3395, "mean_token_accuracy": 0.6627652049064636, "num_tokens": 8637128.0, "step": 664 }, { "epoch": 0.4256, "grad_norm": 3.057220458984375, "learning_rate": 5e-06, "loss": 1.3794, "mean_token_accuracy": 0.6343020871281624, "num_tokens": 8653441.0, "step": 665 }, { "epoch": 0.42624, "grad_norm": 3.4192020893096924, "learning_rate": 5e-06, "loss": 1.2575, "mean_token_accuracy": 0.6774614155292511, "num_tokens": 8667588.0, "step": 666 }, { "epoch": 0.42688, "grad_norm": 3.5300302505493164, "learning_rate": 5e-06, "loss": 1.4013, "mean_token_accuracy": 0.6498018577694893, "num_tokens": 8680263.0, "step": 667 }, { "epoch": 0.42752, "grad_norm": 4.497870922088623, "learning_rate": 5e-06, "loss": 1.3489, "mean_token_accuracy": 0.6535830795764923, "num_tokens": 8691240.0, "step": 668 }, { "epoch": 0.42816, "grad_norm": 4.537415504455566, "learning_rate": 5e-06, "loss": 1.1375, "mean_token_accuracy": 0.6975407898426056, "num_tokens": 8703412.0, "step": 669 }, { "epoch": 0.4288, "grad_norm": 4.318458080291748, "learning_rate": 5e-06, "loss": 1.3989, "mean_token_accuracy": 0.6657192297279835, "num_tokens": 8715529.0, "step": 670 }, { "epoch": 0.42944, "grad_norm": 3.754676580429077, "learning_rate": 5e-06, "loss": 1.3016, "mean_token_accuracy": 0.6766445562243462, "num_tokens": 8727543.0, "step": 671 }, { "epoch": 0.43008, "grad_norm": 3.548112630844116, "learning_rate": 5e-06, "loss": 1.2104, "mean_token_accuracy": 0.672496572136879, "num_tokens": 8741690.0, "step": 672 }, { "epoch": 0.43072, "grad_norm": 3.170255422592163, "learning_rate": 5e-06, "loss": 1.2556, "mean_token_accuracy": 0.6616998463869095, "num_tokens": 8756679.0, "step": 673 }, { "epoch": 0.43136, "grad_norm": 4.215174674987793, "learning_rate": 5e-06, "loss": 1.3897, "mean_token_accuracy": 0.6362641379237175, "num_tokens": 8766546.0, "step": 674 }, { "epoch": 0.432, "grad_norm": 3.93945050239563, "learning_rate": 5e-06, "loss": 1.5469, "mean_token_accuracy": 0.6284241452813148, "num_tokens": 8777535.0, "step": 675 }, { "epoch": 0.43264, "grad_norm": 3.380723237991333, "learning_rate": 5e-06, "loss": 1.2613, "mean_token_accuracy": 0.6732500046491623, "num_tokens": 8790959.0, "step": 676 }, { "epoch": 0.43328, "grad_norm": 3.63143253326416, "learning_rate": 5e-06, "loss": 1.3131, "mean_token_accuracy": 0.6863637119531631, "num_tokens": 8804351.0, "step": 677 }, { "epoch": 0.43392, "grad_norm": 3.2990407943725586, "learning_rate": 5e-06, "loss": 1.3531, "mean_token_accuracy": 0.6465996205806732, "num_tokens": 8819041.0, "step": 678 }, { "epoch": 0.43456, "grad_norm": 3.4251043796539307, "learning_rate": 5e-06, "loss": 1.5304, "mean_token_accuracy": 0.6462676748633385, "num_tokens": 8833926.0, "step": 679 }, { "epoch": 0.4352, "grad_norm": 3.7104909420013428, "learning_rate": 5e-06, "loss": 1.0284, "mean_token_accuracy": 0.695975661277771, "num_tokens": 8845597.0, "step": 680 }, { "epoch": 0.43584, "grad_norm": 3.3179309368133545, "learning_rate": 5e-06, "loss": 1.3163, "mean_token_accuracy": 0.6567405387759209, "num_tokens": 8860280.0, "step": 681 }, { "epoch": 0.43648, "grad_norm": 3.607069730758667, "learning_rate": 5e-06, "loss": 1.3554, "mean_token_accuracy": 0.6537708342075348, "num_tokens": 8873793.0, "step": 682 }, { "epoch": 0.43712, "grad_norm": 3.275057554244995, "learning_rate": 5e-06, "loss": 1.3613, "mean_token_accuracy": 0.6499952375888824, "num_tokens": 8886476.0, "step": 683 }, { "epoch": 0.43776, "grad_norm": 3.3160624504089355, "learning_rate": 5e-06, "loss": 1.2898, "mean_token_accuracy": 0.6765732616186142, "num_tokens": 8900749.0, "step": 684 }, { "epoch": 0.4384, "grad_norm": 3.347907543182373, "learning_rate": 5e-06, "loss": 1.2472, "mean_token_accuracy": 0.6879568248987198, "num_tokens": 8913760.0, "step": 685 }, { "epoch": 0.43904, "grad_norm": 3.862211227416992, "learning_rate": 5e-06, "loss": 1.424, "mean_token_accuracy": 0.6521790996193886, "num_tokens": 8926061.0, "step": 686 }, { "epoch": 0.43968, "grad_norm": 3.4736506938934326, "learning_rate": 5e-06, "loss": 1.0292, "mean_token_accuracy": 0.7187496647238731, "num_tokens": 8939507.0, "step": 687 }, { "epoch": 0.44032, "grad_norm": 3.3794503211975098, "learning_rate": 5e-06, "loss": 1.22, "mean_token_accuracy": 0.6592478863894939, "num_tokens": 8952411.0, "step": 688 }, { "epoch": 0.44096, "grad_norm": 3.6152052879333496, "learning_rate": 5e-06, "loss": 1.1974, "mean_token_accuracy": 0.6630196422338486, "num_tokens": 8966507.0, "step": 689 }, { "epoch": 0.4416, "grad_norm": 4.036067485809326, "learning_rate": 5e-06, "loss": 1.3094, "mean_token_accuracy": 0.6584384590387344, "num_tokens": 8978156.0, "step": 690 }, { "epoch": 0.44224, "grad_norm": 3.740229845046997, "learning_rate": 5e-06, "loss": 1.3378, "mean_token_accuracy": 0.6580435633659363, "num_tokens": 8990929.0, "step": 691 }, { "epoch": 0.44288, "grad_norm": 3.417703866958618, "learning_rate": 5e-06, "loss": 1.4495, "mean_token_accuracy": 0.6308450028300285, "num_tokens": 9004923.0, "step": 692 }, { "epoch": 0.44352, "grad_norm": 4.16903829574585, "learning_rate": 5e-06, "loss": 1.4896, "mean_token_accuracy": 0.6304129362106323, "num_tokens": 9018493.0, "step": 693 }, { "epoch": 0.44416, "grad_norm": 3.383941411972046, "learning_rate": 5e-06, "loss": 1.4175, "mean_token_accuracy": 0.6552764996886253, "num_tokens": 9032465.0, "step": 694 }, { "epoch": 0.4448, "grad_norm": 3.398747205734253, "learning_rate": 5e-06, "loss": 1.2548, "mean_token_accuracy": 0.6547529026865959, "num_tokens": 9045706.0, "step": 695 }, { "epoch": 0.44544, "grad_norm": 3.575016975402832, "learning_rate": 5e-06, "loss": 1.3807, "mean_token_accuracy": 0.6451460421085358, "num_tokens": 9059033.0, "step": 696 }, { "epoch": 0.44608, "grad_norm": 3.3936767578125, "learning_rate": 5e-06, "loss": 1.2398, "mean_token_accuracy": 0.6596207022666931, "num_tokens": 9072068.0, "step": 697 }, { "epoch": 0.44672, "grad_norm": 3.675055980682373, "learning_rate": 5e-06, "loss": 1.299, "mean_token_accuracy": 0.6535976231098175, "num_tokens": 9084713.0, "step": 698 }, { "epoch": 0.44736, "grad_norm": 3.564359426498413, "learning_rate": 5e-06, "loss": 1.3035, "mean_token_accuracy": 0.70219536870718, "num_tokens": 9097477.0, "step": 699 }, { "epoch": 0.448, "grad_norm": 3.400031566619873, "learning_rate": 5e-06, "loss": 1.4423, "mean_token_accuracy": 0.6343613564968109, "num_tokens": 9112699.0, "step": 700 }, { "epoch": 0.44864, "grad_norm": 3.9619691371917725, "learning_rate": 5e-06, "loss": 1.3641, "mean_token_accuracy": 0.6419604942202568, "num_tokens": 9124943.0, "step": 701 }, { "epoch": 0.44928, "grad_norm": 3.6950299739837646, "learning_rate": 5e-06, "loss": 1.1939, "mean_token_accuracy": 0.6645899340510368, "num_tokens": 9136459.0, "step": 702 }, { "epoch": 0.44992, "grad_norm": 2.9667203426361084, "learning_rate": 5e-06, "loss": 1.0622, "mean_token_accuracy": 0.7160019502043724, "num_tokens": 9153604.0, "step": 703 }, { "epoch": 0.45056, "grad_norm": 3.338284969329834, "learning_rate": 5e-06, "loss": 1.3143, "mean_token_accuracy": 0.6503826230764389, "num_tokens": 9168475.0, "step": 704 }, { "epoch": 0.4512, "grad_norm": 3.5206825733184814, "learning_rate": 5e-06, "loss": 1.3616, "mean_token_accuracy": 0.6541919782757759, "num_tokens": 9180057.0, "step": 705 }, { "epoch": 0.45184, "grad_norm": 3.8366057872772217, "learning_rate": 5e-06, "loss": 1.0917, "mean_token_accuracy": 0.701392412185669, "num_tokens": 9191539.0, "step": 706 }, { "epoch": 0.45248, "grad_norm": 3.7983529567718506, "learning_rate": 5e-06, "loss": 1.3878, "mean_token_accuracy": 0.6564305797219276, "num_tokens": 9201977.0, "step": 707 }, { "epoch": 0.45312, "grad_norm": 4.199508190155029, "learning_rate": 5e-06, "loss": 1.1504, "mean_token_accuracy": 0.6836559697985649, "num_tokens": 9212342.0, "step": 708 }, { "epoch": 0.45376, "grad_norm": 3.685267686843872, "learning_rate": 5e-06, "loss": 1.3698, "mean_token_accuracy": 0.6535738334059715, "num_tokens": 9225755.0, "step": 709 }, { "epoch": 0.4544, "grad_norm": 3.736710786819458, "learning_rate": 5e-06, "loss": 1.2562, "mean_token_accuracy": 0.6680023595690727, "num_tokens": 9237738.0, "step": 710 }, { "epoch": 0.45504, "grad_norm": 3.5703136920928955, "learning_rate": 5e-06, "loss": 1.5439, "mean_token_accuracy": 0.6260874792933464, "num_tokens": 9250443.0, "step": 711 }, { "epoch": 0.45568, "grad_norm": 3.6314592361450195, "learning_rate": 5e-06, "loss": 1.2796, "mean_token_accuracy": 0.6624791696667671, "num_tokens": 9262486.0, "step": 712 }, { "epoch": 0.45632, "grad_norm": 3.988708019256592, "learning_rate": 5e-06, "loss": 1.3134, "mean_token_accuracy": 0.6787229478359222, "num_tokens": 9274110.0, "step": 713 }, { "epoch": 0.45696, "grad_norm": 4.141347885131836, "learning_rate": 5e-06, "loss": 1.2379, "mean_token_accuracy": 0.6678915992379189, "num_tokens": 9285461.0, "step": 714 }, { "epoch": 0.4576, "grad_norm": 4.030619144439697, "learning_rate": 5e-06, "loss": 1.4524, "mean_token_accuracy": 0.6373696550726891, "num_tokens": 9297793.0, "step": 715 }, { "epoch": 0.45824, "grad_norm": 3.8397583961486816, "learning_rate": 5e-06, "loss": 1.3026, "mean_token_accuracy": 0.6478614434599876, "num_tokens": 9309837.0, "step": 716 }, { "epoch": 0.45888, "grad_norm": 3.211944580078125, "learning_rate": 5e-06, "loss": 1.3505, "mean_token_accuracy": 0.6571612730622292, "num_tokens": 9325986.0, "step": 717 }, { "epoch": 0.45952, "grad_norm": 3.775752305984497, "learning_rate": 5e-06, "loss": 1.4041, "mean_token_accuracy": 0.6476349085569382, "num_tokens": 9339126.0, "step": 718 }, { "epoch": 0.46016, "grad_norm": 3.311610221862793, "learning_rate": 5e-06, "loss": 1.2113, "mean_token_accuracy": 0.6731600984930992, "num_tokens": 9354016.0, "step": 719 }, { "epoch": 0.4608, "grad_norm": 3.6527278423309326, "learning_rate": 5e-06, "loss": 1.4461, "mean_token_accuracy": 0.6173442825675011, "num_tokens": 9366261.0, "step": 720 }, { "epoch": 0.46144, "grad_norm": 3.3843095302581787, "learning_rate": 5e-06, "loss": 1.1579, "mean_token_accuracy": 0.6829282343387604, "num_tokens": 9381276.0, "step": 721 }, { "epoch": 0.46208, "grad_norm": 3.229539394378662, "learning_rate": 5e-06, "loss": 1.2984, "mean_token_accuracy": 0.6582097262144089, "num_tokens": 9395271.0, "step": 722 }, { "epoch": 0.46272, "grad_norm": 3.170426607131958, "learning_rate": 5e-06, "loss": 1.2019, "mean_token_accuracy": 0.6727022156119347, "num_tokens": 9409204.0, "step": 723 }, { "epoch": 0.46336, "grad_norm": 3.8384881019592285, "learning_rate": 5e-06, "loss": 1.3632, "mean_token_accuracy": 0.6532674580812454, "num_tokens": 9423838.0, "step": 724 }, { "epoch": 0.464, "grad_norm": 4.176010608673096, "learning_rate": 5e-06, "loss": 1.3754, "mean_token_accuracy": 0.6710385903716087, "num_tokens": 9434639.0, "step": 725 }, { "epoch": 0.46464, "grad_norm": 3.5365447998046875, "learning_rate": 5e-06, "loss": 1.2791, "mean_token_accuracy": 0.6657568737864494, "num_tokens": 9446770.0, "step": 726 }, { "epoch": 0.46528, "grad_norm": 3.4129528999328613, "learning_rate": 5e-06, "loss": 1.3072, "mean_token_accuracy": 0.6470441669225693, "num_tokens": 9460600.0, "step": 727 }, { "epoch": 0.46592, "grad_norm": 4.013781547546387, "learning_rate": 5e-06, "loss": 1.3892, "mean_token_accuracy": 0.656228207051754, "num_tokens": 9472044.0, "step": 728 }, { "epoch": 0.46656, "grad_norm": 3.449136734008789, "learning_rate": 5e-06, "loss": 1.372, "mean_token_accuracy": 0.6523317843675613, "num_tokens": 9484363.0, "step": 729 }, { "epoch": 0.4672, "grad_norm": 3.7383124828338623, "learning_rate": 5e-06, "loss": 1.3216, "mean_token_accuracy": 0.6544977352023125, "num_tokens": 9496620.0, "step": 730 }, { "epoch": 0.46784, "grad_norm": 3.362048864364624, "learning_rate": 5e-06, "loss": 1.3853, "mean_token_accuracy": 0.6726252436637878, "num_tokens": 9510670.0, "step": 731 }, { "epoch": 0.46848, "grad_norm": 3.314443826675415, "learning_rate": 5e-06, "loss": 1.4104, "mean_token_accuracy": 0.6626652106642723, "num_tokens": 9525327.0, "step": 732 }, { "epoch": 0.46912, "grad_norm": 3.8517005443573, "learning_rate": 5e-06, "loss": 1.3099, "mean_token_accuracy": 0.650071769952774, "num_tokens": 9537583.0, "step": 733 }, { "epoch": 0.46976, "grad_norm": 3.4071006774902344, "learning_rate": 5e-06, "loss": 1.0845, "mean_token_accuracy": 0.6916024461388588, "num_tokens": 9550411.0, "step": 734 }, { "epoch": 0.4704, "grad_norm": 4.703375816345215, "learning_rate": 5e-06, "loss": 1.2984, "mean_token_accuracy": 0.6852922365069389, "num_tokens": 9561144.0, "step": 735 }, { "epoch": 0.47104, "grad_norm": 3.5826289653778076, "learning_rate": 5e-06, "loss": 1.5058, "mean_token_accuracy": 0.6203412935137749, "num_tokens": 9575541.0, "step": 736 }, { "epoch": 0.47168, "grad_norm": 3.2071099281311035, "learning_rate": 5e-06, "loss": 1.2267, "mean_token_accuracy": 0.6681589409708977, "num_tokens": 9591163.0, "step": 737 }, { "epoch": 0.47232, "grad_norm": 3.8028645515441895, "learning_rate": 5e-06, "loss": 1.4041, "mean_token_accuracy": 0.6411704197525978, "num_tokens": 9604337.0, "step": 738 }, { "epoch": 0.47296, "grad_norm": 3.5578410625457764, "learning_rate": 5e-06, "loss": 1.4089, "mean_token_accuracy": 0.6202419102191925, "num_tokens": 9618994.0, "step": 739 }, { "epoch": 0.4736, "grad_norm": 4.015564441680908, "learning_rate": 5e-06, "loss": 1.3252, "mean_token_accuracy": 0.6425078436732292, "num_tokens": 9629590.0, "step": 740 }, { "epoch": 0.47424, "grad_norm": 3.3953940868377686, "learning_rate": 5e-06, "loss": 1.2248, "mean_token_accuracy": 0.6555972173810005, "num_tokens": 9643383.0, "step": 741 }, { "epoch": 0.47488, "grad_norm": 3.509755849838257, "learning_rate": 5e-06, "loss": 1.2274, "mean_token_accuracy": 0.6479950994253159, "num_tokens": 9657137.0, "step": 742 }, { "epoch": 0.47552, "grad_norm": 3.403864622116089, "learning_rate": 5e-06, "loss": 1.3223, "mean_token_accuracy": 0.6538697630167007, "num_tokens": 9670515.0, "step": 743 }, { "epoch": 0.47616, "grad_norm": 3.5815911293029785, "learning_rate": 5e-06, "loss": 1.3843, "mean_token_accuracy": 0.6228384971618652, "num_tokens": 9682689.0, "step": 744 }, { "epoch": 0.4768, "grad_norm": 4.452811241149902, "learning_rate": 5e-06, "loss": 1.3518, "mean_token_accuracy": 0.6771413907408714, "num_tokens": 9693372.0, "step": 745 }, { "epoch": 0.47744, "grad_norm": 4.269803524017334, "learning_rate": 5e-06, "loss": 1.212, "mean_token_accuracy": 0.672097809612751, "num_tokens": 9702991.0, "step": 746 }, { "epoch": 0.47808, "grad_norm": 3.6928703784942627, "learning_rate": 5e-06, "loss": 1.3458, "mean_token_accuracy": 0.6829958707094193, "num_tokens": 9716472.0, "step": 747 }, { "epoch": 0.47872, "grad_norm": 3.9352831840515137, "learning_rate": 5e-06, "loss": 1.4422, "mean_token_accuracy": 0.6457963958382607, "num_tokens": 9730346.0, "step": 748 }, { "epoch": 0.47936, "grad_norm": 4.322943687438965, "learning_rate": 5e-06, "loss": 1.0575, "mean_token_accuracy": 0.6857285089790821, "num_tokens": 9742613.0, "step": 749 }, { "epoch": 0.48, "grad_norm": 3.4020259380340576, "learning_rate": 5e-06, "loss": 1.3059, "mean_token_accuracy": 0.6406174898147583, "num_tokens": 9754833.0, "step": 750 }, { "epoch": 0.48064, "grad_norm": 3.288209915161133, "learning_rate": 5e-06, "loss": 1.3021, "mean_token_accuracy": 0.6589253880083561, "num_tokens": 9769316.0, "step": 751 }, { "epoch": 0.48128, "grad_norm": 3.2498161792755127, "learning_rate": 5e-06, "loss": 1.4946, "mean_token_accuracy": 0.6402696147561073, "num_tokens": 9783768.0, "step": 752 }, { "epoch": 0.48192, "grad_norm": 3.8162779808044434, "learning_rate": 5e-06, "loss": 1.3911, "mean_token_accuracy": 0.6404719427227974, "num_tokens": 9797279.0, "step": 753 }, { "epoch": 0.48256, "grad_norm": 4.253142833709717, "learning_rate": 5e-06, "loss": 1.4797, "mean_token_accuracy": 0.6313204690814018, "num_tokens": 9808629.0, "step": 754 }, { "epoch": 0.4832, "grad_norm": 3.716420888900757, "learning_rate": 5e-06, "loss": 1.3401, "mean_token_accuracy": 0.6325643435120583, "num_tokens": 9821968.0, "step": 755 }, { "epoch": 0.48384, "grad_norm": 3.5335354804992676, "learning_rate": 5e-06, "loss": 1.3272, "mean_token_accuracy": 0.6529423892498016, "num_tokens": 9835554.0, "step": 756 }, { "epoch": 0.48448, "grad_norm": 3.359344005584717, "learning_rate": 5e-06, "loss": 1.3782, "mean_token_accuracy": 0.6145281083881855, "num_tokens": 9849101.0, "step": 757 }, { "epoch": 0.48512, "grad_norm": 3.4545371532440186, "learning_rate": 5e-06, "loss": 1.5942, "mean_token_accuracy": 0.6196755021810532, "num_tokens": 9862033.0, "step": 758 }, { "epoch": 0.48576, "grad_norm": 3.6362133026123047, "learning_rate": 5e-06, "loss": 1.3524, "mean_token_accuracy": 0.6347524076700211, "num_tokens": 9873692.0, "step": 759 }, { "epoch": 0.4864, "grad_norm": 3.699906826019287, "learning_rate": 5e-06, "loss": 1.2751, "mean_token_accuracy": 0.656343087553978, "num_tokens": 9885375.0, "step": 760 }, { "epoch": 0.48704, "grad_norm": 3.8104074001312256, "learning_rate": 5e-06, "loss": 1.4196, "mean_token_accuracy": 0.630670964717865, "num_tokens": 9897179.0, "step": 761 }, { "epoch": 0.48768, "grad_norm": 3.5518436431884766, "learning_rate": 5e-06, "loss": 1.1912, "mean_token_accuracy": 0.6667153909802437, "num_tokens": 9909370.0, "step": 762 }, { "epoch": 0.48832, "grad_norm": 3.4174013137817383, "learning_rate": 5e-06, "loss": 1.4634, "mean_token_accuracy": 0.6383125334978104, "num_tokens": 9924593.0, "step": 763 }, { "epoch": 0.48896, "grad_norm": 3.690223217010498, "learning_rate": 5e-06, "loss": 1.2632, "mean_token_accuracy": 0.6747411042451859, "num_tokens": 9938123.0, "step": 764 }, { "epoch": 0.4896, "grad_norm": 3.189453125, "learning_rate": 5e-06, "loss": 1.2344, "mean_token_accuracy": 0.6770232170820236, "num_tokens": 9952420.0, "step": 765 }, { "epoch": 0.49024, "grad_norm": 4.607802867889404, "learning_rate": 5e-06, "loss": 1.2471, "mean_token_accuracy": 0.6755125150084496, "num_tokens": 9962228.0, "step": 766 }, { "epoch": 0.49088, "grad_norm": 3.5634379386901855, "learning_rate": 5e-06, "loss": 1.1842, "mean_token_accuracy": 0.6926667168736458, "num_tokens": 9974613.0, "step": 767 }, { "epoch": 0.49152, "grad_norm": 3.5588109493255615, "learning_rate": 5e-06, "loss": 1.3507, "mean_token_accuracy": 0.6611402109265327, "num_tokens": 9988413.0, "step": 768 }, { "epoch": 0.49216, "grad_norm": 3.356700897216797, "learning_rate": 5e-06, "loss": 1.6306, "mean_token_accuracy": 0.6335580386221409, "num_tokens": 10003108.0, "step": 769 }, { "epoch": 0.4928, "grad_norm": 4.425334453582764, "learning_rate": 5e-06, "loss": 1.379, "mean_token_accuracy": 0.6304365694522858, "num_tokens": 10015961.0, "step": 770 }, { "epoch": 0.49344, "grad_norm": 3.2346768379211426, "learning_rate": 5e-06, "loss": 1.194, "mean_token_accuracy": 0.6939344108104706, "num_tokens": 10028885.0, "step": 771 }, { "epoch": 0.49408, "grad_norm": 2.969572067260742, "learning_rate": 5e-06, "loss": 1.1707, "mean_token_accuracy": 0.6672687157988548, "num_tokens": 10044177.0, "step": 772 }, { "epoch": 0.49472, "grad_norm": 3.9597513675689697, "learning_rate": 5e-06, "loss": 1.2808, "mean_token_accuracy": 0.6557754501700401, "num_tokens": 10055808.0, "step": 773 }, { "epoch": 0.49536, "grad_norm": 3.450819730758667, "learning_rate": 5e-06, "loss": 1.0655, "mean_token_accuracy": 0.6846578419208527, "num_tokens": 10071283.0, "step": 774 }, { "epoch": 0.496, "grad_norm": 3.999828338623047, "learning_rate": 5e-06, "loss": 1.4067, "mean_token_accuracy": 0.6274128258228302, "num_tokens": 10083352.0, "step": 775 }, { "epoch": 0.49664, "grad_norm": 4.048245429992676, "learning_rate": 5e-06, "loss": 1.2898, "mean_token_accuracy": 0.6655979752540588, "num_tokens": 10095482.0, "step": 776 }, { "epoch": 0.49728, "grad_norm": 3.832430124282837, "learning_rate": 5e-06, "loss": 1.3077, "mean_token_accuracy": 0.6555028632283211, "num_tokens": 10108448.0, "step": 777 }, { "epoch": 0.49792, "grad_norm": 3.215700626373291, "learning_rate": 5e-06, "loss": 1.3706, "mean_token_accuracy": 0.653311513364315, "num_tokens": 10122582.0, "step": 778 }, { "epoch": 0.49856, "grad_norm": 3.9083938598632812, "learning_rate": 5e-06, "loss": 1.4162, "mean_token_accuracy": 0.666377916932106, "num_tokens": 10133857.0, "step": 779 }, { "epoch": 0.4992, "grad_norm": 3.3011085987091064, "learning_rate": 5e-06, "loss": 1.4602, "mean_token_accuracy": 0.6177773475646973, "num_tokens": 10149390.0, "step": 780 }, { "epoch": 0.49984, "grad_norm": 4.202136516571045, "learning_rate": 5e-06, "loss": 1.348, "mean_token_accuracy": 0.6335294619202614, "num_tokens": 10160191.0, "step": 781 }, { "epoch": 0.50048, "grad_norm": 3.8100340366363525, "learning_rate": 5e-06, "loss": 1.286, "mean_token_accuracy": 0.664748452603817, "num_tokens": 10174169.0, "step": 782 }, { "epoch": 0.50112, "grad_norm": 3.2231757640838623, "learning_rate": 5e-06, "loss": 1.5273, "mean_token_accuracy": 0.6323799937963486, "num_tokens": 10189013.0, "step": 783 }, { "epoch": 0.50176, "grad_norm": 3.380337953567505, "learning_rate": 5e-06, "loss": 1.2976, "mean_token_accuracy": 0.6576580554246902, "num_tokens": 10205197.0, "step": 784 }, { "epoch": 0.5024, "grad_norm": 3.5312960147857666, "learning_rate": 5e-06, "loss": 1.4795, "mean_token_accuracy": 0.6108041927218437, "num_tokens": 10218782.0, "step": 785 }, { "epoch": 0.50304, "grad_norm": 3.7805802822113037, "learning_rate": 5e-06, "loss": 1.1567, "mean_token_accuracy": 0.7012158781290054, "num_tokens": 10229754.0, "step": 786 }, { "epoch": 0.50368, "grad_norm": 3.575208902359009, "learning_rate": 5e-06, "loss": 1.2914, "mean_token_accuracy": 0.6698039025068283, "num_tokens": 10243519.0, "step": 787 }, { "epoch": 0.50432, "grad_norm": 4.018414497375488, "learning_rate": 5e-06, "loss": 1.2843, "mean_token_accuracy": 0.6500632241368294, "num_tokens": 10255436.0, "step": 788 }, { "epoch": 0.50496, "grad_norm": 3.3472957611083984, "learning_rate": 5e-06, "loss": 1.409, "mean_token_accuracy": 0.6223405599594116, "num_tokens": 10270218.0, "step": 789 }, { "epoch": 0.5056, "grad_norm": 3.555922031402588, "learning_rate": 5e-06, "loss": 1.4861, "mean_token_accuracy": 0.6319947242736816, "num_tokens": 10283818.0, "step": 790 }, { "epoch": 0.50624, "grad_norm": 3.2534327507019043, "learning_rate": 5e-06, "loss": 1.505, "mean_token_accuracy": 0.6172455549240112, "num_tokens": 10299119.0, "step": 791 }, { "epoch": 0.50688, "grad_norm": 3.78558087348938, "learning_rate": 5e-06, "loss": 1.3765, "mean_token_accuracy": 0.6513939723372459, "num_tokens": 10311103.0, "step": 792 }, { "epoch": 0.50752, "grad_norm": 3.426884412765503, "learning_rate": 5e-06, "loss": 1.3942, "mean_token_accuracy": 0.6268276050686836, "num_tokens": 10324256.0, "step": 793 }, { "epoch": 0.50816, "grad_norm": 3.586442470550537, "learning_rate": 5e-06, "loss": 1.291, "mean_token_accuracy": 0.6560362279415131, "num_tokens": 10336275.0, "step": 794 }, { "epoch": 0.5088, "grad_norm": 3.6246700286865234, "learning_rate": 5e-06, "loss": 1.4371, "mean_token_accuracy": 0.64004335552454, "num_tokens": 10348939.0, "step": 795 }, { "epoch": 0.50944, "grad_norm": 3.8206660747528076, "learning_rate": 5e-06, "loss": 1.3641, "mean_token_accuracy": 0.6557733938097954, "num_tokens": 10360108.0, "step": 796 }, { "epoch": 0.51008, "grad_norm": 4.05738639831543, "learning_rate": 5e-06, "loss": 1.3265, "mean_token_accuracy": 0.6359497681260109, "num_tokens": 10371144.0, "step": 797 }, { "epoch": 0.51072, "grad_norm": 3.9649102687835693, "learning_rate": 5e-06, "loss": 1.4398, "mean_token_accuracy": 0.6443544700741768, "num_tokens": 10381537.0, "step": 798 }, { "epoch": 0.51136, "grad_norm": 3.3141987323760986, "learning_rate": 5e-06, "loss": 1.4374, "mean_token_accuracy": 0.6308365762233734, "num_tokens": 10396539.0, "step": 799 }, { "epoch": 0.512, "grad_norm": 3.1399598121643066, "learning_rate": 5e-06, "loss": 1.3463, "mean_token_accuracy": 0.6553637236356735, "num_tokens": 10412962.0, "step": 800 }, { "epoch": 0.51264, "grad_norm": 3.690521478652954, "learning_rate": 5e-06, "loss": 1.4432, "mean_token_accuracy": 0.6525074169039726, "num_tokens": 10426314.0, "step": 801 }, { "epoch": 0.51328, "grad_norm": 3.5348432064056396, "learning_rate": 5e-06, "loss": 1.5491, "mean_token_accuracy": 0.5989897102117538, "num_tokens": 10439348.0, "step": 802 }, { "epoch": 0.51392, "grad_norm": 3.760218620300293, "learning_rate": 5e-06, "loss": 1.0484, "mean_token_accuracy": 0.6977821663022041, "num_tokens": 10449973.0, "step": 803 }, { "epoch": 0.51456, "grad_norm": 3.7311551570892334, "learning_rate": 5e-06, "loss": 1.4811, "mean_token_accuracy": 0.630589596927166, "num_tokens": 10462019.0, "step": 804 }, { "epoch": 0.5152, "grad_norm": 3.4669084548950195, "learning_rate": 5e-06, "loss": 1.3778, "mean_token_accuracy": 0.6389184445142746, "num_tokens": 10474583.0, "step": 805 }, { "epoch": 0.51584, "grad_norm": 3.2782742977142334, "learning_rate": 5e-06, "loss": 1.2256, "mean_token_accuracy": 0.6817874610424042, "num_tokens": 10488568.0, "step": 806 }, { "epoch": 0.51648, "grad_norm": 4.345005035400391, "learning_rate": 5e-06, "loss": 1.3498, "mean_token_accuracy": 0.6508120521903038, "num_tokens": 10500968.0, "step": 807 }, { "epoch": 0.51712, "grad_norm": 3.8742589950561523, "learning_rate": 5e-06, "loss": 1.4047, "mean_token_accuracy": 0.634697936475277, "num_tokens": 10513586.0, "step": 808 }, { "epoch": 0.51776, "grad_norm": 3.4968934059143066, "learning_rate": 5e-06, "loss": 1.4397, "mean_token_accuracy": 0.6320827975869179, "num_tokens": 10528263.0, "step": 809 }, { "epoch": 0.5184, "grad_norm": 3.89860463142395, "learning_rate": 5e-06, "loss": 1.299, "mean_token_accuracy": 0.6727789714932442, "num_tokens": 10539352.0, "step": 810 }, { "epoch": 0.51904, "grad_norm": 3.1833720207214355, "learning_rate": 5e-06, "loss": 1.2753, "mean_token_accuracy": 0.6482102572917938, "num_tokens": 10553515.0, "step": 811 }, { "epoch": 0.51968, "grad_norm": 3.3082292079925537, "learning_rate": 5e-06, "loss": 1.2998, "mean_token_accuracy": 0.6546992510557175, "num_tokens": 10566663.0, "step": 812 }, { "epoch": 0.52032, "grad_norm": 3.6185340881347656, "learning_rate": 5e-06, "loss": 1.3815, "mean_token_accuracy": 0.6625709310173988, "num_tokens": 10579584.0, "step": 813 }, { "epoch": 0.52096, "grad_norm": 3.55534291267395, "learning_rate": 5e-06, "loss": 1.1128, "mean_token_accuracy": 0.6956586241722107, "num_tokens": 10591791.0, "step": 814 }, { "epoch": 0.5216, "grad_norm": 3.650907516479492, "learning_rate": 5e-06, "loss": 1.4477, "mean_token_accuracy": 0.624105378985405, "num_tokens": 10604975.0, "step": 815 }, { "epoch": 0.52224, "grad_norm": 3.9432995319366455, "learning_rate": 5e-06, "loss": 1.3038, "mean_token_accuracy": 0.6531900316476822, "num_tokens": 10616449.0, "step": 816 }, { "epoch": 0.52288, "grad_norm": 3.8777639865875244, "learning_rate": 5e-06, "loss": 1.47, "mean_token_accuracy": 0.6426081731915474, "num_tokens": 10626994.0, "step": 817 }, { "epoch": 0.52352, "grad_norm": 3.064539909362793, "learning_rate": 5e-06, "loss": 1.2586, "mean_token_accuracy": 0.6581793427467346, "num_tokens": 10642855.0, "step": 818 }, { "epoch": 0.52416, "grad_norm": 3.3149263858795166, "learning_rate": 5e-06, "loss": 1.4482, "mean_token_accuracy": 0.6364821642637253, "num_tokens": 10656056.0, "step": 819 }, { "epoch": 0.5248, "grad_norm": 4.199079990386963, "learning_rate": 5e-06, "loss": 1.4089, "mean_token_accuracy": 0.6289070248603821, "num_tokens": 10668057.0, "step": 820 }, { "epoch": 0.52544, "grad_norm": 3.142550468444824, "learning_rate": 5e-06, "loss": 1.526, "mean_token_accuracy": 0.6158655360341072, "num_tokens": 10682235.0, "step": 821 }, { "epoch": 0.52608, "grad_norm": 3.3503355979919434, "learning_rate": 5e-06, "loss": 1.2846, "mean_token_accuracy": 0.6406090259552002, "num_tokens": 10697588.0, "step": 822 }, { "epoch": 0.52672, "grad_norm": 3.7879579067230225, "learning_rate": 5e-06, "loss": 1.4343, "mean_token_accuracy": 0.6635532379150391, "num_tokens": 10708737.0, "step": 823 }, { "epoch": 0.52736, "grad_norm": 3.9766318798065186, "learning_rate": 5e-06, "loss": 1.352, "mean_token_accuracy": 0.6262383349239826, "num_tokens": 10721121.0, "step": 824 }, { "epoch": 0.528, "grad_norm": 3.3426828384399414, "learning_rate": 5e-06, "loss": 1.2434, "mean_token_accuracy": 0.6687643975019455, "num_tokens": 10734364.0, "step": 825 }, { "epoch": 0.52864, "grad_norm": 3.411301612854004, "learning_rate": 5e-06, "loss": 1.0815, "mean_token_accuracy": 0.695383183658123, "num_tokens": 10747101.0, "step": 826 }, { "epoch": 0.52928, "grad_norm": 4.2775044441223145, "learning_rate": 5e-06, "loss": 1.1268, "mean_token_accuracy": 0.7007181644439697, "num_tokens": 10757435.0, "step": 827 }, { "epoch": 0.52992, "grad_norm": 3.670020341873169, "learning_rate": 5e-06, "loss": 1.2358, "mean_token_accuracy": 0.6553446725010872, "num_tokens": 10768682.0, "step": 828 }, { "epoch": 0.53056, "grad_norm": 3.6720056533813477, "learning_rate": 5e-06, "loss": 1.4772, "mean_token_accuracy": 0.6402060613036156, "num_tokens": 10781720.0, "step": 829 }, { "epoch": 0.5312, "grad_norm": 4.194923400878906, "learning_rate": 5e-06, "loss": 1.1888, "mean_token_accuracy": 0.6686526387929916, "num_tokens": 10794142.0, "step": 830 }, { "epoch": 0.53184, "grad_norm": 3.1744613647460938, "learning_rate": 5e-06, "loss": 1.4092, "mean_token_accuracy": 0.6328324228525162, "num_tokens": 10809011.0, "step": 831 }, { "epoch": 0.53248, "grad_norm": 3.844196319580078, "learning_rate": 5e-06, "loss": 1.0955, "mean_token_accuracy": 0.6974828243255615, "num_tokens": 10819963.0, "step": 832 }, { "epoch": 0.53312, "grad_norm": 3.668311834335327, "learning_rate": 5e-06, "loss": 1.2182, "mean_token_accuracy": 0.6790367737412453, "num_tokens": 10834237.0, "step": 833 }, { "epoch": 0.53376, "grad_norm": 3.610236406326294, "learning_rate": 5e-06, "loss": 1.078, "mean_token_accuracy": 0.7012447938323021, "num_tokens": 10847095.0, "step": 834 }, { "epoch": 0.5344, "grad_norm": 3.7682337760925293, "learning_rate": 5e-06, "loss": 1.1083, "mean_token_accuracy": 0.6979804188013077, "num_tokens": 10859861.0, "step": 835 }, { "epoch": 0.53504, "grad_norm": 3.720351457595825, "learning_rate": 5e-06, "loss": 1.3023, "mean_token_accuracy": 0.6436434611678123, "num_tokens": 10871706.0, "step": 836 }, { "epoch": 0.53568, "grad_norm": 3.608431816101074, "learning_rate": 5e-06, "loss": 1.5091, "mean_token_accuracy": 0.613635927438736, "num_tokens": 10884615.0, "step": 837 }, { "epoch": 0.53632, "grad_norm": 3.321657419204712, "learning_rate": 5e-06, "loss": 1.1323, "mean_token_accuracy": 0.6927084550261497, "num_tokens": 10898152.0, "step": 838 }, { "epoch": 0.53696, "grad_norm": 2.9468841552734375, "learning_rate": 5e-06, "loss": 1.3507, "mean_token_accuracy": 0.6446966454386711, "num_tokens": 10915868.0, "step": 839 }, { "epoch": 0.5376, "grad_norm": 3.565668821334839, "learning_rate": 5e-06, "loss": 1.4081, "mean_token_accuracy": 0.6363908722996712, "num_tokens": 10930562.0, "step": 840 }, { "epoch": 0.53824, "grad_norm": 3.9890897274017334, "learning_rate": 5e-06, "loss": 1.4644, "mean_token_accuracy": 0.6419349610805511, "num_tokens": 10942958.0, "step": 841 }, { "epoch": 0.53888, "grad_norm": 3.5691657066345215, "learning_rate": 5e-06, "loss": 1.461, "mean_token_accuracy": 0.6422456279397011, "num_tokens": 10955566.0, "step": 842 }, { "epoch": 0.53952, "grad_norm": 3.0054261684417725, "learning_rate": 5e-06, "loss": 1.32, "mean_token_accuracy": 0.6669855192303658, "num_tokens": 10971064.0, "step": 843 }, { "epoch": 0.54016, "grad_norm": 2.9434778690338135, "learning_rate": 5e-06, "loss": 1.343, "mean_token_accuracy": 0.6634941324591637, "num_tokens": 10987737.0, "step": 844 }, { "epoch": 0.5408, "grad_norm": 4.207048416137695, "learning_rate": 5e-06, "loss": 1.338, "mean_token_accuracy": 0.6447890773415565, "num_tokens": 10998859.0, "step": 845 }, { "epoch": 0.54144, "grad_norm": 3.3798792362213135, "learning_rate": 5e-06, "loss": 1.5105, "mean_token_accuracy": 0.6433539763092995, "num_tokens": 11013214.0, "step": 846 }, { "epoch": 0.54208, "grad_norm": 3.163572311401367, "learning_rate": 5e-06, "loss": 1.3399, "mean_token_accuracy": 0.6501183435320854, "num_tokens": 11028566.0, "step": 847 }, { "epoch": 0.54272, "grad_norm": 3.5735156536102295, "learning_rate": 5e-06, "loss": 1.6305, "mean_token_accuracy": 0.6012577600777149, "num_tokens": 11043246.0, "step": 848 }, { "epoch": 0.54336, "grad_norm": 4.034946441650391, "learning_rate": 5e-06, "loss": 1.1873, "mean_token_accuracy": 0.6620588451623917, "num_tokens": 11055228.0, "step": 849 }, { "epoch": 0.544, "grad_norm": 3.2156589031219482, "learning_rate": 5e-06, "loss": 1.4072, "mean_token_accuracy": 0.6418510600924492, "num_tokens": 11068267.0, "step": 850 }, { "epoch": 0.54464, "grad_norm": 4.0673723220825195, "learning_rate": 5e-06, "loss": 1.2545, "mean_token_accuracy": 0.6594027280807495, "num_tokens": 11080978.0, "step": 851 }, { "epoch": 0.54528, "grad_norm": 3.5857112407684326, "learning_rate": 5e-06, "loss": 1.4054, "mean_token_accuracy": 0.6293277516961098, "num_tokens": 11096210.0, "step": 852 }, { "epoch": 0.54592, "grad_norm": 3.829974889755249, "learning_rate": 5e-06, "loss": 1.3174, "mean_token_accuracy": 0.6636649072170258, "num_tokens": 11108254.0, "step": 853 }, { "epoch": 0.54656, "grad_norm": 3.5567145347595215, "learning_rate": 5e-06, "loss": 1.2567, "mean_token_accuracy": 0.677757516503334, "num_tokens": 11120465.0, "step": 854 }, { "epoch": 0.5472, "grad_norm": 4.473601341247559, "learning_rate": 5e-06, "loss": 1.2163, "mean_token_accuracy": 0.6928970888257027, "num_tokens": 11131956.0, "step": 855 }, { "epoch": 0.54784, "grad_norm": 3.658292293548584, "learning_rate": 5e-06, "loss": 1.2401, "mean_token_accuracy": 0.6552563831210136, "num_tokens": 11144655.0, "step": 856 }, { "epoch": 0.54848, "grad_norm": 3.061565399169922, "learning_rate": 5e-06, "loss": 1.1552, "mean_token_accuracy": 0.6949977725744247, "num_tokens": 11159566.0, "step": 857 }, { "epoch": 0.54912, "grad_norm": 3.8165862560272217, "learning_rate": 5e-06, "loss": 1.2363, "mean_token_accuracy": 0.6678136140108109, "num_tokens": 11172278.0, "step": 858 }, { "epoch": 0.54976, "grad_norm": 3.937960147857666, "learning_rate": 5e-06, "loss": 1.1055, "mean_token_accuracy": 0.7062254995107651, "num_tokens": 11183273.0, "step": 859 }, { "epoch": 0.5504, "grad_norm": 3.9735426902770996, "learning_rate": 5e-06, "loss": 1.2674, "mean_token_accuracy": 0.6709722802042961, "num_tokens": 11194956.0, "step": 860 }, { "epoch": 0.55104, "grad_norm": 3.741502523422241, "learning_rate": 5e-06, "loss": 1.4785, "mean_token_accuracy": 0.6148476675152779, "num_tokens": 11209741.0, "step": 861 }, { "epoch": 0.55168, "grad_norm": 3.544828176498413, "learning_rate": 5e-06, "loss": 1.3682, "mean_token_accuracy": 0.6354441791772842, "num_tokens": 11222629.0, "step": 862 }, { "epoch": 0.55232, "grad_norm": 3.3560214042663574, "learning_rate": 5e-06, "loss": 1.1048, "mean_token_accuracy": 0.6948365420103073, "num_tokens": 11237834.0, "step": 863 }, { "epoch": 0.55296, "grad_norm": 3.512924909591675, "learning_rate": 5e-06, "loss": 1.181, "mean_token_accuracy": 0.6836326494812965, "num_tokens": 11250638.0, "step": 864 }, { "epoch": 0.5536, "grad_norm": 4.28767728805542, "learning_rate": 5e-06, "loss": 1.475, "mean_token_accuracy": 0.6338236667215824, "num_tokens": 11261887.0, "step": 865 }, { "epoch": 0.55424, "grad_norm": 3.2134881019592285, "learning_rate": 5e-06, "loss": 1.3229, "mean_token_accuracy": 0.6470964848995209, "num_tokens": 11275931.0, "step": 866 }, { "epoch": 0.55488, "grad_norm": 3.689152240753174, "learning_rate": 5e-06, "loss": 1.3717, "mean_token_accuracy": 0.6691553071141243, "num_tokens": 11287528.0, "step": 867 }, { "epoch": 0.55552, "grad_norm": 3.289281129837036, "learning_rate": 5e-06, "loss": 1.2839, "mean_token_accuracy": 0.6697202101349831, "num_tokens": 11300231.0, "step": 868 }, { "epoch": 0.55616, "grad_norm": 3.278754234313965, "learning_rate": 5e-06, "loss": 1.1297, "mean_token_accuracy": 0.6946472376585007, "num_tokens": 11315024.0, "step": 869 }, { "epoch": 0.5568, "grad_norm": 3.2673239707946777, "learning_rate": 5e-06, "loss": 1.4874, "mean_token_accuracy": 0.6627454794943333, "num_tokens": 11329475.0, "step": 870 }, { "epoch": 0.55744, "grad_norm": 3.1076149940490723, "learning_rate": 5e-06, "loss": 1.3683, "mean_token_accuracy": 0.6290438398718834, "num_tokens": 11343973.0, "step": 871 }, { "epoch": 0.55808, "grad_norm": 3.526763439178467, "learning_rate": 5e-06, "loss": 1.3713, "mean_token_accuracy": 0.6168685257434845, "num_tokens": 11356517.0, "step": 872 }, { "epoch": 0.55872, "grad_norm": 3.46929931640625, "learning_rate": 5e-06, "loss": 1.3024, "mean_token_accuracy": 0.6537005454301834, "num_tokens": 11369229.0, "step": 873 }, { "epoch": 0.55936, "grad_norm": 3.599717617034912, "learning_rate": 5e-06, "loss": 1.3816, "mean_token_accuracy": 0.641513504087925, "num_tokens": 11382702.0, "step": 874 }, { "epoch": 0.56, "grad_norm": 3.80094313621521, "learning_rate": 5e-06, "loss": 1.5008, "mean_token_accuracy": 0.6274667903780937, "num_tokens": 11396562.0, "step": 875 }, { "epoch": 0.56064, "grad_norm": 4.2999067306518555, "learning_rate": 5e-06, "loss": 1.2018, "mean_token_accuracy": 0.6762094050645828, "num_tokens": 11406774.0, "step": 876 }, { "epoch": 0.56128, "grad_norm": 3.715298652648926, "learning_rate": 5e-06, "loss": 1.2514, "mean_token_accuracy": 0.6620960757136345, "num_tokens": 11418251.0, "step": 877 }, { "epoch": 0.56192, "grad_norm": 3.0805916786193848, "learning_rate": 5e-06, "loss": 1.1502, "mean_token_accuracy": 0.684480644762516, "num_tokens": 11433197.0, "step": 878 }, { "epoch": 0.56256, "grad_norm": 3.6326444149017334, "learning_rate": 5e-06, "loss": 1.2656, "mean_token_accuracy": 0.660808764398098, "num_tokens": 11446639.0, "step": 879 }, { "epoch": 0.5632, "grad_norm": 12.266148567199707, "learning_rate": 5e-06, "loss": 1.286, "mean_token_accuracy": 0.6693281307816505, "num_tokens": 11458609.0, "step": 880 }, { "epoch": 0.56384, "grad_norm": 3.6536591053009033, "learning_rate": 5e-06, "loss": 1.2049, "mean_token_accuracy": 0.6694196611642838, "num_tokens": 11470645.0, "step": 881 }, { "epoch": 0.56448, "grad_norm": 3.287473201751709, "learning_rate": 5e-06, "loss": 1.3294, "mean_token_accuracy": 0.6692755967378616, "num_tokens": 11484303.0, "step": 882 }, { "epoch": 0.56512, "grad_norm": 3.7565791606903076, "learning_rate": 5e-06, "loss": 1.251, "mean_token_accuracy": 0.6664244830608368, "num_tokens": 11496299.0, "step": 883 }, { "epoch": 0.56576, "grad_norm": 3.544475793838501, "learning_rate": 5e-06, "loss": 1.4526, "mean_token_accuracy": 0.6100342273712158, "num_tokens": 11510676.0, "step": 884 }, { "epoch": 0.5664, "grad_norm": 3.682511568069458, "learning_rate": 5e-06, "loss": 1.4142, "mean_token_accuracy": 0.6500721573829651, "num_tokens": 11523371.0, "step": 885 }, { "epoch": 0.56704, "grad_norm": 3.6271486282348633, "learning_rate": 5e-06, "loss": 1.1237, "mean_token_accuracy": 0.6834971457719803, "num_tokens": 11536061.0, "step": 886 }, { "epoch": 0.56768, "grad_norm": 3.1198318004608154, "learning_rate": 5e-06, "loss": 1.2309, "mean_token_accuracy": 0.658136211335659, "num_tokens": 11550795.0, "step": 887 }, { "epoch": 0.56832, "grad_norm": 3.9022724628448486, "learning_rate": 5e-06, "loss": 1.3044, "mean_token_accuracy": 0.6979828551411629, "num_tokens": 11564562.0, "step": 888 }, { "epoch": 0.56896, "grad_norm": 3.295694351196289, "learning_rate": 5e-06, "loss": 1.4184, "mean_token_accuracy": 0.60136728733778, "num_tokens": 11578577.0, "step": 889 }, { "epoch": 0.5696, "grad_norm": 3.0561180114746094, "learning_rate": 5e-06, "loss": 1.3529, "mean_token_accuracy": 0.6563450619578362, "num_tokens": 11594404.0, "step": 890 }, { "epoch": 0.57024, "grad_norm": 3.44431471824646, "learning_rate": 5e-06, "loss": 1.1605, "mean_token_accuracy": 0.6642890870571136, "num_tokens": 11605723.0, "step": 891 }, { "epoch": 0.57088, "grad_norm": 4.037685871124268, "learning_rate": 5e-06, "loss": 1.2558, "mean_token_accuracy": 0.6648613065481186, "num_tokens": 11619031.0, "step": 892 }, { "epoch": 0.57152, "grad_norm": 3.2583799362182617, "learning_rate": 5e-06, "loss": 1.3105, "mean_token_accuracy": 0.6500160917639732, "num_tokens": 11634316.0, "step": 893 }, { "epoch": 0.57216, "grad_norm": 3.2072439193725586, "learning_rate": 5e-06, "loss": 1.4559, "mean_token_accuracy": 0.6469361782073975, "num_tokens": 11650239.0, "step": 894 }, { "epoch": 0.5728, "grad_norm": 3.4376208782196045, "learning_rate": 5e-06, "loss": 1.3858, "mean_token_accuracy": 0.6572685986757278, "num_tokens": 11662751.0, "step": 895 }, { "epoch": 0.57344, "grad_norm": 3.647529363632202, "learning_rate": 5e-06, "loss": 1.3375, "mean_token_accuracy": 0.6592177748680115, "num_tokens": 11675377.0, "step": 896 }, { "epoch": 0.57408, "grad_norm": 3.332850217819214, "learning_rate": 5e-06, "loss": 1.3343, "mean_token_accuracy": 0.6491860672831535, "num_tokens": 11688675.0, "step": 897 }, { "epoch": 0.57472, "grad_norm": 4.066124439239502, "learning_rate": 5e-06, "loss": 1.3361, "mean_token_accuracy": 0.6541391238570213, "num_tokens": 11700393.0, "step": 898 }, { "epoch": 0.57536, "grad_norm": 3.341097593307495, "learning_rate": 5e-06, "loss": 1.2515, "mean_token_accuracy": 0.6574959680438042, "num_tokens": 11714611.0, "step": 899 }, { "epoch": 0.576, "grad_norm": 3.0946879386901855, "learning_rate": 5e-06, "loss": 1.242, "mean_token_accuracy": 0.6756256222724915, "num_tokens": 11731479.0, "step": 900 }, { "epoch": 0.57664, "grad_norm": 3.3247451782226562, "learning_rate": 5e-06, "loss": 1.4695, "mean_token_accuracy": 0.6269465908408165, "num_tokens": 11748872.0, "step": 901 }, { "epoch": 0.57728, "grad_norm": 3.942417860031128, "learning_rate": 5e-06, "loss": 1.3982, "mean_token_accuracy": 0.6202432103455067, "num_tokens": 11760014.0, "step": 902 }, { "epoch": 0.57792, "grad_norm": 3.633100986480713, "learning_rate": 5e-06, "loss": 1.4247, "mean_token_accuracy": 0.6394501402974129, "num_tokens": 11773867.0, "step": 903 }, { "epoch": 0.57856, "grad_norm": 3.383073568344116, "learning_rate": 5e-06, "loss": 1.1386, "mean_token_accuracy": 0.697243720293045, "num_tokens": 11787283.0, "step": 904 }, { "epoch": 0.5792, "grad_norm": 3.678783416748047, "learning_rate": 5e-06, "loss": 1.3926, "mean_token_accuracy": 0.6641874313354492, "num_tokens": 11799263.0, "step": 905 }, { "epoch": 0.57984, "grad_norm": 3.2661468982696533, "learning_rate": 5e-06, "loss": 1.5136, "mean_token_accuracy": 0.6076219081878662, "num_tokens": 11815606.0, "step": 906 }, { "epoch": 0.58048, "grad_norm": 3.52829909324646, "learning_rate": 5e-06, "loss": 1.2213, "mean_token_accuracy": 0.6809684634208679, "num_tokens": 11829589.0, "step": 907 }, { "epoch": 0.58112, "grad_norm": 3.6113576889038086, "learning_rate": 5e-06, "loss": 1.3111, "mean_token_accuracy": 0.6847885251045227, "num_tokens": 11842023.0, "step": 908 }, { "epoch": 0.58176, "grad_norm": 4.104685306549072, "learning_rate": 5e-06, "loss": 1.4434, "mean_token_accuracy": 0.6598182618618011, "num_tokens": 11852988.0, "step": 909 }, { "epoch": 0.5824, "grad_norm": 3.4313085079193115, "learning_rate": 5e-06, "loss": 1.4167, "mean_token_accuracy": 0.6736102141439915, "num_tokens": 11866706.0, "step": 910 }, { "epoch": 0.58304, "grad_norm": 3.2502808570861816, "learning_rate": 5e-06, "loss": 1.3682, "mean_token_accuracy": 0.6428176760673523, "num_tokens": 11882889.0, "step": 911 }, { "epoch": 0.58368, "grad_norm": 3.662310838699341, "learning_rate": 5e-06, "loss": 1.0834, "mean_token_accuracy": 0.715592160820961, "num_tokens": 11895792.0, "step": 912 }, { "epoch": 0.58432, "grad_norm": 3.0405428409576416, "learning_rate": 5e-06, "loss": 1.2439, "mean_token_accuracy": 0.6653807386755943, "num_tokens": 11911225.0, "step": 913 }, { "epoch": 0.58496, "grad_norm": 3.550328016281128, "learning_rate": 5e-06, "loss": 1.3896, "mean_token_accuracy": 0.6358233094215393, "num_tokens": 11924073.0, "step": 914 }, { "epoch": 0.5856, "grad_norm": 3.2749056816101074, "learning_rate": 5e-06, "loss": 1.3361, "mean_token_accuracy": 0.6581274121999741, "num_tokens": 11938332.0, "step": 915 }, { "epoch": 0.58624, "grad_norm": 3.873444080352783, "learning_rate": 5e-06, "loss": 1.3269, "mean_token_accuracy": 0.6423259451985359, "num_tokens": 11950429.0, "step": 916 }, { "epoch": 0.58688, "grad_norm": 3.691632032394409, "learning_rate": 5e-06, "loss": 1.3576, "mean_token_accuracy": 0.6709922403097153, "num_tokens": 11962979.0, "step": 917 }, { "epoch": 0.58752, "grad_norm": 3.1465516090393066, "learning_rate": 5e-06, "loss": 1.3155, "mean_token_accuracy": 0.6728775128722191, "num_tokens": 11978492.0, "step": 918 }, { "epoch": 0.58816, "grad_norm": 3.738511562347412, "learning_rate": 5e-06, "loss": 1.1487, "mean_token_accuracy": 0.706301674246788, "num_tokens": 11991139.0, "step": 919 }, { "epoch": 0.5888, "grad_norm": 3.288872241973877, "learning_rate": 5e-06, "loss": 1.3725, "mean_token_accuracy": 0.6539236456155777, "num_tokens": 12005815.0, "step": 920 }, { "epoch": 0.58944, "grad_norm": 3.644181966781616, "learning_rate": 5e-06, "loss": 1.2652, "mean_token_accuracy": 0.6836251989006996, "num_tokens": 12017897.0, "step": 921 }, { "epoch": 0.59008, "grad_norm": 3.8078083992004395, "learning_rate": 5e-06, "loss": 1.3801, "mean_token_accuracy": 0.6469420120120049, "num_tokens": 12030359.0, "step": 922 }, { "epoch": 0.59072, "grad_norm": 3.2687323093414307, "learning_rate": 5e-06, "loss": 1.4049, "mean_token_accuracy": 0.6341100111603737, "num_tokens": 12044729.0, "step": 923 }, { "epoch": 0.59136, "grad_norm": 3.4478020668029785, "learning_rate": 5e-06, "loss": 1.4287, "mean_token_accuracy": 0.6452651098370552, "num_tokens": 12058788.0, "step": 924 }, { "epoch": 0.592, "grad_norm": 4.092494010925293, "learning_rate": 5e-06, "loss": 1.1304, "mean_token_accuracy": 0.6950362101197243, "num_tokens": 12069502.0, "step": 925 }, { "epoch": 0.59264, "grad_norm": 4.566901683807373, "learning_rate": 5e-06, "loss": 1.261, "mean_token_accuracy": 0.6621346473693848, "num_tokens": 12080932.0, "step": 926 }, { "epoch": 0.59328, "grad_norm": 3.4059062004089355, "learning_rate": 5e-06, "loss": 1.291, "mean_token_accuracy": 0.6705317497253418, "num_tokens": 12094725.0, "step": 927 }, { "epoch": 0.59392, "grad_norm": 4.018156051635742, "learning_rate": 5e-06, "loss": 1.4457, "mean_token_accuracy": 0.6595090329647064, "num_tokens": 12107082.0, "step": 928 }, { "epoch": 0.59456, "grad_norm": 3.448580741882324, "learning_rate": 5e-06, "loss": 1.2716, "mean_token_accuracy": 0.6856422200798988, "num_tokens": 12121239.0, "step": 929 }, { "epoch": 0.5952, "grad_norm": 3.425841808319092, "learning_rate": 5e-06, "loss": 1.3174, "mean_token_accuracy": 0.6534383073449135, "num_tokens": 12134626.0, "step": 930 }, { "epoch": 0.59584, "grad_norm": 4.416814804077148, "learning_rate": 5e-06, "loss": 1.2661, "mean_token_accuracy": 0.6484142020344734, "num_tokens": 12145951.0, "step": 931 }, { "epoch": 0.59648, "grad_norm": 3.968085765838623, "learning_rate": 5e-06, "loss": 1.4512, "mean_token_accuracy": 0.6346799582242966, "num_tokens": 12157958.0, "step": 932 }, { "epoch": 0.59712, "grad_norm": 3.6708478927612305, "learning_rate": 5e-06, "loss": 1.2783, "mean_token_accuracy": 0.6855080351233482, "num_tokens": 12170548.0, "step": 933 }, { "epoch": 0.59776, "grad_norm": 3.8740973472595215, "learning_rate": 5e-06, "loss": 1.1046, "mean_token_accuracy": 0.704432986676693, "num_tokens": 12185292.0, "step": 934 }, { "epoch": 0.5984, "grad_norm": 3.4846086502075195, "learning_rate": 5e-06, "loss": 1.3035, "mean_token_accuracy": 0.6759809032082558, "num_tokens": 12198081.0, "step": 935 }, { "epoch": 0.59904, "grad_norm": 3.027975082397461, "learning_rate": 5e-06, "loss": 1.2897, "mean_token_accuracy": 0.6573361679911613, "num_tokens": 12214742.0, "step": 936 }, { "epoch": 0.59968, "grad_norm": 3.879801034927368, "learning_rate": 5e-06, "loss": 1.3042, "mean_token_accuracy": 0.641165092587471, "num_tokens": 12225671.0, "step": 937 }, { "epoch": 0.60032, "grad_norm": 3.933652877807617, "learning_rate": 5e-06, "loss": 1.0435, "mean_token_accuracy": 0.7146632373332977, "num_tokens": 12239781.0, "step": 938 }, { "epoch": 0.60096, "grad_norm": 4.3125786781311035, "learning_rate": 5e-06, "loss": 1.2013, "mean_token_accuracy": 0.6595223546028137, "num_tokens": 12250660.0, "step": 939 }, { "epoch": 0.6016, "grad_norm": 3.671967029571533, "learning_rate": 5e-06, "loss": 1.4127, "mean_token_accuracy": 0.6495495587587357, "num_tokens": 12261626.0, "step": 940 }, { "epoch": 0.60224, "grad_norm": 3.524958610534668, "learning_rate": 5e-06, "loss": 1.5488, "mean_token_accuracy": 0.6084389686584473, "num_tokens": 12275245.0, "step": 941 }, { "epoch": 0.60288, "grad_norm": 3.6148650646209717, "learning_rate": 5e-06, "loss": 1.3679, "mean_token_accuracy": 0.6441814675927162, "num_tokens": 12289591.0, "step": 942 }, { "epoch": 0.60352, "grad_norm": 3.531022071838379, "learning_rate": 5e-06, "loss": 1.4175, "mean_token_accuracy": 0.6326302289962769, "num_tokens": 12303061.0, "step": 943 }, { "epoch": 0.60416, "grad_norm": 3.5599935054779053, "learning_rate": 5e-06, "loss": 1.4981, "mean_token_accuracy": 0.6270119249820709, "num_tokens": 12317689.0, "step": 944 }, { "epoch": 0.6048, "grad_norm": 3.125378370285034, "learning_rate": 5e-06, "loss": 1.0466, "mean_token_accuracy": 0.7116389200091362, "num_tokens": 12332378.0, "step": 945 }, { "epoch": 0.60544, "grad_norm": 3.8127193450927734, "learning_rate": 5e-06, "loss": 1.3282, "mean_token_accuracy": 0.6464278548955917, "num_tokens": 12345052.0, "step": 946 }, { "epoch": 0.60608, "grad_norm": 3.636815309524536, "learning_rate": 5e-06, "loss": 1.4728, "mean_token_accuracy": 0.6262509748339653, "num_tokens": 12356621.0, "step": 947 }, { "epoch": 0.60672, "grad_norm": 3.3789074420928955, "learning_rate": 5e-06, "loss": 1.2371, "mean_token_accuracy": 0.6846612468361855, "num_tokens": 12370824.0, "step": 948 }, { "epoch": 0.60736, "grad_norm": 3.5147576332092285, "learning_rate": 5e-06, "loss": 1.2429, "mean_token_accuracy": 0.6510372906923294, "num_tokens": 12382166.0, "step": 949 }, { "epoch": 0.608, "grad_norm": 4.723844528198242, "learning_rate": 5e-06, "loss": 1.3264, "mean_token_accuracy": 0.6632048487663269, "num_tokens": 12394366.0, "step": 950 }, { "epoch": 0.60864, "grad_norm": 3.679612398147583, "learning_rate": 5e-06, "loss": 1.3556, "mean_token_accuracy": 0.6747320145368576, "num_tokens": 12408326.0, "step": 951 }, { "epoch": 0.60928, "grad_norm": 3.3034772872924805, "learning_rate": 5e-06, "loss": 1.5539, "mean_token_accuracy": 0.6177156269550323, "num_tokens": 12422379.0, "step": 952 }, { "epoch": 0.60992, "grad_norm": 7.560748100280762, "learning_rate": 5e-06, "loss": 1.3543, "mean_token_accuracy": 0.6603868454694748, "num_tokens": 12434809.0, "step": 953 }, { "epoch": 0.61056, "grad_norm": 4.265347003936768, "learning_rate": 5e-06, "loss": 1.4282, "mean_token_accuracy": 0.6390318870544434, "num_tokens": 12447614.0, "step": 954 }, { "epoch": 0.6112, "grad_norm": 3.8850181102752686, "learning_rate": 5e-06, "loss": 1.2699, "mean_token_accuracy": 0.6917356178164482, "num_tokens": 12459891.0, "step": 955 }, { "epoch": 0.61184, "grad_norm": 3.479156255722046, "learning_rate": 5e-06, "loss": 1.1499, "mean_token_accuracy": 0.6750801056623459, "num_tokens": 12474836.0, "step": 956 }, { "epoch": 0.61248, "grad_norm": 2.7899651527404785, "learning_rate": 5e-06, "loss": 1.2879, "mean_token_accuracy": 0.6665042042732239, "num_tokens": 12493558.0, "step": 957 }, { "epoch": 0.61312, "grad_norm": 3.6457180976867676, "learning_rate": 5e-06, "loss": 1.2007, "mean_token_accuracy": 0.6936507746577263, "num_tokens": 12506849.0, "step": 958 }, { "epoch": 0.61376, "grad_norm": 3.0956859588623047, "learning_rate": 5e-06, "loss": 1.4628, "mean_token_accuracy": 0.6274904161691666, "num_tokens": 12522171.0, "step": 959 }, { "epoch": 0.6144, "grad_norm": 3.615293264389038, "learning_rate": 5e-06, "loss": 1.2786, "mean_token_accuracy": 0.679816409945488, "num_tokens": 12537702.0, "step": 960 }, { "epoch": 0.61504, "grad_norm": 3.4518120288848877, "learning_rate": 5e-06, "loss": 1.3591, "mean_token_accuracy": 0.6446092203259468, "num_tokens": 12550526.0, "step": 961 }, { "epoch": 0.61568, "grad_norm": 3.4621338844299316, "learning_rate": 5e-06, "loss": 1.3075, "mean_token_accuracy": 0.665081262588501, "num_tokens": 12564227.0, "step": 962 }, { "epoch": 0.61632, "grad_norm": 3.3471479415893555, "learning_rate": 5e-06, "loss": 1.4756, "mean_token_accuracy": 0.6200397908687592, "num_tokens": 12578106.0, "step": 963 }, { "epoch": 0.61696, "grad_norm": 3.874799966812134, "learning_rate": 5e-06, "loss": 1.2777, "mean_token_accuracy": 0.6613158509135246, "num_tokens": 12589750.0, "step": 964 }, { "epoch": 0.6176, "grad_norm": 4.006873607635498, "learning_rate": 5e-06, "loss": 1.408, "mean_token_accuracy": 0.6419458091259003, "num_tokens": 12602450.0, "step": 965 }, { "epoch": 0.61824, "grad_norm": 3.674241542816162, "learning_rate": 5e-06, "loss": 1.2756, "mean_token_accuracy": 0.6584514081478119, "num_tokens": 12613871.0, "step": 966 }, { "epoch": 0.61888, "grad_norm": 3.7405648231506348, "learning_rate": 5e-06, "loss": 1.3301, "mean_token_accuracy": 0.6810361295938492, "num_tokens": 12626220.0, "step": 967 }, { "epoch": 0.61952, "grad_norm": 3.660600185394287, "learning_rate": 5e-06, "loss": 1.2219, "mean_token_accuracy": 0.6716256737709045, "num_tokens": 12636440.0, "step": 968 }, { "epoch": 0.62016, "grad_norm": 3.4270999431610107, "learning_rate": 5e-06, "loss": 1.0976, "mean_token_accuracy": 0.7019147500395775, "num_tokens": 12649545.0, "step": 969 }, { "epoch": 0.6208, "grad_norm": 3.562014102935791, "learning_rate": 5e-06, "loss": 1.2243, "mean_token_accuracy": 0.6942142397165298, "num_tokens": 12660841.0, "step": 970 }, { "epoch": 0.62144, "grad_norm": 4.004054069519043, "learning_rate": 5e-06, "loss": 1.0961, "mean_token_accuracy": 0.6962654888629913, "num_tokens": 12675967.0, "step": 971 }, { "epoch": 0.62208, "grad_norm": 3.749152898788452, "learning_rate": 5e-06, "loss": 1.489, "mean_token_accuracy": 0.6146213822066784, "num_tokens": 12687203.0, "step": 972 }, { "epoch": 0.62272, "grad_norm": 3.2638871669769287, "learning_rate": 5e-06, "loss": 1.1979, "mean_token_accuracy": 0.687263160943985, "num_tokens": 12700697.0, "step": 973 }, { "epoch": 0.62336, "grad_norm": 3.310070037841797, "learning_rate": 5e-06, "loss": 1.4779, "mean_token_accuracy": 0.632742814719677, "num_tokens": 12716668.0, "step": 974 }, { "epoch": 0.624, "grad_norm": 3.3164589405059814, "learning_rate": 5e-06, "loss": 1.301, "mean_token_accuracy": 0.6563373729586601, "num_tokens": 12729912.0, "step": 975 }, { "epoch": 0.62464, "grad_norm": 3.2415506839752197, "learning_rate": 5e-06, "loss": 1.393, "mean_token_accuracy": 0.644082136452198, "num_tokens": 12745520.0, "step": 976 }, { "epoch": 0.62528, "grad_norm": 3.333308458328247, "learning_rate": 5e-06, "loss": 1.1238, "mean_token_accuracy": 0.6863394901156425, "num_tokens": 12759203.0, "step": 977 }, { "epoch": 0.62592, "grad_norm": 4.198854923248291, "learning_rate": 5e-06, "loss": 1.3601, "mean_token_accuracy": 0.6624843999743462, "num_tokens": 12770322.0, "step": 978 }, { "epoch": 0.62656, "grad_norm": 3.849907636642456, "learning_rate": 5e-06, "loss": 1.2947, "mean_token_accuracy": 0.6675618216395378, "num_tokens": 12782951.0, "step": 979 }, { "epoch": 0.6272, "grad_norm": 3.4649503231048584, "learning_rate": 5e-06, "loss": 1.1915, "mean_token_accuracy": 0.6806611344218254, "num_tokens": 12795383.0, "step": 980 }, { "epoch": 0.62784, "grad_norm": 3.63466739654541, "learning_rate": 5e-06, "loss": 1.3124, "mean_token_accuracy": 0.6731822267174721, "num_tokens": 12808692.0, "step": 981 }, { "epoch": 0.62848, "grad_norm": 4.293845176696777, "learning_rate": 5e-06, "loss": 1.1757, "mean_token_accuracy": 0.6780604794621468, "num_tokens": 12821099.0, "step": 982 }, { "epoch": 0.62912, "grad_norm": 3.565584897994995, "learning_rate": 5e-06, "loss": 1.1746, "mean_token_accuracy": 0.6787943094968796, "num_tokens": 12832201.0, "step": 983 }, { "epoch": 0.62976, "grad_norm": 3.517613410949707, "learning_rate": 5e-06, "loss": 1.2167, "mean_token_accuracy": 0.6914558485150337, "num_tokens": 12845465.0, "step": 984 }, { "epoch": 0.6304, "grad_norm": 3.6170578002929688, "learning_rate": 5e-06, "loss": 1.3226, "mean_token_accuracy": 0.6587705016136169, "num_tokens": 12857366.0, "step": 985 }, { "epoch": 0.63104, "grad_norm": 3.504154682159424, "learning_rate": 5e-06, "loss": 1.4641, "mean_token_accuracy": 0.6085046976804733, "num_tokens": 12871695.0, "step": 986 }, { "epoch": 0.63168, "grad_norm": 3.543142557144165, "learning_rate": 5e-06, "loss": 1.1252, "mean_token_accuracy": 0.7007554769515991, "num_tokens": 12884113.0, "step": 987 }, { "epoch": 0.63232, "grad_norm": 3.9888851642608643, "learning_rate": 5e-06, "loss": 1.2741, "mean_token_accuracy": 0.656329832971096, "num_tokens": 12898706.0, "step": 988 }, { "epoch": 0.63296, "grad_norm": 3.472778081893921, "learning_rate": 5e-06, "loss": 1.2431, "mean_token_accuracy": 0.6751798540353775, "num_tokens": 12911380.0, "step": 989 }, { "epoch": 0.6336, "grad_norm": 3.3277764320373535, "learning_rate": 5e-06, "loss": 1.475, "mean_token_accuracy": 0.632897637784481, "num_tokens": 12925697.0, "step": 990 }, { "epoch": 0.63424, "grad_norm": 3.047473669052124, "learning_rate": 5e-06, "loss": 1.2663, "mean_token_accuracy": 0.6683759167790413, "num_tokens": 12939473.0, "step": 991 }, { "epoch": 0.63488, "grad_norm": 3.483201503753662, "learning_rate": 5e-06, "loss": 1.3407, "mean_token_accuracy": 0.6652352660894394, "num_tokens": 12952439.0, "step": 992 }, { "epoch": 0.63552, "grad_norm": 4.43934965133667, "learning_rate": 5e-06, "loss": 1.0979, "mean_token_accuracy": 0.6819510236382484, "num_tokens": 12963232.0, "step": 993 }, { "epoch": 0.63616, "grad_norm": 3.2107748985290527, "learning_rate": 5e-06, "loss": 1.2219, "mean_token_accuracy": 0.6648012548685074, "num_tokens": 12976396.0, "step": 994 }, { "epoch": 0.6368, "grad_norm": 3.8679394721984863, "learning_rate": 5e-06, "loss": 1.3487, "mean_token_accuracy": 0.6491437703371048, "num_tokens": 12989586.0, "step": 995 }, { "epoch": 0.63744, "grad_norm": 3.75811767578125, "learning_rate": 5e-06, "loss": 1.2384, "mean_token_accuracy": 0.6684290617704391, "num_tokens": 13002145.0, "step": 996 }, { "epoch": 0.63808, "grad_norm": 4.223326206207275, "learning_rate": 5e-06, "loss": 1.3218, "mean_token_accuracy": 0.6605047658085823, "num_tokens": 13011853.0, "step": 997 }, { "epoch": 0.63872, "grad_norm": 4.10746955871582, "learning_rate": 5e-06, "loss": 1.2647, "mean_token_accuracy": 0.6529572680592537, "num_tokens": 13022296.0, "step": 998 }, { "epoch": 0.63936, "grad_norm": 3.858157157897949, "learning_rate": 5e-06, "loss": 1.3031, "mean_token_accuracy": 0.6564661860466003, "num_tokens": 13032768.0, "step": 999 }, { "epoch": 0.64, "grad_norm": 3.4283535480499268, "learning_rate": 5e-06, "loss": 1.3122, "mean_token_accuracy": 0.6764922738075256, "num_tokens": 13045249.0, "step": 1000 }, { "epoch": 0.64064, "grad_norm": 3.5663790702819824, "learning_rate": 5e-06, "loss": 1.3038, "mean_token_accuracy": 0.6534328386187553, "num_tokens": 13057087.0, "step": 1001 }, { "epoch": 0.64128, "grad_norm": 4.08723783493042, "learning_rate": 5e-06, "loss": 1.311, "mean_token_accuracy": 0.6497639790177345, "num_tokens": 13069353.0, "step": 1002 }, { "epoch": 0.64192, "grad_norm": 3.1709539890289307, "learning_rate": 5e-06, "loss": 1.4385, "mean_token_accuracy": 0.6491308063268661, "num_tokens": 13084511.0, "step": 1003 }, { "epoch": 0.64256, "grad_norm": 3.7724292278289795, "learning_rate": 5e-06, "loss": 1.2049, "mean_token_accuracy": 0.6716037020087242, "num_tokens": 13095572.0, "step": 1004 }, { "epoch": 0.6432, "grad_norm": 3.4885339736938477, "learning_rate": 5e-06, "loss": 1.1454, "mean_token_accuracy": 0.7053978741168976, "num_tokens": 13108817.0, "step": 1005 }, { "epoch": 0.64384, "grad_norm": 3.718435287475586, "learning_rate": 5e-06, "loss": 1.2653, "mean_token_accuracy": 0.6529537960886955, "num_tokens": 13119845.0, "step": 1006 }, { "epoch": 0.64448, "grad_norm": 3.7939629554748535, "learning_rate": 5e-06, "loss": 1.2563, "mean_token_accuracy": 0.6783741563558578, "num_tokens": 13131590.0, "step": 1007 }, { "epoch": 0.64512, "grad_norm": 3.0090038776397705, "learning_rate": 5e-06, "loss": 1.3032, "mean_token_accuracy": 0.6621121242642403, "num_tokens": 13146614.0, "step": 1008 }, { "epoch": 0.64576, "grad_norm": 3.3267111778259277, "learning_rate": 5e-06, "loss": 1.4166, "mean_token_accuracy": 0.6354531794786453, "num_tokens": 13160682.0, "step": 1009 }, { "epoch": 0.6464, "grad_norm": 3.528743267059326, "learning_rate": 5e-06, "loss": 1.246, "mean_token_accuracy": 0.6703604385256767, "num_tokens": 13172998.0, "step": 1010 }, { "epoch": 0.64704, "grad_norm": 3.2315750122070312, "learning_rate": 5e-06, "loss": 1.2159, "mean_token_accuracy": 0.6840131431818008, "num_tokens": 13187182.0, "step": 1011 }, { "epoch": 0.64768, "grad_norm": 3.885690689086914, "learning_rate": 5e-06, "loss": 1.3193, "mean_token_accuracy": 0.6602044403553009, "num_tokens": 13198449.0, "step": 1012 }, { "epoch": 0.64832, "grad_norm": 4.214417934417725, "learning_rate": 5e-06, "loss": 1.3289, "mean_token_accuracy": 0.638420894742012, "num_tokens": 13208888.0, "step": 1013 }, { "epoch": 0.64896, "grad_norm": 3.303224563598633, "learning_rate": 5e-06, "loss": 1.3866, "mean_token_accuracy": 0.6286184787750244, "num_tokens": 13222042.0, "step": 1014 }, { "epoch": 0.6496, "grad_norm": 3.879709482192993, "learning_rate": 5e-06, "loss": 1.4231, "mean_token_accuracy": 0.6209117695689201, "num_tokens": 13234662.0, "step": 1015 }, { "epoch": 0.65024, "grad_norm": 3.770817995071411, "learning_rate": 5e-06, "loss": 1.3315, "mean_token_accuracy": 0.6555488482117653, "num_tokens": 13245357.0, "step": 1016 }, { "epoch": 0.65088, "grad_norm": 3.627957582473755, "learning_rate": 5e-06, "loss": 1.2457, "mean_token_accuracy": 0.6971615925431252, "num_tokens": 13258154.0, "step": 1017 }, { "epoch": 0.65152, "grad_norm": 3.818009853363037, "learning_rate": 5e-06, "loss": 1.2174, "mean_token_accuracy": 0.6713634058833122, "num_tokens": 13270001.0, "step": 1018 }, { "epoch": 0.65216, "grad_norm": 3.7726924419403076, "learning_rate": 5e-06, "loss": 1.2554, "mean_token_accuracy": 0.6766888722777367, "num_tokens": 13281334.0, "step": 1019 }, { "epoch": 0.6528, "grad_norm": 3.608661413192749, "learning_rate": 5e-06, "loss": 1.4083, "mean_token_accuracy": 0.6521065756678581, "num_tokens": 13294860.0, "step": 1020 }, { "epoch": 0.65344, "grad_norm": 3.7841391563415527, "learning_rate": 5e-06, "loss": 1.4197, "mean_token_accuracy": 0.6439206749200821, "num_tokens": 13308758.0, "step": 1021 }, { "epoch": 0.65408, "grad_norm": 3.836831569671631, "learning_rate": 5e-06, "loss": 1.3922, "mean_token_accuracy": 0.6572518870234489, "num_tokens": 13319883.0, "step": 1022 }, { "epoch": 0.65472, "grad_norm": 3.774944305419922, "learning_rate": 5e-06, "loss": 1.1792, "mean_token_accuracy": 0.6656563580036163, "num_tokens": 13331846.0, "step": 1023 }, { "epoch": 0.65536, "grad_norm": 4.0701751708984375, "learning_rate": 5e-06, "loss": 1.2333, "mean_token_accuracy": 0.6843068152666092, "num_tokens": 13343010.0, "step": 1024 }, { "epoch": 0.656, "grad_norm": 3.7170510292053223, "learning_rate": 5e-06, "loss": 1.1427, "mean_token_accuracy": 0.6941706016659737, "num_tokens": 13356096.0, "step": 1025 }, { "epoch": 0.65664, "grad_norm": 3.4047844409942627, "learning_rate": 5e-06, "loss": 1.2994, "mean_token_accuracy": 0.6502626538276672, "num_tokens": 13370415.0, "step": 1026 }, { "epoch": 0.65728, "grad_norm": 3.013894557952881, "learning_rate": 5e-06, "loss": 1.2778, "mean_token_accuracy": 0.6713566333055496, "num_tokens": 13385621.0, "step": 1027 }, { "epoch": 0.65792, "grad_norm": 3.8273723125457764, "learning_rate": 5e-06, "loss": 1.4213, "mean_token_accuracy": 0.6448807269334793, "num_tokens": 13399913.0, "step": 1028 }, { "epoch": 0.65856, "grad_norm": 4.501821041107178, "learning_rate": 5e-06, "loss": 1.3451, "mean_token_accuracy": 0.6397663801908493, "num_tokens": 13410323.0, "step": 1029 }, { "epoch": 0.6592, "grad_norm": 3.656630516052246, "learning_rate": 5e-06, "loss": 1.3693, "mean_token_accuracy": 0.6572180986404419, "num_tokens": 13421431.0, "step": 1030 }, { "epoch": 0.65984, "grad_norm": 3.761538505554199, "learning_rate": 5e-06, "loss": 1.3701, "mean_token_accuracy": 0.6800885275006294, "num_tokens": 13433250.0, "step": 1031 }, { "epoch": 0.66048, "grad_norm": 3.5799546241760254, "learning_rate": 5e-06, "loss": 1.3473, "mean_token_accuracy": 0.6503010243177414, "num_tokens": 13446010.0, "step": 1032 }, { "epoch": 0.66112, "grad_norm": 3.578547239303589, "learning_rate": 5e-06, "loss": 1.2915, "mean_token_accuracy": 0.7072044536471367, "num_tokens": 13458978.0, "step": 1033 }, { "epoch": 0.66176, "grad_norm": 3.554094076156616, "learning_rate": 5e-06, "loss": 1.1486, "mean_token_accuracy": 0.6704866662621498, "num_tokens": 13471248.0, "step": 1034 }, { "epoch": 0.6624, "grad_norm": 3.5921144485473633, "learning_rate": 5e-06, "loss": 1.3039, "mean_token_accuracy": 0.6752297282218933, "num_tokens": 13483651.0, "step": 1035 }, { "epoch": 0.66304, "grad_norm": 3.580885648727417, "learning_rate": 5e-06, "loss": 1.3638, "mean_token_accuracy": 0.6531356200575829, "num_tokens": 13496365.0, "step": 1036 }, { "epoch": 0.66368, "grad_norm": 3.6400530338287354, "learning_rate": 5e-06, "loss": 1.2824, "mean_token_accuracy": 0.6671290174126625, "num_tokens": 13509196.0, "step": 1037 }, { "epoch": 0.66432, "grad_norm": 3.050649404525757, "learning_rate": 5e-06, "loss": 1.2883, "mean_token_accuracy": 0.6613614112138748, "num_tokens": 13524592.0, "step": 1038 }, { "epoch": 0.66496, "grad_norm": 3.1810715198516846, "learning_rate": 5e-06, "loss": 1.4794, "mean_token_accuracy": 0.633582279086113, "num_tokens": 13539709.0, "step": 1039 }, { "epoch": 0.6656, "grad_norm": 3.488229751586914, "learning_rate": 5e-06, "loss": 1.1359, "mean_token_accuracy": 0.6704655885696411, "num_tokens": 13551656.0, "step": 1040 }, { "epoch": 0.66624, "grad_norm": 3.1657679080963135, "learning_rate": 5e-06, "loss": 1.405, "mean_token_accuracy": 0.6457558646798134, "num_tokens": 13566681.0, "step": 1041 }, { "epoch": 0.66688, "grad_norm": 3.7111074924468994, "learning_rate": 5e-06, "loss": 1.1577, "mean_token_accuracy": 0.6851859763264656, "num_tokens": 13578777.0, "step": 1042 }, { "epoch": 0.66752, "grad_norm": 3.803246021270752, "learning_rate": 5e-06, "loss": 1.7394, "mean_token_accuracy": 0.5742352418601513, "num_tokens": 13591184.0, "step": 1043 }, { "epoch": 0.66816, "grad_norm": 3.44681453704834, "learning_rate": 5e-06, "loss": 1.249, "mean_token_accuracy": 0.6815991401672363, "num_tokens": 13603011.0, "step": 1044 }, { "epoch": 0.6688, "grad_norm": 3.4363629817962646, "learning_rate": 5e-06, "loss": 1.207, "mean_token_accuracy": 0.6451572626829147, "num_tokens": 13617577.0, "step": 1045 }, { "epoch": 0.66944, "grad_norm": 3.9714715480804443, "learning_rate": 5e-06, "loss": 1.316, "mean_token_accuracy": 0.6733849868178368, "num_tokens": 13628510.0, "step": 1046 }, { "epoch": 0.67008, "grad_norm": 3.5095605850219727, "learning_rate": 5e-06, "loss": 1.2265, "mean_token_accuracy": 0.6774598509073257, "num_tokens": 13639965.0, "step": 1047 }, { "epoch": 0.67072, "grad_norm": 3.4731342792510986, "learning_rate": 5e-06, "loss": 1.514, "mean_token_accuracy": 0.6291368082165718, "num_tokens": 13652881.0, "step": 1048 }, { "epoch": 0.67136, "grad_norm": 3.4731788635253906, "learning_rate": 5e-06, "loss": 1.2134, "mean_token_accuracy": 0.7064939141273499, "num_tokens": 13665986.0, "step": 1049 }, { "epoch": 0.672, "grad_norm": 3.885256052017212, "learning_rate": 5e-06, "loss": 1.2582, "mean_token_accuracy": 0.6920784562826157, "num_tokens": 13677555.0, "step": 1050 }, { "epoch": 0.67264, "grad_norm": 3.5971357822418213, "learning_rate": 5e-06, "loss": 1.2803, "mean_token_accuracy": 0.6860932558774948, "num_tokens": 13689439.0, "step": 1051 }, { "epoch": 0.67328, "grad_norm": 3.4999284744262695, "learning_rate": 5e-06, "loss": 1.2694, "mean_token_accuracy": 0.6710788905620575, "num_tokens": 13703306.0, "step": 1052 }, { "epoch": 0.67392, "grad_norm": 3.894716262817383, "learning_rate": 5e-06, "loss": 1.1895, "mean_token_accuracy": 0.694163866341114, "num_tokens": 13715107.0, "step": 1053 }, { "epoch": 0.67456, "grad_norm": 3.8361921310424805, "learning_rate": 5e-06, "loss": 1.3142, "mean_token_accuracy": 0.6533372104167938, "num_tokens": 13726324.0, "step": 1054 }, { "epoch": 0.6752, "grad_norm": 3.5220136642456055, "learning_rate": 5e-06, "loss": 1.5138, "mean_token_accuracy": 0.6085778698325157, "num_tokens": 13739132.0, "step": 1055 }, { "epoch": 0.67584, "grad_norm": 3.4445347785949707, "learning_rate": 5e-06, "loss": 1.3869, "mean_token_accuracy": 0.6502392590045929, "num_tokens": 13751996.0, "step": 1056 }, { "epoch": 0.67648, "grad_norm": 4.514054298400879, "learning_rate": 5e-06, "loss": 1.0082, "mean_token_accuracy": 0.7111445441842079, "num_tokens": 13762912.0, "step": 1057 }, { "epoch": 0.67712, "grad_norm": 3.4511091709136963, "learning_rate": 5e-06, "loss": 1.2244, "mean_token_accuracy": 0.6701223999261856, "num_tokens": 13776094.0, "step": 1058 }, { "epoch": 0.67776, "grad_norm": 3.518554449081421, "learning_rate": 5e-06, "loss": 1.4949, "mean_token_accuracy": 0.6190471276640892, "num_tokens": 13789681.0, "step": 1059 }, { "epoch": 0.6784, "grad_norm": 3.177955150604248, "learning_rate": 5e-06, "loss": 1.4254, "mean_token_accuracy": 0.6302540749311447, "num_tokens": 13802777.0, "step": 1060 }, { "epoch": 0.67904, "grad_norm": 3.7214250564575195, "learning_rate": 5e-06, "loss": 1.3088, "mean_token_accuracy": 0.6556456014513969, "num_tokens": 13815229.0, "step": 1061 }, { "epoch": 0.67968, "grad_norm": 3.726001739501953, "learning_rate": 5e-06, "loss": 1.2057, "mean_token_accuracy": 0.6688976883888245, "num_tokens": 13826932.0, "step": 1062 }, { "epoch": 0.68032, "grad_norm": 3.1761860847473145, "learning_rate": 5e-06, "loss": 1.4054, "mean_token_accuracy": 0.6317546740174294, "num_tokens": 13842144.0, "step": 1063 }, { "epoch": 0.68096, "grad_norm": 4.224031448364258, "learning_rate": 5e-06, "loss": 1.1421, "mean_token_accuracy": 0.6879750266671181, "num_tokens": 13852498.0, "step": 1064 }, { "epoch": 0.6816, "grad_norm": 3.1462998390197754, "learning_rate": 5e-06, "loss": 1.3639, "mean_token_accuracy": 0.6389659121632576, "num_tokens": 13867842.0, "step": 1065 }, { "epoch": 0.68224, "grad_norm": 3.7994680404663086, "learning_rate": 5e-06, "loss": 1.2442, "mean_token_accuracy": 0.6608950793743134, "num_tokens": 13878904.0, "step": 1066 }, { "epoch": 0.68288, "grad_norm": 3.3029258251190186, "learning_rate": 5e-06, "loss": 1.1501, "mean_token_accuracy": 0.6978383362293243, "num_tokens": 13892970.0, "step": 1067 }, { "epoch": 0.68352, "grad_norm": 4.019161224365234, "learning_rate": 5e-06, "loss": 1.2156, "mean_token_accuracy": 0.689938597381115, "num_tokens": 13904207.0, "step": 1068 }, { "epoch": 0.68416, "grad_norm": 3.9899635314941406, "learning_rate": 5e-06, "loss": 1.4744, "mean_token_accuracy": 0.6247128024697304, "num_tokens": 13917327.0, "step": 1069 }, { "epoch": 0.6848, "grad_norm": 5.03689432144165, "learning_rate": 5e-06, "loss": 1.3551, "mean_token_accuracy": 0.6756569370627403, "num_tokens": 13927551.0, "step": 1070 }, { "epoch": 0.68544, "grad_norm": 3.43404221534729, "learning_rate": 5e-06, "loss": 1.1618, "mean_token_accuracy": 0.6679700240492821, "num_tokens": 13939630.0, "step": 1071 }, { "epoch": 0.68608, "grad_norm": 4.027390956878662, "learning_rate": 5e-06, "loss": 1.3842, "mean_token_accuracy": 0.6630447581410408, "num_tokens": 13949082.0, "step": 1072 }, { "epoch": 0.68672, "grad_norm": 3.764420986175537, "learning_rate": 5e-06, "loss": 1.293, "mean_token_accuracy": 0.6628079935908318, "num_tokens": 13962085.0, "step": 1073 }, { "epoch": 0.68736, "grad_norm": 3.617522954940796, "learning_rate": 5e-06, "loss": 1.5355, "mean_token_accuracy": 0.6268560141324997, "num_tokens": 13973510.0, "step": 1074 }, { "epoch": 0.688, "grad_norm": 3.6434836387634277, "learning_rate": 5e-06, "loss": 1.3283, "mean_token_accuracy": 0.6506749242544174, "num_tokens": 13986487.0, "step": 1075 }, { "epoch": 0.68864, "grad_norm": 3.4601213932037354, "learning_rate": 5e-06, "loss": 1.1938, "mean_token_accuracy": 0.6761009320616722, "num_tokens": 13998818.0, "step": 1076 }, { "epoch": 0.68928, "grad_norm": 3.537867307662964, "learning_rate": 5e-06, "loss": 1.2904, "mean_token_accuracy": 0.6788460239768028, "num_tokens": 14011012.0, "step": 1077 }, { "epoch": 0.68992, "grad_norm": 3.204850435256958, "learning_rate": 5e-06, "loss": 1.257, "mean_token_accuracy": 0.650609090924263, "num_tokens": 14026133.0, "step": 1078 }, { "epoch": 0.69056, "grad_norm": 3.1684117317199707, "learning_rate": 5e-06, "loss": 1.3857, "mean_token_accuracy": 0.669170081615448, "num_tokens": 14041463.0, "step": 1079 }, { "epoch": 0.6912, "grad_norm": 2.97310209274292, "learning_rate": 5e-06, "loss": 1.1807, "mean_token_accuracy": 0.678413525223732, "num_tokens": 14055206.0, "step": 1080 }, { "epoch": 0.69184, "grad_norm": 3.415344476699829, "learning_rate": 5e-06, "loss": 1.335, "mean_token_accuracy": 0.6698315292596817, "num_tokens": 14067043.0, "step": 1081 }, { "epoch": 0.69248, "grad_norm": 3.2605786323547363, "learning_rate": 5e-06, "loss": 1.23, "mean_token_accuracy": 0.6607236042618752, "num_tokens": 14081150.0, "step": 1082 }, { "epoch": 0.69312, "grad_norm": 3.5928292274475098, "learning_rate": 5e-06, "loss": 1.3275, "mean_token_accuracy": 0.6353218033909798, "num_tokens": 14093199.0, "step": 1083 }, { "epoch": 0.69376, "grad_norm": 3.6726202964782715, "learning_rate": 5e-06, "loss": 1.2071, "mean_token_accuracy": 0.6714643463492393, "num_tokens": 14106118.0, "step": 1084 }, { "epoch": 0.6944, "grad_norm": 3.3273112773895264, "learning_rate": 5e-06, "loss": 1.2143, "mean_token_accuracy": 0.6702926605939865, "num_tokens": 14119919.0, "step": 1085 }, { "epoch": 0.69504, "grad_norm": 3.3181533813476562, "learning_rate": 5e-06, "loss": 1.525, "mean_token_accuracy": 0.6393495798110962, "num_tokens": 14134965.0, "step": 1086 }, { "epoch": 0.69568, "grad_norm": 3.290024995803833, "learning_rate": 5e-06, "loss": 1.4868, "mean_token_accuracy": 0.641509011387825, "num_tokens": 14150873.0, "step": 1087 }, { "epoch": 0.69632, "grad_norm": 3.179009199142456, "learning_rate": 5e-06, "loss": 1.3236, "mean_token_accuracy": 0.6799350008368492, "num_tokens": 14165423.0, "step": 1088 }, { "epoch": 0.69696, "grad_norm": 4.067260265350342, "learning_rate": 5e-06, "loss": 1.3399, "mean_token_accuracy": 0.6480335146188736, "num_tokens": 14177467.0, "step": 1089 }, { "epoch": 0.6976, "grad_norm": 3.0903289318084717, "learning_rate": 5e-06, "loss": 1.2387, "mean_token_accuracy": 0.6776180788874626, "num_tokens": 14192746.0, "step": 1090 }, { "epoch": 0.69824, "grad_norm": 3.60392165184021, "learning_rate": 5e-06, "loss": 1.4149, "mean_token_accuracy": 0.6424715965986252, "num_tokens": 14205814.0, "step": 1091 }, { "epoch": 0.69888, "grad_norm": 3.857509136199951, "learning_rate": 5e-06, "loss": 1.2077, "mean_token_accuracy": 0.6793344393372536, "num_tokens": 14216791.0, "step": 1092 }, { "epoch": 0.69952, "grad_norm": 3.376009702682495, "learning_rate": 5e-06, "loss": 1.329, "mean_token_accuracy": 0.6632150262594223, "num_tokens": 14231190.0, "step": 1093 }, { "epoch": 0.70016, "grad_norm": 3.522667407989502, "learning_rate": 5e-06, "loss": 1.3746, "mean_token_accuracy": 0.6448647379875183, "num_tokens": 14246266.0, "step": 1094 }, { "epoch": 0.7008, "grad_norm": 3.88810658454895, "learning_rate": 5e-06, "loss": 1.4056, "mean_token_accuracy": 0.6266975551843643, "num_tokens": 14258900.0, "step": 1095 }, { "epoch": 0.70144, "grad_norm": 4.134660243988037, "learning_rate": 5e-06, "loss": 1.4136, "mean_token_accuracy": 0.6337871551513672, "num_tokens": 14270135.0, "step": 1096 }, { "epoch": 0.70208, "grad_norm": 2.7987403869628906, "learning_rate": 5e-06, "loss": 1.2978, "mean_token_accuracy": 0.6591611802577972, "num_tokens": 14287885.0, "step": 1097 }, { "epoch": 0.70272, "grad_norm": 3.6904680728912354, "learning_rate": 5e-06, "loss": 1.3421, "mean_token_accuracy": 0.6668279096484184, "num_tokens": 14299211.0, "step": 1098 }, { "epoch": 0.70336, "grad_norm": 3.754704475402832, "learning_rate": 5e-06, "loss": 1.2728, "mean_token_accuracy": 0.6614086180925369, "num_tokens": 14310772.0, "step": 1099 }, { "epoch": 0.704, "grad_norm": 4.1148529052734375, "learning_rate": 5e-06, "loss": 1.3038, "mean_token_accuracy": 0.6619797796010971, "num_tokens": 14321538.0, "step": 1100 }, { "epoch": 0.70464, "grad_norm": 3.9892449378967285, "learning_rate": 5e-06, "loss": 1.4351, "mean_token_accuracy": 0.6713762879371643, "num_tokens": 14332186.0, "step": 1101 }, { "epoch": 0.70528, "grad_norm": 2.8868937492370605, "learning_rate": 5e-06, "loss": 1.3939, "mean_token_accuracy": 0.6400659307837486, "num_tokens": 14349043.0, "step": 1102 }, { "epoch": 0.70592, "grad_norm": 3.4299302101135254, "learning_rate": 5e-06, "loss": 1.2674, "mean_token_accuracy": 0.6643095165491104, "num_tokens": 14363765.0, "step": 1103 }, { "epoch": 0.70656, "grad_norm": 3.3706107139587402, "learning_rate": 5e-06, "loss": 1.3314, "mean_token_accuracy": 0.6502428278326988, "num_tokens": 14377935.0, "step": 1104 }, { "epoch": 0.7072, "grad_norm": 3.353766441345215, "learning_rate": 5e-06, "loss": 1.4449, "mean_token_accuracy": 0.6395809948444366, "num_tokens": 14391497.0, "step": 1105 }, { "epoch": 0.70784, "grad_norm": 3.7346043586730957, "learning_rate": 5e-06, "loss": 1.2955, "mean_token_accuracy": 0.6504843458533287, "num_tokens": 14403120.0, "step": 1106 }, { "epoch": 0.70848, "grad_norm": 3.9729044437408447, "learning_rate": 5e-06, "loss": 1.2647, "mean_token_accuracy": 0.6647339016199112, "num_tokens": 14415923.0, "step": 1107 }, { "epoch": 0.70912, "grad_norm": 4.029970169067383, "learning_rate": 5e-06, "loss": 1.227, "mean_token_accuracy": 0.6661604270339012, "num_tokens": 14427554.0, "step": 1108 }, { "epoch": 0.70976, "grad_norm": 3.4321465492248535, "learning_rate": 5e-06, "loss": 1.1746, "mean_token_accuracy": 0.6878796294331551, "num_tokens": 14441354.0, "step": 1109 }, { "epoch": 0.7104, "grad_norm": 3.303091287612915, "learning_rate": 5e-06, "loss": 1.3507, "mean_token_accuracy": 0.6451763212680817, "num_tokens": 14455989.0, "step": 1110 }, { "epoch": 0.71104, "grad_norm": 3.9641027450561523, "learning_rate": 5e-06, "loss": 1.287, "mean_token_accuracy": 0.661915197968483, "num_tokens": 14466700.0, "step": 1111 }, { "epoch": 0.71168, "grad_norm": 3.4277381896972656, "learning_rate": 5e-06, "loss": 1.1418, "mean_token_accuracy": 0.6885363236069679, "num_tokens": 14479735.0, "step": 1112 }, { "epoch": 0.71232, "grad_norm": 3.531708240509033, "learning_rate": 5e-06, "loss": 1.1786, "mean_token_accuracy": 0.6798023506999016, "num_tokens": 14492428.0, "step": 1113 }, { "epoch": 0.71296, "grad_norm": 3.962233304977417, "learning_rate": 5e-06, "loss": 1.5488, "mean_token_accuracy": 0.6183573752641678, "num_tokens": 14505050.0, "step": 1114 }, { "epoch": 0.7136, "grad_norm": 3.1472697257995605, "learning_rate": 5e-06, "loss": 1.426, "mean_token_accuracy": 0.6550877764821053, "num_tokens": 14519128.0, "step": 1115 }, { "epoch": 0.71424, "grad_norm": 3.8537216186523438, "learning_rate": 5e-06, "loss": 1.1991, "mean_token_accuracy": 0.682333379983902, "num_tokens": 14530594.0, "step": 1116 }, { "epoch": 0.71488, "grad_norm": 3.527343511581421, "learning_rate": 5e-06, "loss": 1.207, "mean_token_accuracy": 0.6722685918211937, "num_tokens": 14541472.0, "step": 1117 }, { "epoch": 0.71552, "grad_norm": 3.790855646133423, "learning_rate": 5e-06, "loss": 1.2531, "mean_token_accuracy": 0.6708436533808708, "num_tokens": 14552813.0, "step": 1118 }, { "epoch": 0.71616, "grad_norm": 3.553488254547119, "learning_rate": 5e-06, "loss": 1.1016, "mean_token_accuracy": 0.7148231789469719, "num_tokens": 14565447.0, "step": 1119 }, { "epoch": 0.7168, "grad_norm": 3.887118339538574, "learning_rate": 5e-06, "loss": 1.1646, "mean_token_accuracy": 0.6907912865281105, "num_tokens": 14577377.0, "step": 1120 }, { "epoch": 0.71744, "grad_norm": 3.0343868732452393, "learning_rate": 5e-06, "loss": 1.142, "mean_token_accuracy": 0.6866175085306168, "num_tokens": 14591768.0, "step": 1121 }, { "epoch": 0.71808, "grad_norm": 4.561229705810547, "learning_rate": 5e-06, "loss": 1.4004, "mean_token_accuracy": 0.6330222748219967, "num_tokens": 14603193.0, "step": 1122 }, { "epoch": 0.71872, "grad_norm": 3.5638325214385986, "learning_rate": 5e-06, "loss": 1.1526, "mean_token_accuracy": 0.6880695223808289, "num_tokens": 14617007.0, "step": 1123 }, { "epoch": 0.71936, "grad_norm": 3.810415267944336, "learning_rate": 5e-06, "loss": 1.279, "mean_token_accuracy": 0.670023150742054, "num_tokens": 14629048.0, "step": 1124 }, { "epoch": 0.72, "grad_norm": 4.179751396179199, "learning_rate": 5e-06, "loss": 1.0399, "mean_token_accuracy": 0.7168305143713951, "num_tokens": 14639669.0, "step": 1125 }, { "epoch": 0.72064, "grad_norm": 3.539612054824829, "learning_rate": 5e-06, "loss": 1.3306, "mean_token_accuracy": 0.6881062537431717, "num_tokens": 14652435.0, "step": 1126 }, { "epoch": 0.72128, "grad_norm": 3.597693681716919, "learning_rate": 5e-06, "loss": 1.3782, "mean_token_accuracy": 0.6435956582427025, "num_tokens": 14664804.0, "step": 1127 }, { "epoch": 0.72192, "grad_norm": 3.3020715713500977, "learning_rate": 5e-06, "loss": 1.3158, "mean_token_accuracy": 0.6422711089253426, "num_tokens": 14679481.0, "step": 1128 }, { "epoch": 0.72256, "grad_norm": 3.4006054401397705, "learning_rate": 5e-06, "loss": 1.4268, "mean_token_accuracy": 0.6340702697634697, "num_tokens": 14693009.0, "step": 1129 }, { "epoch": 0.7232, "grad_norm": 3.6534066200256348, "learning_rate": 5e-06, "loss": 1.3942, "mean_token_accuracy": 0.6490144804120064, "num_tokens": 14704415.0, "step": 1130 }, { "epoch": 0.72384, "grad_norm": 4.022477149963379, "learning_rate": 5e-06, "loss": 1.4425, "mean_token_accuracy": 0.6391843035817146, "num_tokens": 14718972.0, "step": 1131 }, { "epoch": 0.72448, "grad_norm": 3.717512369155884, "learning_rate": 5e-06, "loss": 1.337, "mean_token_accuracy": 0.6694408059120178, "num_tokens": 14731864.0, "step": 1132 }, { "epoch": 0.72512, "grad_norm": 3.640937566757202, "learning_rate": 5e-06, "loss": 1.2358, "mean_token_accuracy": 0.6641776859760284, "num_tokens": 14743467.0, "step": 1133 }, { "epoch": 0.72576, "grad_norm": 3.5870702266693115, "learning_rate": 5e-06, "loss": 1.2264, "mean_token_accuracy": 0.6706337183713913, "num_tokens": 14755081.0, "step": 1134 }, { "epoch": 0.7264, "grad_norm": 3.6272132396698, "learning_rate": 5e-06, "loss": 1.3878, "mean_token_accuracy": 0.6861530616879463, "num_tokens": 14766730.0, "step": 1135 }, { "epoch": 0.72704, "grad_norm": 3.349130392074585, "learning_rate": 5e-06, "loss": 1.428, "mean_token_accuracy": 0.6495333984494209, "num_tokens": 14780158.0, "step": 1136 }, { "epoch": 0.72768, "grad_norm": 3.8108246326446533, "learning_rate": 5e-06, "loss": 1.2034, "mean_token_accuracy": 0.6748805195093155, "num_tokens": 14793884.0, "step": 1137 }, { "epoch": 0.72832, "grad_norm": 3.4483556747436523, "learning_rate": 5e-06, "loss": 1.3949, "mean_token_accuracy": 0.6391731649637222, "num_tokens": 14806967.0, "step": 1138 }, { "epoch": 0.72896, "grad_norm": 3.3666470050811768, "learning_rate": 5e-06, "loss": 1.2185, "mean_token_accuracy": 0.6691764742136002, "num_tokens": 14820303.0, "step": 1139 }, { "epoch": 0.7296, "grad_norm": 3.32536244392395, "learning_rate": 5e-06, "loss": 1.2849, "mean_token_accuracy": 0.6658271849155426, "num_tokens": 14834025.0, "step": 1140 }, { "epoch": 0.73024, "grad_norm": 3.825983762741089, "learning_rate": 5e-06, "loss": 1.6181, "mean_token_accuracy": 0.602071076631546, "num_tokens": 14847478.0, "step": 1141 }, { "epoch": 0.73088, "grad_norm": 4.397375106811523, "learning_rate": 5e-06, "loss": 1.3697, "mean_token_accuracy": 0.6505918800830841, "num_tokens": 14859179.0, "step": 1142 }, { "epoch": 0.73152, "grad_norm": 4.159323215484619, "learning_rate": 5e-06, "loss": 1.1297, "mean_token_accuracy": 0.7010925114154816, "num_tokens": 14869522.0, "step": 1143 }, { "epoch": 0.73216, "grad_norm": 3.4876530170440674, "learning_rate": 5e-06, "loss": 1.245, "mean_token_accuracy": 0.652951605618, "num_tokens": 14883372.0, "step": 1144 }, { "epoch": 0.7328, "grad_norm": 3.0746846199035645, "learning_rate": 5e-06, "loss": 1.5279, "mean_token_accuracy": 0.6031446754932404, "num_tokens": 14899033.0, "step": 1145 }, { "epoch": 0.73344, "grad_norm": 3.7521297931671143, "learning_rate": 5e-06, "loss": 1.186, "mean_token_accuracy": 0.6688085421919823, "num_tokens": 14913040.0, "step": 1146 }, { "epoch": 0.73408, "grad_norm": 3.9737706184387207, "learning_rate": 5e-06, "loss": 1.314, "mean_token_accuracy": 0.6808184832334518, "num_tokens": 14927324.0, "step": 1147 }, { "epoch": 0.73472, "grad_norm": 3.6961631774902344, "learning_rate": 5e-06, "loss": 1.264, "mean_token_accuracy": 0.6656776443123817, "num_tokens": 14938251.0, "step": 1148 }, { "epoch": 0.73536, "grad_norm": 4.080604553222656, "learning_rate": 5e-06, "loss": 1.4443, "mean_token_accuracy": 0.6355870217084885, "num_tokens": 14948864.0, "step": 1149 }, { "epoch": 0.736, "grad_norm": 3.284268617630005, "learning_rate": 5e-06, "loss": 1.1416, "mean_token_accuracy": 0.6982963308691978, "num_tokens": 14962968.0, "step": 1150 }, { "epoch": 0.73664, "grad_norm": 3.623760223388672, "learning_rate": 5e-06, "loss": 1.206, "mean_token_accuracy": 0.6715554222464561, "num_tokens": 14974492.0, "step": 1151 }, { "epoch": 0.73728, "grad_norm": 3.6222383975982666, "learning_rate": 5e-06, "loss": 1.2002, "mean_token_accuracy": 0.665322557091713, "num_tokens": 14987080.0, "step": 1152 }, { "epoch": 0.73792, "grad_norm": 4.134393692016602, "learning_rate": 5e-06, "loss": 1.3446, "mean_token_accuracy": 0.6732967086136341, "num_tokens": 14997376.0, "step": 1153 }, { "epoch": 0.73856, "grad_norm": 3.1004269123077393, "learning_rate": 5e-06, "loss": 1.1475, "mean_token_accuracy": 0.6673456728458405, "num_tokens": 15011696.0, "step": 1154 }, { "epoch": 0.7392, "grad_norm": 3.437642812728882, "learning_rate": 5e-06, "loss": 1.3756, "mean_token_accuracy": 0.6303885355591774, "num_tokens": 15026552.0, "step": 1155 }, { "epoch": 0.73984, "grad_norm": 8.039863586425781, "learning_rate": 5e-06, "loss": 1.2034, "mean_token_accuracy": 0.7016506418585777, "num_tokens": 15038554.0, "step": 1156 }, { "epoch": 0.74048, "grad_norm": 3.248920440673828, "learning_rate": 5e-06, "loss": 1.2926, "mean_token_accuracy": 0.6631775796413422, "num_tokens": 15054408.0, "step": 1157 }, { "epoch": 0.74112, "grad_norm": 3.959541082382202, "learning_rate": 5e-06, "loss": 1.45, "mean_token_accuracy": 0.6382048651576042, "num_tokens": 15065684.0, "step": 1158 }, { "epoch": 0.74176, "grad_norm": 4.347902297973633, "learning_rate": 5e-06, "loss": 1.0298, "mean_token_accuracy": 0.7234738394618034, "num_tokens": 15077041.0, "step": 1159 }, { "epoch": 0.7424, "grad_norm": 4.224346160888672, "learning_rate": 5e-06, "loss": 1.272, "mean_token_accuracy": 0.6782936900854111, "num_tokens": 15088347.0, "step": 1160 }, { "epoch": 0.74304, "grad_norm": 3.770258903503418, "learning_rate": 5e-06, "loss": 1.3465, "mean_token_accuracy": 0.661977045238018, "num_tokens": 15101460.0, "step": 1161 }, { "epoch": 0.74368, "grad_norm": 3.7153191566467285, "learning_rate": 5e-06, "loss": 1.3216, "mean_token_accuracy": 0.6487752310931683, "num_tokens": 15115486.0, "step": 1162 }, { "epoch": 0.74432, "grad_norm": 4.508492469787598, "learning_rate": 5e-06, "loss": 1.2035, "mean_token_accuracy": 0.6841593757271767, "num_tokens": 15125039.0, "step": 1163 }, { "epoch": 0.74496, "grad_norm": 3.0245108604431152, "learning_rate": 5e-06, "loss": 1.3148, "mean_token_accuracy": 0.6497530564665794, "num_tokens": 15142007.0, "step": 1164 }, { "epoch": 0.7456, "grad_norm": 3.7130560874938965, "learning_rate": 5e-06, "loss": 1.365, "mean_token_accuracy": 0.6439146772027016, "num_tokens": 15154070.0, "step": 1165 }, { "epoch": 0.74624, "grad_norm": 4.014090538024902, "learning_rate": 5e-06, "loss": 1.5601, "mean_token_accuracy": 0.6075103767216206, "num_tokens": 15165816.0, "step": 1166 }, { "epoch": 0.74688, "grad_norm": 3.5442097187042236, "learning_rate": 5e-06, "loss": 1.2896, "mean_token_accuracy": 0.6592826843261719, "num_tokens": 15180213.0, "step": 1167 }, { "epoch": 0.74752, "grad_norm": 3.3585143089294434, "learning_rate": 5e-06, "loss": 1.3492, "mean_token_accuracy": 0.650349847972393, "num_tokens": 15195358.0, "step": 1168 }, { "epoch": 0.74816, "grad_norm": 3.3249661922454834, "learning_rate": 5e-06, "loss": 1.2987, "mean_token_accuracy": 0.6541887670755386, "num_tokens": 15208187.0, "step": 1169 }, { "epoch": 0.7488, "grad_norm": 3.2732949256896973, "learning_rate": 5e-06, "loss": 1.4745, "mean_token_accuracy": 0.6346693634986877, "num_tokens": 15224302.0, "step": 1170 }, { "epoch": 0.74944, "grad_norm": 3.717664957046509, "learning_rate": 5e-06, "loss": 1.2258, "mean_token_accuracy": 0.6592052280902863, "num_tokens": 15235541.0, "step": 1171 }, { "epoch": 0.75008, "grad_norm": 3.3119561672210693, "learning_rate": 5e-06, "loss": 1.2281, "mean_token_accuracy": 0.6707274541258812, "num_tokens": 15249155.0, "step": 1172 }, { "epoch": 0.75072, "grad_norm": 3.4180824756622314, "learning_rate": 5e-06, "loss": 1.2323, "mean_token_accuracy": 0.6759630665183067, "num_tokens": 15262745.0, "step": 1173 }, { "epoch": 0.75136, "grad_norm": 3.351557970046997, "learning_rate": 5e-06, "loss": 1.2131, "mean_token_accuracy": 0.6812815740704536, "num_tokens": 15276413.0, "step": 1174 }, { "epoch": 0.752, "grad_norm": 4.228631973266602, "learning_rate": 5e-06, "loss": 1.1811, "mean_token_accuracy": 0.694750115275383, "num_tokens": 15287419.0, "step": 1175 }, { "epoch": 0.75264, "grad_norm": 3.346228837966919, "learning_rate": 5e-06, "loss": 1.4694, "mean_token_accuracy": 0.6221407428383827, "num_tokens": 15304245.0, "step": 1176 }, { "epoch": 0.75328, "grad_norm": 3.899305582046509, "learning_rate": 5e-06, "loss": 1.3843, "mean_token_accuracy": 0.6404564082622528, "num_tokens": 15315843.0, "step": 1177 }, { "epoch": 0.75392, "grad_norm": 3.3452677726745605, "learning_rate": 5e-06, "loss": 1.4037, "mean_token_accuracy": 0.6525731533765793, "num_tokens": 15330084.0, "step": 1178 }, { "epoch": 0.75456, "grad_norm": 3.4091222286224365, "learning_rate": 5e-06, "loss": 1.4005, "mean_token_accuracy": 0.6623276621103287, "num_tokens": 15342406.0, "step": 1179 }, { "epoch": 0.7552, "grad_norm": 3.5373282432556152, "learning_rate": 5e-06, "loss": 1.1613, "mean_token_accuracy": 0.6908905506134033, "num_tokens": 15355306.0, "step": 1180 }, { "epoch": 0.75584, "grad_norm": 3.9077682495117188, "learning_rate": 5e-06, "loss": 1.2934, "mean_token_accuracy": 0.6559719070792198, "num_tokens": 15365562.0, "step": 1181 }, { "epoch": 0.75648, "grad_norm": 4.251070022583008, "learning_rate": 5e-06, "loss": 1.1809, "mean_token_accuracy": 0.7090832963585854, "num_tokens": 15377910.0, "step": 1182 }, { "epoch": 0.75712, "grad_norm": 3.6916239261627197, "learning_rate": 5e-06, "loss": 1.1891, "mean_token_accuracy": 0.6642716750502586, "num_tokens": 15390158.0, "step": 1183 }, { "epoch": 0.75776, "grad_norm": 3.235966682434082, "learning_rate": 5e-06, "loss": 1.3514, "mean_token_accuracy": 0.654154047369957, "num_tokens": 15405763.0, "step": 1184 }, { "epoch": 0.7584, "grad_norm": 3.0988378524780273, "learning_rate": 5e-06, "loss": 1.3606, "mean_token_accuracy": 0.6405491232872009, "num_tokens": 15421326.0, "step": 1185 }, { "epoch": 0.75904, "grad_norm": 3.5612781047821045, "learning_rate": 5e-06, "loss": 1.3463, "mean_token_accuracy": 0.6501871645450592, "num_tokens": 15434530.0, "step": 1186 }, { "epoch": 0.75968, "grad_norm": 3.6004257202148438, "learning_rate": 5e-06, "loss": 1.346, "mean_token_accuracy": 0.6661404147744179, "num_tokens": 15448462.0, "step": 1187 }, { "epoch": 0.76032, "grad_norm": 4.093327045440674, "learning_rate": 5e-06, "loss": 1.257, "mean_token_accuracy": 0.6833517551422119, "num_tokens": 15460521.0, "step": 1188 }, { "epoch": 0.76096, "grad_norm": 3.7774133682250977, "learning_rate": 5e-06, "loss": 1.4023, "mean_token_accuracy": 0.6524300873279572, "num_tokens": 15472815.0, "step": 1189 }, { "epoch": 0.7616, "grad_norm": 3.2685515880584717, "learning_rate": 5e-06, "loss": 1.1868, "mean_token_accuracy": 0.6791387870907784, "num_tokens": 15488314.0, "step": 1190 }, { "epoch": 0.76224, "grad_norm": 3.4335551261901855, "learning_rate": 5e-06, "loss": 1.4461, "mean_token_accuracy": 0.6327428966760635, "num_tokens": 15502345.0, "step": 1191 }, { "epoch": 0.76288, "grad_norm": 3.3318638801574707, "learning_rate": 5e-06, "loss": 1.4262, "mean_token_accuracy": 0.632860004901886, "num_tokens": 15518148.0, "step": 1192 }, { "epoch": 0.76352, "grad_norm": 3.1482911109924316, "learning_rate": 5e-06, "loss": 1.2723, "mean_token_accuracy": 0.696734681725502, "num_tokens": 15532567.0, "step": 1193 }, { "epoch": 0.76416, "grad_norm": 4.470282554626465, "learning_rate": 5e-06, "loss": 1.4906, "mean_token_accuracy": 0.6341715455055237, "num_tokens": 15544296.0, "step": 1194 }, { "epoch": 0.7648, "grad_norm": 3.548245429992676, "learning_rate": 5e-06, "loss": 1.4285, "mean_token_accuracy": 0.628907784819603, "num_tokens": 15556178.0, "step": 1195 }, { "epoch": 0.76544, "grad_norm": 3.0455758571624756, "learning_rate": 5e-06, "loss": 1.2669, "mean_token_accuracy": 0.6598797962069511, "num_tokens": 15570402.0, "step": 1196 }, { "epoch": 0.76608, "grad_norm": 3.394630193710327, "learning_rate": 5e-06, "loss": 1.3457, "mean_token_accuracy": 0.6525625661015511, "num_tokens": 15583933.0, "step": 1197 }, { "epoch": 0.76672, "grad_norm": 3.572402238845825, "learning_rate": 5e-06, "loss": 1.1829, "mean_token_accuracy": 0.6823357492685318, "num_tokens": 15596838.0, "step": 1198 }, { "epoch": 0.76736, "grad_norm": 4.091769695281982, "learning_rate": 5e-06, "loss": 1.3601, "mean_token_accuracy": 0.6605678722262383, "num_tokens": 15609200.0, "step": 1199 }, { "epoch": 0.768, "grad_norm": 3.402550220489502, "learning_rate": 5e-06, "loss": 1.2345, "mean_token_accuracy": 0.6593906283378601, "num_tokens": 15623053.0, "step": 1200 }, { "epoch": 0.76864, "grad_norm": 3.7215263843536377, "learning_rate": 5e-06, "loss": 1.3201, "mean_token_accuracy": 0.6674540042877197, "num_tokens": 15635540.0, "step": 1201 }, { "epoch": 0.76928, "grad_norm": 3.5162336826324463, "learning_rate": 5e-06, "loss": 1.3041, "mean_token_accuracy": 0.662396639585495, "num_tokens": 15648152.0, "step": 1202 }, { "epoch": 0.76992, "grad_norm": 3.8758740425109863, "learning_rate": 5e-06, "loss": 1.2048, "mean_token_accuracy": 0.6618586331605911, "num_tokens": 15659825.0, "step": 1203 }, { "epoch": 0.77056, "grad_norm": 3.6302740573883057, "learning_rate": 5e-06, "loss": 1.463, "mean_token_accuracy": 0.6243670582771301, "num_tokens": 15675357.0, "step": 1204 }, { "epoch": 0.7712, "grad_norm": 3.250278949737549, "learning_rate": 5e-06, "loss": 1.3677, "mean_token_accuracy": 0.6358913704752922, "num_tokens": 15690114.0, "step": 1205 }, { "epoch": 0.77184, "grad_norm": 3.5102968215942383, "learning_rate": 5e-06, "loss": 1.1182, "mean_token_accuracy": 0.6968755125999451, "num_tokens": 15704497.0, "step": 1206 }, { "epoch": 0.77248, "grad_norm": 3.386099100112915, "learning_rate": 5e-06, "loss": 1.1454, "mean_token_accuracy": 0.6858478710055351, "num_tokens": 15718228.0, "step": 1207 }, { "epoch": 0.77312, "grad_norm": 3.6120481491088867, "learning_rate": 5e-06, "loss": 1.3312, "mean_token_accuracy": 0.669069878757, "num_tokens": 15731116.0, "step": 1208 }, { "epoch": 0.77376, "grad_norm": 3.7133243083953857, "learning_rate": 5e-06, "loss": 1.5032, "mean_token_accuracy": 0.6273391470313072, "num_tokens": 15743678.0, "step": 1209 }, { "epoch": 0.7744, "grad_norm": 3.4095213413238525, "learning_rate": 5e-06, "loss": 1.3583, "mean_token_accuracy": 0.6577341482043266, "num_tokens": 15757250.0, "step": 1210 }, { "epoch": 0.77504, "grad_norm": 4.357828140258789, "learning_rate": 5e-06, "loss": 1.4194, "mean_token_accuracy": 0.6646361202001572, "num_tokens": 15767568.0, "step": 1211 }, { "epoch": 0.77568, "grad_norm": 3.3669044971466064, "learning_rate": 5e-06, "loss": 1.3806, "mean_token_accuracy": 0.6653162240982056, "num_tokens": 15781457.0, "step": 1212 }, { "epoch": 0.77632, "grad_norm": 3.057096004486084, "learning_rate": 5e-06, "loss": 1.2735, "mean_token_accuracy": 0.6667918264865875, "num_tokens": 15796444.0, "step": 1213 }, { "epoch": 0.77696, "grad_norm": 3.549315929412842, "learning_rate": 5e-06, "loss": 1.255, "mean_token_accuracy": 0.668325200676918, "num_tokens": 15807909.0, "step": 1214 }, { "epoch": 0.7776, "grad_norm": 4.293363571166992, "learning_rate": 5e-06, "loss": 1.2011, "mean_token_accuracy": 0.7006052732467651, "num_tokens": 15818410.0, "step": 1215 }, { "epoch": 0.77824, "grad_norm": 3.4453113079071045, "learning_rate": 5e-06, "loss": 1.4502, "mean_token_accuracy": 0.6456183791160583, "num_tokens": 15830410.0, "step": 1216 }, { "epoch": 0.77888, "grad_norm": 3.340660572052002, "learning_rate": 5e-06, "loss": 1.3797, "mean_token_accuracy": 0.6471363380551338, "num_tokens": 15843270.0, "step": 1217 }, { "epoch": 0.77952, "grad_norm": 3.578989267349243, "learning_rate": 5e-06, "loss": 1.3165, "mean_token_accuracy": 0.6591121554374695, "num_tokens": 15856513.0, "step": 1218 }, { "epoch": 0.78016, "grad_norm": 3.311697483062744, "learning_rate": 5e-06, "loss": 1.3268, "mean_token_accuracy": 0.6524678990244865, "num_tokens": 15869453.0, "step": 1219 }, { "epoch": 0.7808, "grad_norm": 3.2292022705078125, "learning_rate": 5e-06, "loss": 1.2939, "mean_token_accuracy": 0.6603180393576622, "num_tokens": 15884284.0, "step": 1220 }, { "epoch": 0.78144, "grad_norm": 3.189804792404175, "learning_rate": 5e-06, "loss": 1.3883, "mean_token_accuracy": 0.6416840329766273, "num_tokens": 15898664.0, "step": 1221 }, { "epoch": 0.78208, "grad_norm": 3.1236817836761475, "learning_rate": 5e-06, "loss": 1.3237, "mean_token_accuracy": 0.6408574059605598, "num_tokens": 15913125.0, "step": 1222 }, { "epoch": 0.78272, "grad_norm": 4.161830902099609, "learning_rate": 5e-06, "loss": 1.4025, "mean_token_accuracy": 0.630896121263504, "num_tokens": 15925782.0, "step": 1223 }, { "epoch": 0.78336, "grad_norm": 3.626995086669922, "learning_rate": 5e-06, "loss": 1.087, "mean_token_accuracy": 0.6901156529784203, "num_tokens": 15936307.0, "step": 1224 }, { "epoch": 0.784, "grad_norm": 3.5811476707458496, "learning_rate": 5e-06, "loss": 1.1955, "mean_token_accuracy": 0.6784915700554848, "num_tokens": 15947469.0, "step": 1225 }, { "epoch": 0.78464, "grad_norm": 3.4900920391082764, "learning_rate": 5e-06, "loss": 1.3206, "mean_token_accuracy": 0.6642494723200798, "num_tokens": 15961802.0, "step": 1226 }, { "epoch": 0.78528, "grad_norm": 3.295171022415161, "learning_rate": 5e-06, "loss": 1.366, "mean_token_accuracy": 0.6388103812932968, "num_tokens": 15973542.0, "step": 1227 }, { "epoch": 0.78592, "grad_norm": 3.179863214492798, "learning_rate": 5e-06, "loss": 1.4789, "mean_token_accuracy": 0.6259790062904358, "num_tokens": 15987918.0, "step": 1228 }, { "epoch": 0.78656, "grad_norm": 3.5669660568237305, "learning_rate": 5e-06, "loss": 1.3716, "mean_token_accuracy": 0.6410401687026024, "num_tokens": 16000309.0, "step": 1229 }, { "epoch": 0.7872, "grad_norm": 3.2992517948150635, "learning_rate": 5e-06, "loss": 1.3798, "mean_token_accuracy": 0.6566397473216057, "num_tokens": 16014417.0, "step": 1230 }, { "epoch": 0.78784, "grad_norm": 3.6735100746154785, "learning_rate": 5e-06, "loss": 1.2311, "mean_token_accuracy": 0.6738264411687851, "num_tokens": 16026169.0, "step": 1231 }, { "epoch": 0.78848, "grad_norm": 4.013977527618408, "learning_rate": 5e-06, "loss": 1.3946, "mean_token_accuracy": 0.6297749131917953, "num_tokens": 16036396.0, "step": 1232 }, { "epoch": 0.78912, "grad_norm": 3.506371259689331, "learning_rate": 5e-06, "loss": 1.3798, "mean_token_accuracy": 0.6475069150328636, "num_tokens": 16049573.0, "step": 1233 }, { "epoch": 0.78976, "grad_norm": 3.0766477584838867, "learning_rate": 5e-06, "loss": 1.4281, "mean_token_accuracy": 0.6643402278423309, "num_tokens": 16064639.0, "step": 1234 }, { "epoch": 0.7904, "grad_norm": 3.5113558769226074, "learning_rate": 5e-06, "loss": 1.1854, "mean_token_accuracy": 0.6849528402090073, "num_tokens": 16078167.0, "step": 1235 }, { "epoch": 0.79104, "grad_norm": 3.223271369934082, "learning_rate": 5e-06, "loss": 1.3133, "mean_token_accuracy": 0.6554304733872414, "num_tokens": 16093693.0, "step": 1236 }, { "epoch": 0.79168, "grad_norm": 3.661078691482544, "learning_rate": 5e-06, "loss": 1.2121, "mean_token_accuracy": 0.7052098885178566, "num_tokens": 16105008.0, "step": 1237 }, { "epoch": 0.79232, "grad_norm": 3.4575560092926025, "learning_rate": 5e-06, "loss": 1.3498, "mean_token_accuracy": 0.6544990688562393, "num_tokens": 16117846.0, "step": 1238 }, { "epoch": 0.79296, "grad_norm": 3.559100866317749, "learning_rate": 5e-06, "loss": 1.4116, "mean_token_accuracy": 0.6313577368855476, "num_tokens": 16130981.0, "step": 1239 }, { "epoch": 0.7936, "grad_norm": 3.2983896732330322, "learning_rate": 5e-06, "loss": 1.3647, "mean_token_accuracy": 0.6640519946813583, "num_tokens": 16144847.0, "step": 1240 }, { "epoch": 0.79424, "grad_norm": 3.622084856033325, "learning_rate": 5e-06, "loss": 1.292, "mean_token_accuracy": 0.6699836328625679, "num_tokens": 16156779.0, "step": 1241 }, { "epoch": 0.79488, "grad_norm": 4.421840190887451, "learning_rate": 5e-06, "loss": 1.2792, "mean_token_accuracy": 0.673715990036726, "num_tokens": 16166792.0, "step": 1242 }, { "epoch": 0.79552, "grad_norm": 3.312913656234741, "learning_rate": 5e-06, "loss": 1.4633, "mean_token_accuracy": 0.6244674026966095, "num_tokens": 16181336.0, "step": 1243 }, { "epoch": 0.79616, "grad_norm": 3.5397815704345703, "learning_rate": 5e-06, "loss": 1.4338, "mean_token_accuracy": 0.6298167407512665, "num_tokens": 16194684.0, "step": 1244 }, { "epoch": 0.7968, "grad_norm": 3.798386335372925, "learning_rate": 5e-06, "loss": 1.3008, "mean_token_accuracy": 0.6577273011207581, "num_tokens": 16206010.0, "step": 1245 }, { "epoch": 0.79744, "grad_norm": 3.379908561706543, "learning_rate": 5e-06, "loss": 1.0665, "mean_token_accuracy": 0.6839143261313438, "num_tokens": 16220119.0, "step": 1246 }, { "epoch": 0.79808, "grad_norm": 3.7385215759277344, "learning_rate": 5e-06, "loss": 1.2292, "mean_token_accuracy": 0.6770885214209557, "num_tokens": 16230676.0, "step": 1247 }, { "epoch": 0.79872, "grad_norm": 3.6756489276885986, "learning_rate": 5e-06, "loss": 1.2106, "mean_token_accuracy": 0.6802457198500633, "num_tokens": 16243393.0, "step": 1248 }, { "epoch": 0.79936, "grad_norm": 3.861645221710205, "learning_rate": 5e-06, "loss": 1.2558, "mean_token_accuracy": 0.6762053146958351, "num_tokens": 16254097.0, "step": 1249 }, { "epoch": 0.8, "grad_norm": 3.3169620037078857, "learning_rate": 5e-06, "loss": 1.2539, "mean_token_accuracy": 0.6607455164194107, "num_tokens": 16268379.0, "step": 1250 }, { "epoch": 0.80064, "grad_norm": 3.2894480228424072, "learning_rate": 5e-06, "loss": 1.3166, "mean_token_accuracy": 0.6549070253968239, "num_tokens": 16283564.0, "step": 1251 }, { "epoch": 0.80128, "grad_norm": 3.8048436641693115, "learning_rate": 5e-06, "loss": 1.424, "mean_token_accuracy": 0.6509182900190353, "num_tokens": 16295386.0, "step": 1252 }, { "epoch": 0.80192, "grad_norm": 3.7577552795410156, "learning_rate": 5e-06, "loss": 1.1979, "mean_token_accuracy": 0.6759278625249863, "num_tokens": 16309119.0, "step": 1253 }, { "epoch": 0.80256, "grad_norm": 3.8013439178466797, "learning_rate": 5e-06, "loss": 1.3405, "mean_token_accuracy": 0.668271005153656, "num_tokens": 16320088.0, "step": 1254 }, { "epoch": 0.8032, "grad_norm": 3.75661039352417, "learning_rate": 5e-06, "loss": 1.2675, "mean_token_accuracy": 0.6828467771410942, "num_tokens": 16332206.0, "step": 1255 }, { "epoch": 0.80384, "grad_norm": 4.377762794494629, "learning_rate": 5e-06, "loss": 1.49, "mean_token_accuracy": 0.6417308822274208, "num_tokens": 16344154.0, "step": 1256 }, { "epoch": 0.80448, "grad_norm": 3.524298906326294, "learning_rate": 5e-06, "loss": 1.2527, "mean_token_accuracy": 0.6651709750294685, "num_tokens": 16357771.0, "step": 1257 }, { "epoch": 0.80512, "grad_norm": 3.6572201251983643, "learning_rate": 5e-06, "loss": 1.3117, "mean_token_accuracy": 0.6568779051303864, "num_tokens": 16369715.0, "step": 1258 }, { "epoch": 0.80576, "grad_norm": 3.557985305786133, "learning_rate": 5e-06, "loss": 1.178, "mean_token_accuracy": 0.688338540494442, "num_tokens": 16381110.0, "step": 1259 }, { "epoch": 0.8064, "grad_norm": 3.9126033782958984, "learning_rate": 5e-06, "loss": 1.5385, "mean_token_accuracy": 0.6207233294844627, "num_tokens": 16393612.0, "step": 1260 }, { "epoch": 0.80704, "grad_norm": 3.5483007431030273, "learning_rate": 5e-06, "loss": 1.1773, "mean_token_accuracy": 0.6948479861021042, "num_tokens": 16406331.0, "step": 1261 }, { "epoch": 0.80768, "grad_norm": 3.6159143447875977, "learning_rate": 5e-06, "loss": 1.3108, "mean_token_accuracy": 0.6626102104783058, "num_tokens": 16418794.0, "step": 1262 }, { "epoch": 0.80832, "grad_norm": 3.201352834701538, "learning_rate": 5e-06, "loss": 1.346, "mean_token_accuracy": 0.6304316557943821, "num_tokens": 16429613.0, "step": 1263 }, { "epoch": 0.80896, "grad_norm": 3.9572861194610596, "learning_rate": 5e-06, "loss": 1.0437, "mean_token_accuracy": 0.7080657631158829, "num_tokens": 16440987.0, "step": 1264 }, { "epoch": 0.8096, "grad_norm": 3.182184934616089, "learning_rate": 5e-06, "loss": 1.356, "mean_token_accuracy": 0.6579003632068634, "num_tokens": 16455570.0, "step": 1265 }, { "epoch": 0.81024, "grad_norm": 3.835308313369751, "learning_rate": 5e-06, "loss": 1.4172, "mean_token_accuracy": 0.6239579617977142, "num_tokens": 16468080.0, "step": 1266 }, { "epoch": 0.81088, "grad_norm": 3.3559696674346924, "learning_rate": 5e-06, "loss": 1.1735, "mean_token_accuracy": 0.6860703229904175, "num_tokens": 16481207.0, "step": 1267 }, { "epoch": 0.81152, "grad_norm": 3.19657039642334, "learning_rate": 5e-06, "loss": 1.3224, "mean_token_accuracy": 0.6673007681965828, "num_tokens": 16495219.0, "step": 1268 }, { "epoch": 0.81216, "grad_norm": 3.2514398097991943, "learning_rate": 5e-06, "loss": 1.4182, "mean_token_accuracy": 0.6229546442627907, "num_tokens": 16511488.0, "step": 1269 }, { "epoch": 0.8128, "grad_norm": 2.9578235149383545, "learning_rate": 5e-06, "loss": 1.0946, "mean_token_accuracy": 0.6994348987936974, "num_tokens": 16527874.0, "step": 1270 }, { "epoch": 0.81344, "grad_norm": 3.202214479446411, "learning_rate": 5e-06, "loss": 1.4316, "mean_token_accuracy": 0.625508576631546, "num_tokens": 16542263.0, "step": 1271 }, { "epoch": 0.81408, "grad_norm": 3.9414408206939697, "learning_rate": 5e-06, "loss": 1.2243, "mean_token_accuracy": 0.6666690483689308, "num_tokens": 16554216.0, "step": 1272 }, { "epoch": 0.81472, "grad_norm": 3.792768955230713, "learning_rate": 5e-06, "loss": 1.2563, "mean_token_accuracy": 0.6469622924923897, "num_tokens": 16566251.0, "step": 1273 }, { "epoch": 0.81536, "grad_norm": 3.4059951305389404, "learning_rate": 5e-06, "loss": 1.3999, "mean_token_accuracy": 0.6422073394060135, "num_tokens": 16579765.0, "step": 1274 }, { "epoch": 0.816, "grad_norm": 4.562513828277588, "learning_rate": 5e-06, "loss": 1.2946, "mean_token_accuracy": 0.6693150997161865, "num_tokens": 16589352.0, "step": 1275 }, { "epoch": 0.81664, "grad_norm": 4.269272327423096, "learning_rate": 5e-06, "loss": 1.2899, "mean_token_accuracy": 0.6878708451986313, "num_tokens": 16598911.0, "step": 1276 }, { "epoch": 0.81728, "grad_norm": 3.5766615867614746, "learning_rate": 5e-06, "loss": 1.3643, "mean_token_accuracy": 0.6596589758992195, "num_tokens": 16612073.0, "step": 1277 }, { "epoch": 0.81792, "grad_norm": 3.2693169116973877, "learning_rate": 5e-06, "loss": 1.2538, "mean_token_accuracy": 0.6582971885800362, "num_tokens": 16626673.0, "step": 1278 }, { "epoch": 0.81856, "grad_norm": 3.7346718311309814, "learning_rate": 5e-06, "loss": 1.4059, "mean_token_accuracy": 0.6605831310153008, "num_tokens": 16640222.0, "step": 1279 }, { "epoch": 0.8192, "grad_norm": 3.571347951889038, "learning_rate": 5e-06, "loss": 1.378, "mean_token_accuracy": 0.6396291702985764, "num_tokens": 16652331.0, "step": 1280 }, { "epoch": 0.81984, "grad_norm": 3.3202948570251465, "learning_rate": 5e-06, "loss": 1.3002, "mean_token_accuracy": 0.656744010746479, "num_tokens": 16664098.0, "step": 1281 }, { "epoch": 0.82048, "grad_norm": 3.2276108264923096, "learning_rate": 5e-06, "loss": 1.1796, "mean_token_accuracy": 0.6782404407858849, "num_tokens": 16678834.0, "step": 1282 }, { "epoch": 0.82112, "grad_norm": 3.5021538734436035, "learning_rate": 5e-06, "loss": 1.1757, "mean_token_accuracy": 0.6985038220882416, "num_tokens": 16692514.0, "step": 1283 }, { "epoch": 0.82176, "grad_norm": 3.8361024856567383, "learning_rate": 5e-06, "loss": 1.3879, "mean_token_accuracy": 0.6511978656053543, "num_tokens": 16705296.0, "step": 1284 }, { "epoch": 0.8224, "grad_norm": 3.3450541496276855, "learning_rate": 5e-06, "loss": 1.4618, "mean_token_accuracy": 0.6236701160669327, "num_tokens": 16719506.0, "step": 1285 }, { "epoch": 0.82304, "grad_norm": 3.344872236251831, "learning_rate": 5e-06, "loss": 1.4341, "mean_token_accuracy": 0.6452220380306244, "num_tokens": 16733788.0, "step": 1286 }, { "epoch": 0.82368, "grad_norm": 3.2765679359436035, "learning_rate": 5e-06, "loss": 1.3945, "mean_token_accuracy": 0.6469878405332565, "num_tokens": 16746762.0, "step": 1287 }, { "epoch": 0.82432, "grad_norm": 3.3606464862823486, "learning_rate": 5e-06, "loss": 1.442, "mean_token_accuracy": 0.6300052553415298, "num_tokens": 16762035.0, "step": 1288 }, { "epoch": 0.82496, "grad_norm": 3.9703168869018555, "learning_rate": 5e-06, "loss": 1.4146, "mean_token_accuracy": 0.6354392319917679, "num_tokens": 16772696.0, "step": 1289 }, { "epoch": 0.8256, "grad_norm": 3.2966363430023193, "learning_rate": 5e-06, "loss": 1.2722, "mean_token_accuracy": 0.665034607052803, "num_tokens": 16787285.0, "step": 1290 }, { "epoch": 0.82624, "grad_norm": 3.6354568004608154, "learning_rate": 5e-06, "loss": 1.2903, "mean_token_accuracy": 0.6690637767314911, "num_tokens": 16799868.0, "step": 1291 }, { "epoch": 0.82688, "grad_norm": 3.9511008262634277, "learning_rate": 5e-06, "loss": 1.3668, "mean_token_accuracy": 0.6623431816697121, "num_tokens": 16811306.0, "step": 1292 }, { "epoch": 0.82752, "grad_norm": 3.4990999698638916, "learning_rate": 5e-06, "loss": 1.2118, "mean_token_accuracy": 0.6844175234436989, "num_tokens": 16824295.0, "step": 1293 }, { "epoch": 0.82816, "grad_norm": 3.638296604156494, "learning_rate": 5e-06, "loss": 1.1873, "mean_token_accuracy": 0.6708608791232109, "num_tokens": 16836129.0, "step": 1294 }, { "epoch": 0.8288, "grad_norm": 3.5374062061309814, "learning_rate": 5e-06, "loss": 1.3716, "mean_token_accuracy": 0.662922739982605, "num_tokens": 16849257.0, "step": 1295 }, { "epoch": 0.82944, "grad_norm": 4.183645725250244, "learning_rate": 5e-06, "loss": 1.2535, "mean_token_accuracy": 0.6610106378793716, "num_tokens": 16860223.0, "step": 1296 }, { "epoch": 0.83008, "grad_norm": 3.551673412322998, "learning_rate": 5e-06, "loss": 1.2743, "mean_token_accuracy": 0.6478192396461964, "num_tokens": 16871987.0, "step": 1297 }, { "epoch": 0.83072, "grad_norm": 3.2299296855926514, "learning_rate": 5e-06, "loss": 1.3783, "mean_token_accuracy": 0.6627595871686935, "num_tokens": 16886179.0, "step": 1298 }, { "epoch": 0.83136, "grad_norm": 3.688389301300049, "learning_rate": 5e-06, "loss": 1.0686, "mean_token_accuracy": 0.7124327570199966, "num_tokens": 16898088.0, "step": 1299 }, { "epoch": 0.832, "grad_norm": 3.371751070022583, "learning_rate": 5e-06, "loss": 1.4761, "mean_token_accuracy": 0.624034658074379, "num_tokens": 16912488.0, "step": 1300 }, { "epoch": 0.83264, "grad_norm": 3.6259591579437256, "learning_rate": 5e-06, "loss": 1.1865, "mean_token_accuracy": 0.6735802069306374, "num_tokens": 16926127.0, "step": 1301 }, { "epoch": 0.83328, "grad_norm": 3.571916103363037, "learning_rate": 5e-06, "loss": 1.5566, "mean_token_accuracy": 0.62827018648386, "num_tokens": 16939816.0, "step": 1302 }, { "epoch": 0.83392, "grad_norm": 3.3074350357055664, "learning_rate": 5e-06, "loss": 1.4043, "mean_token_accuracy": 0.639069065451622, "num_tokens": 16953696.0, "step": 1303 }, { "epoch": 0.83456, "grad_norm": 3.573622941970825, "learning_rate": 5e-06, "loss": 1.3567, "mean_token_accuracy": 0.6488766446709633, "num_tokens": 16965974.0, "step": 1304 }, { "epoch": 0.8352, "grad_norm": 3.201739549636841, "learning_rate": 5e-06, "loss": 1.2488, "mean_token_accuracy": 0.6712930873036385, "num_tokens": 16980031.0, "step": 1305 }, { "epoch": 0.83584, "grad_norm": 3.284263849258423, "learning_rate": 5e-06, "loss": 1.3163, "mean_token_accuracy": 0.6636942848563194, "num_tokens": 16993264.0, "step": 1306 }, { "epoch": 0.83648, "grad_norm": 3.39267897605896, "learning_rate": 5e-06, "loss": 1.2675, "mean_token_accuracy": 0.6717317998409271, "num_tokens": 17005939.0, "step": 1307 }, { "epoch": 0.83712, "grad_norm": 3.601962089538574, "learning_rate": 5e-06, "loss": 1.2444, "mean_token_accuracy": 0.7010955587029457, "num_tokens": 17019858.0, "step": 1308 }, { "epoch": 0.83776, "grad_norm": 4.25007438659668, "learning_rate": 5e-06, "loss": 1.2578, "mean_token_accuracy": 0.6884395852684975, "num_tokens": 17031840.0, "step": 1309 }, { "epoch": 0.8384, "grad_norm": 3.216642379760742, "learning_rate": 5e-06, "loss": 1.07, "mean_token_accuracy": 0.6811397597193718, "num_tokens": 17043624.0, "step": 1310 }, { "epoch": 0.83904, "grad_norm": 4.06812858581543, "learning_rate": 5e-06, "loss": 1.2633, "mean_token_accuracy": 0.6581188440322876, "num_tokens": 17055221.0, "step": 1311 }, { "epoch": 0.83968, "grad_norm": 4.409648418426514, "learning_rate": 5e-06, "loss": 1.4064, "mean_token_accuracy": 0.6504970565438271, "num_tokens": 17065224.0, "step": 1312 }, { "epoch": 0.84032, "grad_norm": 3.070948839187622, "learning_rate": 5e-06, "loss": 1.3761, "mean_token_accuracy": 0.6479237154126167, "num_tokens": 17078405.0, "step": 1313 }, { "epoch": 0.84096, "grad_norm": 3.568082094192505, "learning_rate": 5e-06, "loss": 1.0255, "mean_token_accuracy": 0.6967073529958725, "num_tokens": 17091614.0, "step": 1314 }, { "epoch": 0.8416, "grad_norm": 3.664025068283081, "learning_rate": 5e-06, "loss": 1.4398, "mean_token_accuracy": 0.6788545474410057, "num_tokens": 17104109.0, "step": 1315 }, { "epoch": 0.84224, "grad_norm": 3.4449939727783203, "learning_rate": 5e-06, "loss": 1.203, "mean_token_accuracy": 0.6785411536693573, "num_tokens": 17116517.0, "step": 1316 }, { "epoch": 0.84288, "grad_norm": 3.2764899730682373, "learning_rate": 5e-06, "loss": 1.2928, "mean_token_accuracy": 0.6439727321267128, "num_tokens": 17130912.0, "step": 1317 }, { "epoch": 0.84352, "grad_norm": 3.6440088748931885, "learning_rate": 5e-06, "loss": 1.076, "mean_token_accuracy": 0.7138596475124359, "num_tokens": 17143603.0, "step": 1318 }, { "epoch": 0.84416, "grad_norm": 3.7815802097320557, "learning_rate": 5e-06, "loss": 1.4247, "mean_token_accuracy": 0.6309964135289192, "num_tokens": 17156597.0, "step": 1319 }, { "epoch": 0.8448, "grad_norm": 3.145379066467285, "learning_rate": 5e-06, "loss": 1.0981, "mean_token_accuracy": 0.7020114660263062, "num_tokens": 17170210.0, "step": 1320 }, { "epoch": 0.84544, "grad_norm": 4.029253005981445, "learning_rate": 5e-06, "loss": 1.4513, "mean_token_accuracy": 0.6537614092230797, "num_tokens": 17182328.0, "step": 1321 }, { "epoch": 0.84608, "grad_norm": 3.2656235694885254, "learning_rate": 5e-06, "loss": 1.5357, "mean_token_accuracy": 0.641093410551548, "num_tokens": 17197005.0, "step": 1322 }, { "epoch": 0.84672, "grad_norm": 3.559967041015625, "learning_rate": 5e-06, "loss": 1.0718, "mean_token_accuracy": 0.7045318782329559, "num_tokens": 17208973.0, "step": 1323 }, { "epoch": 0.84736, "grad_norm": 3.366745710372925, "learning_rate": 5e-06, "loss": 1.3679, "mean_token_accuracy": 0.6683920547366142, "num_tokens": 17221909.0, "step": 1324 }, { "epoch": 0.848, "grad_norm": 3.4706954956054688, "learning_rate": 5e-06, "loss": 1.317, "mean_token_accuracy": 0.6562648341059685, "num_tokens": 17234739.0, "step": 1325 }, { "epoch": 0.84864, "grad_norm": 3.4657156467437744, "learning_rate": 5e-06, "loss": 1.4667, "mean_token_accuracy": 0.6328883245587349, "num_tokens": 17249245.0, "step": 1326 }, { "epoch": 0.84928, "grad_norm": 3.4521939754486084, "learning_rate": 5e-06, "loss": 1.4047, "mean_token_accuracy": 0.6429140567779541, "num_tokens": 17263466.0, "step": 1327 }, { "epoch": 0.84992, "grad_norm": 3.3580243587493896, "learning_rate": 5e-06, "loss": 1.4, "mean_token_accuracy": 0.6485451236367226, "num_tokens": 17277966.0, "step": 1328 }, { "epoch": 0.85056, "grad_norm": 3.6181726455688477, "learning_rate": 5e-06, "loss": 1.4906, "mean_token_accuracy": 0.6203976050019264, "num_tokens": 17290080.0, "step": 1329 }, { "epoch": 0.8512, "grad_norm": 3.0654401779174805, "learning_rate": 5e-06, "loss": 1.3708, "mean_token_accuracy": 0.6423755809664726, "num_tokens": 17307462.0, "step": 1330 }, { "epoch": 0.85184, "grad_norm": 3.682450294494629, "learning_rate": 5e-06, "loss": 1.4412, "mean_token_accuracy": 0.6563334167003632, "num_tokens": 17320760.0, "step": 1331 }, { "epoch": 0.85248, "grad_norm": 4.22981071472168, "learning_rate": 5e-06, "loss": 1.1568, "mean_token_accuracy": 0.687714472413063, "num_tokens": 17330799.0, "step": 1332 }, { "epoch": 0.85312, "grad_norm": 3.9495580196380615, "learning_rate": 5e-06, "loss": 1.385, "mean_token_accuracy": 0.643461637198925, "num_tokens": 17340826.0, "step": 1333 }, { "epoch": 0.85376, "grad_norm": 3.5318918228149414, "learning_rate": 5e-06, "loss": 1.2977, "mean_token_accuracy": 0.6616112142801285, "num_tokens": 17353805.0, "step": 1334 }, { "epoch": 0.8544, "grad_norm": 3.967776298522949, "learning_rate": 5e-06, "loss": 1.2952, "mean_token_accuracy": 0.6506235525012016, "num_tokens": 17366394.0, "step": 1335 }, { "epoch": 0.85504, "grad_norm": 3.663810968399048, "learning_rate": 5e-06, "loss": 1.148, "mean_token_accuracy": 0.6877422258257866, "num_tokens": 17377970.0, "step": 1336 }, { "epoch": 0.85568, "grad_norm": 3.229074478149414, "learning_rate": 5e-06, "loss": 1.2273, "mean_token_accuracy": 0.6863315925002098, "num_tokens": 17392020.0, "step": 1337 }, { "epoch": 0.85632, "grad_norm": 3.3477957248687744, "learning_rate": 5e-06, "loss": 1.054, "mean_token_accuracy": 0.7150156199932098, "num_tokens": 17404119.0, "step": 1338 }, { "epoch": 0.85696, "grad_norm": 3.4252710342407227, "learning_rate": 5e-06, "loss": 1.4177, "mean_token_accuracy": 0.6546554416418076, "num_tokens": 17418373.0, "step": 1339 }, { "epoch": 0.8576, "grad_norm": 3.3960907459259033, "learning_rate": 5e-06, "loss": 1.2424, "mean_token_accuracy": 0.6713818609714508, "num_tokens": 17430650.0, "step": 1340 }, { "epoch": 0.85824, "grad_norm": 3.5569021701812744, "learning_rate": 5e-06, "loss": 1.4048, "mean_token_accuracy": 0.6662162095308304, "num_tokens": 17443979.0, "step": 1341 }, { "epoch": 0.85888, "grad_norm": 3.508941650390625, "learning_rate": 5e-06, "loss": 1.552, "mean_token_accuracy": 0.6020488813519478, "num_tokens": 17458181.0, "step": 1342 }, { "epoch": 0.85952, "grad_norm": 3.9543237686157227, "learning_rate": 5e-06, "loss": 1.5179, "mean_token_accuracy": 0.6510177925229073, "num_tokens": 17469923.0, "step": 1343 }, { "epoch": 0.86016, "grad_norm": 4.113687515258789, "learning_rate": 5e-06, "loss": 1.3311, "mean_token_accuracy": 0.652983695268631, "num_tokens": 17483696.0, "step": 1344 }, { "epoch": 0.8608, "grad_norm": 3.756329298019409, "learning_rate": 5e-06, "loss": 1.2371, "mean_token_accuracy": 0.6460439562797546, "num_tokens": 17496524.0, "step": 1345 }, { "epoch": 0.86144, "grad_norm": 3.375931978225708, "learning_rate": 5e-06, "loss": 1.3934, "mean_token_accuracy": 0.6481117159128189, "num_tokens": 17510332.0, "step": 1346 }, { "epoch": 0.86208, "grad_norm": 4.059141635894775, "learning_rate": 5e-06, "loss": 1.0956, "mean_token_accuracy": 0.6906588524580002, "num_tokens": 17520886.0, "step": 1347 }, { "epoch": 0.86272, "grad_norm": 2.9917287826538086, "learning_rate": 5e-06, "loss": 1.2105, "mean_token_accuracy": 0.6634574681520462, "num_tokens": 17536190.0, "step": 1348 }, { "epoch": 0.86336, "grad_norm": 3.9010698795318604, "learning_rate": 5e-06, "loss": 1.2638, "mean_token_accuracy": 0.6710032075643539, "num_tokens": 17548427.0, "step": 1349 }, { "epoch": 0.864, "grad_norm": 3.535780668258667, "learning_rate": 5e-06, "loss": 1.2962, "mean_token_accuracy": 0.6431876122951508, "num_tokens": 17561319.0, "step": 1350 }, { "epoch": 0.86464, "grad_norm": 3.2573955059051514, "learning_rate": 5e-06, "loss": 1.233, "mean_token_accuracy": 0.6907457932829857, "num_tokens": 17573944.0, "step": 1351 }, { "epoch": 0.86528, "grad_norm": 3.478487491607666, "learning_rate": 5e-06, "loss": 1.1014, "mean_token_accuracy": 0.6931867897510529, "num_tokens": 17587912.0, "step": 1352 }, { "epoch": 0.86592, "grad_norm": 3.618330955505371, "learning_rate": 5e-06, "loss": 1.4492, "mean_token_accuracy": 0.6480180844664574, "num_tokens": 17600364.0, "step": 1353 }, { "epoch": 0.86656, "grad_norm": 3.834172248840332, "learning_rate": 5e-06, "loss": 1.4564, "mean_token_accuracy": 0.6302541047334671, "num_tokens": 17614033.0, "step": 1354 }, { "epoch": 0.8672, "grad_norm": 3.973057746887207, "learning_rate": 5e-06, "loss": 1.4296, "mean_token_accuracy": 0.6398394256830215, "num_tokens": 17626618.0, "step": 1355 }, { "epoch": 0.86784, "grad_norm": 3.6730847358703613, "learning_rate": 5e-06, "loss": 1.3343, "mean_token_accuracy": 0.6507444530725479, "num_tokens": 17638206.0, "step": 1356 }, { "epoch": 0.86848, "grad_norm": 3.6375482082366943, "learning_rate": 5e-06, "loss": 1.3747, "mean_token_accuracy": 0.6530400216579437, "num_tokens": 17650041.0, "step": 1357 }, { "epoch": 0.86912, "grad_norm": 3.4408140182495117, "learning_rate": 5e-06, "loss": 1.1361, "mean_token_accuracy": 0.6785493567585945, "num_tokens": 17661144.0, "step": 1358 }, { "epoch": 0.86976, "grad_norm": 3.449578046798706, "learning_rate": 5e-06, "loss": 1.3709, "mean_token_accuracy": 0.6438928842544556, "num_tokens": 17674539.0, "step": 1359 }, { "epoch": 0.8704, "grad_norm": 5.356245994567871, "learning_rate": 5e-06, "loss": 1.4153, "mean_token_accuracy": 0.6556727215647697, "num_tokens": 17685582.0, "step": 1360 }, { "epoch": 0.87104, "grad_norm": 3.2209205627441406, "learning_rate": 5e-06, "loss": 1.3495, "mean_token_accuracy": 0.6475684642791748, "num_tokens": 17700534.0, "step": 1361 }, { "epoch": 0.87168, "grad_norm": 4.095639705657959, "learning_rate": 5e-06, "loss": 1.3171, "mean_token_accuracy": 0.6621948033571243, "num_tokens": 17712854.0, "step": 1362 }, { "epoch": 0.87232, "grad_norm": 4.265082359313965, "learning_rate": 5e-06, "loss": 1.3075, "mean_token_accuracy": 0.6697151511907578, "num_tokens": 17723003.0, "step": 1363 }, { "epoch": 0.87296, "grad_norm": 3.368932008743286, "learning_rate": 5e-06, "loss": 1.4089, "mean_token_accuracy": 0.6478553786873817, "num_tokens": 17736730.0, "step": 1364 }, { "epoch": 0.8736, "grad_norm": 3.5103371143341064, "learning_rate": 5e-06, "loss": 1.1961, "mean_token_accuracy": 0.6800166815519333, "num_tokens": 17749195.0, "step": 1365 }, { "epoch": 0.87424, "grad_norm": 3.6628217697143555, "learning_rate": 5e-06, "loss": 1.2206, "mean_token_accuracy": 0.7043485268950462, "num_tokens": 17761716.0, "step": 1366 }, { "epoch": 0.87488, "grad_norm": 3.283897638320923, "learning_rate": 5e-06, "loss": 1.2755, "mean_token_accuracy": 0.6614806577563286, "num_tokens": 17776711.0, "step": 1367 }, { "epoch": 0.87552, "grad_norm": 4.253682613372803, "learning_rate": 5e-06, "loss": 1.1304, "mean_token_accuracy": 0.7110883370041847, "num_tokens": 17787618.0, "step": 1368 }, { "epoch": 0.87616, "grad_norm": 3.7107419967651367, "learning_rate": 5e-06, "loss": 1.353, "mean_token_accuracy": 0.6722783967852592, "num_tokens": 17798686.0, "step": 1369 }, { "epoch": 0.8768, "grad_norm": 4.0010271072387695, "learning_rate": 5e-06, "loss": 1.2605, "mean_token_accuracy": 0.660023458302021, "num_tokens": 17812008.0, "step": 1370 }, { "epoch": 0.87744, "grad_norm": 3.8963913917541504, "learning_rate": 5e-06, "loss": 1.0612, "mean_token_accuracy": 0.6989761069417, "num_tokens": 17822062.0, "step": 1371 }, { "epoch": 0.87808, "grad_norm": 3.409618854522705, "learning_rate": 5e-06, "loss": 1.5595, "mean_token_accuracy": 0.6186339408159256, "num_tokens": 17836142.0, "step": 1372 }, { "epoch": 0.87872, "grad_norm": 2.955591917037964, "learning_rate": 5e-06, "loss": 1.2973, "mean_token_accuracy": 0.6578470319509506, "num_tokens": 17850508.0, "step": 1373 }, { "epoch": 0.87936, "grad_norm": 3.400749921798706, "learning_rate": 5e-06, "loss": 1.1947, "mean_token_accuracy": 0.6701619401574135, "num_tokens": 17864240.0, "step": 1374 }, { "epoch": 0.88, "grad_norm": 3.2822978496551514, "learning_rate": 5e-06, "loss": 1.46, "mean_token_accuracy": 0.6225104928016663, "num_tokens": 17879022.0, "step": 1375 }, { "epoch": 0.88064, "grad_norm": 3.9761667251586914, "learning_rate": 5e-06, "loss": 1.1623, "mean_token_accuracy": 0.6682100668549538, "num_tokens": 17890289.0, "step": 1376 }, { "epoch": 0.88128, "grad_norm": 3.6653897762298584, "learning_rate": 5e-06, "loss": 1.3524, "mean_token_accuracy": 0.6353943534195423, "num_tokens": 17903080.0, "step": 1377 }, { "epoch": 0.88192, "grad_norm": 4.603322505950928, "learning_rate": 5e-06, "loss": 1.6278, "mean_token_accuracy": 0.6351469904184341, "num_tokens": 17912567.0, "step": 1378 }, { "epoch": 0.88256, "grad_norm": 3.411752700805664, "learning_rate": 5e-06, "loss": 1.2195, "mean_token_accuracy": 0.675203487277031, "num_tokens": 17927030.0, "step": 1379 }, { "epoch": 0.8832, "grad_norm": 4.03117036819458, "learning_rate": 5e-06, "loss": 1.1379, "mean_token_accuracy": 0.6800655201077461, "num_tokens": 17936846.0, "step": 1380 }, { "epoch": 0.88384, "grad_norm": 3.4626095294952393, "learning_rate": 5e-06, "loss": 1.2256, "mean_token_accuracy": 0.6742624565958977, "num_tokens": 17949176.0, "step": 1381 }, { "epoch": 0.88448, "grad_norm": 3.326813220977783, "learning_rate": 5e-06, "loss": 1.2921, "mean_token_accuracy": 0.6827266663312912, "num_tokens": 17962574.0, "step": 1382 }, { "epoch": 0.88512, "grad_norm": 3.539931535720825, "learning_rate": 5e-06, "loss": 1.0815, "mean_token_accuracy": 0.7009731009602547, "num_tokens": 17975312.0, "step": 1383 }, { "epoch": 0.88576, "grad_norm": 3.1076414585113525, "learning_rate": 5e-06, "loss": 1.2004, "mean_token_accuracy": 0.6538084149360657, "num_tokens": 17992044.0, "step": 1384 }, { "epoch": 0.8864, "grad_norm": 3.54392147064209, "learning_rate": 5e-06, "loss": 1.3749, "mean_token_accuracy": 0.6528435945510864, "num_tokens": 18004880.0, "step": 1385 }, { "epoch": 0.88704, "grad_norm": 3.1049365997314453, "learning_rate": 5e-06, "loss": 1.4558, "mean_token_accuracy": 0.6254525259137154, "num_tokens": 18018899.0, "step": 1386 }, { "epoch": 0.88768, "grad_norm": 3.872276782989502, "learning_rate": 5e-06, "loss": 1.3721, "mean_token_accuracy": 0.6540368646383286, "num_tokens": 18031425.0, "step": 1387 }, { "epoch": 0.88832, "grad_norm": 4.218468189239502, "learning_rate": 5e-06, "loss": 1.1603, "mean_token_accuracy": 0.6857082098722458, "num_tokens": 18042604.0, "step": 1388 }, { "epoch": 0.88896, "grad_norm": 3.564180374145508, "learning_rate": 5e-06, "loss": 1.3641, "mean_token_accuracy": 0.6548919975757599, "num_tokens": 18055105.0, "step": 1389 }, { "epoch": 0.8896, "grad_norm": 3.4216361045837402, "learning_rate": 5e-06, "loss": 1.3133, "mean_token_accuracy": 0.6648430451750755, "num_tokens": 18068895.0, "step": 1390 }, { "epoch": 0.89024, "grad_norm": 3.466216564178467, "learning_rate": 5e-06, "loss": 1.426, "mean_token_accuracy": 0.675171747803688, "num_tokens": 18082806.0, "step": 1391 }, { "epoch": 0.89088, "grad_norm": 4.009366512298584, "learning_rate": 5e-06, "loss": 1.151, "mean_token_accuracy": 0.6659594774246216, "num_tokens": 18093287.0, "step": 1392 }, { "epoch": 0.89152, "grad_norm": 3.287865161895752, "learning_rate": 5e-06, "loss": 1.1818, "mean_token_accuracy": 0.697388269007206, "num_tokens": 18107755.0, "step": 1393 }, { "epoch": 0.89216, "grad_norm": 3.865363597869873, "learning_rate": 5e-06, "loss": 1.2494, "mean_token_accuracy": 0.6640971228480339, "num_tokens": 18118665.0, "step": 1394 }, { "epoch": 0.8928, "grad_norm": 3.694581985473633, "learning_rate": 5e-06, "loss": 1.2886, "mean_token_accuracy": 0.6522090062499046, "num_tokens": 18130135.0, "step": 1395 }, { "epoch": 0.89344, "grad_norm": 3.5079498291015625, "learning_rate": 5e-06, "loss": 1.2224, "mean_token_accuracy": 0.6756654903292656, "num_tokens": 18143669.0, "step": 1396 }, { "epoch": 0.89408, "grad_norm": 3.9231410026550293, "learning_rate": 5e-06, "loss": 1.2382, "mean_token_accuracy": 0.6565620601177216, "num_tokens": 18155787.0, "step": 1397 }, { "epoch": 0.89472, "grad_norm": 3.2922706604003906, "learning_rate": 5e-06, "loss": 1.3624, "mean_token_accuracy": 0.657343290746212, "num_tokens": 18169330.0, "step": 1398 }, { "epoch": 0.89536, "grad_norm": 4.219677448272705, "learning_rate": 5e-06, "loss": 1.3696, "mean_token_accuracy": 0.6795709133148193, "num_tokens": 18181111.0, "step": 1399 }, { "epoch": 0.896, "grad_norm": 3.3847157955169678, "learning_rate": 5e-06, "loss": 1.2803, "mean_token_accuracy": 0.6801963672041893, "num_tokens": 18194826.0, "step": 1400 }, { "epoch": 0.89664, "grad_norm": 3.3101882934570312, "learning_rate": 5e-06, "loss": 1.2146, "mean_token_accuracy": 0.6810724586248398, "num_tokens": 18207891.0, "step": 1401 }, { "epoch": 0.89728, "grad_norm": 4.586159706115723, "learning_rate": 5e-06, "loss": 1.234, "mean_token_accuracy": 0.6616763696074486, "num_tokens": 18219068.0, "step": 1402 }, { "epoch": 0.89792, "grad_norm": 2.9213805198669434, "learning_rate": 5e-06, "loss": 1.5959, "mean_token_accuracy": 0.6127992421388626, "num_tokens": 18234945.0, "step": 1403 }, { "epoch": 0.89856, "grad_norm": 3.180678606033325, "learning_rate": 5e-06, "loss": 1.2227, "mean_token_accuracy": 0.680058054625988, "num_tokens": 18249768.0, "step": 1404 }, { "epoch": 0.8992, "grad_norm": 3.4679532051086426, "learning_rate": 5e-06, "loss": 1.2374, "mean_token_accuracy": 0.67696313560009, "num_tokens": 18265924.0, "step": 1405 }, { "epoch": 0.89984, "grad_norm": 3.4234979152679443, "learning_rate": 5e-06, "loss": 1.5505, "mean_token_accuracy": 0.6274235621094704, "num_tokens": 18280819.0, "step": 1406 }, { "epoch": 0.90048, "grad_norm": 4.96069860458374, "learning_rate": 5e-06, "loss": 1.4236, "mean_token_accuracy": 0.607517022639513, "num_tokens": 18291686.0, "step": 1407 }, { "epoch": 0.90112, "grad_norm": 3.1977005004882812, "learning_rate": 5e-06, "loss": 1.3486, "mean_token_accuracy": 0.6483859121799469, "num_tokens": 18304993.0, "step": 1408 }, { "epoch": 0.90176, "grad_norm": 3.5749099254608154, "learning_rate": 5e-06, "loss": 1.2922, "mean_token_accuracy": 0.6452238261699677, "num_tokens": 18319373.0, "step": 1409 }, { "epoch": 0.9024, "grad_norm": 3.388899803161621, "learning_rate": 5e-06, "loss": 1.3281, "mean_token_accuracy": 0.6475742906332016, "num_tokens": 18331998.0, "step": 1410 }, { "epoch": 0.90304, "grad_norm": 3.4031882286071777, "learning_rate": 5e-06, "loss": 1.3355, "mean_token_accuracy": 0.6812806725502014, "num_tokens": 18344632.0, "step": 1411 }, { "epoch": 0.90368, "grad_norm": 3.8880221843719482, "learning_rate": 5e-06, "loss": 1.3898, "mean_token_accuracy": 0.6473888382315636, "num_tokens": 18356350.0, "step": 1412 }, { "epoch": 0.90432, "grad_norm": 3.5985724925994873, "learning_rate": 5e-06, "loss": 1.2345, "mean_token_accuracy": 0.6557316966354847, "num_tokens": 18368400.0, "step": 1413 }, { "epoch": 0.90496, "grad_norm": 3.6234962940216064, "learning_rate": 5e-06, "loss": 1.1942, "mean_token_accuracy": 0.6906508356332779, "num_tokens": 18379118.0, "step": 1414 }, { "epoch": 0.9056, "grad_norm": 3.8934993743896484, "learning_rate": 5e-06, "loss": 1.2382, "mean_token_accuracy": 0.6724176928400993, "num_tokens": 18391595.0, "step": 1415 }, { "epoch": 0.90624, "grad_norm": 3.603591203689575, "learning_rate": 5e-06, "loss": 1.3737, "mean_token_accuracy": 0.6498560681939125, "num_tokens": 18403595.0, "step": 1416 }, { "epoch": 0.90688, "grad_norm": 3.2106738090515137, "learning_rate": 5e-06, "loss": 1.2911, "mean_token_accuracy": 0.6614857837557793, "num_tokens": 18418034.0, "step": 1417 }, { "epoch": 0.90752, "grad_norm": 3.0255284309387207, "learning_rate": 5e-06, "loss": 1.2975, "mean_token_accuracy": 0.653803177177906, "num_tokens": 18434798.0, "step": 1418 }, { "epoch": 0.90816, "grad_norm": 3.696108818054199, "learning_rate": 5e-06, "loss": 1.3184, "mean_token_accuracy": 0.6341993510723114, "num_tokens": 18446612.0, "step": 1419 }, { "epoch": 0.9088, "grad_norm": 4.0753254890441895, "learning_rate": 5e-06, "loss": 1.2244, "mean_token_accuracy": 0.6535854563117027, "num_tokens": 18458141.0, "step": 1420 }, { "epoch": 0.90944, "grad_norm": 3.655604124069214, "learning_rate": 5e-06, "loss": 1.3088, "mean_token_accuracy": 0.6778343543410301, "num_tokens": 18471653.0, "step": 1421 }, { "epoch": 0.91008, "grad_norm": 3.4860193729400635, "learning_rate": 5e-06, "loss": 1.2065, "mean_token_accuracy": 0.6871431916952133, "num_tokens": 18482903.0, "step": 1422 }, { "epoch": 0.91072, "grad_norm": 3.5701212882995605, "learning_rate": 5e-06, "loss": 1.161, "mean_token_accuracy": 0.6876110881567001, "num_tokens": 18495519.0, "step": 1423 }, { "epoch": 0.91136, "grad_norm": 4.311164855957031, "learning_rate": 5e-06, "loss": 1.2691, "mean_token_accuracy": 0.6963246017694473, "num_tokens": 18506391.0, "step": 1424 }, { "epoch": 0.912, "grad_norm": 3.228339672088623, "learning_rate": 5e-06, "loss": 1.2486, "mean_token_accuracy": 0.6647578254342079, "num_tokens": 18521751.0, "step": 1425 }, { "epoch": 0.91264, "grad_norm": 3.649463176727295, "learning_rate": 5e-06, "loss": 1.2265, "mean_token_accuracy": 0.6655023992061615, "num_tokens": 18533605.0, "step": 1426 }, { "epoch": 0.91328, "grad_norm": 3.822047710418701, "learning_rate": 5e-06, "loss": 1.2303, "mean_token_accuracy": 0.6853557825088501, "num_tokens": 18545920.0, "step": 1427 }, { "epoch": 0.91392, "grad_norm": 3.622427463531494, "learning_rate": 5e-06, "loss": 1.3153, "mean_token_accuracy": 0.6682358086109161, "num_tokens": 18558370.0, "step": 1428 }, { "epoch": 0.91456, "grad_norm": 3.013226270675659, "learning_rate": 5e-06, "loss": 1.0413, "mean_token_accuracy": 0.7230858653783798, "num_tokens": 18572388.0, "step": 1429 }, { "epoch": 0.9152, "grad_norm": 2.999063730239868, "learning_rate": 5e-06, "loss": 1.2757, "mean_token_accuracy": 0.658422015607357, "num_tokens": 18587001.0, "step": 1430 }, { "epoch": 0.91584, "grad_norm": 3.246445417404175, "learning_rate": 5e-06, "loss": 1.1428, "mean_token_accuracy": 0.7174563780426979, "num_tokens": 18600196.0, "step": 1431 }, { "epoch": 0.91648, "grad_norm": 3.52728533744812, "learning_rate": 5e-06, "loss": 1.2719, "mean_token_accuracy": 0.6571086049079895, "num_tokens": 18612602.0, "step": 1432 }, { "epoch": 0.91712, "grad_norm": 3.3236947059631348, "learning_rate": 5e-06, "loss": 1.3722, "mean_token_accuracy": 0.6516182944178581, "num_tokens": 18628569.0, "step": 1433 }, { "epoch": 0.91776, "grad_norm": 3.9207522869110107, "learning_rate": 5e-06, "loss": 1.289, "mean_token_accuracy": 0.6646075919270515, "num_tokens": 18639375.0, "step": 1434 }, { "epoch": 0.9184, "grad_norm": 3.3679165840148926, "learning_rate": 5e-06, "loss": 1.3844, "mean_token_accuracy": 0.6545412912964821, "num_tokens": 18652531.0, "step": 1435 }, { "epoch": 0.91904, "grad_norm": 3.58003830909729, "learning_rate": 5e-06, "loss": 1.3116, "mean_token_accuracy": 0.655610017478466, "num_tokens": 18665160.0, "step": 1436 }, { "epoch": 0.91968, "grad_norm": 3.827817916870117, "learning_rate": 5e-06, "loss": 1.1945, "mean_token_accuracy": 0.6569493412971497, "num_tokens": 18676671.0, "step": 1437 }, { "epoch": 0.92032, "grad_norm": 3.6998956203460693, "learning_rate": 5e-06, "loss": 1.5481, "mean_token_accuracy": 0.6249835789203644, "num_tokens": 18690078.0, "step": 1438 }, { "epoch": 0.92096, "grad_norm": 3.2389333248138428, "learning_rate": 5e-06, "loss": 1.2938, "mean_token_accuracy": 0.65943942964077, "num_tokens": 18703678.0, "step": 1439 }, { "epoch": 0.9216, "grad_norm": 2.924175262451172, "learning_rate": 5e-06, "loss": 1.2873, "mean_token_accuracy": 0.6494470685720444, "num_tokens": 18719576.0, "step": 1440 }, { "epoch": 0.92224, "grad_norm": 3.7290942668914795, "learning_rate": 5e-06, "loss": 1.2667, "mean_token_accuracy": 0.6728792116045952, "num_tokens": 18732712.0, "step": 1441 }, { "epoch": 0.92288, "grad_norm": 3.406003952026367, "learning_rate": 5e-06, "loss": 1.1128, "mean_token_accuracy": 0.7027332484722137, "num_tokens": 18745929.0, "step": 1442 }, { "epoch": 0.92352, "grad_norm": 3.9130918979644775, "learning_rate": 5e-06, "loss": 1.1714, "mean_token_accuracy": 0.6731210052967072, "num_tokens": 18755977.0, "step": 1443 }, { "epoch": 0.92416, "grad_norm": 3.678868055343628, "learning_rate": 5e-06, "loss": 1.3613, "mean_token_accuracy": 0.6376957893371582, "num_tokens": 18767848.0, "step": 1444 }, { "epoch": 0.9248, "grad_norm": 3.355009078979492, "learning_rate": 5e-06, "loss": 1.4501, "mean_token_accuracy": 0.6530297324061394, "num_tokens": 18781692.0, "step": 1445 }, { "epoch": 0.92544, "grad_norm": 3.197375774383545, "learning_rate": 5e-06, "loss": 1.4667, "mean_token_accuracy": 0.6258358731865883, "num_tokens": 18796361.0, "step": 1446 }, { "epoch": 0.92608, "grad_norm": 3.364900588989258, "learning_rate": 5e-06, "loss": 1.4204, "mean_token_accuracy": 0.6358629465103149, "num_tokens": 18810771.0, "step": 1447 }, { "epoch": 0.92672, "grad_norm": 3.323707342147827, "learning_rate": 5e-06, "loss": 1.1537, "mean_token_accuracy": 0.700812466442585, "num_tokens": 18824895.0, "step": 1448 }, { "epoch": 0.92736, "grad_norm": 3.5423851013183594, "learning_rate": 5e-06, "loss": 1.1198, "mean_token_accuracy": 0.6927414685487747, "num_tokens": 18838244.0, "step": 1449 }, { "epoch": 0.928, "grad_norm": 3.5557827949523926, "learning_rate": 5e-06, "loss": 1.3942, "mean_token_accuracy": 0.6344395503401756, "num_tokens": 18850747.0, "step": 1450 }, { "epoch": 0.92864, "grad_norm": 3.8772428035736084, "learning_rate": 5e-06, "loss": 1.1849, "mean_token_accuracy": 0.6797264739871025, "num_tokens": 18863209.0, "step": 1451 }, { "epoch": 0.92928, "grad_norm": 3.387641668319702, "learning_rate": 5e-06, "loss": 1.4152, "mean_token_accuracy": 0.6333313882350922, "num_tokens": 18876056.0, "step": 1452 }, { "epoch": 0.92992, "grad_norm": 3.554407835006714, "learning_rate": 5e-06, "loss": 1.1832, "mean_token_accuracy": 0.6640536859631538, "num_tokens": 18890920.0, "step": 1453 }, { "epoch": 0.93056, "grad_norm": 3.302236795425415, "learning_rate": 5e-06, "loss": 1.5489, "mean_token_accuracy": 0.6134847179055214, "num_tokens": 18905793.0, "step": 1454 }, { "epoch": 0.9312, "grad_norm": 3.531574010848999, "learning_rate": 5e-06, "loss": 1.2801, "mean_token_accuracy": 0.6507202833890915, "num_tokens": 18920224.0, "step": 1455 }, { "epoch": 0.93184, "grad_norm": 3.5933139324188232, "learning_rate": 5e-06, "loss": 1.3922, "mean_token_accuracy": 0.6551200449466705, "num_tokens": 18932613.0, "step": 1456 }, { "epoch": 0.93248, "grad_norm": 3.254462480545044, "learning_rate": 5e-06, "loss": 1.3985, "mean_token_accuracy": 0.6505570337176323, "num_tokens": 18946774.0, "step": 1457 }, { "epoch": 0.93312, "grad_norm": 3.2945821285247803, "learning_rate": 5e-06, "loss": 1.5279, "mean_token_accuracy": 0.6084811314940453, "num_tokens": 18961275.0, "step": 1458 }, { "epoch": 0.93376, "grad_norm": 3.2776741981506348, "learning_rate": 5e-06, "loss": 1.3401, "mean_token_accuracy": 0.640129804611206, "num_tokens": 18975529.0, "step": 1459 }, { "epoch": 0.9344, "grad_norm": 3.2493832111358643, "learning_rate": 5e-06, "loss": 1.1077, "mean_token_accuracy": 0.6907928735017776, "num_tokens": 18988267.0, "step": 1460 }, { "epoch": 0.93504, "grad_norm": 3.765650987625122, "learning_rate": 5e-06, "loss": 1.3092, "mean_token_accuracy": 0.6711199656128883, "num_tokens": 19000229.0, "step": 1461 }, { "epoch": 0.93568, "grad_norm": 3.1340558528900146, "learning_rate": 5e-06, "loss": 1.4336, "mean_token_accuracy": 0.6485133692622185, "num_tokens": 19014356.0, "step": 1462 }, { "epoch": 0.93632, "grad_norm": 3.672553300857544, "learning_rate": 5e-06, "loss": 1.1751, "mean_token_accuracy": 0.664104662835598, "num_tokens": 19025717.0, "step": 1463 }, { "epoch": 0.93696, "grad_norm": 3.753906726837158, "learning_rate": 5e-06, "loss": 1.1003, "mean_token_accuracy": 0.6995716020464897, "num_tokens": 19037864.0, "step": 1464 }, { "epoch": 0.9376, "grad_norm": 3.1207399368286133, "learning_rate": 5e-06, "loss": 1.2334, "mean_token_accuracy": 0.6692882552742958, "num_tokens": 19052336.0, "step": 1465 }, { "epoch": 0.93824, "grad_norm": 3.639620065689087, "learning_rate": 5e-06, "loss": 1.396, "mean_token_accuracy": 0.6677844971418381, "num_tokens": 19065183.0, "step": 1466 }, { "epoch": 0.93888, "grad_norm": 3.5665981769561768, "learning_rate": 5e-06, "loss": 1.3489, "mean_token_accuracy": 0.66384107619524, "num_tokens": 19078765.0, "step": 1467 }, { "epoch": 0.93952, "grad_norm": 3.5918264389038086, "learning_rate": 5e-06, "loss": 1.4087, "mean_token_accuracy": 0.6427194476127625, "num_tokens": 19091098.0, "step": 1468 }, { "epoch": 0.94016, "grad_norm": 3.3692591190338135, "learning_rate": 5e-06, "loss": 1.3897, "mean_token_accuracy": 0.6431680992245674, "num_tokens": 19105664.0, "step": 1469 }, { "epoch": 0.9408, "grad_norm": 3.6854288578033447, "learning_rate": 5e-06, "loss": 1.3319, "mean_token_accuracy": 0.6552760303020477, "num_tokens": 19118215.0, "step": 1470 }, { "epoch": 0.94144, "grad_norm": 3.3998701572418213, "learning_rate": 5e-06, "loss": 1.1683, "mean_token_accuracy": 0.675237774848938, "num_tokens": 19130126.0, "step": 1471 }, { "epoch": 0.94208, "grad_norm": 3.5668833255767822, "learning_rate": 5e-06, "loss": 1.4991, "mean_token_accuracy": 0.6222522705793381, "num_tokens": 19142375.0, "step": 1472 }, { "epoch": 0.94272, "grad_norm": 3.275745153427124, "learning_rate": 5e-06, "loss": 1.3953, "mean_token_accuracy": 0.6239468678832054, "num_tokens": 19157943.0, "step": 1473 }, { "epoch": 0.94336, "grad_norm": 4.061445236206055, "learning_rate": 5e-06, "loss": 1.3817, "mean_token_accuracy": 0.6464495584368706, "num_tokens": 19169261.0, "step": 1474 }, { "epoch": 0.944, "grad_norm": 3.1921486854553223, "learning_rate": 5e-06, "loss": 1.284, "mean_token_accuracy": 0.6610319390892982, "num_tokens": 19184566.0, "step": 1475 }, { "epoch": 0.94464, "grad_norm": 3.192448139190674, "learning_rate": 5e-06, "loss": 1.2544, "mean_token_accuracy": 0.670927107334137, "num_tokens": 19199161.0, "step": 1476 }, { "epoch": 0.94528, "grad_norm": 3.534567356109619, "learning_rate": 5e-06, "loss": 1.2898, "mean_token_accuracy": 0.6620035171508789, "num_tokens": 19210216.0, "step": 1477 }, { "epoch": 0.94592, "grad_norm": 3.4070894718170166, "learning_rate": 5e-06, "loss": 1.2067, "mean_token_accuracy": 0.665832906961441, "num_tokens": 19222748.0, "step": 1478 }, { "epoch": 0.94656, "grad_norm": 3.373779058456421, "learning_rate": 5e-06, "loss": 1.3304, "mean_token_accuracy": 0.6508694216609001, "num_tokens": 19236471.0, "step": 1479 }, { "epoch": 0.9472, "grad_norm": 3.518333911895752, "learning_rate": 5e-06, "loss": 1.4454, "mean_token_accuracy": 0.645517073571682, "num_tokens": 19249438.0, "step": 1480 }, { "epoch": 0.94784, "grad_norm": 3.995748519897461, "learning_rate": 5e-06, "loss": 1.4204, "mean_token_accuracy": 0.6810062602162361, "num_tokens": 19262043.0, "step": 1481 }, { "epoch": 0.94848, "grad_norm": 3.0706183910369873, "learning_rate": 5e-06, "loss": 1.0148, "mean_token_accuracy": 0.7076255902647972, "num_tokens": 19277307.0, "step": 1482 }, { "epoch": 0.94912, "grad_norm": 3.0978240966796875, "learning_rate": 5e-06, "loss": 1.3144, "mean_token_accuracy": 0.6533934101462364, "num_tokens": 19292657.0, "step": 1483 }, { "epoch": 0.94976, "grad_norm": 3.988011121749878, "learning_rate": 5e-06, "loss": 1.3691, "mean_token_accuracy": 0.6342190653085709, "num_tokens": 19303123.0, "step": 1484 }, { "epoch": 0.9504, "grad_norm": 3.7990894317626953, "learning_rate": 5e-06, "loss": 1.107, "mean_token_accuracy": 0.7004605457186699, "num_tokens": 19314275.0, "step": 1485 }, { "epoch": 0.95104, "grad_norm": 3.5531113147735596, "learning_rate": 5e-06, "loss": 1.3478, "mean_token_accuracy": 0.6372592151165009, "num_tokens": 19327717.0, "step": 1486 }, { "epoch": 0.95168, "grad_norm": 3.129286050796509, "learning_rate": 5e-06, "loss": 1.5809, "mean_token_accuracy": 0.6213468164205551, "num_tokens": 19341237.0, "step": 1487 }, { "epoch": 0.95232, "grad_norm": 3.394064426422119, "learning_rate": 5e-06, "loss": 1.3591, "mean_token_accuracy": 0.6372789964079857, "num_tokens": 19355577.0, "step": 1488 }, { "epoch": 0.95296, "grad_norm": 3.2110018730163574, "learning_rate": 5e-06, "loss": 1.2399, "mean_token_accuracy": 0.679095022380352, "num_tokens": 19371326.0, "step": 1489 }, { "epoch": 0.9536, "grad_norm": 3.3202333450317383, "learning_rate": 5e-06, "loss": 1.3916, "mean_token_accuracy": 0.6611816883087158, "num_tokens": 19385868.0, "step": 1490 }, { "epoch": 0.95424, "grad_norm": 3.5390098094940186, "learning_rate": 5e-06, "loss": 1.2219, "mean_token_accuracy": 0.6761639937758446, "num_tokens": 19398025.0, "step": 1491 }, { "epoch": 0.95488, "grad_norm": 3.390742778778076, "learning_rate": 5e-06, "loss": 1.5499, "mean_token_accuracy": 0.6107296124100685, "num_tokens": 19412343.0, "step": 1492 }, { "epoch": 0.95552, "grad_norm": 2.821200132369995, "learning_rate": 5e-06, "loss": 1.2155, "mean_token_accuracy": 0.6577084437012672, "num_tokens": 19428748.0, "step": 1493 }, { "epoch": 0.95616, "grad_norm": 3.292036771774292, "learning_rate": 5e-06, "loss": 1.3155, "mean_token_accuracy": 0.644202746450901, "num_tokens": 19440656.0, "step": 1494 }, { "epoch": 0.9568, "grad_norm": 3.416463851928711, "learning_rate": 5e-06, "loss": 1.2269, "mean_token_accuracy": 0.6907675266265869, "num_tokens": 19452544.0, "step": 1495 }, { "epoch": 0.95744, "grad_norm": 3.6329751014709473, "learning_rate": 5e-06, "loss": 1.3323, "mean_token_accuracy": 0.6382646858692169, "num_tokens": 19465315.0, "step": 1496 }, { "epoch": 0.95808, "grad_norm": 3.5367205142974854, "learning_rate": 5e-06, "loss": 1.373, "mean_token_accuracy": 0.6586090922355652, "num_tokens": 19480115.0, "step": 1497 }, { "epoch": 0.95872, "grad_norm": 3.5177509784698486, "learning_rate": 5e-06, "loss": 1.2388, "mean_token_accuracy": 0.6645878851413727, "num_tokens": 19494388.0, "step": 1498 }, { "epoch": 0.95936, "grad_norm": 3.709169626235962, "learning_rate": 5e-06, "loss": 1.3733, "mean_token_accuracy": 0.6607565060257912, "num_tokens": 19505621.0, "step": 1499 }, { "epoch": 0.96, "grad_norm": 3.3196604251861572, "learning_rate": 5e-06, "loss": 1.1325, "mean_token_accuracy": 0.6826166063547134, "num_tokens": 19519830.0, "step": 1500 }, { "epoch": 0.96064, "grad_norm": 4.17763090133667, "learning_rate": 5e-06, "loss": 1.2355, "mean_token_accuracy": 0.6763554587960243, "num_tokens": 19532118.0, "step": 1501 }, { "epoch": 0.96128, "grad_norm": 3.9797887802124023, "learning_rate": 5e-06, "loss": 0.9252, "mean_token_accuracy": 0.7308862134814262, "num_tokens": 19543422.0, "step": 1502 }, { "epoch": 0.96192, "grad_norm": 3.3593435287475586, "learning_rate": 5e-06, "loss": 1.121, "mean_token_accuracy": 0.6892295926809311, "num_tokens": 19555897.0, "step": 1503 }, { "epoch": 0.96256, "grad_norm": 3.6559438705444336, "learning_rate": 5e-06, "loss": 1.1375, "mean_token_accuracy": 0.6769029051065445, "num_tokens": 19567248.0, "step": 1504 }, { "epoch": 0.9632, "grad_norm": 3.6883292198181152, "learning_rate": 5e-06, "loss": 1.3164, "mean_token_accuracy": 0.643324077129364, "num_tokens": 19579310.0, "step": 1505 }, { "epoch": 0.96384, "grad_norm": 3.5200116634368896, "learning_rate": 5e-06, "loss": 1.2694, "mean_token_accuracy": 0.6747664734721184, "num_tokens": 19592537.0, "step": 1506 }, { "epoch": 0.96448, "grad_norm": 3.3167619705200195, "learning_rate": 5e-06, "loss": 1.2958, "mean_token_accuracy": 0.6770147830247879, "num_tokens": 19606932.0, "step": 1507 }, { "epoch": 0.96512, "grad_norm": 2.7224249839782715, "learning_rate": 5e-06, "loss": 1.3125, "mean_token_accuracy": 0.6614532843232155, "num_tokens": 19624296.0, "step": 1508 }, { "epoch": 0.96576, "grad_norm": 3.4137089252471924, "learning_rate": 5e-06, "loss": 1.2778, "mean_token_accuracy": 0.662353903055191, "num_tokens": 19637049.0, "step": 1509 }, { "epoch": 0.9664, "grad_norm": 3.7370848655700684, "learning_rate": 5e-06, "loss": 1.4503, "mean_token_accuracy": 0.639873132109642, "num_tokens": 19649788.0, "step": 1510 }, { "epoch": 0.96704, "grad_norm": 3.4333293437957764, "learning_rate": 5e-06, "loss": 1.4996, "mean_token_accuracy": 0.63913669064641, "num_tokens": 19662956.0, "step": 1511 }, { "epoch": 0.96768, "grad_norm": 3.8436150550842285, "learning_rate": 5e-06, "loss": 1.2372, "mean_token_accuracy": 0.6666671261191368, "num_tokens": 19674701.0, "step": 1512 }, { "epoch": 0.96832, "grad_norm": 3.4364569187164307, "learning_rate": 5e-06, "loss": 1.4256, "mean_token_accuracy": 0.6375450566411018, "num_tokens": 19688356.0, "step": 1513 }, { "epoch": 0.96896, "grad_norm": 3.1849286556243896, "learning_rate": 5e-06, "loss": 1.3019, "mean_token_accuracy": 0.654203861951828, "num_tokens": 19703055.0, "step": 1514 }, { "epoch": 0.9696, "grad_norm": 3.790954828262329, "learning_rate": 5e-06, "loss": 1.1957, "mean_token_accuracy": 0.6805417165160179, "num_tokens": 19715360.0, "step": 1515 }, { "epoch": 0.97024, "grad_norm": 3.696563243865967, "learning_rate": 5e-06, "loss": 1.2499, "mean_token_accuracy": 0.6584246829152107, "num_tokens": 19726044.0, "step": 1516 }, { "epoch": 0.97088, "grad_norm": 4.10850191116333, "learning_rate": 5e-06, "loss": 1.4378, "mean_token_accuracy": 0.6355271711945534, "num_tokens": 19739139.0, "step": 1517 }, { "epoch": 0.97152, "grad_norm": 3.1323556900024414, "learning_rate": 5e-06, "loss": 1.3652, "mean_token_accuracy": 0.6413158774375916, "num_tokens": 19753058.0, "step": 1518 }, { "epoch": 0.97216, "grad_norm": 3.334622859954834, "learning_rate": 5e-06, "loss": 1.2963, "mean_token_accuracy": 0.6517771631479263, "num_tokens": 19765569.0, "step": 1519 }, { "epoch": 0.9728, "grad_norm": 5.364054203033447, "learning_rate": 5e-06, "loss": 1.1438, "mean_token_accuracy": 0.6952068582177162, "num_tokens": 19778058.0, "step": 1520 }, { "epoch": 0.97344, "grad_norm": 3.416874408721924, "learning_rate": 5e-06, "loss": 1.1759, "mean_token_accuracy": 0.6735500246286392, "num_tokens": 19792728.0, "step": 1521 }, { "epoch": 0.97408, "grad_norm": 3.164233922958374, "learning_rate": 5e-06, "loss": 1.1211, "mean_token_accuracy": 0.6952219977974892, "num_tokens": 19807085.0, "step": 1522 }, { "epoch": 0.97472, "grad_norm": 3.73028564453125, "learning_rate": 5e-06, "loss": 1.3345, "mean_token_accuracy": 0.6841987073421478, "num_tokens": 19821681.0, "step": 1523 }, { "epoch": 0.97536, "grad_norm": 3.401895761489868, "learning_rate": 5e-06, "loss": 1.3681, "mean_token_accuracy": 0.6333037242293358, "num_tokens": 19834796.0, "step": 1524 }, { "epoch": 0.976, "grad_norm": 3.8067119121551514, "learning_rate": 5e-06, "loss": 1.0905, "mean_token_accuracy": 0.6978934183716774, "num_tokens": 19846639.0, "step": 1525 }, { "epoch": 0.97664, "grad_norm": 3.070439338684082, "learning_rate": 5e-06, "loss": 1.2461, "mean_token_accuracy": 0.653811477124691, "num_tokens": 19860465.0, "step": 1526 }, { "epoch": 0.97728, "grad_norm": 3.186588764190674, "learning_rate": 5e-06, "loss": 1.1821, "mean_token_accuracy": 0.7026697173714638, "num_tokens": 19876272.0, "step": 1527 }, { "epoch": 0.97792, "grad_norm": 3.122529983520508, "learning_rate": 5e-06, "loss": 1.1799, "mean_token_accuracy": 0.6770785599946976, "num_tokens": 19892221.0, "step": 1528 }, { "epoch": 0.97856, "grad_norm": 3.7920093536376953, "learning_rate": 5e-06, "loss": 1.3852, "mean_token_accuracy": 0.6517146974802017, "num_tokens": 19903219.0, "step": 1529 }, { "epoch": 0.9792, "grad_norm": 3.9800093173980713, "learning_rate": 5e-06, "loss": 1.3666, "mean_token_accuracy": 0.6671213582158089, "num_tokens": 19914283.0, "step": 1530 }, { "epoch": 0.97984, "grad_norm": 4.115480899810791, "learning_rate": 5e-06, "loss": 1.6462, "mean_token_accuracy": 0.6288831681013107, "num_tokens": 19924831.0, "step": 1531 }, { "epoch": 0.98048, "grad_norm": 3.8407366275787354, "learning_rate": 5e-06, "loss": 1.3123, "mean_token_accuracy": 0.646103672683239, "num_tokens": 19935368.0, "step": 1532 }, { "epoch": 0.98112, "grad_norm": 3.036931276321411, "learning_rate": 5e-06, "loss": 1.3947, "mean_token_accuracy": 0.6490734815597534, "num_tokens": 19950888.0, "step": 1533 }, { "epoch": 0.98176, "grad_norm": 3.3416826725006104, "learning_rate": 5e-06, "loss": 1.3709, "mean_token_accuracy": 0.6444736868143082, "num_tokens": 19964717.0, "step": 1534 }, { "epoch": 0.9824, "grad_norm": 3.184088945388794, "learning_rate": 5e-06, "loss": 1.3429, "mean_token_accuracy": 0.673894077539444, "num_tokens": 19977976.0, "step": 1535 }, { "epoch": 0.98304, "grad_norm": 3.382946491241455, "learning_rate": 5e-06, "loss": 1.481, "mean_token_accuracy": 0.6425464749336243, "num_tokens": 19991312.0, "step": 1536 }, { "epoch": 0.98368, "grad_norm": 3.7429699897766113, "learning_rate": 5e-06, "loss": 1.2422, "mean_token_accuracy": 0.6737086698412895, "num_tokens": 20002300.0, "step": 1537 }, { "epoch": 0.98432, "grad_norm": 3.6931872367858887, "learning_rate": 5e-06, "loss": 1.3122, "mean_token_accuracy": 0.6581440344452858, "num_tokens": 20015107.0, "step": 1538 }, { "epoch": 0.98496, "grad_norm": 4.0337300300598145, "learning_rate": 5e-06, "loss": 1.3912, "mean_token_accuracy": 0.6898427382111549, "num_tokens": 20027265.0, "step": 1539 }, { "epoch": 0.9856, "grad_norm": 3.514187812805176, "learning_rate": 5e-06, "loss": 1.0613, "mean_token_accuracy": 0.7012772336602211, "num_tokens": 20038919.0, "step": 1540 }, { "epoch": 0.98624, "grad_norm": 3.5034477710723877, "learning_rate": 5e-06, "loss": 1.4009, "mean_token_accuracy": 0.6428939253091812, "num_tokens": 20052482.0, "step": 1541 }, { "epoch": 0.98688, "grad_norm": 3.3519279956817627, "learning_rate": 5e-06, "loss": 1.4362, "mean_token_accuracy": 0.6396335512399673, "num_tokens": 20067032.0, "step": 1542 }, { "epoch": 0.98752, "grad_norm": 3.7068188190460205, "learning_rate": 5e-06, "loss": 1.2301, "mean_token_accuracy": 0.6891591548919678, "num_tokens": 20079146.0, "step": 1543 }, { "epoch": 0.98816, "grad_norm": 3.6617250442504883, "learning_rate": 5e-06, "loss": 1.211, "mean_token_accuracy": 0.7132939025759697, "num_tokens": 20089620.0, "step": 1544 }, { "epoch": 0.9888, "grad_norm": 3.217038631439209, "learning_rate": 5e-06, "loss": 1.3661, "mean_token_accuracy": 0.6576998308300972, "num_tokens": 20103587.0, "step": 1545 }, { "epoch": 0.98944, "grad_norm": 3.996293783187866, "learning_rate": 5e-06, "loss": 1.2923, "mean_token_accuracy": 0.6637570187449455, "num_tokens": 20115402.0, "step": 1546 }, { "epoch": 0.99008, "grad_norm": 3.543278932571411, "learning_rate": 5e-06, "loss": 1.2429, "mean_token_accuracy": 0.6742196753621101, "num_tokens": 20126222.0, "step": 1547 }, { "epoch": 0.99072, "grad_norm": 3.501190662384033, "learning_rate": 5e-06, "loss": 1.2304, "mean_token_accuracy": 0.6541951596736908, "num_tokens": 20137476.0, "step": 1548 }, { "epoch": 0.99136, "grad_norm": 3.904467821121216, "learning_rate": 5e-06, "loss": 1.2723, "mean_token_accuracy": 0.6750770211219788, "num_tokens": 20149377.0, "step": 1549 }, { "epoch": 0.992, "grad_norm": 3.557426691055298, "learning_rate": 5e-06, "loss": 1.4754, "mean_token_accuracy": 0.6633486226201057, "num_tokens": 20161955.0, "step": 1550 }, { "epoch": 0.99264, "grad_norm": 3.5321543216705322, "learning_rate": 5e-06, "loss": 1.3909, "mean_token_accuracy": 0.6640786305069923, "num_tokens": 20174432.0, "step": 1551 }, { "epoch": 0.99328, "grad_norm": 4.1432929039001465, "learning_rate": 5e-06, "loss": 1.2162, "mean_token_accuracy": 0.6733951196074486, "num_tokens": 20186656.0, "step": 1552 }, { "epoch": 0.99392, "grad_norm": 3.221876859664917, "learning_rate": 5e-06, "loss": 1.2039, "mean_token_accuracy": 0.6700675636529922, "num_tokens": 20200325.0, "step": 1553 }, { "epoch": 0.99456, "grad_norm": 3.4923529624938965, "learning_rate": 5e-06, "loss": 1.2479, "mean_token_accuracy": 0.6704057157039642, "num_tokens": 20211958.0, "step": 1554 }, { "epoch": 0.9952, "grad_norm": 3.4751315116882324, "learning_rate": 5e-06, "loss": 1.2513, "mean_token_accuracy": 0.6954710930585861, "num_tokens": 20224457.0, "step": 1555 }, { "epoch": 0.99584, "grad_norm": 3.4763216972351074, "learning_rate": 5e-06, "loss": 1.1645, "mean_token_accuracy": 0.6789154633879662, "num_tokens": 20236259.0, "step": 1556 }, { "epoch": 0.99648, "grad_norm": 3.582597017288208, "learning_rate": 5e-06, "loss": 1.383, "mean_token_accuracy": 0.6580745279788971, "num_tokens": 20250508.0, "step": 1557 }, { "epoch": 0.99712, "grad_norm": 4.058999061584473, "learning_rate": 5e-06, "loss": 1.3162, "mean_token_accuracy": 0.6591609418392181, "num_tokens": 20262337.0, "step": 1558 }, { "epoch": 0.99776, "grad_norm": 3.842996597290039, "learning_rate": 5e-06, "loss": 1.3768, "mean_token_accuracy": 0.6542828008532524, "num_tokens": 20273865.0, "step": 1559 }, { "epoch": 0.9984, "grad_norm": 3.5340254306793213, "learning_rate": 5e-06, "loss": 1.2762, "mean_token_accuracy": 0.6779467761516571, "num_tokens": 20286723.0, "step": 1560 }, { "epoch": 0.99904, "grad_norm": 3.087484836578369, "learning_rate": 5e-06, "loss": 1.3845, "mean_token_accuracy": 0.6415645852684975, "num_tokens": 20302849.0, "step": 1561 }, { "epoch": 0.99968, "grad_norm": 3.4678475856781006, "learning_rate": 5e-06, "loss": 1.2984, "mean_token_accuracy": 0.6632586568593979, "num_tokens": 20315462.0, "step": 1562 } ], "logging_steps": 1, "max_steps": 1562, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 56623305523200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }