{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 2484, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004830917874396135, "grad_norm": NaN, "learning_rate": 0.0002, "loss": 77.3398, "step": 2 }, { "epoch": 0.00966183574879227, "grad_norm": 46.88166809082031, "learning_rate": 0.0001998389694041868, "loss": 74.1949, "step": 4 }, { "epoch": 0.014492753623188406, "grad_norm": 57.10332489013672, "learning_rate": 0.0001996779388083736, "loss": 64.1131, "step": 6 }, { "epoch": 0.01932367149758454, "grad_norm": 32.98695373535156, "learning_rate": 0.0001995169082125604, "loss": 50.8557, "step": 8 }, { "epoch": 0.024154589371980676, "grad_norm": 42.64508056640625, "learning_rate": 0.0001993558776167472, "loss": 46.3401, "step": 10 }, { "epoch": 0.028985507246376812, "grad_norm": 23.093393325805664, "learning_rate": 0.00019919484702093397, "loss": 42.264, "step": 12 }, { "epoch": 0.033816425120772944, "grad_norm": 11.703313827514648, "learning_rate": 0.00019903381642512078, "loss": 40.6424, "step": 14 }, { "epoch": 0.03864734299516908, "grad_norm": 12.010417938232422, "learning_rate": 0.00019887278582930758, "loss": 41.0451, "step": 16 }, { "epoch": 0.043478260869565216, "grad_norm": 8.227810859680176, "learning_rate": 0.00019871175523349436, "loss": 40.8654, "step": 18 }, { "epoch": 0.04830917874396135, "grad_norm": 8.531281471252441, "learning_rate": 0.00019855072463768116, "loss": 39.9403, "step": 20 }, { "epoch": 0.05314009661835749, "grad_norm": 13.759291648864746, "learning_rate": 0.00019838969404186796, "loss": 40.4885, "step": 22 }, { "epoch": 0.057971014492753624, "grad_norm": 12.24333381652832, "learning_rate": 0.00019822866344605474, "loss": 38.9464, "step": 24 }, { "epoch": 0.06280193236714976, "grad_norm": 9.252888679504395, "learning_rate": 0.00019806763285024154, "loss": 37.4614, "step": 26 }, { "epoch": 0.06763285024154589, "grad_norm": 12.859115600585938, "learning_rate": 0.00019790660225442835, "loss": 40.04, "step": 28 }, { "epoch": 0.07246376811594203, "grad_norm": 8.288698196411133, "learning_rate": 0.00019774557165861512, "loss": 39.2962, "step": 30 }, { "epoch": 0.07729468599033816, "grad_norm": 11.058815956115723, "learning_rate": 0.00019758454106280193, "loss": 38.5443, "step": 32 }, { "epoch": 0.0821256038647343, "grad_norm": 16.540409088134766, "learning_rate": 0.00019742351046698876, "loss": 40.7756, "step": 34 }, { "epoch": 0.08695652173913043, "grad_norm": 7.264046669006348, "learning_rate": 0.00019726247987117553, "loss": 40.0833, "step": 36 }, { "epoch": 0.09178743961352658, "grad_norm": 7.761327743530273, "learning_rate": 0.00019710144927536234, "loss": 36.8212, "step": 38 }, { "epoch": 0.0966183574879227, "grad_norm": 12.5891695022583, "learning_rate": 0.00019694041867954914, "loss": 37.6787, "step": 40 }, { "epoch": 0.10144927536231885, "grad_norm": 8.316587448120117, "learning_rate": 0.00019677938808373592, "loss": 38.399, "step": 42 }, { "epoch": 0.10628019323671498, "grad_norm": 6.445375442504883, "learning_rate": 0.00019661835748792272, "loss": 37.4109, "step": 44 }, { "epoch": 0.1111111111111111, "grad_norm": 10.068939208984375, "learning_rate": 0.00019645732689210952, "loss": 36.8106, "step": 46 }, { "epoch": 0.11594202898550725, "grad_norm": 10.530860900878906, "learning_rate": 0.0001962962962962963, "loss": 37.1642, "step": 48 }, { "epoch": 0.12077294685990338, "grad_norm": 7.902243614196777, "learning_rate": 0.0001961352657004831, "loss": 39.0534, "step": 50 }, { "epoch": 0.12560386473429952, "grad_norm": 7.551494598388672, "learning_rate": 0.0001959742351046699, "loss": 39.5417, "step": 52 }, { "epoch": 0.13043478260869565, "grad_norm": 10.609959602355957, "learning_rate": 0.00019581320450885668, "loss": 38.9163, "step": 54 }, { "epoch": 0.13526570048309178, "grad_norm": 11.915197372436523, "learning_rate": 0.0001956521739130435, "loss": 37.7346, "step": 56 }, { "epoch": 0.14009661835748793, "grad_norm": 9.105422973632812, "learning_rate": 0.0001954911433172303, "loss": 37.7469, "step": 58 }, { "epoch": 0.14492753623188406, "grad_norm": 10.62623119354248, "learning_rate": 0.00019533011272141707, "loss": 37.0263, "step": 60 }, { "epoch": 0.1497584541062802, "grad_norm": 9.892080307006836, "learning_rate": 0.00019516908212560387, "loss": 39.5135, "step": 62 }, { "epoch": 0.15458937198067632, "grad_norm": 12.9131441116333, "learning_rate": 0.00019500805152979068, "loss": 35.7463, "step": 64 }, { "epoch": 0.15942028985507245, "grad_norm": 9.631657600402832, "learning_rate": 0.00019484702093397745, "loss": 36.228, "step": 66 }, { "epoch": 0.1642512077294686, "grad_norm": 9.47088623046875, "learning_rate": 0.00019468599033816426, "loss": 38.1165, "step": 68 }, { "epoch": 0.16908212560386474, "grad_norm": 10.864086151123047, "learning_rate": 0.00019452495974235106, "loss": 37.2582, "step": 70 }, { "epoch": 0.17391304347826086, "grad_norm": 11.572696685791016, "learning_rate": 0.00019436392914653784, "loss": 36.3281, "step": 72 }, { "epoch": 0.178743961352657, "grad_norm": 8.3622407913208, "learning_rate": 0.00019420289855072464, "loss": 37.2185, "step": 74 }, { "epoch": 0.18357487922705315, "grad_norm": 10.799039840698242, "learning_rate": 0.00019404186795491144, "loss": 37.1632, "step": 76 }, { "epoch": 0.18840579710144928, "grad_norm": 10.033112525939941, "learning_rate": 0.00019388083735909825, "loss": 36.5457, "step": 78 }, { "epoch": 0.1932367149758454, "grad_norm": 14.008647918701172, "learning_rate": 0.00019371980676328502, "loss": 37.3024, "step": 80 }, { "epoch": 0.19806763285024154, "grad_norm": 16.30805778503418, "learning_rate": 0.00019355877616747183, "loss": 33.9588, "step": 82 }, { "epoch": 0.2028985507246377, "grad_norm": 10.949873924255371, "learning_rate": 0.00019339774557165863, "loss": 34.5954, "step": 84 }, { "epoch": 0.20772946859903382, "grad_norm": 13.2377290725708, "learning_rate": 0.0001932367149758454, "loss": 35.3164, "step": 86 }, { "epoch": 0.21256038647342995, "grad_norm": 16.080217361450195, "learning_rate": 0.0001930756843800322, "loss": 36.9439, "step": 88 }, { "epoch": 0.21739130434782608, "grad_norm": 12.830262184143066, "learning_rate": 0.000192914653784219, "loss": 35.9063, "step": 90 }, { "epoch": 0.2222222222222222, "grad_norm": 12.519986152648926, "learning_rate": 0.0001927536231884058, "loss": 36.0092, "step": 92 }, { "epoch": 0.22705314009661837, "grad_norm": 11.222923278808594, "learning_rate": 0.0001925925925925926, "loss": 35.2874, "step": 94 }, { "epoch": 0.2318840579710145, "grad_norm": 13.27009105682373, "learning_rate": 0.0001924315619967794, "loss": 33.1713, "step": 96 }, { "epoch": 0.23671497584541062, "grad_norm": 10.449563026428223, "learning_rate": 0.0001922705314009662, "loss": 34.836, "step": 98 }, { "epoch": 0.24154589371980675, "grad_norm": 17.162439346313477, "learning_rate": 0.000192109500805153, "loss": 33.4152, "step": 100 }, { "epoch": 0.2463768115942029, "grad_norm": 11.21731185913086, "learning_rate": 0.00019194847020933978, "loss": 33.7839, "step": 102 }, { "epoch": 0.25120772946859904, "grad_norm": 12.32532024383545, "learning_rate": 0.00019178743961352658, "loss": 34.1485, "step": 104 }, { "epoch": 0.2560386473429952, "grad_norm": 10.481746673583984, "learning_rate": 0.0001916264090177134, "loss": 32.3468, "step": 106 }, { "epoch": 0.2608695652173913, "grad_norm": 10.69057846069336, "learning_rate": 0.00019146537842190016, "loss": 33.281, "step": 108 }, { "epoch": 0.26570048309178745, "grad_norm": 14.237508773803711, "learning_rate": 0.00019130434782608697, "loss": 33.0036, "step": 110 }, { "epoch": 0.27053140096618356, "grad_norm": 8.754230499267578, "learning_rate": 0.00019114331723027377, "loss": 34.4538, "step": 112 }, { "epoch": 0.2753623188405797, "grad_norm": 8.595375061035156, "learning_rate": 0.00019098228663446057, "loss": 32.7405, "step": 114 }, { "epoch": 0.28019323671497587, "grad_norm": 10.565451622009277, "learning_rate": 0.00019082125603864735, "loss": 32.7687, "step": 116 }, { "epoch": 0.28502415458937197, "grad_norm": 9.513022422790527, "learning_rate": 0.00019066022544283415, "loss": 32.8597, "step": 118 }, { "epoch": 0.2898550724637681, "grad_norm": 12.073749542236328, "learning_rate": 0.00019049919484702096, "loss": 32.9837, "step": 120 }, { "epoch": 0.2946859903381642, "grad_norm": 9.835869789123535, "learning_rate": 0.00019033816425120773, "loss": 33.7884, "step": 122 }, { "epoch": 0.2995169082125604, "grad_norm": 9.995708465576172, "learning_rate": 0.00019017713365539454, "loss": 33.7684, "step": 124 }, { "epoch": 0.30434782608695654, "grad_norm": 9.067010879516602, "learning_rate": 0.00019001610305958134, "loss": 30.8072, "step": 126 }, { "epoch": 0.30917874396135264, "grad_norm": 9.235272407531738, "learning_rate": 0.00018985507246376812, "loss": 32.0888, "step": 128 }, { "epoch": 0.3140096618357488, "grad_norm": 9.046205520629883, "learning_rate": 0.00018969404186795492, "loss": 33.0407, "step": 130 }, { "epoch": 0.3188405797101449, "grad_norm": 9.927671432495117, "learning_rate": 0.00018953301127214172, "loss": 32.0351, "step": 132 }, { "epoch": 0.32367149758454106, "grad_norm": 10.035076141357422, "learning_rate": 0.0001893719806763285, "loss": 32.5972, "step": 134 }, { "epoch": 0.3285024154589372, "grad_norm": 10.489717483520508, "learning_rate": 0.0001892109500805153, "loss": 31.3804, "step": 136 }, { "epoch": 0.3333333333333333, "grad_norm": 13.48115348815918, "learning_rate": 0.0001890499194847021, "loss": 32.5356, "step": 138 }, { "epoch": 0.33816425120772947, "grad_norm": 8.694147109985352, "learning_rate": 0.00018888888888888888, "loss": 32.9306, "step": 140 }, { "epoch": 0.34299516908212563, "grad_norm": 8.273658752441406, "learning_rate": 0.0001887278582930757, "loss": 32.2116, "step": 142 }, { "epoch": 0.34782608695652173, "grad_norm": 10.635282516479492, "learning_rate": 0.0001885668276972625, "loss": 30.2346, "step": 144 }, { "epoch": 0.3526570048309179, "grad_norm": 9.83012866973877, "learning_rate": 0.00018840579710144927, "loss": 32.6259, "step": 146 }, { "epoch": 0.357487922705314, "grad_norm": 12.415063858032227, "learning_rate": 0.00018824476650563607, "loss": 32.0993, "step": 148 }, { "epoch": 0.36231884057971014, "grad_norm": 11.103983879089355, "learning_rate": 0.00018808373590982287, "loss": 31.7034, "step": 150 }, { "epoch": 0.3671497584541063, "grad_norm": 15.64197826385498, "learning_rate": 0.00018792270531400965, "loss": 31.6066, "step": 152 }, { "epoch": 0.3719806763285024, "grad_norm": 10.493355751037598, "learning_rate": 0.00018776167471819645, "loss": 29.1212, "step": 154 }, { "epoch": 0.37681159420289856, "grad_norm": 9.921483993530273, "learning_rate": 0.00018760064412238326, "loss": 31.0883, "step": 156 }, { "epoch": 0.38164251207729466, "grad_norm": 9.639843940734863, "learning_rate": 0.00018743961352657006, "loss": 29.4677, "step": 158 }, { "epoch": 0.3864734299516908, "grad_norm": 13.891378402709961, "learning_rate": 0.00018727858293075687, "loss": 31.5877, "step": 160 }, { "epoch": 0.391304347826087, "grad_norm": 10.116133689880371, "learning_rate": 0.00018711755233494367, "loss": 29.9048, "step": 162 }, { "epoch": 0.3961352657004831, "grad_norm": 7.1683173179626465, "learning_rate": 0.00018695652173913045, "loss": 30.6905, "step": 164 }, { "epoch": 0.40096618357487923, "grad_norm": 11.81785774230957, "learning_rate": 0.00018679549114331725, "loss": 32.2774, "step": 166 }, { "epoch": 0.4057971014492754, "grad_norm": 7.2079925537109375, "learning_rate": 0.00018663446054750405, "loss": 31.3603, "step": 168 }, { "epoch": 0.4106280193236715, "grad_norm": 10.22714900970459, "learning_rate": 0.00018647342995169083, "loss": 29.7549, "step": 170 }, { "epoch": 0.41545893719806765, "grad_norm": 8.35627269744873, "learning_rate": 0.00018631239935587763, "loss": 31.7175, "step": 172 }, { "epoch": 0.42028985507246375, "grad_norm": 8.98567008972168, "learning_rate": 0.00018615136876006444, "loss": 31.4399, "step": 174 }, { "epoch": 0.4251207729468599, "grad_norm": 10.814435958862305, "learning_rate": 0.0001859903381642512, "loss": 30.4269, "step": 176 }, { "epoch": 0.42995169082125606, "grad_norm": 9.445025444030762, "learning_rate": 0.00018582930756843802, "loss": 29.7986, "step": 178 }, { "epoch": 0.43478260869565216, "grad_norm": 11.667193412780762, "learning_rate": 0.00018566827697262482, "loss": 28.7911, "step": 180 }, { "epoch": 0.4396135265700483, "grad_norm": 8.154279708862305, "learning_rate": 0.0001855072463768116, "loss": 32.8533, "step": 182 }, { "epoch": 0.4444444444444444, "grad_norm": 9.40849781036377, "learning_rate": 0.0001853462157809984, "loss": 30.3062, "step": 184 }, { "epoch": 0.4492753623188406, "grad_norm": 9.476842880249023, "learning_rate": 0.0001851851851851852, "loss": 29.9025, "step": 186 }, { "epoch": 0.45410628019323673, "grad_norm": 9.150154113769531, "learning_rate": 0.00018502415458937198, "loss": 31.5757, "step": 188 }, { "epoch": 0.45893719806763283, "grad_norm": 8.072809219360352, "learning_rate": 0.00018486312399355878, "loss": 32.186, "step": 190 }, { "epoch": 0.463768115942029, "grad_norm": 11.970826148986816, "learning_rate": 0.0001847020933977456, "loss": 29.3784, "step": 192 }, { "epoch": 0.46859903381642515, "grad_norm": 11.011039733886719, "learning_rate": 0.00018454106280193236, "loss": 28.1951, "step": 194 }, { "epoch": 0.47342995169082125, "grad_norm": 10.958206176757812, "learning_rate": 0.00018438003220611917, "loss": 31.0484, "step": 196 }, { "epoch": 0.4782608695652174, "grad_norm": 9.812915802001953, "learning_rate": 0.00018421900161030597, "loss": 31.784, "step": 198 }, { "epoch": 0.4830917874396135, "grad_norm": 11.235363960266113, "learning_rate": 0.00018405797101449275, "loss": 31.3269, "step": 200 }, { "epoch": 0.48792270531400966, "grad_norm": 14.048873901367188, "learning_rate": 0.00018389694041867955, "loss": 31.0424, "step": 202 }, { "epoch": 0.4927536231884058, "grad_norm": 9.81869125366211, "learning_rate": 0.00018373590982286635, "loss": 32.6147, "step": 204 }, { "epoch": 0.4975845410628019, "grad_norm": 8.801289558410645, "learning_rate": 0.00018357487922705313, "loss": 30.4718, "step": 206 }, { "epoch": 0.5024154589371981, "grad_norm": 7.1190385818481445, "learning_rate": 0.00018341384863123993, "loss": 29.2369, "step": 208 }, { "epoch": 0.5072463768115942, "grad_norm": 8.437512397766113, "learning_rate": 0.00018325281803542674, "loss": 30.7868, "step": 210 }, { "epoch": 0.5120772946859904, "grad_norm": 6.539140224456787, "learning_rate": 0.0001830917874396135, "loss": 29.9767, "step": 212 }, { "epoch": 0.5169082125603864, "grad_norm": 9.160558700561523, "learning_rate": 0.00018293075684380032, "loss": 29.9602, "step": 214 }, { "epoch": 0.5217391304347826, "grad_norm": 7.8765177726745605, "learning_rate": 0.00018276972624798712, "loss": 30.8873, "step": 216 }, { "epoch": 0.5265700483091788, "grad_norm": 8.778061866760254, "learning_rate": 0.00018260869565217392, "loss": 30.1888, "step": 218 }, { "epoch": 0.5314009661835749, "grad_norm": 8.268914222717285, "learning_rate": 0.00018244766505636073, "loss": 31.7208, "step": 220 }, { "epoch": 0.5362318840579711, "grad_norm": 8.659038543701172, "learning_rate": 0.00018228663446054753, "loss": 31.3643, "step": 222 }, { "epoch": 0.5410628019323671, "grad_norm": 9.013368606567383, "learning_rate": 0.0001821256038647343, "loss": 30.1504, "step": 224 }, { "epoch": 0.5458937198067633, "grad_norm": 9.309354782104492, "learning_rate": 0.0001819645732689211, "loss": 30.2107, "step": 226 }, { "epoch": 0.5507246376811594, "grad_norm": 7.953092098236084, "learning_rate": 0.00018180354267310791, "loss": 29.3963, "step": 228 }, { "epoch": 0.5555555555555556, "grad_norm": 9.035888671875, "learning_rate": 0.0001816425120772947, "loss": 31.3732, "step": 230 }, { "epoch": 0.5603864734299517, "grad_norm": 10.098958969116211, "learning_rate": 0.0001814814814814815, "loss": 30.0866, "step": 232 }, { "epoch": 0.5652173913043478, "grad_norm": 9.308027267456055, "learning_rate": 0.0001813204508856683, "loss": 31.168, "step": 234 }, { "epoch": 0.5700483091787439, "grad_norm": 10.684345245361328, "learning_rate": 0.00018115942028985507, "loss": 30.3999, "step": 236 }, { "epoch": 0.5748792270531401, "grad_norm": 8.09032917022705, "learning_rate": 0.00018099838969404188, "loss": 31.6167, "step": 238 }, { "epoch": 0.5797101449275363, "grad_norm": 7.366332530975342, "learning_rate": 0.00018083735909822868, "loss": 30.2593, "step": 240 }, { "epoch": 0.5845410628019324, "grad_norm": 7.711369514465332, "learning_rate": 0.00018067632850241546, "loss": 28.9501, "step": 242 }, { "epoch": 0.5893719806763285, "grad_norm": 7.934360504150391, "learning_rate": 0.00018051529790660226, "loss": 28.5365, "step": 244 }, { "epoch": 0.5942028985507246, "grad_norm": 8.121601104736328, "learning_rate": 0.00018035426731078907, "loss": 29.2618, "step": 246 }, { "epoch": 0.5990338164251208, "grad_norm": 7.918673038482666, "learning_rate": 0.00018019323671497584, "loss": 30.0373, "step": 248 }, { "epoch": 0.6038647342995169, "grad_norm": 11.193553924560547, "learning_rate": 0.00018003220611916265, "loss": 31.3798, "step": 250 }, { "epoch": 0.6086956521739131, "grad_norm": 9.393643379211426, "learning_rate": 0.00017987117552334945, "loss": 31.2223, "step": 252 }, { "epoch": 0.6135265700483091, "grad_norm": 8.13814926147461, "learning_rate": 0.00017971014492753625, "loss": 30.5097, "step": 254 }, { "epoch": 0.6183574879227053, "grad_norm": 8.290206909179688, "learning_rate": 0.00017954911433172303, "loss": 27.856, "step": 256 }, { "epoch": 0.6231884057971014, "grad_norm": 9.917459487915039, "learning_rate": 0.00017938808373590983, "loss": 28.664, "step": 258 }, { "epoch": 0.6280193236714976, "grad_norm": 10.206878662109375, "learning_rate": 0.00017922705314009664, "loss": 31.3406, "step": 260 }, { "epoch": 0.6328502415458938, "grad_norm": 9.776812553405762, "learning_rate": 0.0001790660225442834, "loss": 30.5843, "step": 262 }, { "epoch": 0.6376811594202898, "grad_norm": 10.508336067199707, "learning_rate": 0.00017890499194847022, "loss": 30.3617, "step": 264 }, { "epoch": 0.642512077294686, "grad_norm": 9.144083976745605, "learning_rate": 0.00017874396135265702, "loss": 30.0566, "step": 266 }, { "epoch": 0.6473429951690821, "grad_norm": 9.019740104675293, "learning_rate": 0.0001785829307568438, "loss": 30.1384, "step": 268 }, { "epoch": 0.6521739130434783, "grad_norm": 9.140926361083984, "learning_rate": 0.0001784219001610306, "loss": 28.6601, "step": 270 }, { "epoch": 0.6570048309178744, "grad_norm": 9.820598602294922, "learning_rate": 0.0001782608695652174, "loss": 30.1565, "step": 272 }, { "epoch": 0.6618357487922706, "grad_norm": 9.670087814331055, "learning_rate": 0.00017809983896940418, "loss": 30.156, "step": 274 }, { "epoch": 0.6666666666666666, "grad_norm": 8.119627952575684, "learning_rate": 0.00017793880837359098, "loss": 29.5374, "step": 276 }, { "epoch": 0.6714975845410628, "grad_norm": 8.52702522277832, "learning_rate": 0.00017777777777777779, "loss": 30.2013, "step": 278 }, { "epoch": 0.6763285024154589, "grad_norm": 8.241043090820312, "learning_rate": 0.00017761674718196456, "loss": 29.2284, "step": 280 }, { "epoch": 0.6811594202898551, "grad_norm": 9.305002212524414, "learning_rate": 0.0001774557165861514, "loss": 30.8417, "step": 282 }, { "epoch": 0.6859903381642513, "grad_norm": 8.483264923095703, "learning_rate": 0.00017729468599033817, "loss": 28.3121, "step": 284 }, { "epoch": 0.6908212560386473, "grad_norm": 8.674230575561523, "learning_rate": 0.00017713365539452497, "loss": 30.4354, "step": 286 }, { "epoch": 0.6956521739130435, "grad_norm": 8.816984176635742, "learning_rate": 0.00017697262479871178, "loss": 29.0581, "step": 288 }, { "epoch": 0.7004830917874396, "grad_norm": 8.081759452819824, "learning_rate": 0.00017681159420289858, "loss": 31.0066, "step": 290 }, { "epoch": 0.7053140096618358, "grad_norm": 10.987712860107422, "learning_rate": 0.00017665056360708536, "loss": 28.4036, "step": 292 }, { "epoch": 0.7101449275362319, "grad_norm": 9.358428955078125, "learning_rate": 0.00017648953301127216, "loss": 31.733, "step": 294 }, { "epoch": 0.714975845410628, "grad_norm": 9.714231491088867, "learning_rate": 0.00017632850241545896, "loss": 28.3295, "step": 296 }, { "epoch": 0.7198067632850241, "grad_norm": 10.079188346862793, "learning_rate": 0.00017616747181964574, "loss": 29.7857, "step": 298 }, { "epoch": 0.7246376811594203, "grad_norm": 10.379854202270508, "learning_rate": 0.00017600644122383254, "loss": 28.7091, "step": 300 }, { "epoch": 0.7294685990338164, "grad_norm": 9.6157808303833, "learning_rate": 0.00017584541062801935, "loss": 30.0664, "step": 302 }, { "epoch": 0.7342995169082126, "grad_norm": 9.851590156555176, "learning_rate": 0.00017568438003220612, "loss": 30.5656, "step": 304 }, { "epoch": 0.7391304347826086, "grad_norm": 9.500916481018066, "learning_rate": 0.00017552334943639293, "loss": 28.8709, "step": 306 }, { "epoch": 0.7439613526570048, "grad_norm": 9.999371528625488, "learning_rate": 0.00017536231884057973, "loss": 29.5298, "step": 308 }, { "epoch": 0.748792270531401, "grad_norm": 8.5446195602417, "learning_rate": 0.0001752012882447665, "loss": 29.5134, "step": 310 }, { "epoch": 0.7536231884057971, "grad_norm": 9.369108200073242, "learning_rate": 0.0001750402576489533, "loss": 30.5918, "step": 312 }, { "epoch": 0.7584541062801933, "grad_norm": 9.66053581237793, "learning_rate": 0.00017487922705314011, "loss": 29.5254, "step": 314 }, { "epoch": 0.7632850241545893, "grad_norm": 9.699007034301758, "learning_rate": 0.0001747181964573269, "loss": 29.297, "step": 316 }, { "epoch": 0.7681159420289855, "grad_norm": 6.751578330993652, "learning_rate": 0.0001745571658615137, "loss": 27.781, "step": 318 }, { "epoch": 0.7729468599033816, "grad_norm": 8.00158977508545, "learning_rate": 0.0001743961352657005, "loss": 28.7844, "step": 320 }, { "epoch": 0.7777777777777778, "grad_norm": 11.203788757324219, "learning_rate": 0.00017423510466988727, "loss": 29.4509, "step": 322 }, { "epoch": 0.782608695652174, "grad_norm": 11.6134033203125, "learning_rate": 0.00017407407407407408, "loss": 28.4847, "step": 324 }, { "epoch": 0.7874396135265701, "grad_norm": 8.184885025024414, "learning_rate": 0.00017391304347826088, "loss": 30.9662, "step": 326 }, { "epoch": 0.7922705314009661, "grad_norm": 7.118366241455078, "learning_rate": 0.00017375201288244766, "loss": 30.3931, "step": 328 }, { "epoch": 0.7971014492753623, "grad_norm": 8.378530502319336, "learning_rate": 0.00017359098228663446, "loss": 28.4438, "step": 330 }, { "epoch": 0.8019323671497585, "grad_norm": 8.539013862609863, "learning_rate": 0.00017342995169082126, "loss": 30.293, "step": 332 }, { "epoch": 0.8067632850241546, "grad_norm": 10.329437255859375, "learning_rate": 0.00017326892109500804, "loss": 28.9439, "step": 334 }, { "epoch": 0.8115942028985508, "grad_norm": 7.267086982727051, "learning_rate": 0.00017310789049919484, "loss": 30.0624, "step": 336 }, { "epoch": 0.8164251207729468, "grad_norm": 10.7781400680542, "learning_rate": 0.00017294685990338165, "loss": 30.338, "step": 338 }, { "epoch": 0.821256038647343, "grad_norm": 11.077190399169922, "learning_rate": 0.00017278582930756842, "loss": 29.5852, "step": 340 }, { "epoch": 0.8260869565217391, "grad_norm": 9.007209777832031, "learning_rate": 0.00017262479871175523, "loss": 29.4697, "step": 342 }, { "epoch": 0.8309178743961353, "grad_norm": 8.706661224365234, "learning_rate": 0.00017246376811594206, "loss": 29.4373, "step": 344 }, { "epoch": 0.8357487922705314, "grad_norm": 8.104077339172363, "learning_rate": 0.00017230273752012884, "loss": 31.1459, "step": 346 }, { "epoch": 0.8405797101449275, "grad_norm": 8.499916076660156, "learning_rate": 0.00017214170692431564, "loss": 29.6039, "step": 348 }, { "epoch": 0.8454106280193237, "grad_norm": 9.886308670043945, "learning_rate": 0.00017198067632850244, "loss": 28.4998, "step": 350 }, { "epoch": 0.8502415458937198, "grad_norm": 6.680812835693359, "learning_rate": 0.00017181964573268922, "loss": 29.2374, "step": 352 }, { "epoch": 0.855072463768116, "grad_norm": 7.037901401519775, "learning_rate": 0.00017165861513687602, "loss": 29.3442, "step": 354 }, { "epoch": 0.8599033816425121, "grad_norm": 9.425200462341309, "learning_rate": 0.00017149758454106283, "loss": 28.2594, "step": 356 }, { "epoch": 0.8647342995169082, "grad_norm": 10.08089828491211, "learning_rate": 0.0001713365539452496, "loss": 29.583, "step": 358 }, { "epoch": 0.8695652173913043, "grad_norm": 8.83069133758545, "learning_rate": 0.0001711755233494364, "loss": 28.8106, "step": 360 }, { "epoch": 0.8743961352657005, "grad_norm": 12.723852157592773, "learning_rate": 0.0001710144927536232, "loss": 28.6247, "step": 362 }, { "epoch": 0.8792270531400966, "grad_norm": 7.244641304016113, "learning_rate": 0.00017085346215780999, "loss": 28.2318, "step": 364 }, { "epoch": 0.8840579710144928, "grad_norm": 10.645294189453125, "learning_rate": 0.0001706924315619968, "loss": 30.3984, "step": 366 }, { "epoch": 0.8888888888888888, "grad_norm": 8.675403594970703, "learning_rate": 0.0001705314009661836, "loss": 29.51, "step": 368 }, { "epoch": 0.893719806763285, "grad_norm": 9.324760437011719, "learning_rate": 0.00017037037037037037, "loss": 29.173, "step": 370 }, { "epoch": 0.8985507246376812, "grad_norm": 8.37873363494873, "learning_rate": 0.00017020933977455717, "loss": 28.2663, "step": 372 }, { "epoch": 0.9033816425120773, "grad_norm": 7.841792583465576, "learning_rate": 0.00017004830917874398, "loss": 30.4364, "step": 374 }, { "epoch": 0.9082125603864735, "grad_norm": 9.046091079711914, "learning_rate": 0.00016988727858293075, "loss": 30.9454, "step": 376 }, { "epoch": 0.9130434782608695, "grad_norm": 8.812469482421875, "learning_rate": 0.00016972624798711756, "loss": 26.7434, "step": 378 }, { "epoch": 0.9178743961352657, "grad_norm": 6.815216541290283, "learning_rate": 0.00016956521739130436, "loss": 30.5352, "step": 380 }, { "epoch": 0.9227053140096618, "grad_norm": 9.451848983764648, "learning_rate": 0.00016940418679549114, "loss": 29.4241, "step": 382 }, { "epoch": 0.927536231884058, "grad_norm": 8.97130298614502, "learning_rate": 0.00016924315619967794, "loss": 28.7862, "step": 384 }, { "epoch": 0.9323671497584541, "grad_norm": 7.6972975730896, "learning_rate": 0.00016908212560386474, "loss": 29.5439, "step": 386 }, { "epoch": 0.9371980676328503, "grad_norm": 7.955355167388916, "learning_rate": 0.00016892109500805152, "loss": 29.0917, "step": 388 }, { "epoch": 0.9420289855072463, "grad_norm": 9.80173397064209, "learning_rate": 0.00016876006441223832, "loss": 26.3015, "step": 390 }, { "epoch": 0.9468599033816425, "grad_norm": 9.457799911499023, "learning_rate": 0.00016859903381642513, "loss": 27.8037, "step": 392 }, { "epoch": 0.9516908212560387, "grad_norm": 7.2435173988342285, "learning_rate": 0.00016843800322061193, "loss": 28.3055, "step": 394 }, { "epoch": 0.9565217391304348, "grad_norm": 8.652717590332031, "learning_rate": 0.0001682769726247987, "loss": 29.4198, "step": 396 }, { "epoch": 0.961352657004831, "grad_norm": 11.697986602783203, "learning_rate": 0.0001681159420289855, "loss": 29.3374, "step": 398 }, { "epoch": 0.966183574879227, "grad_norm": 9.140453338623047, "learning_rate": 0.00016795491143317231, "loss": 28.7595, "step": 400 }, { "epoch": 0.9710144927536232, "grad_norm": 8.438916206359863, "learning_rate": 0.0001677938808373591, "loss": 27.0845, "step": 402 }, { "epoch": 0.9758454106280193, "grad_norm": 9.950366973876953, "learning_rate": 0.0001676328502415459, "loss": 28.7205, "step": 404 }, { "epoch": 0.9806763285024155, "grad_norm": 7.97797155380249, "learning_rate": 0.0001674718196457327, "loss": 30.1159, "step": 406 }, { "epoch": 0.9855072463768116, "grad_norm": 7.832582950592041, "learning_rate": 0.0001673107890499195, "loss": 29.9818, "step": 408 }, { "epoch": 0.9903381642512077, "grad_norm": 9.50314998626709, "learning_rate": 0.0001671497584541063, "loss": 26.6032, "step": 410 }, { "epoch": 0.9951690821256038, "grad_norm": 10.015514373779297, "learning_rate": 0.00016698872785829308, "loss": 30.9247, "step": 412 }, { "epoch": 1.0, "grad_norm": 8.032495498657227, "learning_rate": 0.00016682769726247988, "loss": 29.0615, "step": 414 }, { "epoch": 1.0048309178743962, "grad_norm": 7.304556846618652, "learning_rate": 0.0001666666666666667, "loss": 29.5321, "step": 416 }, { "epoch": 1.0096618357487923, "grad_norm": 7.575999736785889, "learning_rate": 0.00016650563607085346, "loss": 26.892, "step": 418 }, { "epoch": 1.0144927536231885, "grad_norm": 9.361268043518066, "learning_rate": 0.00016634460547504027, "loss": 28.0997, "step": 420 }, { "epoch": 1.0193236714975846, "grad_norm": 8.8648099899292, "learning_rate": 0.00016618357487922707, "loss": 28.1403, "step": 422 }, { "epoch": 1.0241545893719808, "grad_norm": 11.204512596130371, "learning_rate": 0.00016602254428341385, "loss": 29.3477, "step": 424 }, { "epoch": 1.0289855072463767, "grad_norm": 8.673910140991211, "learning_rate": 0.00016586151368760065, "loss": 29.9338, "step": 426 }, { "epoch": 1.0338164251207729, "grad_norm": 10.797616958618164, "learning_rate": 0.00016570048309178746, "loss": 28.3507, "step": 428 }, { "epoch": 1.038647342995169, "grad_norm": 9.084686279296875, "learning_rate": 0.00016553945249597426, "loss": 27.9469, "step": 430 }, { "epoch": 1.0434782608695652, "grad_norm": 9.642114639282227, "learning_rate": 0.00016537842190016104, "loss": 29.2968, "step": 432 }, { "epoch": 1.0483091787439613, "grad_norm": 8.333573341369629, "learning_rate": 0.00016521739130434784, "loss": 27.1096, "step": 434 }, { "epoch": 1.0531400966183575, "grad_norm": 10.562450408935547, "learning_rate": 0.00016505636070853464, "loss": 30.1957, "step": 436 }, { "epoch": 1.0579710144927537, "grad_norm": 7.98309326171875, "learning_rate": 0.00016489533011272142, "loss": 29.5, "step": 438 }, { "epoch": 1.0628019323671498, "grad_norm": 7.789132595062256, "learning_rate": 0.00016473429951690822, "loss": 29.6741, "step": 440 }, { "epoch": 1.067632850241546, "grad_norm": 8.362640380859375, "learning_rate": 0.00016457326892109503, "loss": 29.4575, "step": 442 }, { "epoch": 1.0724637681159421, "grad_norm": 7.407423973083496, "learning_rate": 0.0001644122383252818, "loss": 26.9575, "step": 444 }, { "epoch": 1.077294685990338, "grad_norm": 8.499112129211426, "learning_rate": 0.0001642512077294686, "loss": 29.4598, "step": 446 }, { "epoch": 1.0821256038647342, "grad_norm": 8.675498008728027, "learning_rate": 0.0001640901771336554, "loss": 26.3951, "step": 448 }, { "epoch": 1.0869565217391304, "grad_norm": 9.390106201171875, "learning_rate": 0.00016392914653784219, "loss": 27.4861, "step": 450 }, { "epoch": 1.0917874396135265, "grad_norm": 8.66092586517334, "learning_rate": 0.000163768115942029, "loss": 27.5384, "step": 452 }, { "epoch": 1.0966183574879227, "grad_norm": 9.866594314575195, "learning_rate": 0.0001636070853462158, "loss": 28.5237, "step": 454 }, { "epoch": 1.1014492753623188, "grad_norm": 8.653681755065918, "learning_rate": 0.00016344605475040257, "loss": 28.2452, "step": 456 }, { "epoch": 1.106280193236715, "grad_norm": 7.964409351348877, "learning_rate": 0.00016328502415458937, "loss": 27.3373, "step": 458 }, { "epoch": 1.1111111111111112, "grad_norm": 6.7314863204956055, "learning_rate": 0.00016312399355877618, "loss": 27.628, "step": 460 }, { "epoch": 1.1159420289855073, "grad_norm": 8.670600891113281, "learning_rate": 0.00016296296296296295, "loss": 28.4199, "step": 462 }, { "epoch": 1.1207729468599035, "grad_norm": 8.304594993591309, "learning_rate": 0.00016280193236714976, "loss": 28.3439, "step": 464 }, { "epoch": 1.1256038647342996, "grad_norm": 8.142372131347656, "learning_rate": 0.00016264090177133656, "loss": 29.8356, "step": 466 }, { "epoch": 1.1304347826086956, "grad_norm": 9.617864608764648, "learning_rate": 0.00016247987117552336, "loss": 26.8729, "step": 468 }, { "epoch": 1.1352657004830917, "grad_norm": 11.739964485168457, "learning_rate": 0.00016231884057971017, "loss": 27.5099, "step": 470 }, { "epoch": 1.1400966183574879, "grad_norm": 9.0482759475708, "learning_rate": 0.00016215780998389697, "loss": 25.6606, "step": 472 }, { "epoch": 1.144927536231884, "grad_norm": 7.055074214935303, "learning_rate": 0.00016199677938808375, "loss": 27.0075, "step": 474 }, { "epoch": 1.1497584541062802, "grad_norm": 9.319602012634277, "learning_rate": 0.00016183574879227055, "loss": 30.2885, "step": 476 }, { "epoch": 1.1545893719806763, "grad_norm": 9.021683692932129, "learning_rate": 0.00016167471819645735, "loss": 28.8099, "step": 478 }, { "epoch": 1.1594202898550725, "grad_norm": 6.554941177368164, "learning_rate": 0.00016151368760064413, "loss": 28.2995, "step": 480 }, { "epoch": 1.1642512077294687, "grad_norm": 7.542542934417725, "learning_rate": 0.00016135265700483093, "loss": 28.35, "step": 482 }, { "epoch": 1.1690821256038648, "grad_norm": 12.053621292114258, "learning_rate": 0.00016119162640901774, "loss": 30.3838, "step": 484 }, { "epoch": 1.1739130434782608, "grad_norm": 8.615163803100586, "learning_rate": 0.00016103059581320451, "loss": 27.872, "step": 486 }, { "epoch": 1.178743961352657, "grad_norm": 10.88862419128418, "learning_rate": 0.00016086956521739132, "loss": 26.9656, "step": 488 }, { "epoch": 1.183574879227053, "grad_norm": 9.35364818572998, "learning_rate": 0.00016070853462157812, "loss": 27.791, "step": 490 }, { "epoch": 1.1884057971014492, "grad_norm": 8.610274314880371, "learning_rate": 0.0001605475040257649, "loss": 27.1357, "step": 492 }, { "epoch": 1.1932367149758454, "grad_norm": 8.759892463684082, "learning_rate": 0.0001603864734299517, "loss": 27.035, "step": 494 }, { "epoch": 1.1980676328502415, "grad_norm": 10.015132904052734, "learning_rate": 0.0001602254428341385, "loss": 28.5965, "step": 496 }, { "epoch": 1.2028985507246377, "grad_norm": 9.121025085449219, "learning_rate": 0.00016006441223832528, "loss": 28.5107, "step": 498 }, { "epoch": 1.2077294685990339, "grad_norm": 9.401590347290039, "learning_rate": 0.00015990338164251208, "loss": 28.5304, "step": 500 }, { "epoch": 1.21256038647343, "grad_norm": 8.708001136779785, "learning_rate": 0.0001597423510466989, "loss": 29.9226, "step": 502 }, { "epoch": 1.2173913043478262, "grad_norm": 9.344232559204102, "learning_rate": 0.00015958132045088566, "loss": 27.7489, "step": 504 }, { "epoch": 1.2222222222222223, "grad_norm": 7.874361991882324, "learning_rate": 0.00015942028985507247, "loss": 29.2094, "step": 506 }, { "epoch": 1.2270531400966185, "grad_norm": 9.35866928100586, "learning_rate": 0.00015925925925925927, "loss": 28.5889, "step": 508 }, { "epoch": 1.2318840579710144, "grad_norm": 9.740680694580078, "learning_rate": 0.00015909822866344605, "loss": 29.5747, "step": 510 }, { "epoch": 1.2367149758454106, "grad_norm": 7.713297367095947, "learning_rate": 0.00015893719806763285, "loss": 28.9137, "step": 512 }, { "epoch": 1.2415458937198067, "grad_norm": 8.05880355834961, "learning_rate": 0.00015877616747181965, "loss": 29.4705, "step": 514 }, { "epoch": 1.2463768115942029, "grad_norm": 8.30479621887207, "learning_rate": 0.00015861513687600643, "loss": 29.1907, "step": 516 }, { "epoch": 1.251207729468599, "grad_norm": 10.590409278869629, "learning_rate": 0.00015845410628019323, "loss": 27.9428, "step": 518 }, { "epoch": 1.2560386473429952, "grad_norm": 8.054545402526855, "learning_rate": 0.00015829307568438004, "loss": 28.4886, "step": 520 }, { "epoch": 1.2608695652173914, "grad_norm": 8.148458480834961, "learning_rate": 0.00015813204508856681, "loss": 27.5395, "step": 522 }, { "epoch": 1.2657004830917875, "grad_norm": 8.747846603393555, "learning_rate": 0.00015797101449275362, "loss": 29.784, "step": 524 }, { "epoch": 1.2705314009661834, "grad_norm": 8.56131362915039, "learning_rate": 0.00015780998389694042, "loss": 26.8683, "step": 526 }, { "epoch": 1.2753623188405796, "grad_norm": 7.3210883140563965, "learning_rate": 0.0001576489533011272, "loss": 28.6462, "step": 528 }, { "epoch": 1.2801932367149758, "grad_norm": 7.494152545928955, "learning_rate": 0.00015748792270531403, "loss": 28.5302, "step": 530 }, { "epoch": 1.285024154589372, "grad_norm": 8.267993927001953, "learning_rate": 0.00015732689210950083, "loss": 28.3822, "step": 532 }, { "epoch": 1.289855072463768, "grad_norm": 9.768172264099121, "learning_rate": 0.0001571658615136876, "loss": 27.6898, "step": 534 }, { "epoch": 1.2946859903381642, "grad_norm": 6.865130424499512, "learning_rate": 0.0001570048309178744, "loss": 28.2198, "step": 536 }, { "epoch": 1.2995169082125604, "grad_norm": 8.628961563110352, "learning_rate": 0.00015684380032206122, "loss": 26.5476, "step": 538 }, { "epoch": 1.3043478260869565, "grad_norm": 9.150886535644531, "learning_rate": 0.000156682769726248, "loss": 29.4819, "step": 540 }, { "epoch": 1.3091787439613527, "grad_norm": 8.535932540893555, "learning_rate": 0.0001565217391304348, "loss": 28.8612, "step": 542 }, { "epoch": 1.3140096618357489, "grad_norm": 8.818495750427246, "learning_rate": 0.0001563607085346216, "loss": 29.2101, "step": 544 }, { "epoch": 1.318840579710145, "grad_norm": 8.080242156982422, "learning_rate": 0.00015619967793880838, "loss": 26.2693, "step": 546 }, { "epoch": 1.3236714975845412, "grad_norm": 7.340477466583252, "learning_rate": 0.00015603864734299518, "loss": 28.7229, "step": 548 }, { "epoch": 1.3285024154589373, "grad_norm": 7.5151047706604, "learning_rate": 0.00015587761674718198, "loss": 28.9171, "step": 550 }, { "epoch": 1.3333333333333333, "grad_norm": 8.710932731628418, "learning_rate": 0.00015571658615136876, "loss": 27.4353, "step": 552 }, { "epoch": 1.3381642512077294, "grad_norm": 8.146522521972656, "learning_rate": 0.00015555555555555556, "loss": 27.0576, "step": 554 }, { "epoch": 1.3429951690821256, "grad_norm": 9.677267074584961, "learning_rate": 0.00015539452495974237, "loss": 27.569, "step": 556 }, { "epoch": 1.3478260869565217, "grad_norm": 8.272392272949219, "learning_rate": 0.00015523349436392914, "loss": 27.5188, "step": 558 }, { "epoch": 1.3526570048309179, "grad_norm": 9.684012413024902, "learning_rate": 0.00015507246376811595, "loss": 29.1665, "step": 560 }, { "epoch": 1.357487922705314, "grad_norm": 12.55364990234375, "learning_rate": 0.00015491143317230275, "loss": 28.0156, "step": 562 }, { "epoch": 1.3623188405797102, "grad_norm": 8.099139213562012, "learning_rate": 0.00015475040257648953, "loss": 28.706, "step": 564 }, { "epoch": 1.3671497584541064, "grad_norm": 9.807384490966797, "learning_rate": 0.00015458937198067633, "loss": 28.5265, "step": 566 }, { "epoch": 1.3719806763285023, "grad_norm": 9.85666275024414, "learning_rate": 0.00015442834138486313, "loss": 27.7005, "step": 568 }, { "epoch": 1.3768115942028984, "grad_norm": 7.128468990325928, "learning_rate": 0.00015426731078904994, "loss": 27.5816, "step": 570 }, { "epoch": 1.3816425120772946, "grad_norm": 8.653708457946777, "learning_rate": 0.0001541062801932367, "loss": 28.4573, "step": 572 }, { "epoch": 1.3864734299516908, "grad_norm": 7.988314151763916, "learning_rate": 0.00015394524959742352, "loss": 27.9508, "step": 574 }, { "epoch": 1.391304347826087, "grad_norm": 10.148573875427246, "learning_rate": 0.00015378421900161032, "loss": 28.0399, "step": 576 }, { "epoch": 1.396135265700483, "grad_norm": 8.33492660522461, "learning_rate": 0.0001536231884057971, "loss": 29.6252, "step": 578 }, { "epoch": 1.4009661835748792, "grad_norm": 10.362284660339355, "learning_rate": 0.0001534621578099839, "loss": 28.2634, "step": 580 }, { "epoch": 1.4057971014492754, "grad_norm": 11.445610046386719, "learning_rate": 0.0001533011272141707, "loss": 27.94, "step": 582 }, { "epoch": 1.4106280193236715, "grad_norm": 5.8856916427612305, "learning_rate": 0.00015314009661835748, "loss": 28.6919, "step": 584 }, { "epoch": 1.4154589371980677, "grad_norm": 8.040237426757812, "learning_rate": 0.00015297906602254428, "loss": 27.4428, "step": 586 }, { "epoch": 1.4202898550724639, "grad_norm": 8.459455490112305, "learning_rate": 0.0001528180354267311, "loss": 28.883, "step": 588 }, { "epoch": 1.42512077294686, "grad_norm": 9.862092971801758, "learning_rate": 0.00015265700483091786, "loss": 27.5396, "step": 590 }, { "epoch": 1.4299516908212562, "grad_norm": 9.240866661071777, "learning_rate": 0.0001524959742351047, "loss": 28.5589, "step": 592 }, { "epoch": 1.434782608695652, "grad_norm": 8.943296432495117, "learning_rate": 0.00015233494363929147, "loss": 28.6255, "step": 594 }, { "epoch": 1.4396135265700483, "grad_norm": 9.087813377380371, "learning_rate": 0.00015217391304347827, "loss": 29.508, "step": 596 }, { "epoch": 1.4444444444444444, "grad_norm": 8.143028259277344, "learning_rate": 0.00015201288244766508, "loss": 27.9508, "step": 598 }, { "epoch": 1.4492753623188406, "grad_norm": 8.073821067810059, "learning_rate": 0.00015185185185185185, "loss": 28.4479, "step": 600 }, { "epoch": 1.4541062801932367, "grad_norm": 7.678289413452148, "learning_rate": 0.00015169082125603866, "loss": 28.75, "step": 602 }, { "epoch": 1.458937198067633, "grad_norm": 7.962745189666748, "learning_rate": 0.00015152979066022546, "loss": 28.074, "step": 604 }, { "epoch": 1.463768115942029, "grad_norm": 8.225008010864258, "learning_rate": 0.00015136876006441224, "loss": 28.5051, "step": 606 }, { "epoch": 1.4685990338164252, "grad_norm": 6.815709590911865, "learning_rate": 0.00015120772946859904, "loss": 29.1703, "step": 608 }, { "epoch": 1.4734299516908211, "grad_norm": 7.653327465057373, "learning_rate": 0.00015104669887278585, "loss": 27.2951, "step": 610 }, { "epoch": 1.4782608695652173, "grad_norm": 10.327927589416504, "learning_rate": 0.00015088566827697265, "loss": 28.1144, "step": 612 }, { "epoch": 1.4830917874396135, "grad_norm": 8.612911224365234, "learning_rate": 0.00015072463768115943, "loss": 28.5057, "step": 614 }, { "epoch": 1.4879227053140096, "grad_norm": 8.190404891967773, "learning_rate": 0.00015056360708534623, "loss": 27.3437, "step": 616 }, { "epoch": 1.4927536231884058, "grad_norm": 7.556375980377197, "learning_rate": 0.00015040257648953303, "loss": 26.3396, "step": 618 }, { "epoch": 1.497584541062802, "grad_norm": 8.995963096618652, "learning_rate": 0.0001502415458937198, "loss": 28.9629, "step": 620 }, { "epoch": 1.502415458937198, "grad_norm": 13.403937339782715, "learning_rate": 0.0001500805152979066, "loss": 28.1381, "step": 622 }, { "epoch": 1.5072463768115942, "grad_norm": 8.48337459564209, "learning_rate": 0.00014991948470209342, "loss": 28.8002, "step": 624 }, { "epoch": 1.5120772946859904, "grad_norm": 7.916252613067627, "learning_rate": 0.0001497584541062802, "loss": 29.5661, "step": 626 }, { "epoch": 1.5169082125603865, "grad_norm": 8.097860336303711, "learning_rate": 0.000149597423510467, "loss": 29.1108, "step": 628 }, { "epoch": 1.5217391304347827, "grad_norm": 7.992598056793213, "learning_rate": 0.0001494363929146538, "loss": 25.8962, "step": 630 }, { "epoch": 1.5265700483091789, "grad_norm": 6.601809501647949, "learning_rate": 0.00014927536231884058, "loss": 27.6647, "step": 632 }, { "epoch": 1.531400966183575, "grad_norm": 10.532876014709473, "learning_rate": 0.00014911433172302738, "loss": 30.0814, "step": 634 }, { "epoch": 1.5362318840579712, "grad_norm": 8.925707817077637, "learning_rate": 0.00014895330112721418, "loss": 28.0325, "step": 636 }, { "epoch": 1.541062801932367, "grad_norm": 6.749852657318115, "learning_rate": 0.00014879227053140096, "loss": 27.5785, "step": 638 }, { "epoch": 1.5458937198067633, "grad_norm": 9.3954439163208, "learning_rate": 0.00014863123993558776, "loss": 29.7051, "step": 640 }, { "epoch": 1.5507246376811594, "grad_norm": 7.524625778198242, "learning_rate": 0.00014847020933977457, "loss": 29.8091, "step": 642 }, { "epoch": 1.5555555555555556, "grad_norm": 8.303244590759277, "learning_rate": 0.00014830917874396134, "loss": 28.2779, "step": 644 }, { "epoch": 1.5603864734299517, "grad_norm": 8.040205001831055, "learning_rate": 0.00014814814814814815, "loss": 29.3399, "step": 646 }, { "epoch": 1.5652173913043477, "grad_norm": 6.253566265106201, "learning_rate": 0.00014798711755233495, "loss": 29.2561, "step": 648 }, { "epoch": 1.5700483091787438, "grad_norm": 8.045578002929688, "learning_rate": 0.00014782608695652173, "loss": 26.8589, "step": 650 }, { "epoch": 1.57487922705314, "grad_norm": 8.851433753967285, "learning_rate": 0.00014766505636070853, "loss": 28.2428, "step": 652 }, { "epoch": 1.5797101449275361, "grad_norm": 8.400809288024902, "learning_rate": 0.00014750402576489533, "loss": 29.5603, "step": 654 }, { "epoch": 1.5845410628019323, "grad_norm": 7.533260345458984, "learning_rate": 0.00014734299516908214, "loss": 26.993, "step": 656 }, { "epoch": 1.5893719806763285, "grad_norm": 6.993838310241699, "learning_rate": 0.00014718196457326894, "loss": 28.0578, "step": 658 }, { "epoch": 1.5942028985507246, "grad_norm": 9.009007453918457, "learning_rate": 0.00014702093397745574, "loss": 27.0416, "step": 660 }, { "epoch": 1.5990338164251208, "grad_norm": 7.587328910827637, "learning_rate": 0.00014685990338164252, "loss": 27.896, "step": 662 }, { "epoch": 1.603864734299517, "grad_norm": 7.423081398010254, "learning_rate": 0.00014669887278582932, "loss": 26.6892, "step": 664 }, { "epoch": 1.608695652173913, "grad_norm": 8.408404350280762, "learning_rate": 0.00014653784219001613, "loss": 27.5191, "step": 666 }, { "epoch": 1.6135265700483092, "grad_norm": 8.044210433959961, "learning_rate": 0.0001463768115942029, "loss": 30.3552, "step": 668 }, { "epoch": 1.6183574879227054, "grad_norm": 8.7662935256958, "learning_rate": 0.0001462157809983897, "loss": 27.6742, "step": 670 }, { "epoch": 1.6231884057971016, "grad_norm": 7.504002094268799, "learning_rate": 0.0001460547504025765, "loss": 26.1561, "step": 672 }, { "epoch": 1.6280193236714977, "grad_norm": 7.4576826095581055, "learning_rate": 0.0001458937198067633, "loss": 27.9011, "step": 674 }, { "epoch": 1.6328502415458939, "grad_norm": 7.216124057769775, "learning_rate": 0.0001457326892109501, "loss": 27.3678, "step": 676 }, { "epoch": 1.6376811594202898, "grad_norm": 8.461156845092773, "learning_rate": 0.0001455716586151369, "loss": 28.0337, "step": 678 }, { "epoch": 1.642512077294686, "grad_norm": 9.682413101196289, "learning_rate": 0.00014541062801932367, "loss": 27.1031, "step": 680 }, { "epoch": 1.6473429951690821, "grad_norm": 6.8604817390441895, "learning_rate": 0.00014524959742351047, "loss": 28.9948, "step": 682 }, { "epoch": 1.6521739130434783, "grad_norm": 8.835001945495605, "learning_rate": 0.00014508856682769728, "loss": 28.2096, "step": 684 }, { "epoch": 1.6570048309178744, "grad_norm": 8.947821617126465, "learning_rate": 0.00014492753623188405, "loss": 28.8512, "step": 686 }, { "epoch": 1.6618357487922706, "grad_norm": 7.301581859588623, "learning_rate": 0.00014476650563607086, "loss": 28.8649, "step": 688 }, { "epoch": 1.6666666666666665, "grad_norm": 8.465940475463867, "learning_rate": 0.00014460547504025766, "loss": 28.6396, "step": 690 }, { "epoch": 1.6714975845410627, "grad_norm": 9.281678199768066, "learning_rate": 0.00014444444444444444, "loss": 26.2167, "step": 692 }, { "epoch": 1.6763285024154588, "grad_norm": 8.054730415344238, "learning_rate": 0.00014428341384863124, "loss": 30.5799, "step": 694 }, { "epoch": 1.681159420289855, "grad_norm": 9.177703857421875, "learning_rate": 0.00014412238325281804, "loss": 27.6689, "step": 696 }, { "epoch": 1.6859903381642511, "grad_norm": 7.34149169921875, "learning_rate": 0.00014396135265700482, "loss": 28.1966, "step": 698 }, { "epoch": 1.6908212560386473, "grad_norm": 9.34843921661377, "learning_rate": 0.00014380032206119162, "loss": 28.2126, "step": 700 }, { "epoch": 1.6956521739130435, "grad_norm": 8.255733489990234, "learning_rate": 0.00014363929146537843, "loss": 28.279, "step": 702 }, { "epoch": 1.7004830917874396, "grad_norm": 7.138146877288818, "learning_rate": 0.0001434782608695652, "loss": 28.0335, "step": 704 }, { "epoch": 1.7053140096618358, "grad_norm": 7.608633041381836, "learning_rate": 0.000143317230273752, "loss": 27.788, "step": 706 }, { "epoch": 1.710144927536232, "grad_norm": 10.221348762512207, "learning_rate": 0.0001431561996779388, "loss": 26.4698, "step": 708 }, { "epoch": 1.714975845410628, "grad_norm": 7.764200210571289, "learning_rate": 0.0001429951690821256, "loss": 27.5784, "step": 710 }, { "epoch": 1.7198067632850242, "grad_norm": 7.295494079589844, "learning_rate": 0.0001428341384863124, "loss": 25.7921, "step": 712 }, { "epoch": 1.7246376811594204, "grad_norm": 7.534460544586182, "learning_rate": 0.0001426731078904992, "loss": 27.5532, "step": 714 }, { "epoch": 1.7294685990338166, "grad_norm": 7.3485002517700195, "learning_rate": 0.000142512077294686, "loss": 27.3069, "step": 716 }, { "epoch": 1.7342995169082127, "grad_norm": 7.3418049812316895, "learning_rate": 0.0001423510466988728, "loss": 28.4993, "step": 718 }, { "epoch": 1.7391304347826086, "grad_norm": 7.740454196929932, "learning_rate": 0.0001421900161030596, "loss": 27.7353, "step": 720 }, { "epoch": 1.7439613526570048, "grad_norm": 6.945924282073975, "learning_rate": 0.00014202898550724638, "loss": 29.0447, "step": 722 }, { "epoch": 1.748792270531401, "grad_norm": 10.651424407958984, "learning_rate": 0.00014186795491143319, "loss": 28.2643, "step": 724 }, { "epoch": 1.7536231884057971, "grad_norm": 8.329526901245117, "learning_rate": 0.00014170692431562, "loss": 27.9287, "step": 726 }, { "epoch": 1.7584541062801933, "grad_norm": 9.379905700683594, "learning_rate": 0.00014154589371980677, "loss": 29.7388, "step": 728 }, { "epoch": 1.7632850241545892, "grad_norm": 8.386578559875488, "learning_rate": 0.00014138486312399357, "loss": 27.136, "step": 730 }, { "epoch": 1.7681159420289854, "grad_norm": 7.3653388023376465, "learning_rate": 0.00014122383252818037, "loss": 27.7946, "step": 732 }, { "epoch": 1.7729468599033815, "grad_norm": 8.317994117736816, "learning_rate": 0.00014106280193236715, "loss": 27.9617, "step": 734 }, { "epoch": 1.7777777777777777, "grad_norm": 9.021920204162598, "learning_rate": 0.00014090177133655395, "loss": 27.6967, "step": 736 }, { "epoch": 1.7826086956521738, "grad_norm": 6.490061283111572, "learning_rate": 0.00014074074074074076, "loss": 29.1245, "step": 738 }, { "epoch": 1.78743961352657, "grad_norm": 8.023162841796875, "learning_rate": 0.00014057971014492753, "loss": 28.1596, "step": 740 }, { "epoch": 1.7922705314009661, "grad_norm": 7.9419169425964355, "learning_rate": 0.00014041867954911434, "loss": 27.8295, "step": 742 }, { "epoch": 1.7971014492753623, "grad_norm": 7.990035057067871, "learning_rate": 0.00014025764895330114, "loss": 26.8629, "step": 744 }, { "epoch": 1.8019323671497585, "grad_norm": 8.936909675598145, "learning_rate": 0.00014009661835748792, "loss": 28.8046, "step": 746 }, { "epoch": 1.8067632850241546, "grad_norm": 8.737541198730469, "learning_rate": 0.00013993558776167472, "loss": 28.1564, "step": 748 }, { "epoch": 1.8115942028985508, "grad_norm": 6.9399518966674805, "learning_rate": 0.00013977455716586152, "loss": 28.085, "step": 750 }, { "epoch": 1.816425120772947, "grad_norm": 7.811395645141602, "learning_rate": 0.00013961352657004833, "loss": 26.8601, "step": 752 }, { "epoch": 1.821256038647343, "grad_norm": 7.366069793701172, "learning_rate": 0.0001394524959742351, "loss": 27.7022, "step": 754 }, { "epoch": 1.8260869565217392, "grad_norm": 7.216097831726074, "learning_rate": 0.0001392914653784219, "loss": 27.8814, "step": 756 }, { "epoch": 1.8309178743961354, "grad_norm": 7.748776912689209, "learning_rate": 0.0001391304347826087, "loss": 26.7868, "step": 758 }, { "epoch": 1.8357487922705316, "grad_norm": 7.118618488311768, "learning_rate": 0.0001389694041867955, "loss": 27.519, "step": 760 }, { "epoch": 1.8405797101449275, "grad_norm": 7.588200092315674, "learning_rate": 0.0001388083735909823, "loss": 27.8801, "step": 762 }, { "epoch": 1.8454106280193237, "grad_norm": 8.082246780395508, "learning_rate": 0.0001386473429951691, "loss": 27.8767, "step": 764 }, { "epoch": 1.8502415458937198, "grad_norm": 8.772019386291504, "learning_rate": 0.00013848631239935587, "loss": 27.0454, "step": 766 }, { "epoch": 1.855072463768116, "grad_norm": 11.820154190063477, "learning_rate": 0.00013832528180354267, "loss": 27.4214, "step": 768 }, { "epoch": 1.8599033816425121, "grad_norm": 7.21035623550415, "learning_rate": 0.00013816425120772948, "loss": 28.2132, "step": 770 }, { "epoch": 1.864734299516908, "grad_norm": 7.833118438720703, "learning_rate": 0.00013800322061191625, "loss": 26.0298, "step": 772 }, { "epoch": 1.8695652173913042, "grad_norm": 9.474292755126953, "learning_rate": 0.00013784219001610306, "loss": 27.4543, "step": 774 }, { "epoch": 1.8743961352657004, "grad_norm": 8.790839195251465, "learning_rate": 0.00013768115942028986, "loss": 26.6007, "step": 776 }, { "epoch": 1.8792270531400965, "grad_norm": 7.7932963371276855, "learning_rate": 0.00013752012882447664, "loss": 26.3272, "step": 778 }, { "epoch": 1.8840579710144927, "grad_norm": 8.080236434936523, "learning_rate": 0.00013735909822866347, "loss": 27.8956, "step": 780 }, { "epoch": 1.8888888888888888, "grad_norm": 8.07216739654541, "learning_rate": 0.00013719806763285024, "loss": 27.893, "step": 782 }, { "epoch": 1.893719806763285, "grad_norm": 12.139753341674805, "learning_rate": 0.00013703703703703705, "loss": 29.3664, "step": 784 }, { "epoch": 1.8985507246376812, "grad_norm": 8.131410598754883, "learning_rate": 0.00013687600644122385, "loss": 26.927, "step": 786 }, { "epoch": 1.9033816425120773, "grad_norm": 7.748467922210693, "learning_rate": 0.00013671497584541066, "loss": 27.8784, "step": 788 }, { "epoch": 1.9082125603864735, "grad_norm": 7.19915771484375, "learning_rate": 0.00013655394524959743, "loss": 26.3311, "step": 790 }, { "epoch": 1.9130434782608696, "grad_norm": 7.374076843261719, "learning_rate": 0.00013639291465378424, "loss": 27.5376, "step": 792 }, { "epoch": 1.9178743961352658, "grad_norm": 9.71866512298584, "learning_rate": 0.00013623188405797104, "loss": 25.3754, "step": 794 }, { "epoch": 1.922705314009662, "grad_norm": 7.384367942810059, "learning_rate": 0.00013607085346215782, "loss": 27.993, "step": 796 }, { "epoch": 1.927536231884058, "grad_norm": 8.255502700805664, "learning_rate": 0.00013590982286634462, "loss": 27.9883, "step": 798 }, { "epoch": 1.9323671497584543, "grad_norm": 6.8607306480407715, "learning_rate": 0.00013574879227053142, "loss": 27.9741, "step": 800 }, { "epoch": 1.9371980676328504, "grad_norm": 7.215616226196289, "learning_rate": 0.0001355877616747182, "loss": 28.1998, "step": 802 }, { "epoch": 1.9420289855072463, "grad_norm": 7.920051574707031, "learning_rate": 0.000135426731078905, "loss": 27.6914, "step": 804 }, { "epoch": 1.9468599033816425, "grad_norm": 7.799438953399658, "learning_rate": 0.0001352657004830918, "loss": 26.2782, "step": 806 }, { "epoch": 1.9516908212560387, "grad_norm": 7.846622943878174, "learning_rate": 0.00013510466988727858, "loss": 27.5173, "step": 808 }, { "epoch": 1.9565217391304348, "grad_norm": 7.30129861831665, "learning_rate": 0.00013494363929146539, "loss": 24.9119, "step": 810 }, { "epoch": 1.961352657004831, "grad_norm": 7.13409948348999, "learning_rate": 0.0001347826086956522, "loss": 27.9657, "step": 812 }, { "epoch": 1.966183574879227, "grad_norm": 9.307235717773438, "learning_rate": 0.00013462157809983897, "loss": 27.6966, "step": 814 }, { "epoch": 1.971014492753623, "grad_norm": 7.8404741287231445, "learning_rate": 0.00013446054750402577, "loss": 26.6173, "step": 816 }, { "epoch": 1.9758454106280192, "grad_norm": 8.165302276611328, "learning_rate": 0.00013429951690821257, "loss": 27.9967, "step": 818 }, { "epoch": 1.9806763285024154, "grad_norm": 7.126535892486572, "learning_rate": 0.00013413848631239935, "loss": 28.0344, "step": 820 }, { "epoch": 1.9855072463768115, "grad_norm": 9.40721321105957, "learning_rate": 0.00013397745571658615, "loss": 26.6568, "step": 822 }, { "epoch": 1.9903381642512077, "grad_norm": 6.842724323272705, "learning_rate": 0.00013381642512077296, "loss": 27.3194, "step": 824 }, { "epoch": 1.9951690821256038, "grad_norm": 6.537780284881592, "learning_rate": 0.00013365539452495973, "loss": 27.7764, "step": 826 }, { "epoch": 2.0, "grad_norm": 6.422900199890137, "learning_rate": 0.00013349436392914654, "loss": 27.8163, "step": 828 }, { "epoch": 2.004830917874396, "grad_norm": 7.0895466804504395, "learning_rate": 0.00013333333333333334, "loss": 28.3841, "step": 830 }, { "epoch": 2.0096618357487923, "grad_norm": 6.439542293548584, "learning_rate": 0.00013317230273752012, "loss": 28.0774, "step": 832 }, { "epoch": 2.0144927536231885, "grad_norm": 7.766908645629883, "learning_rate": 0.00013301127214170692, "loss": 26.6744, "step": 834 }, { "epoch": 2.0193236714975846, "grad_norm": 9.178189277648926, "learning_rate": 0.00013285024154589372, "loss": 29.3364, "step": 836 }, { "epoch": 2.024154589371981, "grad_norm": 6.916229248046875, "learning_rate": 0.0001326892109500805, "loss": 28.0622, "step": 838 }, { "epoch": 2.028985507246377, "grad_norm": 7.51179838180542, "learning_rate": 0.0001325281803542673, "loss": 26.2849, "step": 840 }, { "epoch": 2.033816425120773, "grad_norm": 8.321070671081543, "learning_rate": 0.00013236714975845413, "loss": 27.222, "step": 842 }, { "epoch": 2.0386473429951693, "grad_norm": 6.450362205505371, "learning_rate": 0.0001322061191626409, "loss": 28.371, "step": 844 }, { "epoch": 2.0434782608695654, "grad_norm": 9.631372451782227, "learning_rate": 0.00013204508856682771, "loss": 29.1135, "step": 846 }, { "epoch": 2.0483091787439616, "grad_norm": 7.727206707000732, "learning_rate": 0.00013188405797101452, "loss": 27.9027, "step": 848 }, { "epoch": 2.0531400966183573, "grad_norm": 8.837319374084473, "learning_rate": 0.0001317230273752013, "loss": 27.5687, "step": 850 }, { "epoch": 2.0579710144927534, "grad_norm": 8.151753425598145, "learning_rate": 0.0001315619967793881, "loss": 27.0424, "step": 852 }, { "epoch": 2.0628019323671496, "grad_norm": 7.2588605880737305, "learning_rate": 0.0001314009661835749, "loss": 26.2731, "step": 854 }, { "epoch": 2.0676328502415457, "grad_norm": 9.428071975708008, "learning_rate": 0.00013123993558776168, "loss": 27.1224, "step": 856 }, { "epoch": 2.072463768115942, "grad_norm": 8.864592552185059, "learning_rate": 0.00013107890499194848, "loss": 27.3137, "step": 858 }, { "epoch": 2.077294685990338, "grad_norm": 9.21855640411377, "learning_rate": 0.00013091787439613528, "loss": 26.886, "step": 860 }, { "epoch": 2.082125603864734, "grad_norm": 7.239558696746826, "learning_rate": 0.00013075684380032206, "loss": 28.2175, "step": 862 }, { "epoch": 2.0869565217391304, "grad_norm": 8.155842781066895, "learning_rate": 0.00013059581320450886, "loss": 27.7151, "step": 864 }, { "epoch": 2.0917874396135265, "grad_norm": 7.057051658630371, "learning_rate": 0.00013043478260869567, "loss": 26.3673, "step": 866 }, { "epoch": 2.0966183574879227, "grad_norm": 7.664299488067627, "learning_rate": 0.00013027375201288244, "loss": 25.7326, "step": 868 }, { "epoch": 2.101449275362319, "grad_norm": 6.310895919799805, "learning_rate": 0.00013011272141706925, "loss": 28.7024, "step": 870 }, { "epoch": 2.106280193236715, "grad_norm": 7.707338809967041, "learning_rate": 0.00012995169082125605, "loss": 27.724, "step": 872 }, { "epoch": 2.111111111111111, "grad_norm": 7.318761825561523, "learning_rate": 0.00012979066022544283, "loss": 27.2221, "step": 874 }, { "epoch": 2.1159420289855073, "grad_norm": 9.668201446533203, "learning_rate": 0.00012962962962962963, "loss": 27.0287, "step": 876 }, { "epoch": 2.1207729468599035, "grad_norm": 7.614035129547119, "learning_rate": 0.00012946859903381643, "loss": 26.1026, "step": 878 }, { "epoch": 2.1256038647342996, "grad_norm": 8.675333023071289, "learning_rate": 0.0001293075684380032, "loss": 27.6808, "step": 880 }, { "epoch": 2.130434782608696, "grad_norm": 6.966851234436035, "learning_rate": 0.00012914653784219001, "loss": 27.6239, "step": 882 }, { "epoch": 2.135265700483092, "grad_norm": 6.5391974449157715, "learning_rate": 0.00012898550724637682, "loss": 27.9896, "step": 884 }, { "epoch": 2.140096618357488, "grad_norm": 8.508500099182129, "learning_rate": 0.0001288244766505636, "loss": 27.918, "step": 886 }, { "epoch": 2.1449275362318843, "grad_norm": 7.540635108947754, "learning_rate": 0.0001286634460547504, "loss": 28.694, "step": 888 }, { "epoch": 2.14975845410628, "grad_norm": 8.311809539794922, "learning_rate": 0.0001285024154589372, "loss": 27.7563, "step": 890 }, { "epoch": 2.154589371980676, "grad_norm": 7.208229064941406, "learning_rate": 0.000128341384863124, "loss": 28.0522, "step": 892 }, { "epoch": 2.1594202898550723, "grad_norm": 7.324676036834717, "learning_rate": 0.00012818035426731078, "loss": 28.7856, "step": 894 }, { "epoch": 2.1642512077294684, "grad_norm": 8.06933879852295, "learning_rate": 0.00012801932367149759, "loss": 26.2217, "step": 896 }, { "epoch": 2.1690821256038646, "grad_norm": 7.0082902908325195, "learning_rate": 0.0001278582930756844, "loss": 28.2109, "step": 898 }, { "epoch": 2.1739130434782608, "grad_norm": 6.494582176208496, "learning_rate": 0.00012769726247987117, "loss": 27.6243, "step": 900 }, { "epoch": 2.178743961352657, "grad_norm": 6.218760967254639, "learning_rate": 0.00012753623188405797, "loss": 26.7612, "step": 902 }, { "epoch": 2.183574879227053, "grad_norm": 9.239087104797363, "learning_rate": 0.00012737520128824477, "loss": 27.9478, "step": 904 }, { "epoch": 2.1884057971014492, "grad_norm": 6.95756196975708, "learning_rate": 0.00012721417069243158, "loss": 28.2791, "step": 906 }, { "epoch": 2.1932367149758454, "grad_norm": 7.1247944831848145, "learning_rate": 0.00012705314009661838, "loss": 26.1297, "step": 908 }, { "epoch": 2.1980676328502415, "grad_norm": 9.735993385314941, "learning_rate": 0.00012689210950080516, "loss": 24.9398, "step": 910 }, { "epoch": 2.2028985507246377, "grad_norm": 10.508362770080566, "learning_rate": 0.00012673107890499196, "loss": 26.488, "step": 912 }, { "epoch": 2.207729468599034, "grad_norm": 8.847992897033691, "learning_rate": 0.00012657004830917876, "loss": 27.945, "step": 914 }, { "epoch": 2.21256038647343, "grad_norm": 6.917768478393555, "learning_rate": 0.00012640901771336554, "loss": 28.1242, "step": 916 }, { "epoch": 2.217391304347826, "grad_norm": 8.339996337890625, "learning_rate": 0.00012624798711755234, "loss": 26.9138, "step": 918 }, { "epoch": 2.2222222222222223, "grad_norm": 6.435300827026367, "learning_rate": 0.00012608695652173915, "loss": 27.6529, "step": 920 }, { "epoch": 2.2270531400966185, "grad_norm": 7.194887638092041, "learning_rate": 0.00012592592592592592, "loss": 27.1809, "step": 922 }, { "epoch": 2.2318840579710146, "grad_norm": 9.154160499572754, "learning_rate": 0.00012576489533011273, "loss": 27.0501, "step": 924 }, { "epoch": 2.236714975845411, "grad_norm": 7.581670761108398, "learning_rate": 0.00012560386473429953, "loss": 26.1787, "step": 926 }, { "epoch": 2.241545893719807, "grad_norm": 8.077373504638672, "learning_rate": 0.00012544283413848633, "loss": 27.04, "step": 928 }, { "epoch": 2.246376811594203, "grad_norm": 7.282364845275879, "learning_rate": 0.0001252818035426731, "loss": 27.0844, "step": 930 }, { "epoch": 2.2512077294685993, "grad_norm": 7.848824501037598, "learning_rate": 0.0001251207729468599, "loss": 29.1671, "step": 932 }, { "epoch": 2.2560386473429954, "grad_norm": 7.200251579284668, "learning_rate": 0.00012495974235104672, "loss": 26.1801, "step": 934 }, { "epoch": 2.260869565217391, "grad_norm": 7.419154167175293, "learning_rate": 0.0001247987117552335, "loss": 27.1106, "step": 936 }, { "epoch": 2.2657004830917873, "grad_norm": 8.16390609741211, "learning_rate": 0.0001246376811594203, "loss": 25.5017, "step": 938 }, { "epoch": 2.2705314009661834, "grad_norm": 7.58992338180542, "learning_rate": 0.0001244766505636071, "loss": 27.0191, "step": 940 }, { "epoch": 2.2753623188405796, "grad_norm": 8.532602310180664, "learning_rate": 0.00012431561996779388, "loss": 28.0053, "step": 942 }, { "epoch": 2.2801932367149758, "grad_norm": 7.449092388153076, "learning_rate": 0.00012415458937198068, "loss": 25.7749, "step": 944 }, { "epoch": 2.285024154589372, "grad_norm": 7.38059139251709, "learning_rate": 0.00012399355877616748, "loss": 28.2566, "step": 946 }, { "epoch": 2.289855072463768, "grad_norm": 6.6862874031066895, "learning_rate": 0.00012383252818035426, "loss": 28.8852, "step": 948 }, { "epoch": 2.2946859903381642, "grad_norm": 7.916528701782227, "learning_rate": 0.00012367149758454106, "loss": 27.8083, "step": 950 }, { "epoch": 2.2995169082125604, "grad_norm": 6.143187522888184, "learning_rate": 0.00012351046698872787, "loss": 26.0691, "step": 952 }, { "epoch": 2.3043478260869565, "grad_norm": 8.420724868774414, "learning_rate": 0.00012334943639291464, "loss": 28.0858, "step": 954 }, { "epoch": 2.3091787439613527, "grad_norm": 7.883975505828857, "learning_rate": 0.00012318840579710145, "loss": 27.3439, "step": 956 }, { "epoch": 2.314009661835749, "grad_norm": 7.242871284484863, "learning_rate": 0.00012302737520128825, "loss": 27.1341, "step": 958 }, { "epoch": 2.318840579710145, "grad_norm": 7.858469009399414, "learning_rate": 0.00012286634460547503, "loss": 25.8494, "step": 960 }, { "epoch": 2.323671497584541, "grad_norm": 7.365942478179932, "learning_rate": 0.00012270531400966183, "loss": 26.9695, "step": 962 }, { "epoch": 2.3285024154589373, "grad_norm": 6.930251121520996, "learning_rate": 0.00012254428341384863, "loss": 25.7841, "step": 964 }, { "epoch": 2.3333333333333335, "grad_norm": 6.728757858276367, "learning_rate": 0.00012238325281803544, "loss": 28.053, "step": 966 }, { "epoch": 2.3381642512077296, "grad_norm": 6.711808681488037, "learning_rate": 0.00012222222222222224, "loss": 27.7962, "step": 968 }, { "epoch": 2.342995169082126, "grad_norm": 7.4918951988220215, "learning_rate": 0.00012206119162640903, "loss": 27.0597, "step": 970 }, { "epoch": 2.3478260869565215, "grad_norm": 8.181355476379395, "learning_rate": 0.00012190016103059582, "loss": 28.2665, "step": 972 }, { "epoch": 2.3526570048309177, "grad_norm": 7.762918949127197, "learning_rate": 0.00012173913043478263, "loss": 28.546, "step": 974 }, { "epoch": 2.357487922705314, "grad_norm": 7.8778276443481445, "learning_rate": 0.00012157809983896942, "loss": 27.1973, "step": 976 }, { "epoch": 2.36231884057971, "grad_norm": 7.002277374267578, "learning_rate": 0.0001214170692431562, "loss": 26.2418, "step": 978 }, { "epoch": 2.367149758454106, "grad_norm": 7.298165321350098, "learning_rate": 0.00012125603864734301, "loss": 28.2059, "step": 980 }, { "epoch": 2.3719806763285023, "grad_norm": 7.899686336517334, "learning_rate": 0.0001210950080515298, "loss": 26.9666, "step": 982 }, { "epoch": 2.3768115942028984, "grad_norm": 7.3516669273376465, "learning_rate": 0.0001209339774557166, "loss": 27.9299, "step": 984 }, { "epoch": 2.3816425120772946, "grad_norm": 7.224858283996582, "learning_rate": 0.00012077294685990339, "loss": 25.816, "step": 986 }, { "epoch": 2.3864734299516908, "grad_norm": 7.0076494216918945, "learning_rate": 0.00012061191626409018, "loss": 24.8251, "step": 988 }, { "epoch": 2.391304347826087, "grad_norm": 6.74472188949585, "learning_rate": 0.00012045088566827699, "loss": 27.6622, "step": 990 }, { "epoch": 2.396135265700483, "grad_norm": 6.549550533294678, "learning_rate": 0.00012028985507246378, "loss": 28.284, "step": 992 }, { "epoch": 2.4009661835748792, "grad_norm": 6.806623458862305, "learning_rate": 0.00012012882447665057, "loss": 26.6694, "step": 994 }, { "epoch": 2.4057971014492754, "grad_norm": 8.050207138061523, "learning_rate": 0.00011996779388083737, "loss": 28.3372, "step": 996 }, { "epoch": 2.4106280193236715, "grad_norm": 7.284823417663574, "learning_rate": 0.00011980676328502416, "loss": 26.9082, "step": 998 }, { "epoch": 2.4154589371980677, "grad_norm": 7.920591831207275, "learning_rate": 0.00011964573268921095, "loss": 29.7462, "step": 1000 }, { "epoch": 2.420289855072464, "grad_norm": 8.616438865661621, "learning_rate": 0.00011948470209339775, "loss": 26.7905, "step": 1002 }, { "epoch": 2.42512077294686, "grad_norm": 7.106829643249512, "learning_rate": 0.00011932367149758454, "loss": 27.5633, "step": 1004 }, { "epoch": 2.429951690821256, "grad_norm": 8.117084503173828, "learning_rate": 0.00011916264090177133, "loss": 26.9659, "step": 1006 }, { "epoch": 2.4347826086956523, "grad_norm": 7.732640743255615, "learning_rate": 0.00011900161030595814, "loss": 28.2114, "step": 1008 }, { "epoch": 2.4396135265700485, "grad_norm": 7.36362361907959, "learning_rate": 0.00011884057971014493, "loss": 26.3716, "step": 1010 }, { "epoch": 2.4444444444444446, "grad_norm": 8.114975929260254, "learning_rate": 0.00011867954911433172, "loss": 28.8353, "step": 1012 }, { "epoch": 2.449275362318841, "grad_norm": 7.141117095947266, "learning_rate": 0.00011851851851851852, "loss": 25.7371, "step": 1014 }, { "epoch": 2.454106280193237, "grad_norm": 7.491177558898926, "learning_rate": 0.00011835748792270531, "loss": 26.9146, "step": 1016 }, { "epoch": 2.4589371980676327, "grad_norm": 6.710269451141357, "learning_rate": 0.00011819645732689211, "loss": 25.5321, "step": 1018 }, { "epoch": 2.463768115942029, "grad_norm": 7.143400192260742, "learning_rate": 0.0001180354267310789, "loss": 29.9676, "step": 1020 }, { "epoch": 2.468599033816425, "grad_norm": 8.246957778930664, "learning_rate": 0.00011787439613526569, "loss": 29.2592, "step": 1022 }, { "epoch": 2.473429951690821, "grad_norm": 8.44863510131836, "learning_rate": 0.0001177133655394525, "loss": 26.0309, "step": 1024 }, { "epoch": 2.4782608695652173, "grad_norm": 7.821875095367432, "learning_rate": 0.00011755233494363929, "loss": 26.8746, "step": 1026 }, { "epoch": 2.4830917874396135, "grad_norm": 8.529960632324219, "learning_rate": 0.0001173913043478261, "loss": 27.0204, "step": 1028 }, { "epoch": 2.4879227053140096, "grad_norm": 6.8329339027404785, "learning_rate": 0.0001172302737520129, "loss": 25.2555, "step": 1030 }, { "epoch": 2.4927536231884058, "grad_norm": 6.804640769958496, "learning_rate": 0.0001170692431561997, "loss": 25.6537, "step": 1032 }, { "epoch": 2.497584541062802, "grad_norm": 7.089588642120361, "learning_rate": 0.00011690821256038649, "loss": 25.2568, "step": 1034 }, { "epoch": 2.502415458937198, "grad_norm": 11.241130828857422, "learning_rate": 0.00011674718196457328, "loss": 27.1132, "step": 1036 }, { "epoch": 2.5072463768115942, "grad_norm": 7.47288703918457, "learning_rate": 0.00011658615136876008, "loss": 25.9993, "step": 1038 }, { "epoch": 2.5120772946859904, "grad_norm": 8.372520446777344, "learning_rate": 0.00011642512077294687, "loss": 27.6641, "step": 1040 }, { "epoch": 2.5169082125603865, "grad_norm": 8.117879867553711, "learning_rate": 0.00011626409017713366, "loss": 26.6226, "step": 1042 }, { "epoch": 2.5217391304347827, "grad_norm": 8.319169044494629, "learning_rate": 0.00011610305958132046, "loss": 27.4311, "step": 1044 }, { "epoch": 2.526570048309179, "grad_norm": 7.18233585357666, "learning_rate": 0.00011594202898550725, "loss": 27.6304, "step": 1046 }, { "epoch": 2.531400966183575, "grad_norm": 7.594292640686035, "learning_rate": 0.00011578099838969404, "loss": 26.9063, "step": 1048 }, { "epoch": 2.536231884057971, "grad_norm": 8.392667770385742, "learning_rate": 0.00011561996779388085, "loss": 27.2786, "step": 1050 }, { "epoch": 2.541062801932367, "grad_norm": 6.698591709136963, "learning_rate": 0.00011545893719806764, "loss": 25.5416, "step": 1052 }, { "epoch": 2.545893719806763, "grad_norm": 6.185670375823975, "learning_rate": 0.00011529790660225444, "loss": 26.9696, "step": 1054 }, { "epoch": 2.550724637681159, "grad_norm": 7.676215648651123, "learning_rate": 0.00011513687600644123, "loss": 26.5383, "step": 1056 }, { "epoch": 2.5555555555555554, "grad_norm": 6.880972385406494, "learning_rate": 0.00011497584541062802, "loss": 26.3302, "step": 1058 }, { "epoch": 2.5603864734299515, "grad_norm": 8.553890228271484, "learning_rate": 0.00011481481481481482, "loss": 26.0391, "step": 1060 }, { "epoch": 2.5652173913043477, "grad_norm": 6.153205394744873, "learning_rate": 0.00011465378421900161, "loss": 25.729, "step": 1062 }, { "epoch": 2.570048309178744, "grad_norm": 8.465208053588867, "learning_rate": 0.0001144927536231884, "loss": 26.5018, "step": 1064 }, { "epoch": 2.57487922705314, "grad_norm": 8.127817153930664, "learning_rate": 0.00011433172302737521, "loss": 26.3506, "step": 1066 }, { "epoch": 2.579710144927536, "grad_norm": 9.615152359008789, "learning_rate": 0.000114170692431562, "loss": 25.9415, "step": 1068 }, { "epoch": 2.5845410628019323, "grad_norm": 7.294039249420166, "learning_rate": 0.00011400966183574879, "loss": 26.7507, "step": 1070 }, { "epoch": 2.5893719806763285, "grad_norm": 8.261009216308594, "learning_rate": 0.00011384863123993559, "loss": 26.7187, "step": 1072 }, { "epoch": 2.5942028985507246, "grad_norm": 6.705962181091309, "learning_rate": 0.00011368760064412238, "loss": 26.6202, "step": 1074 }, { "epoch": 2.5990338164251208, "grad_norm": 10.057275772094727, "learning_rate": 0.00011352657004830917, "loss": 26.6226, "step": 1076 }, { "epoch": 2.603864734299517, "grad_norm": 8.795845031738281, "learning_rate": 0.00011336553945249598, "loss": 28.1032, "step": 1078 }, { "epoch": 2.608695652173913, "grad_norm": 7.4816131591796875, "learning_rate": 0.00011320450885668277, "loss": 25.8255, "step": 1080 }, { "epoch": 2.6135265700483092, "grad_norm": 7.060609340667725, "learning_rate": 0.00011304347826086956, "loss": 26.9353, "step": 1082 }, { "epoch": 2.6183574879227054, "grad_norm": 7.140244960784912, "learning_rate": 0.00011288244766505636, "loss": 27.3619, "step": 1084 }, { "epoch": 2.6231884057971016, "grad_norm": 7.22598934173584, "learning_rate": 0.00011272141706924315, "loss": 25.3791, "step": 1086 }, { "epoch": 2.6280193236714977, "grad_norm": 7.098104953765869, "learning_rate": 0.00011256038647342995, "loss": 26.0269, "step": 1088 }, { "epoch": 2.632850241545894, "grad_norm": 6.918243408203125, "learning_rate": 0.00011239935587761677, "loss": 26.9077, "step": 1090 }, { "epoch": 2.63768115942029, "grad_norm": 7.557582378387451, "learning_rate": 0.00011223832528180356, "loss": 26.3413, "step": 1092 }, { "epoch": 2.642512077294686, "grad_norm": 7.406020164489746, "learning_rate": 0.00011207729468599035, "loss": 25.9181, "step": 1094 }, { "epoch": 2.6473429951690823, "grad_norm": 7.0549492835998535, "learning_rate": 0.00011191626409017715, "loss": 26.8606, "step": 1096 }, { "epoch": 2.6521739130434785, "grad_norm": 6.645535469055176, "learning_rate": 0.00011175523349436394, "loss": 27.9375, "step": 1098 }, { "epoch": 2.6570048309178746, "grad_norm": 7.90491247177124, "learning_rate": 0.00011159420289855073, "loss": 26.3062, "step": 1100 }, { "epoch": 2.661835748792271, "grad_norm": 6.990922927856445, "learning_rate": 0.00011143317230273754, "loss": 28.5585, "step": 1102 }, { "epoch": 2.6666666666666665, "grad_norm": 7.085525989532471, "learning_rate": 0.00011127214170692433, "loss": 25.2121, "step": 1104 }, { "epoch": 2.6714975845410627, "grad_norm": 8.292244911193848, "learning_rate": 0.00011111111111111112, "loss": 26.5729, "step": 1106 }, { "epoch": 2.676328502415459, "grad_norm": 7.650384426116943, "learning_rate": 0.00011095008051529792, "loss": 25.5093, "step": 1108 }, { "epoch": 2.681159420289855, "grad_norm": 9.83218765258789, "learning_rate": 0.00011078904991948471, "loss": 25.2708, "step": 1110 }, { "epoch": 2.685990338164251, "grad_norm": 6.258013725280762, "learning_rate": 0.0001106280193236715, "loss": 24.9544, "step": 1112 }, { "epoch": 2.6908212560386473, "grad_norm": 7.423259258270264, "learning_rate": 0.0001104669887278583, "loss": 27.2744, "step": 1114 }, { "epoch": 2.6956521739130435, "grad_norm": 7.9002814292907715, "learning_rate": 0.0001103059581320451, "loss": 26.9861, "step": 1116 }, { "epoch": 2.7004830917874396, "grad_norm": 7.641670227050781, "learning_rate": 0.00011014492753623188, "loss": 27.426, "step": 1118 }, { "epoch": 2.7053140096618358, "grad_norm": 7.658080577850342, "learning_rate": 0.00010998389694041869, "loss": 27.6252, "step": 1120 }, { "epoch": 2.710144927536232, "grad_norm": 7.938218116760254, "learning_rate": 0.00010982286634460548, "loss": 26.1781, "step": 1122 }, { "epoch": 2.714975845410628, "grad_norm": 7.96283483505249, "learning_rate": 0.00010966183574879228, "loss": 27.7596, "step": 1124 }, { "epoch": 2.7198067632850242, "grad_norm": 10.215167045593262, "learning_rate": 0.00010950080515297907, "loss": 26.9451, "step": 1126 }, { "epoch": 2.7246376811594204, "grad_norm": 7.972415924072266, "learning_rate": 0.00010933977455716586, "loss": 27.1329, "step": 1128 }, { "epoch": 2.7294685990338166, "grad_norm": 5.932509899139404, "learning_rate": 0.00010917874396135266, "loss": 28.5013, "step": 1130 }, { "epoch": 2.7342995169082127, "grad_norm": 8.786707878112793, "learning_rate": 0.00010901771336553945, "loss": 26.5279, "step": 1132 }, { "epoch": 2.7391304347826084, "grad_norm": 6.930019855499268, "learning_rate": 0.00010885668276972624, "loss": 27.3484, "step": 1134 }, { "epoch": 2.7439613526570046, "grad_norm": 7.4109015464782715, "learning_rate": 0.00010869565217391305, "loss": 26.4129, "step": 1136 }, { "epoch": 2.7487922705314007, "grad_norm": 6.286072731018066, "learning_rate": 0.00010853462157809984, "loss": 26.3836, "step": 1138 }, { "epoch": 2.753623188405797, "grad_norm": 8.696404457092285, "learning_rate": 0.00010837359098228663, "loss": 25.7786, "step": 1140 }, { "epoch": 2.758454106280193, "grad_norm": 8.277897834777832, "learning_rate": 0.00010821256038647343, "loss": 27.2492, "step": 1142 }, { "epoch": 2.763285024154589, "grad_norm": 7.653816223144531, "learning_rate": 0.00010805152979066022, "loss": 27.0198, "step": 1144 }, { "epoch": 2.7681159420289854, "grad_norm": 7.8368144035339355, "learning_rate": 0.00010789049919484701, "loss": 28.3334, "step": 1146 }, { "epoch": 2.7729468599033815, "grad_norm": 6.9786529541015625, "learning_rate": 0.00010772946859903381, "loss": 26.5917, "step": 1148 }, { "epoch": 2.7777777777777777, "grad_norm": 7.004583358764648, "learning_rate": 0.0001075684380032206, "loss": 26.4706, "step": 1150 }, { "epoch": 2.782608695652174, "grad_norm": 8.017105102539062, "learning_rate": 0.00010740740740740742, "loss": 28.0672, "step": 1152 }, { "epoch": 2.78743961352657, "grad_norm": 6.233907699584961, "learning_rate": 0.00010724637681159421, "loss": 27.5043, "step": 1154 }, { "epoch": 2.792270531400966, "grad_norm": 7.529089450836182, "learning_rate": 0.00010708534621578102, "loss": 25.2191, "step": 1156 }, { "epoch": 2.7971014492753623, "grad_norm": 7.839463233947754, "learning_rate": 0.0001069243156199678, "loss": 25.6082, "step": 1158 }, { "epoch": 2.8019323671497585, "grad_norm": 8.686691284179688, "learning_rate": 0.00010676328502415461, "loss": 27.9281, "step": 1160 }, { "epoch": 2.8067632850241546, "grad_norm": 6.9186930656433105, "learning_rate": 0.0001066022544283414, "loss": 26.3933, "step": 1162 }, { "epoch": 2.8115942028985508, "grad_norm": 7.170950889587402, "learning_rate": 0.00010644122383252819, "loss": 26.5526, "step": 1164 }, { "epoch": 2.816425120772947, "grad_norm": 6.971534729003906, "learning_rate": 0.00010628019323671499, "loss": 26.1706, "step": 1166 }, { "epoch": 2.821256038647343, "grad_norm": 7.302921295166016, "learning_rate": 0.00010611916264090178, "loss": 28.0723, "step": 1168 }, { "epoch": 2.8260869565217392, "grad_norm": 7.918272495269775, "learning_rate": 0.00010595813204508857, "loss": 25.9546, "step": 1170 }, { "epoch": 2.8309178743961354, "grad_norm": 8.934640884399414, "learning_rate": 0.00010579710144927538, "loss": 28.0027, "step": 1172 }, { "epoch": 2.8357487922705316, "grad_norm": 9.624857902526855, "learning_rate": 0.00010563607085346217, "loss": 27.6072, "step": 1174 }, { "epoch": 2.8405797101449277, "grad_norm": 7.182722091674805, "learning_rate": 0.00010547504025764896, "loss": 25.9444, "step": 1176 }, { "epoch": 2.845410628019324, "grad_norm": 8.560644149780273, "learning_rate": 0.00010531400966183576, "loss": 24.5426, "step": 1178 }, { "epoch": 2.85024154589372, "grad_norm": 7.0820088386535645, "learning_rate": 0.00010515297906602255, "loss": 27.1353, "step": 1180 }, { "epoch": 2.855072463768116, "grad_norm": 7.135811805725098, "learning_rate": 0.00010499194847020934, "loss": 25.9438, "step": 1182 }, { "epoch": 2.8599033816425123, "grad_norm": 7.968995571136475, "learning_rate": 0.00010483091787439614, "loss": 25.7914, "step": 1184 }, { "epoch": 2.864734299516908, "grad_norm": 7.4556193351745605, "learning_rate": 0.00010466988727858293, "loss": 28.4208, "step": 1186 }, { "epoch": 2.869565217391304, "grad_norm": 8.124032974243164, "learning_rate": 0.00010450885668276972, "loss": 26.6249, "step": 1188 }, { "epoch": 2.8743961352657004, "grad_norm": 6.682657718658447, "learning_rate": 0.00010434782608695653, "loss": 27.7629, "step": 1190 }, { "epoch": 2.8792270531400965, "grad_norm": 7.784018516540527, "learning_rate": 0.00010418679549114332, "loss": 26.3142, "step": 1192 }, { "epoch": 2.8840579710144927, "grad_norm": 6.824240207672119, "learning_rate": 0.00010402576489533012, "loss": 26.4967, "step": 1194 }, { "epoch": 2.888888888888889, "grad_norm": 6.703210353851318, "learning_rate": 0.00010386473429951691, "loss": 27.9698, "step": 1196 }, { "epoch": 2.893719806763285, "grad_norm": 7.0591840744018555, "learning_rate": 0.0001037037037037037, "loss": 26.4026, "step": 1198 }, { "epoch": 2.898550724637681, "grad_norm": 6.3246564865112305, "learning_rate": 0.0001035426731078905, "loss": 26.839, "step": 1200 }, { "epoch": 2.9033816425120773, "grad_norm": 8.211289405822754, "learning_rate": 0.00010338164251207729, "loss": 27.0174, "step": 1202 }, { "epoch": 2.9082125603864735, "grad_norm": 6.735382556915283, "learning_rate": 0.00010322061191626408, "loss": 26.3102, "step": 1204 }, { "epoch": 2.9130434782608696, "grad_norm": 8.0295991897583, "learning_rate": 0.00010305958132045089, "loss": 25.7761, "step": 1206 }, { "epoch": 2.917874396135266, "grad_norm": 8.097826957702637, "learning_rate": 0.00010289855072463768, "loss": 28.9129, "step": 1208 }, { "epoch": 2.922705314009662, "grad_norm": 8.124273300170898, "learning_rate": 0.00010273752012882447, "loss": 26.1519, "step": 1210 }, { "epoch": 2.927536231884058, "grad_norm": 8.470534324645996, "learning_rate": 0.00010257648953301127, "loss": 25.6004, "step": 1212 }, { "epoch": 2.9323671497584543, "grad_norm": 7.348142147064209, "learning_rate": 0.00010241545893719809, "loss": 27.1859, "step": 1214 }, { "epoch": 2.9371980676328504, "grad_norm": 8.258639335632324, "learning_rate": 0.00010225442834138488, "loss": 24.1802, "step": 1216 }, { "epoch": 2.942028985507246, "grad_norm": 8.160893440246582, "learning_rate": 0.00010209339774557167, "loss": 26.3956, "step": 1218 }, { "epoch": 2.9468599033816423, "grad_norm": 7.1116814613342285, "learning_rate": 0.00010193236714975847, "loss": 25.9712, "step": 1220 }, { "epoch": 2.9516908212560384, "grad_norm": 6.059470176696777, "learning_rate": 0.00010177133655394526, "loss": 27.1363, "step": 1222 }, { "epoch": 2.9565217391304346, "grad_norm": 7.71455192565918, "learning_rate": 0.00010161030595813205, "loss": 26.5133, "step": 1224 }, { "epoch": 2.9613526570048307, "grad_norm": 9.131839752197266, "learning_rate": 0.00010144927536231885, "loss": 27.7297, "step": 1226 }, { "epoch": 2.966183574879227, "grad_norm": 6.740046977996826, "learning_rate": 0.00010128824476650564, "loss": 25.8968, "step": 1228 }, { "epoch": 2.971014492753623, "grad_norm": 7.255392074584961, "learning_rate": 0.00010112721417069245, "loss": 26.343, "step": 1230 }, { "epoch": 2.975845410628019, "grad_norm": 7.241657733917236, "learning_rate": 0.00010096618357487924, "loss": 26.2671, "step": 1232 }, { "epoch": 2.9806763285024154, "grad_norm": 8.625435829162598, "learning_rate": 0.00010080515297906603, "loss": 26.2536, "step": 1234 }, { "epoch": 2.9855072463768115, "grad_norm": 7.044302940368652, "learning_rate": 0.00010064412238325283, "loss": 27.3368, "step": 1236 }, { "epoch": 2.9903381642512077, "grad_norm": 7.077991485595703, "learning_rate": 0.00010048309178743962, "loss": 28.0877, "step": 1238 }, { "epoch": 2.995169082125604, "grad_norm": 7.624186992645264, "learning_rate": 0.00010032206119162641, "loss": 27.3155, "step": 1240 }, { "epoch": 3.0, "grad_norm": 7.315317630767822, "learning_rate": 0.00010016103059581321, "loss": 26.3495, "step": 1242 }, { "epoch": 3.004830917874396, "grad_norm": 6.168877124786377, "learning_rate": 0.0001, "loss": 27.1989, "step": 1244 }, { "epoch": 3.0096618357487923, "grad_norm": 7.338534832000732, "learning_rate": 9.98389694041868e-05, "loss": 26.6896, "step": 1246 }, { "epoch": 3.0144927536231885, "grad_norm": 7.950836658477783, "learning_rate": 9.96779388083736e-05, "loss": 26.1743, "step": 1248 }, { "epoch": 3.0193236714975846, "grad_norm": 7.836818218231201, "learning_rate": 9.951690821256039e-05, "loss": 24.6431, "step": 1250 }, { "epoch": 3.024154589371981, "grad_norm": 7.391972064971924, "learning_rate": 9.935587761674718e-05, "loss": 26.9987, "step": 1252 }, { "epoch": 3.028985507246377, "grad_norm": 6.927128314971924, "learning_rate": 9.919484702093398e-05, "loss": 26.3314, "step": 1254 }, { "epoch": 3.033816425120773, "grad_norm": 6.5931267738342285, "learning_rate": 9.903381642512077e-05, "loss": 28.486, "step": 1256 }, { "epoch": 3.0386473429951693, "grad_norm": 6.712624549865723, "learning_rate": 9.887278582930756e-05, "loss": 23.3735, "step": 1258 }, { "epoch": 3.0434782608695654, "grad_norm": 7.244742393493652, "learning_rate": 9.871175523349438e-05, "loss": 28.1393, "step": 1260 }, { "epoch": 3.0483091787439616, "grad_norm": 7.571489334106445, "learning_rate": 9.855072463768117e-05, "loss": 26.1208, "step": 1262 }, { "epoch": 3.0531400966183573, "grad_norm": 7.6882643699646, "learning_rate": 9.838969404186796e-05, "loss": 25.3927, "step": 1264 }, { "epoch": 3.0579710144927534, "grad_norm": 7.103066444396973, "learning_rate": 9.822866344605476e-05, "loss": 25.6778, "step": 1266 }, { "epoch": 3.0628019323671496, "grad_norm": 7.564841270446777, "learning_rate": 9.806763285024155e-05, "loss": 26.3471, "step": 1268 }, { "epoch": 3.0676328502415457, "grad_norm": 7.3738508224487305, "learning_rate": 9.790660225442834e-05, "loss": 26.4939, "step": 1270 }, { "epoch": 3.072463768115942, "grad_norm": 8.300433158874512, "learning_rate": 9.774557165861515e-05, "loss": 27.4497, "step": 1272 }, { "epoch": 3.077294685990338, "grad_norm": 6.373605251312256, "learning_rate": 9.758454106280194e-05, "loss": 27.1139, "step": 1274 }, { "epoch": 3.082125603864734, "grad_norm": 7.21131706237793, "learning_rate": 9.742351046698873e-05, "loss": 25.3131, "step": 1276 }, { "epoch": 3.0869565217391304, "grad_norm": 7.3897504806518555, "learning_rate": 9.726247987117553e-05, "loss": 24.7751, "step": 1278 }, { "epoch": 3.0917874396135265, "grad_norm": 6.666619777679443, "learning_rate": 9.710144927536232e-05, "loss": 25.6616, "step": 1280 }, { "epoch": 3.0966183574879227, "grad_norm": 6.16898250579834, "learning_rate": 9.694041867954912e-05, "loss": 23.4636, "step": 1282 }, { "epoch": 3.101449275362319, "grad_norm": 6.940250396728516, "learning_rate": 9.677938808373591e-05, "loss": 27.0285, "step": 1284 }, { "epoch": 3.106280193236715, "grad_norm": 8.428845405578613, "learning_rate": 9.66183574879227e-05, "loss": 26.6035, "step": 1286 }, { "epoch": 3.111111111111111, "grad_norm": 7.685654640197754, "learning_rate": 9.64573268921095e-05, "loss": 27.0342, "step": 1288 }, { "epoch": 3.1159420289855073, "grad_norm": 8.046797752380371, "learning_rate": 9.62962962962963e-05, "loss": 26.7352, "step": 1290 }, { "epoch": 3.1207729468599035, "grad_norm": 7.739950180053711, "learning_rate": 9.61352657004831e-05, "loss": 27.607, "step": 1292 }, { "epoch": 3.1256038647342996, "grad_norm": 8.301579475402832, "learning_rate": 9.597423510466989e-05, "loss": 26.7545, "step": 1294 }, { "epoch": 3.130434782608696, "grad_norm": 7.416752338409424, "learning_rate": 9.58132045088567e-05, "loss": 25.5911, "step": 1296 }, { "epoch": 3.135265700483092, "grad_norm": 7.364454746246338, "learning_rate": 9.565217391304348e-05, "loss": 25.8314, "step": 1298 }, { "epoch": 3.140096618357488, "grad_norm": 7.930257797241211, "learning_rate": 9.549114331723029e-05, "loss": 23.9096, "step": 1300 }, { "epoch": 3.1449275362318843, "grad_norm": 6.694441795349121, "learning_rate": 9.533011272141708e-05, "loss": 26.1153, "step": 1302 }, { "epoch": 3.14975845410628, "grad_norm": 6.781352996826172, "learning_rate": 9.516908212560387e-05, "loss": 26.7928, "step": 1304 }, { "epoch": 3.154589371980676, "grad_norm": 6.676225662231445, "learning_rate": 9.500805152979067e-05, "loss": 27.0461, "step": 1306 }, { "epoch": 3.1594202898550723, "grad_norm": 7.4368767738342285, "learning_rate": 9.484702093397746e-05, "loss": 26.7284, "step": 1308 }, { "epoch": 3.1642512077294684, "grad_norm": 7.008518695831299, "learning_rate": 9.468599033816425e-05, "loss": 27.4804, "step": 1310 }, { "epoch": 3.1690821256038646, "grad_norm": 7.6441850662231445, "learning_rate": 9.452495974235105e-05, "loss": 27.3274, "step": 1312 }, { "epoch": 3.1739130434782608, "grad_norm": 7.242411136627197, "learning_rate": 9.436392914653784e-05, "loss": 24.6985, "step": 1314 }, { "epoch": 3.178743961352657, "grad_norm": 6.712805271148682, "learning_rate": 9.420289855072463e-05, "loss": 25.8327, "step": 1316 }, { "epoch": 3.183574879227053, "grad_norm": 6.724958419799805, "learning_rate": 9.404186795491144e-05, "loss": 26.9393, "step": 1318 }, { "epoch": 3.1884057971014492, "grad_norm": 7.451432228088379, "learning_rate": 9.388083735909823e-05, "loss": 25.1356, "step": 1320 }, { "epoch": 3.1932367149758454, "grad_norm": 7.7775421142578125, "learning_rate": 9.371980676328503e-05, "loss": 26.6738, "step": 1322 }, { "epoch": 3.1980676328502415, "grad_norm": 7.692890167236328, "learning_rate": 9.355877616747183e-05, "loss": 25.1647, "step": 1324 }, { "epoch": 3.2028985507246377, "grad_norm": 7.3927812576293945, "learning_rate": 9.339774557165862e-05, "loss": 26.9764, "step": 1326 }, { "epoch": 3.207729468599034, "grad_norm": 7.326320171356201, "learning_rate": 9.323671497584541e-05, "loss": 25.9006, "step": 1328 }, { "epoch": 3.21256038647343, "grad_norm": 8.4861421585083, "learning_rate": 9.307568438003222e-05, "loss": 26.148, "step": 1330 }, { "epoch": 3.217391304347826, "grad_norm": 8.520912170410156, "learning_rate": 9.291465378421901e-05, "loss": 26.3554, "step": 1332 }, { "epoch": 3.2222222222222223, "grad_norm": 7.051355361938477, "learning_rate": 9.27536231884058e-05, "loss": 26.3572, "step": 1334 }, { "epoch": 3.2270531400966185, "grad_norm": 9.287524223327637, "learning_rate": 9.25925925925926e-05, "loss": 25.9726, "step": 1336 }, { "epoch": 3.2318840579710146, "grad_norm": 7.160129070281982, "learning_rate": 9.243156199677939e-05, "loss": 28.7179, "step": 1338 }, { "epoch": 3.236714975845411, "grad_norm": 7.048616886138916, "learning_rate": 9.227053140096618e-05, "loss": 25.7061, "step": 1340 }, { "epoch": 3.241545893719807, "grad_norm": 7.782952785491943, "learning_rate": 9.210950080515299e-05, "loss": 26.6252, "step": 1342 }, { "epoch": 3.246376811594203, "grad_norm": 8.396957397460938, "learning_rate": 9.194847020933978e-05, "loss": 25.4261, "step": 1344 }, { "epoch": 3.2512077294685993, "grad_norm": 7.221895217895508, "learning_rate": 9.178743961352657e-05, "loss": 25.6437, "step": 1346 }, { "epoch": 3.2560386473429954, "grad_norm": 7.694455146789551, "learning_rate": 9.162640901771337e-05, "loss": 26.2562, "step": 1348 }, { "epoch": 3.260869565217391, "grad_norm": 7.642673492431641, "learning_rate": 9.146537842190016e-05, "loss": 25.1317, "step": 1350 }, { "epoch": 3.2657004830917873, "grad_norm": 6.599581241607666, "learning_rate": 9.130434782608696e-05, "loss": 25.9692, "step": 1352 }, { "epoch": 3.2705314009661834, "grad_norm": 8.950820922851562, "learning_rate": 9.114331723027377e-05, "loss": 27.4472, "step": 1354 }, { "epoch": 3.2753623188405796, "grad_norm": 6.30159854888916, "learning_rate": 9.098228663446056e-05, "loss": 25.9316, "step": 1356 }, { "epoch": 3.2801932367149758, "grad_norm": 6.927635192871094, "learning_rate": 9.082125603864735e-05, "loss": 27.5305, "step": 1358 }, { "epoch": 3.285024154589372, "grad_norm": 6.424526214599609, "learning_rate": 9.066022544283415e-05, "loss": 26.0866, "step": 1360 }, { "epoch": 3.289855072463768, "grad_norm": 8.260842323303223, "learning_rate": 9.049919484702094e-05, "loss": 25.4734, "step": 1362 }, { "epoch": 3.2946859903381642, "grad_norm": 6.600332736968994, "learning_rate": 9.033816425120773e-05, "loss": 25.1304, "step": 1364 }, { "epoch": 3.2995169082125604, "grad_norm": 6.801137447357178, "learning_rate": 9.017713365539453e-05, "loss": 27.4591, "step": 1366 }, { "epoch": 3.3043478260869565, "grad_norm": 7.686280250549316, "learning_rate": 9.001610305958132e-05, "loss": 26.2466, "step": 1368 }, { "epoch": 3.3091787439613527, "grad_norm": 6.084709644317627, "learning_rate": 8.985507246376813e-05, "loss": 25.2827, "step": 1370 }, { "epoch": 3.314009661835749, "grad_norm": 7.699804306030273, "learning_rate": 8.969404186795492e-05, "loss": 28.068, "step": 1372 }, { "epoch": 3.318840579710145, "grad_norm": 8.359792709350586, "learning_rate": 8.95330112721417e-05, "loss": 28.9643, "step": 1374 }, { "epoch": 3.323671497584541, "grad_norm": 7.701099872589111, "learning_rate": 8.937198067632851e-05, "loss": 26.1439, "step": 1376 }, { "epoch": 3.3285024154589373, "grad_norm": 8.339729309082031, "learning_rate": 8.92109500805153e-05, "loss": 26.0983, "step": 1378 }, { "epoch": 3.3333333333333335, "grad_norm": 8.924784660339355, "learning_rate": 8.904991948470209e-05, "loss": 25.8818, "step": 1380 }, { "epoch": 3.3381642512077296, "grad_norm": 8.396602630615234, "learning_rate": 8.888888888888889e-05, "loss": 27.7536, "step": 1382 }, { "epoch": 3.342995169082126, "grad_norm": 8.177582740783691, "learning_rate": 8.87278582930757e-05, "loss": 25.7908, "step": 1384 }, { "epoch": 3.3478260869565215, "grad_norm": 6.711874008178711, "learning_rate": 8.856682769726249e-05, "loss": 27.9945, "step": 1386 }, { "epoch": 3.3526570048309177, "grad_norm": 6.735175132751465, "learning_rate": 8.840579710144929e-05, "loss": 27.7595, "step": 1388 }, { "epoch": 3.357487922705314, "grad_norm": 8.890625, "learning_rate": 8.824476650563608e-05, "loss": 25.7886, "step": 1390 }, { "epoch": 3.36231884057971, "grad_norm": 7.918723106384277, "learning_rate": 8.808373590982287e-05, "loss": 27.3296, "step": 1392 }, { "epoch": 3.367149758454106, "grad_norm": 8.405486106872559, "learning_rate": 8.792270531400967e-05, "loss": 24.8263, "step": 1394 }, { "epoch": 3.3719806763285023, "grad_norm": 7.2000837326049805, "learning_rate": 8.776167471819646e-05, "loss": 27.6412, "step": 1396 }, { "epoch": 3.3768115942028984, "grad_norm": 9.657790184020996, "learning_rate": 8.760064412238325e-05, "loss": 24.8264, "step": 1398 }, { "epoch": 3.3816425120772946, "grad_norm": 7.06240177154541, "learning_rate": 8.743961352657006e-05, "loss": 26.162, "step": 1400 }, { "epoch": 3.3864734299516908, "grad_norm": 7.3674116134643555, "learning_rate": 8.727858293075685e-05, "loss": 27.8042, "step": 1402 }, { "epoch": 3.391304347826087, "grad_norm": 7.9507737159729, "learning_rate": 8.711755233494364e-05, "loss": 26.6252, "step": 1404 }, { "epoch": 3.396135265700483, "grad_norm": 8.195547103881836, "learning_rate": 8.695652173913044e-05, "loss": 26.886, "step": 1406 }, { "epoch": 3.4009661835748792, "grad_norm": 7.462141513824463, "learning_rate": 8.679549114331723e-05, "loss": 27.8522, "step": 1408 }, { "epoch": 3.4057971014492754, "grad_norm": 7.903439521789551, "learning_rate": 8.663446054750402e-05, "loss": 26.1915, "step": 1410 }, { "epoch": 3.4106280193236715, "grad_norm": 7.791518211364746, "learning_rate": 8.647342995169082e-05, "loss": 27.6484, "step": 1412 }, { "epoch": 3.4154589371980677, "grad_norm": 7.624407768249512, "learning_rate": 8.631239935587761e-05, "loss": 28.3851, "step": 1414 }, { "epoch": 3.420289855072464, "grad_norm": 7.524753570556641, "learning_rate": 8.615136876006442e-05, "loss": 25.3125, "step": 1416 }, { "epoch": 3.42512077294686, "grad_norm": 8.102710723876953, "learning_rate": 8.599033816425122e-05, "loss": 24.9681, "step": 1418 }, { "epoch": 3.429951690821256, "grad_norm": 6.52889347076416, "learning_rate": 8.582930756843801e-05, "loss": 27.6317, "step": 1420 }, { "epoch": 3.4347826086956523, "grad_norm": 8.491759300231934, "learning_rate": 8.56682769726248e-05, "loss": 26.7627, "step": 1422 }, { "epoch": 3.4396135265700485, "grad_norm": 8.082484245300293, "learning_rate": 8.55072463768116e-05, "loss": 25.5842, "step": 1424 }, { "epoch": 3.4444444444444446, "grad_norm": 8.158738136291504, "learning_rate": 8.53462157809984e-05, "loss": 27.7775, "step": 1426 }, { "epoch": 3.449275362318841, "grad_norm": 6.948888778686523, "learning_rate": 8.518518518518518e-05, "loss": 24.269, "step": 1428 }, { "epoch": 3.454106280193237, "grad_norm": 7.217655181884766, "learning_rate": 8.502415458937199e-05, "loss": 27.2054, "step": 1430 }, { "epoch": 3.4589371980676327, "grad_norm": 7.5419440269470215, "learning_rate": 8.486312399355878e-05, "loss": 25.9815, "step": 1432 }, { "epoch": 3.463768115942029, "grad_norm": 7.58052921295166, "learning_rate": 8.470209339774557e-05, "loss": 27.4913, "step": 1434 }, { "epoch": 3.468599033816425, "grad_norm": 7.221286296844482, "learning_rate": 8.454106280193237e-05, "loss": 26.7118, "step": 1436 }, { "epoch": 3.473429951690821, "grad_norm": 7.131877899169922, "learning_rate": 8.438003220611916e-05, "loss": 27.0661, "step": 1438 }, { "epoch": 3.4782608695652173, "grad_norm": 6.600888729095459, "learning_rate": 8.421900161030597e-05, "loss": 24.237, "step": 1440 }, { "epoch": 3.4830917874396135, "grad_norm": 7.91683292388916, "learning_rate": 8.405797101449276e-05, "loss": 24.8586, "step": 1442 }, { "epoch": 3.4879227053140096, "grad_norm": 6.824517250061035, "learning_rate": 8.389694041867955e-05, "loss": 26.9819, "step": 1444 }, { "epoch": 3.4927536231884058, "grad_norm": 6.753680229187012, "learning_rate": 8.373590982286635e-05, "loss": 26.0397, "step": 1446 }, { "epoch": 3.497584541062802, "grad_norm": 7.486673831939697, "learning_rate": 8.357487922705315e-05, "loss": 26.7425, "step": 1448 }, { "epoch": 3.502415458937198, "grad_norm": 8.475358009338379, "learning_rate": 8.341384863123994e-05, "loss": 26.1292, "step": 1450 }, { "epoch": 3.5072463768115942, "grad_norm": 6.859409332275391, "learning_rate": 8.325281803542673e-05, "loss": 26.4357, "step": 1452 }, { "epoch": 3.5120772946859904, "grad_norm": 7.169741630554199, "learning_rate": 8.309178743961354e-05, "loss": 27.2822, "step": 1454 }, { "epoch": 3.5169082125603865, "grad_norm": 8.31079387664795, "learning_rate": 8.293075684380033e-05, "loss": 27.7764, "step": 1456 }, { "epoch": 3.5217391304347827, "grad_norm": 6.888429164886475, "learning_rate": 8.276972624798713e-05, "loss": 26.5406, "step": 1458 }, { "epoch": 3.526570048309179, "grad_norm": 7.568389892578125, "learning_rate": 8.260869565217392e-05, "loss": 25.9647, "step": 1460 }, { "epoch": 3.531400966183575, "grad_norm": 6.64613151550293, "learning_rate": 8.244766505636071e-05, "loss": 26.9271, "step": 1462 }, { "epoch": 3.536231884057971, "grad_norm": 6.534989833831787, "learning_rate": 8.228663446054751e-05, "loss": 26.2684, "step": 1464 }, { "epoch": 3.541062801932367, "grad_norm": 7.926050662994385, "learning_rate": 8.21256038647343e-05, "loss": 26.845, "step": 1466 }, { "epoch": 3.545893719806763, "grad_norm": 7.452934741973877, "learning_rate": 8.196457326892109e-05, "loss": 25.8417, "step": 1468 }, { "epoch": 3.550724637681159, "grad_norm": 7.26784086227417, "learning_rate": 8.18035426731079e-05, "loss": 26.0035, "step": 1470 }, { "epoch": 3.5555555555555554, "grad_norm": 7.318904399871826, "learning_rate": 8.164251207729469e-05, "loss": 27.4574, "step": 1472 }, { "epoch": 3.5603864734299515, "grad_norm": 6.999464511871338, "learning_rate": 8.148148148148148e-05, "loss": 25.949, "step": 1474 }, { "epoch": 3.5652173913043477, "grad_norm": 7.244204044342041, "learning_rate": 8.132045088566828e-05, "loss": 26.636, "step": 1476 }, { "epoch": 3.570048309178744, "grad_norm": 8.60554027557373, "learning_rate": 8.115942028985508e-05, "loss": 27.4116, "step": 1478 }, { "epoch": 3.57487922705314, "grad_norm": 6.701752662658691, "learning_rate": 8.099838969404187e-05, "loss": 25.0194, "step": 1480 }, { "epoch": 3.579710144927536, "grad_norm": 6.613931655883789, "learning_rate": 8.083735909822868e-05, "loss": 25.7569, "step": 1482 }, { "epoch": 3.5845410628019323, "grad_norm": 7.828546524047852, "learning_rate": 8.067632850241547e-05, "loss": 27.1483, "step": 1484 }, { "epoch": 3.5893719806763285, "grad_norm": 7.983916282653809, "learning_rate": 8.051529790660226e-05, "loss": 24.6017, "step": 1486 }, { "epoch": 3.5942028985507246, "grad_norm": 8.500826835632324, "learning_rate": 8.035426731078906e-05, "loss": 26.6575, "step": 1488 }, { "epoch": 3.5990338164251208, "grad_norm": 8.88049030303955, "learning_rate": 8.019323671497585e-05, "loss": 23.7421, "step": 1490 }, { "epoch": 3.603864734299517, "grad_norm": 7.034642696380615, "learning_rate": 8.003220611916264e-05, "loss": 27.8291, "step": 1492 }, { "epoch": 3.608695652173913, "grad_norm": 7.1023077964782715, "learning_rate": 7.987117552334944e-05, "loss": 26.7066, "step": 1494 }, { "epoch": 3.6135265700483092, "grad_norm": 8.332448959350586, "learning_rate": 7.971014492753623e-05, "loss": 25.7769, "step": 1496 }, { "epoch": 3.6183574879227054, "grad_norm": 7.105356693267822, "learning_rate": 7.954911433172302e-05, "loss": 25.7133, "step": 1498 }, { "epoch": 3.6231884057971016, "grad_norm": 7.028257369995117, "learning_rate": 7.938808373590983e-05, "loss": 25.0051, "step": 1500 }, { "epoch": 3.6280193236714977, "grad_norm": 7.71824312210083, "learning_rate": 7.922705314009662e-05, "loss": 25.9737, "step": 1502 }, { "epoch": 3.632850241545894, "grad_norm": 5.497483253479004, "learning_rate": 7.906602254428341e-05, "loss": 27.4592, "step": 1504 }, { "epoch": 3.63768115942029, "grad_norm": 8.458606719970703, "learning_rate": 7.890499194847021e-05, "loss": 24.0378, "step": 1506 }, { "epoch": 3.642512077294686, "grad_norm": 8.406185150146484, "learning_rate": 7.874396135265701e-05, "loss": 26.3229, "step": 1508 }, { "epoch": 3.6473429951690823, "grad_norm": 7.685035228729248, "learning_rate": 7.85829307568438e-05, "loss": 25.97, "step": 1510 }, { "epoch": 3.6521739130434785, "grad_norm": 8.686131477355957, "learning_rate": 7.842190016103061e-05, "loss": 26.591, "step": 1512 }, { "epoch": 3.6570048309178746, "grad_norm": 6.984585285186768, "learning_rate": 7.82608695652174e-05, "loss": 25.8358, "step": 1514 }, { "epoch": 3.661835748792271, "grad_norm": 5.834330081939697, "learning_rate": 7.809983896940419e-05, "loss": 26.1456, "step": 1516 }, { "epoch": 3.6666666666666665, "grad_norm": 6.367677688598633, "learning_rate": 7.793880837359099e-05, "loss": 26.5751, "step": 1518 }, { "epoch": 3.6714975845410627, "grad_norm": 6.723855018615723, "learning_rate": 7.777777777777778e-05, "loss": 26.5181, "step": 1520 }, { "epoch": 3.676328502415459, "grad_norm": 6.305589199066162, "learning_rate": 7.761674718196457e-05, "loss": 25.6111, "step": 1522 }, { "epoch": 3.681159420289855, "grad_norm": 6.444118976593018, "learning_rate": 7.745571658615138e-05, "loss": 25.0445, "step": 1524 }, { "epoch": 3.685990338164251, "grad_norm": 7.176147937774658, "learning_rate": 7.729468599033817e-05, "loss": 25.2998, "step": 1526 }, { "epoch": 3.6908212560386473, "grad_norm": 8.422863006591797, "learning_rate": 7.713365539452497e-05, "loss": 27.3324, "step": 1528 }, { "epoch": 3.6956521739130435, "grad_norm": 6.9630913734436035, "learning_rate": 7.697262479871176e-05, "loss": 25.7144, "step": 1530 }, { "epoch": 3.7004830917874396, "grad_norm": 5.846348285675049, "learning_rate": 7.681159420289855e-05, "loss": 27.7589, "step": 1532 }, { "epoch": 3.7053140096618358, "grad_norm": 7.343765735626221, "learning_rate": 7.665056360708535e-05, "loss": 25.8322, "step": 1534 }, { "epoch": 3.710144927536232, "grad_norm": 6.997490882873535, "learning_rate": 7.648953301127214e-05, "loss": 28.2401, "step": 1536 }, { "epoch": 3.714975845410628, "grad_norm": 6.830377101898193, "learning_rate": 7.632850241545893e-05, "loss": 25.1853, "step": 1538 }, { "epoch": 3.7198067632850242, "grad_norm": 7.353569030761719, "learning_rate": 7.616747181964574e-05, "loss": 27.8896, "step": 1540 }, { "epoch": 3.7246376811594204, "grad_norm": 6.923029899597168, "learning_rate": 7.600644122383254e-05, "loss": 27.466, "step": 1542 }, { "epoch": 3.7294685990338166, "grad_norm": 7.982540607452393, "learning_rate": 7.584541062801933e-05, "loss": 26.5827, "step": 1544 }, { "epoch": 3.7342995169082127, "grad_norm": 7.8132758140563965, "learning_rate": 7.568438003220612e-05, "loss": 25.5102, "step": 1546 }, { "epoch": 3.7391304347826084, "grad_norm": 6.979062557220459, "learning_rate": 7.552334943639292e-05, "loss": 26.6007, "step": 1548 }, { "epoch": 3.7439613526570046, "grad_norm": 6.988529682159424, "learning_rate": 7.536231884057971e-05, "loss": 27.2199, "step": 1550 }, { "epoch": 3.7487922705314007, "grad_norm": 6.884960174560547, "learning_rate": 7.520128824476652e-05, "loss": 28.3951, "step": 1552 }, { "epoch": 3.753623188405797, "grad_norm": 7.593159198760986, "learning_rate": 7.50402576489533e-05, "loss": 26.4899, "step": 1554 }, { "epoch": 3.758454106280193, "grad_norm": 7.603058815002441, "learning_rate": 7.48792270531401e-05, "loss": 25.2797, "step": 1556 }, { "epoch": 3.763285024154589, "grad_norm": 8.542155265808105, "learning_rate": 7.47181964573269e-05, "loss": 25.7991, "step": 1558 }, { "epoch": 3.7681159420289854, "grad_norm": 7.652464389801025, "learning_rate": 7.455716586151369e-05, "loss": 26.4845, "step": 1560 }, { "epoch": 3.7729468599033815, "grad_norm": 8.047564506530762, "learning_rate": 7.439613526570048e-05, "loss": 26.0127, "step": 1562 }, { "epoch": 3.7777777777777777, "grad_norm": 6.38883113861084, "learning_rate": 7.423510466988728e-05, "loss": 27.0549, "step": 1564 }, { "epoch": 3.782608695652174, "grad_norm": 6.353972434997559, "learning_rate": 7.407407407407407e-05, "loss": 25.2923, "step": 1566 }, { "epoch": 3.78743961352657, "grad_norm": 6.962271690368652, "learning_rate": 7.391304347826086e-05, "loss": 27.8826, "step": 1568 }, { "epoch": 3.792270531400966, "grad_norm": 6.521156311035156, "learning_rate": 7.375201288244767e-05, "loss": 28.1107, "step": 1570 }, { "epoch": 3.7971014492753623, "grad_norm": 8.195451736450195, "learning_rate": 7.359098228663447e-05, "loss": 26.5253, "step": 1572 }, { "epoch": 3.8019323671497585, "grad_norm": 6.806168556213379, "learning_rate": 7.342995169082126e-05, "loss": 27.1728, "step": 1574 }, { "epoch": 3.8067632850241546, "grad_norm": 7.229825973510742, "learning_rate": 7.326892109500806e-05, "loss": 25.333, "step": 1576 }, { "epoch": 3.8115942028985508, "grad_norm": 6.635615825653076, "learning_rate": 7.310789049919485e-05, "loss": 27.0733, "step": 1578 }, { "epoch": 3.816425120772947, "grad_norm": 6.50180721282959, "learning_rate": 7.294685990338164e-05, "loss": 27.4529, "step": 1580 }, { "epoch": 3.821256038647343, "grad_norm": 7.335048675537109, "learning_rate": 7.278582930756845e-05, "loss": 25.6855, "step": 1582 }, { "epoch": 3.8260869565217392, "grad_norm": 6.961329460144043, "learning_rate": 7.262479871175524e-05, "loss": 26.0103, "step": 1584 }, { "epoch": 3.8309178743961354, "grad_norm": 6.842545986175537, "learning_rate": 7.246376811594203e-05, "loss": 27.9322, "step": 1586 }, { "epoch": 3.8357487922705316, "grad_norm": 6.83944845199585, "learning_rate": 7.230273752012883e-05, "loss": 27.5723, "step": 1588 }, { "epoch": 3.8405797101449277, "grad_norm": 7.0610127449035645, "learning_rate": 7.214170692431562e-05, "loss": 26.011, "step": 1590 }, { "epoch": 3.845410628019324, "grad_norm": 7.726437568664551, "learning_rate": 7.198067632850241e-05, "loss": 26.6807, "step": 1592 }, { "epoch": 3.85024154589372, "grad_norm": 9.280223846435547, "learning_rate": 7.181964573268921e-05, "loss": 26.9886, "step": 1594 }, { "epoch": 3.855072463768116, "grad_norm": 6.995485782623291, "learning_rate": 7.1658615136876e-05, "loss": 27.2315, "step": 1596 }, { "epoch": 3.8599033816425123, "grad_norm": 7.200146198272705, "learning_rate": 7.14975845410628e-05, "loss": 25.7971, "step": 1598 }, { "epoch": 3.864734299516908, "grad_norm": 7.404515743255615, "learning_rate": 7.13365539452496e-05, "loss": 25.3168, "step": 1600 }, { "epoch": 3.869565217391304, "grad_norm": 7.142045497894287, "learning_rate": 7.11755233494364e-05, "loss": 24.6409, "step": 1602 }, { "epoch": 3.8743961352657004, "grad_norm": 7.02120304107666, "learning_rate": 7.101449275362319e-05, "loss": 27.3518, "step": 1604 }, { "epoch": 3.8792270531400965, "grad_norm": 7.604321002960205, "learning_rate": 7.085346215781e-05, "loss": 26.8926, "step": 1606 }, { "epoch": 3.8840579710144927, "grad_norm": 7.089973449707031, "learning_rate": 7.069243156199678e-05, "loss": 24.8074, "step": 1608 }, { "epoch": 3.888888888888889, "grad_norm": 8.049272537231445, "learning_rate": 7.053140096618357e-05, "loss": 25.8524, "step": 1610 }, { "epoch": 3.893719806763285, "grad_norm": 7.1630144119262695, "learning_rate": 7.037037037037038e-05, "loss": 26.9071, "step": 1612 }, { "epoch": 3.898550724637681, "grad_norm": 6.2005510330200195, "learning_rate": 7.020933977455717e-05, "loss": 27.107, "step": 1614 }, { "epoch": 3.9033816425120773, "grad_norm": 8.320915222167969, "learning_rate": 7.004830917874396e-05, "loss": 27.0582, "step": 1616 }, { "epoch": 3.9082125603864735, "grad_norm": 6.952855110168457, "learning_rate": 6.988727858293076e-05, "loss": 26.4762, "step": 1618 }, { "epoch": 3.9130434782608696, "grad_norm": 7.927274227142334, "learning_rate": 6.972624798711755e-05, "loss": 24.2706, "step": 1620 }, { "epoch": 3.917874396135266, "grad_norm": 7.922103404998779, "learning_rate": 6.956521739130436e-05, "loss": 25.8205, "step": 1622 }, { "epoch": 3.922705314009662, "grad_norm": 7.824489116668701, "learning_rate": 6.940418679549115e-05, "loss": 26.4827, "step": 1624 }, { "epoch": 3.927536231884058, "grad_norm": 6.419587135314941, "learning_rate": 6.924315619967794e-05, "loss": 28.2682, "step": 1626 }, { "epoch": 3.9323671497584543, "grad_norm": 8.104780197143555, "learning_rate": 6.908212560386474e-05, "loss": 25.7621, "step": 1628 }, { "epoch": 3.9371980676328504, "grad_norm": 7.307147979736328, "learning_rate": 6.892109500805153e-05, "loss": 27.1352, "step": 1630 }, { "epoch": 3.942028985507246, "grad_norm": 7.069173812866211, "learning_rate": 6.876006441223832e-05, "loss": 25.538, "step": 1632 }, { "epoch": 3.9468599033816423, "grad_norm": 7.971487522125244, "learning_rate": 6.859903381642512e-05, "loss": 26.754, "step": 1634 }, { "epoch": 3.9516908212560384, "grad_norm": 7.200797080993652, "learning_rate": 6.843800322061193e-05, "loss": 25.5438, "step": 1636 }, { "epoch": 3.9565217391304346, "grad_norm": 8.00469970703125, "learning_rate": 6.827697262479872e-05, "loss": 26.6568, "step": 1638 }, { "epoch": 3.9613526570048307, "grad_norm": 6.9250359535217285, "learning_rate": 6.811594202898552e-05, "loss": 25.4743, "step": 1640 }, { "epoch": 3.966183574879227, "grad_norm": 6.473790168762207, "learning_rate": 6.795491143317231e-05, "loss": 25.9443, "step": 1642 }, { "epoch": 3.971014492753623, "grad_norm": 8.05759048461914, "learning_rate": 6.77938808373591e-05, "loss": 25.5339, "step": 1644 }, { "epoch": 3.975845410628019, "grad_norm": 7.342809200286865, "learning_rate": 6.76328502415459e-05, "loss": 24.9969, "step": 1646 }, { "epoch": 3.9806763285024154, "grad_norm": 7.265125274658203, "learning_rate": 6.747181964573269e-05, "loss": 27.3797, "step": 1648 }, { "epoch": 3.9855072463768115, "grad_norm": 7.021026134490967, "learning_rate": 6.731078904991948e-05, "loss": 26.1091, "step": 1650 }, { "epoch": 3.9903381642512077, "grad_norm": 7.2072529792785645, "learning_rate": 6.714975845410629e-05, "loss": 27.5467, "step": 1652 }, { "epoch": 3.995169082125604, "grad_norm": 7.393160820007324, "learning_rate": 6.698872785829308e-05, "loss": 26.9415, "step": 1654 }, { "epoch": 4.0, "grad_norm": 7.511723518371582, "learning_rate": 6.682769726247987e-05, "loss": 26.0607, "step": 1656 }, { "epoch": 4.004830917874396, "grad_norm": 8.766012191772461, "learning_rate": 6.666666666666667e-05, "loss": 27.5218, "step": 1658 }, { "epoch": 4.009661835748792, "grad_norm": 6.866961479187012, "learning_rate": 6.650563607085346e-05, "loss": 26.818, "step": 1660 }, { "epoch": 4.0144927536231885, "grad_norm": 7.680884838104248, "learning_rate": 6.634460547504025e-05, "loss": 26.8022, "step": 1662 }, { "epoch": 4.019323671497585, "grad_norm": 7.392796039581299, "learning_rate": 6.618357487922707e-05, "loss": 26.8426, "step": 1664 }, { "epoch": 4.024154589371981, "grad_norm": 7.595928192138672, "learning_rate": 6.602254428341386e-05, "loss": 24.3986, "step": 1666 }, { "epoch": 4.028985507246377, "grad_norm": 7.379922866821289, "learning_rate": 6.586151368760065e-05, "loss": 28.0114, "step": 1668 }, { "epoch": 4.033816425120773, "grad_norm": 7.208115100860596, "learning_rate": 6.570048309178745e-05, "loss": 27.5758, "step": 1670 }, { "epoch": 4.038647342995169, "grad_norm": 7.357963562011719, "learning_rate": 6.553945249597424e-05, "loss": 24.8787, "step": 1672 }, { "epoch": 4.043478260869565, "grad_norm": 7.291189670562744, "learning_rate": 6.537842190016103e-05, "loss": 26.2749, "step": 1674 }, { "epoch": 4.048309178743962, "grad_norm": 7.44353723526001, "learning_rate": 6.521739130434783e-05, "loss": 25.121, "step": 1676 }, { "epoch": 4.053140096618358, "grad_norm": 6.338862419128418, "learning_rate": 6.505636070853462e-05, "loss": 25.7155, "step": 1678 }, { "epoch": 4.057971014492754, "grad_norm": 6.6159162521362305, "learning_rate": 6.489533011272141e-05, "loss": 24.0727, "step": 1680 }, { "epoch": 4.06280193236715, "grad_norm": 6.825524806976318, "learning_rate": 6.473429951690822e-05, "loss": 24.1144, "step": 1682 }, { "epoch": 4.067632850241546, "grad_norm": 6.563850402832031, "learning_rate": 6.457326892109501e-05, "loss": 25.211, "step": 1684 }, { "epoch": 4.072463768115942, "grad_norm": 6.340920925140381, "learning_rate": 6.44122383252818e-05, "loss": 25.9026, "step": 1686 }, { "epoch": 4.0772946859903385, "grad_norm": 6.728626251220703, "learning_rate": 6.42512077294686e-05, "loss": 27.3648, "step": 1688 }, { "epoch": 4.082125603864735, "grad_norm": 6.788083553314209, "learning_rate": 6.409017713365539e-05, "loss": 26.6329, "step": 1690 }, { "epoch": 4.086956521739131, "grad_norm": 7.323519706726074, "learning_rate": 6.39291465378422e-05, "loss": 25.6293, "step": 1692 }, { "epoch": 4.091787439613527, "grad_norm": 6.454324245452881, "learning_rate": 6.376811594202898e-05, "loss": 26.1033, "step": 1694 }, { "epoch": 4.096618357487923, "grad_norm": 6.53643798828125, "learning_rate": 6.360708534621579e-05, "loss": 28.4655, "step": 1696 }, { "epoch": 4.101449275362318, "grad_norm": 8.033370971679688, "learning_rate": 6.344605475040258e-05, "loss": 26.4287, "step": 1698 }, { "epoch": 4.106280193236715, "grad_norm": 6.196560382843018, "learning_rate": 6.328502415458938e-05, "loss": 26.5747, "step": 1700 }, { "epoch": 4.111111111111111, "grad_norm": 6.994458198547363, "learning_rate": 6.312399355877617e-05, "loss": 25.3307, "step": 1702 }, { "epoch": 4.115942028985507, "grad_norm": 7.29825496673584, "learning_rate": 6.296296296296296e-05, "loss": 25.2931, "step": 1704 }, { "epoch": 4.120772946859903, "grad_norm": 7.366706371307373, "learning_rate": 6.280193236714976e-05, "loss": 25.1327, "step": 1706 }, { "epoch": 4.125603864734299, "grad_norm": 7.066011428833008, "learning_rate": 6.264090177133655e-05, "loss": 27.8359, "step": 1708 }, { "epoch": 4.130434782608695, "grad_norm": 7.165285587310791, "learning_rate": 6.247987117552336e-05, "loss": 26.1166, "step": 1710 }, { "epoch": 4.1352657004830915, "grad_norm": 6.823864936828613, "learning_rate": 6.231884057971015e-05, "loss": 27.5943, "step": 1712 }, { "epoch": 4.140096618357488, "grad_norm": 7.767164707183838, "learning_rate": 6.215780998389694e-05, "loss": 24.9854, "step": 1714 }, { "epoch": 4.144927536231884, "grad_norm": 6.458461284637451, "learning_rate": 6.199677938808374e-05, "loss": 26.8271, "step": 1716 }, { "epoch": 4.14975845410628, "grad_norm": 7.082225322723389, "learning_rate": 6.183574879227053e-05, "loss": 25.678, "step": 1718 }, { "epoch": 4.154589371980676, "grad_norm": 7.867661476135254, "learning_rate": 6.167471819645732e-05, "loss": 26.7575, "step": 1720 }, { "epoch": 4.159420289855072, "grad_norm": 7.803908824920654, "learning_rate": 6.151368760064413e-05, "loss": 27.785, "step": 1722 }, { "epoch": 4.164251207729468, "grad_norm": 7.704416751861572, "learning_rate": 6.135265700483092e-05, "loss": 26.4086, "step": 1724 }, { "epoch": 4.169082125603865, "grad_norm": 7.166048049926758, "learning_rate": 6.119162640901772e-05, "loss": 25.6944, "step": 1726 }, { "epoch": 4.173913043478261, "grad_norm": 7.665358066558838, "learning_rate": 6.1030595813204516e-05, "loss": 25.7421, "step": 1728 }, { "epoch": 4.178743961352657, "grad_norm": 6.582197666168213, "learning_rate": 6.086956521739131e-05, "loss": 24.2085, "step": 1730 }, { "epoch": 4.183574879227053, "grad_norm": 6.641133785247803, "learning_rate": 6.07085346215781e-05, "loss": 26.5714, "step": 1732 }, { "epoch": 4.188405797101449, "grad_norm": 8.203088760375977, "learning_rate": 6.05475040257649e-05, "loss": 24.042, "step": 1734 }, { "epoch": 4.193236714975845, "grad_norm": 7.593963146209717, "learning_rate": 6.0386473429951696e-05, "loss": 26.1024, "step": 1736 }, { "epoch": 4.1980676328502415, "grad_norm": 6.2828450202941895, "learning_rate": 6.022544283413849e-05, "loss": 25.6751, "step": 1738 }, { "epoch": 4.202898550724638, "grad_norm": 7.936067581176758, "learning_rate": 6.006441223832528e-05, "loss": 26.5624, "step": 1740 }, { "epoch": 4.207729468599034, "grad_norm": 7.069867134094238, "learning_rate": 5.990338164251208e-05, "loss": 26.0663, "step": 1742 }, { "epoch": 4.21256038647343, "grad_norm": 7.237870693206787, "learning_rate": 5.9742351046698876e-05, "loss": 26.6947, "step": 1744 }, { "epoch": 4.217391304347826, "grad_norm": 6.671788692474365, "learning_rate": 5.9581320450885666e-05, "loss": 26.2133, "step": 1746 }, { "epoch": 4.222222222222222, "grad_norm": 6.456491947174072, "learning_rate": 5.942028985507246e-05, "loss": 25.4374, "step": 1748 }, { "epoch": 4.2270531400966185, "grad_norm": 6.428054332733154, "learning_rate": 5.925925925925926e-05, "loss": 27.421, "step": 1750 }, { "epoch": 4.231884057971015, "grad_norm": 6.948849678039551, "learning_rate": 5.9098228663446057e-05, "loss": 24.7258, "step": 1752 }, { "epoch": 4.236714975845411, "grad_norm": 7.914185047149658, "learning_rate": 5.8937198067632847e-05, "loss": 26.731, "step": 1754 }, { "epoch": 4.241545893719807, "grad_norm": 6.79870080947876, "learning_rate": 5.877616747181964e-05, "loss": 27.7706, "step": 1756 }, { "epoch": 4.246376811594203, "grad_norm": 7.053183078765869, "learning_rate": 5.861513687600645e-05, "loss": 25.8897, "step": 1758 }, { "epoch": 4.251207729468599, "grad_norm": 7.341165065765381, "learning_rate": 5.8454106280193244e-05, "loss": 25.7238, "step": 1760 }, { "epoch": 4.256038647342995, "grad_norm": 6.499047756195068, "learning_rate": 5.829307568438004e-05, "loss": 26.5338, "step": 1762 }, { "epoch": 4.260869565217392, "grad_norm": 6.891699314117432, "learning_rate": 5.813204508856683e-05, "loss": 26.3659, "step": 1764 }, { "epoch": 4.265700483091788, "grad_norm": 6.726503849029541, "learning_rate": 5.797101449275363e-05, "loss": 24.1357, "step": 1766 }, { "epoch": 4.270531400966184, "grad_norm": 7.38776159286499, "learning_rate": 5.7809983896940424e-05, "loss": 25.8197, "step": 1768 }, { "epoch": 4.27536231884058, "grad_norm": 6.880035400390625, "learning_rate": 5.764895330112722e-05, "loss": 26.0566, "step": 1770 }, { "epoch": 4.280193236714976, "grad_norm": 6.925288677215576, "learning_rate": 5.748792270531401e-05, "loss": 27.931, "step": 1772 }, { "epoch": 4.285024154589372, "grad_norm": 8.501145362854004, "learning_rate": 5.732689210950081e-05, "loss": 24.3897, "step": 1774 }, { "epoch": 4.2898550724637685, "grad_norm": 7.33554744720459, "learning_rate": 5.7165861513687604e-05, "loss": 26.0568, "step": 1776 }, { "epoch": 4.294685990338165, "grad_norm": 6.757916450500488, "learning_rate": 5.7004830917874394e-05, "loss": 26.4327, "step": 1778 }, { "epoch": 4.29951690821256, "grad_norm": 7.093183517456055, "learning_rate": 5.684380032206119e-05, "loss": 25.8601, "step": 1780 }, { "epoch": 4.304347826086957, "grad_norm": 5.872477054595947, "learning_rate": 5.668276972624799e-05, "loss": 26.353, "step": 1782 }, { "epoch": 4.309178743961352, "grad_norm": 5.949990272521973, "learning_rate": 5.652173913043478e-05, "loss": 27.0481, "step": 1784 }, { "epoch": 4.314009661835748, "grad_norm": 6.953137397766113, "learning_rate": 5.6360708534621574e-05, "loss": 26.569, "step": 1786 }, { "epoch": 4.318840579710145, "grad_norm": 7.875227928161621, "learning_rate": 5.6199677938808385e-05, "loss": 23.7947, "step": 1788 }, { "epoch": 4.323671497584541, "grad_norm": 6.187444686889648, "learning_rate": 5.6038647342995175e-05, "loss": 25.1237, "step": 1790 }, { "epoch": 4.328502415458937, "grad_norm": 6.970160961151123, "learning_rate": 5.587761674718197e-05, "loss": 25.7053, "step": 1792 }, { "epoch": 4.333333333333333, "grad_norm": 6.903000831604004, "learning_rate": 5.571658615136877e-05, "loss": 26.7737, "step": 1794 }, { "epoch": 4.338164251207729, "grad_norm": 7.370026111602783, "learning_rate": 5.555555555555556e-05, "loss": 25.3098, "step": 1796 }, { "epoch": 4.342995169082125, "grad_norm": 6.926233768463135, "learning_rate": 5.5394524959742355e-05, "loss": 24.9499, "step": 1798 }, { "epoch": 4.3478260869565215, "grad_norm": 6.8403544425964355, "learning_rate": 5.523349436392915e-05, "loss": 25.6363, "step": 1800 }, { "epoch": 4.352657004830918, "grad_norm": 7.1537089347839355, "learning_rate": 5.507246376811594e-05, "loss": 24.911, "step": 1802 }, { "epoch": 4.357487922705314, "grad_norm": 6.798279285430908, "learning_rate": 5.491143317230274e-05, "loss": 27.4748, "step": 1804 }, { "epoch": 4.36231884057971, "grad_norm": 5.993078231811523, "learning_rate": 5.4750402576489535e-05, "loss": 27.1362, "step": 1806 }, { "epoch": 4.367149758454106, "grad_norm": 7.4096574783325195, "learning_rate": 5.458937198067633e-05, "loss": 26.0591, "step": 1808 }, { "epoch": 4.371980676328502, "grad_norm": 6.903232574462891, "learning_rate": 5.442834138486312e-05, "loss": 26.8211, "step": 1810 }, { "epoch": 4.3768115942028984, "grad_norm": 7.838393211364746, "learning_rate": 5.426731078904992e-05, "loss": 26.2384, "step": 1812 }, { "epoch": 4.381642512077295, "grad_norm": 7.33106803894043, "learning_rate": 5.4106280193236716e-05, "loss": 26.5385, "step": 1814 }, { "epoch": 4.386473429951691, "grad_norm": 6.619305610656738, "learning_rate": 5.3945249597423505e-05, "loss": 22.8647, "step": 1816 }, { "epoch": 4.391304347826087, "grad_norm": 7.007352352142334, "learning_rate": 5.37842190016103e-05, "loss": 28.2365, "step": 1818 }, { "epoch": 4.396135265700483, "grad_norm": 7.026554584503174, "learning_rate": 5.3623188405797106e-05, "loss": 25.9467, "step": 1820 }, { "epoch": 4.400966183574879, "grad_norm": 8.395278930664062, "learning_rate": 5.34621578099839e-05, "loss": 24.3139, "step": 1822 }, { "epoch": 4.405797101449275, "grad_norm": 6.9680495262146, "learning_rate": 5.33011272141707e-05, "loss": 24.9039, "step": 1824 }, { "epoch": 4.4106280193236715, "grad_norm": 7.212375164031982, "learning_rate": 5.3140096618357496e-05, "loss": 25.4465, "step": 1826 }, { "epoch": 4.415458937198068, "grad_norm": 6.966728210449219, "learning_rate": 5.2979066022544286e-05, "loss": 25.7792, "step": 1828 }, { "epoch": 4.420289855072464, "grad_norm": 6.4454522132873535, "learning_rate": 5.281803542673108e-05, "loss": 25.9821, "step": 1830 }, { "epoch": 4.42512077294686, "grad_norm": 7.032574653625488, "learning_rate": 5.265700483091788e-05, "loss": 26.4527, "step": 1832 }, { "epoch": 4.429951690821256, "grad_norm": 7.715813159942627, "learning_rate": 5.249597423510467e-05, "loss": 25.1597, "step": 1834 }, { "epoch": 4.434782608695652, "grad_norm": 8.366538047790527, "learning_rate": 5.2334943639291466e-05, "loss": 26.6993, "step": 1836 }, { "epoch": 4.4396135265700485, "grad_norm": 6.7702484130859375, "learning_rate": 5.217391304347826e-05, "loss": 25.7026, "step": 1838 }, { "epoch": 4.444444444444445, "grad_norm": 7.936936378479004, "learning_rate": 5.201288244766506e-05, "loss": 26.0788, "step": 1840 }, { "epoch": 4.449275362318841, "grad_norm": 9.027806282043457, "learning_rate": 5.185185185185185e-05, "loss": 26.8397, "step": 1842 }, { "epoch": 4.454106280193237, "grad_norm": 7.541802406311035, "learning_rate": 5.1690821256038647e-05, "loss": 25.2463, "step": 1844 }, { "epoch": 4.458937198067633, "grad_norm": 6.402732849121094, "learning_rate": 5.152979066022544e-05, "loss": 26.2031, "step": 1846 }, { "epoch": 4.463768115942029, "grad_norm": 9.23645305633545, "learning_rate": 5.136876006441223e-05, "loss": 25.5027, "step": 1848 }, { "epoch": 4.468599033816425, "grad_norm": 7.548840045928955, "learning_rate": 5.1207729468599044e-05, "loss": 25.3803, "step": 1850 }, { "epoch": 4.473429951690822, "grad_norm": 6.839424133300781, "learning_rate": 5.1046698872785834e-05, "loss": 25.4142, "step": 1852 }, { "epoch": 4.478260869565218, "grad_norm": 6.8843512535095215, "learning_rate": 5.088566827697263e-05, "loss": 25.0163, "step": 1854 }, { "epoch": 4.483091787439614, "grad_norm": 6.359217643737793, "learning_rate": 5.072463768115943e-05, "loss": 26.7153, "step": 1856 }, { "epoch": 4.48792270531401, "grad_norm": 7.048843860626221, "learning_rate": 5.0563607085346224e-05, "loss": 25.6665, "step": 1858 }, { "epoch": 4.492753623188406, "grad_norm": 7.086437702178955, "learning_rate": 5.0402576489533014e-05, "loss": 26.86, "step": 1860 }, { "epoch": 4.4975845410628015, "grad_norm": 6.8362507820129395, "learning_rate": 5.024154589371981e-05, "loss": 26.5117, "step": 1862 }, { "epoch": 4.5024154589371985, "grad_norm": 6.434200763702393, "learning_rate": 5.008051529790661e-05, "loss": 25.2265, "step": 1864 }, { "epoch": 4.507246376811594, "grad_norm": 8.030712127685547, "learning_rate": 4.99194847020934e-05, "loss": 24.4644, "step": 1866 }, { "epoch": 4.512077294685991, "grad_norm": 7.7696051597595215, "learning_rate": 4.9758454106280194e-05, "loss": 27.1793, "step": 1868 }, { "epoch": 4.516908212560386, "grad_norm": 6.404499530792236, "learning_rate": 4.959742351046699e-05, "loss": 26.4276, "step": 1870 }, { "epoch": 4.521739130434782, "grad_norm": 7.412373065948486, "learning_rate": 4.943639291465378e-05, "loss": 25.6971, "step": 1872 }, { "epoch": 4.526570048309178, "grad_norm": 7.425329685211182, "learning_rate": 4.9275362318840584e-05, "loss": 23.4725, "step": 1874 }, { "epoch": 4.531400966183575, "grad_norm": 6.722659587860107, "learning_rate": 4.911433172302738e-05, "loss": 26.158, "step": 1876 }, { "epoch": 4.536231884057971, "grad_norm": 7.206009387969971, "learning_rate": 4.895330112721417e-05, "loss": 26.8682, "step": 1878 }, { "epoch": 4.541062801932367, "grad_norm": 7.180261135101318, "learning_rate": 4.879227053140097e-05, "loss": 26.191, "step": 1880 }, { "epoch": 4.545893719806763, "grad_norm": 7.371028900146484, "learning_rate": 4.8631239935587765e-05, "loss": 23.7948, "step": 1882 }, { "epoch": 4.550724637681159, "grad_norm": 6.874049663543701, "learning_rate": 4.847020933977456e-05, "loss": 26.638, "step": 1884 }, { "epoch": 4.555555555555555, "grad_norm": 7.5235795974731445, "learning_rate": 4.830917874396135e-05, "loss": 26.9283, "step": 1886 }, { "epoch": 4.5603864734299515, "grad_norm": 7.371413707733154, "learning_rate": 4.814814814814815e-05, "loss": 26.5723, "step": 1888 }, { "epoch": 4.565217391304348, "grad_norm": 6.487553119659424, "learning_rate": 4.7987117552334945e-05, "loss": 25.9711, "step": 1890 }, { "epoch": 4.570048309178744, "grad_norm": 6.800736427307129, "learning_rate": 4.782608695652174e-05, "loss": 23.376, "step": 1892 }, { "epoch": 4.57487922705314, "grad_norm": 7.149484634399414, "learning_rate": 4.766505636070854e-05, "loss": 26.0393, "step": 1894 }, { "epoch": 4.579710144927536, "grad_norm": 7.532267093658447, "learning_rate": 4.7504025764895335e-05, "loss": 26.1861, "step": 1896 }, { "epoch": 4.584541062801932, "grad_norm": 7.492485046386719, "learning_rate": 4.7342995169082125e-05, "loss": 26.4485, "step": 1898 }, { "epoch": 4.5893719806763285, "grad_norm": 6.885655879974365, "learning_rate": 4.718196457326892e-05, "loss": 25.1794, "step": 1900 }, { "epoch": 4.594202898550725, "grad_norm": 6.430235862731934, "learning_rate": 4.702093397745572e-05, "loss": 25.6459, "step": 1902 }, { "epoch": 4.599033816425121, "grad_norm": 6.470332145690918, "learning_rate": 4.6859903381642516e-05, "loss": 26.0056, "step": 1904 }, { "epoch": 4.603864734299517, "grad_norm": 6.93711519241333, "learning_rate": 4.669887278582931e-05, "loss": 26.8588, "step": 1906 }, { "epoch": 4.608695652173913, "grad_norm": 7.658902168273926, "learning_rate": 4.653784219001611e-05, "loss": 25.5478, "step": 1908 }, { "epoch": 4.613526570048309, "grad_norm": 7.67640495300293, "learning_rate": 4.63768115942029e-05, "loss": 26.2871, "step": 1910 }, { "epoch": 4.618357487922705, "grad_norm": 7.06746244430542, "learning_rate": 4.6215780998389696e-05, "loss": 25.8891, "step": 1912 }, { "epoch": 4.6231884057971016, "grad_norm": 7.047806739807129, "learning_rate": 4.605475040257649e-05, "loss": 27.2715, "step": 1914 }, { "epoch": 4.628019323671498, "grad_norm": 7.097225189208984, "learning_rate": 4.589371980676328e-05, "loss": 24.1438, "step": 1916 }, { "epoch": 4.632850241545894, "grad_norm": 7.487665176391602, "learning_rate": 4.573268921095008e-05, "loss": 25.5925, "step": 1918 }, { "epoch": 4.63768115942029, "grad_norm": 6.561511516571045, "learning_rate": 4.557165861513688e-05, "loss": 27.0304, "step": 1920 }, { "epoch": 4.642512077294686, "grad_norm": 7.644463539123535, "learning_rate": 4.541062801932367e-05, "loss": 26.007, "step": 1922 }, { "epoch": 4.647342995169082, "grad_norm": 7.329721927642822, "learning_rate": 4.524959742351047e-05, "loss": 23.3239, "step": 1924 }, { "epoch": 4.6521739130434785, "grad_norm": 6.725891590118408, "learning_rate": 4.5088566827697266e-05, "loss": 25.8835, "step": 1926 }, { "epoch": 4.657004830917875, "grad_norm": 7.27399206161499, "learning_rate": 4.492753623188406e-05, "loss": 26.4273, "step": 1928 }, { "epoch": 4.661835748792271, "grad_norm": 6.614084720611572, "learning_rate": 4.476650563607085e-05, "loss": 25.8323, "step": 1930 }, { "epoch": 4.666666666666667, "grad_norm": 6.703570365905762, "learning_rate": 4.460547504025765e-05, "loss": 25.0444, "step": 1932 }, { "epoch": 4.671497584541063, "grad_norm": 7.8840556144714355, "learning_rate": 4.4444444444444447e-05, "loss": 26.0548, "step": 1934 }, { "epoch": 4.676328502415459, "grad_norm": 6.566593170166016, "learning_rate": 4.428341384863124e-05, "loss": 25.8758, "step": 1936 }, { "epoch": 4.681159420289855, "grad_norm": 6.961997985839844, "learning_rate": 4.412238325281804e-05, "loss": 26.1125, "step": 1938 }, { "epoch": 4.685990338164252, "grad_norm": 8.170991897583008, "learning_rate": 4.396135265700484e-05, "loss": 27.3513, "step": 1940 }, { "epoch": 4.690821256038648, "grad_norm": 6.823581218719482, "learning_rate": 4.380032206119163e-05, "loss": 25.9433, "step": 1942 }, { "epoch": 4.695652173913043, "grad_norm": 7.356668949127197, "learning_rate": 4.3639291465378424e-05, "loss": 27.2403, "step": 1944 }, { "epoch": 4.70048309178744, "grad_norm": 7.08234977722168, "learning_rate": 4.347826086956522e-05, "loss": 25.7369, "step": 1946 }, { "epoch": 4.705314009661835, "grad_norm": 6.981078147888184, "learning_rate": 4.331723027375201e-05, "loss": 25.5263, "step": 1948 }, { "epoch": 4.710144927536232, "grad_norm": 6.724111080169678, "learning_rate": 4.315619967793881e-05, "loss": 25.9395, "step": 1950 }, { "epoch": 4.714975845410628, "grad_norm": 6.105647563934326, "learning_rate": 4.299516908212561e-05, "loss": 26.302, "step": 1952 }, { "epoch": 4.719806763285024, "grad_norm": 7.321731090545654, "learning_rate": 4.28341384863124e-05, "loss": 25.4126, "step": 1954 }, { "epoch": 4.72463768115942, "grad_norm": 6.488819599151611, "learning_rate": 4.26731078904992e-05, "loss": 26.1355, "step": 1956 }, { "epoch": 4.729468599033816, "grad_norm": 6.578047752380371, "learning_rate": 4.2512077294685994e-05, "loss": 27.0345, "step": 1958 }, { "epoch": 4.734299516908212, "grad_norm": 6.070748805999756, "learning_rate": 4.2351046698872784e-05, "loss": 24.2139, "step": 1960 }, { "epoch": 4.739130434782608, "grad_norm": 6.960094451904297, "learning_rate": 4.219001610305958e-05, "loss": 26.6307, "step": 1962 }, { "epoch": 4.743961352657005, "grad_norm": 6.557458877563477, "learning_rate": 4.202898550724638e-05, "loss": 24.7468, "step": 1964 }, { "epoch": 4.748792270531401, "grad_norm": 7.3893656730651855, "learning_rate": 4.1867954911433174e-05, "loss": 25.1312, "step": 1966 }, { "epoch": 4.753623188405797, "grad_norm": 7.08898401260376, "learning_rate": 4.170692431561997e-05, "loss": 26.4543, "step": 1968 }, { "epoch": 4.758454106280193, "grad_norm": 7.590085029602051, "learning_rate": 4.154589371980677e-05, "loss": 26.8427, "step": 1970 }, { "epoch": 4.763285024154589, "grad_norm": 6.841743469238281, "learning_rate": 4.1384863123993565e-05, "loss": 25.9606, "step": 1972 }, { "epoch": 4.768115942028985, "grad_norm": 7.612220764160156, "learning_rate": 4.1223832528180355e-05, "loss": 27.338, "step": 1974 }, { "epoch": 4.7729468599033815, "grad_norm": 6.759093761444092, "learning_rate": 4.106280193236715e-05, "loss": 26.6189, "step": 1976 }, { "epoch": 4.777777777777778, "grad_norm": 7.5177226066589355, "learning_rate": 4.090177133655395e-05, "loss": 26.6014, "step": 1978 }, { "epoch": 4.782608695652174, "grad_norm": 6.755998611450195, "learning_rate": 4.074074074074074e-05, "loss": 24.8074, "step": 1980 }, { "epoch": 4.78743961352657, "grad_norm": 7.969665050506592, "learning_rate": 4.057971014492754e-05, "loss": 26.8168, "step": 1982 }, { "epoch": 4.792270531400966, "grad_norm": 6.537661552429199, "learning_rate": 4.041867954911434e-05, "loss": 26.8878, "step": 1984 }, { "epoch": 4.797101449275362, "grad_norm": 7.462778091430664, "learning_rate": 4.025764895330113e-05, "loss": 26.6395, "step": 1986 }, { "epoch": 4.8019323671497585, "grad_norm": 7.199199199676514, "learning_rate": 4.0096618357487925e-05, "loss": 26.6467, "step": 1988 }, { "epoch": 4.806763285024155, "grad_norm": 6.970396995544434, "learning_rate": 3.993558776167472e-05, "loss": 27.3238, "step": 1990 }, { "epoch": 4.811594202898551, "grad_norm": 6.526374340057373, "learning_rate": 3.977455716586151e-05, "loss": 26.7327, "step": 1992 }, { "epoch": 4.816425120772947, "grad_norm": 7.019384384155273, "learning_rate": 3.961352657004831e-05, "loss": 24.6199, "step": 1994 }, { "epoch": 4.821256038647343, "grad_norm": 7.474978923797607, "learning_rate": 3.9452495974235105e-05, "loss": 26.8535, "step": 1996 }, { "epoch": 4.826086956521739, "grad_norm": 7.651355266571045, "learning_rate": 3.92914653784219e-05, "loss": 25.2036, "step": 1998 }, { "epoch": 4.830917874396135, "grad_norm": 6.540372848510742, "learning_rate": 3.91304347826087e-05, "loss": 26.1222, "step": 2000 }, { "epoch": 4.835748792270532, "grad_norm": 7.769553184509277, "learning_rate": 3.8969404186795496e-05, "loss": 25.3271, "step": 2002 }, { "epoch": 4.840579710144928, "grad_norm": 7.059219837188721, "learning_rate": 3.8808373590982286e-05, "loss": 27.5878, "step": 2004 }, { "epoch": 4.845410628019324, "grad_norm": 7.040493011474609, "learning_rate": 3.864734299516908e-05, "loss": 24.8298, "step": 2006 }, { "epoch": 4.85024154589372, "grad_norm": 6.8158111572265625, "learning_rate": 3.848631239935588e-05, "loss": 25.9933, "step": 2008 }, { "epoch": 4.855072463768116, "grad_norm": 6.576706886291504, "learning_rate": 3.8325281803542676e-05, "loss": 25.7341, "step": 2010 }, { "epoch": 4.859903381642512, "grad_norm": 6.51364803314209, "learning_rate": 3.8164251207729466e-05, "loss": 27.156, "step": 2012 }, { "epoch": 4.8647342995169085, "grad_norm": 7.035210609436035, "learning_rate": 3.800322061191627e-05, "loss": 25.8662, "step": 2014 }, { "epoch": 4.869565217391305, "grad_norm": 8.57784366607666, "learning_rate": 3.784219001610306e-05, "loss": 27.2607, "step": 2016 }, { "epoch": 4.874396135265701, "grad_norm": 7.060666084289551, "learning_rate": 3.7681159420289856e-05, "loss": 25.5058, "step": 2018 }, { "epoch": 4.879227053140097, "grad_norm": 6.544167995452881, "learning_rate": 3.752012882447665e-05, "loss": 27.5042, "step": 2020 }, { "epoch": 4.884057971014493, "grad_norm": 7.82602071762085, "learning_rate": 3.735909822866345e-05, "loss": 25.1871, "step": 2022 }, { "epoch": 4.888888888888889, "grad_norm": 6.692302227020264, "learning_rate": 3.719806763285024e-05, "loss": 24.8878, "step": 2024 }, { "epoch": 4.8937198067632846, "grad_norm": 6.907380104064941, "learning_rate": 3.7037037037037037e-05, "loss": 26.2569, "step": 2026 }, { "epoch": 4.898550724637682, "grad_norm": 6.529886245727539, "learning_rate": 3.687600644122383e-05, "loss": 25.2621, "step": 2028 }, { "epoch": 4.903381642512077, "grad_norm": 8.162117958068848, "learning_rate": 3.671497584541063e-05, "loss": 25.3648, "step": 2030 }, { "epoch": 4.908212560386474, "grad_norm": 7.2825422286987305, "learning_rate": 3.655394524959743e-05, "loss": 25.0396, "step": 2032 }, { "epoch": 4.913043478260869, "grad_norm": 7.4677886962890625, "learning_rate": 3.6392914653784224e-05, "loss": 26.3367, "step": 2034 }, { "epoch": 4.917874396135265, "grad_norm": 6.709794521331787, "learning_rate": 3.6231884057971014e-05, "loss": 25.5507, "step": 2036 }, { "epoch": 4.9227053140096615, "grad_norm": 6.555368423461914, "learning_rate": 3.607085346215781e-05, "loss": 24.5421, "step": 2038 }, { "epoch": 4.927536231884058, "grad_norm": 6.405154705047607, "learning_rate": 3.590982286634461e-05, "loss": 25.699, "step": 2040 }, { "epoch": 4.932367149758454, "grad_norm": 7.2418012619018555, "learning_rate": 3.57487922705314e-05, "loss": 25.3032, "step": 2042 }, { "epoch": 4.93719806763285, "grad_norm": 7.165282726287842, "learning_rate": 3.55877616747182e-05, "loss": 27.57, "step": 2044 }, { "epoch": 4.942028985507246, "grad_norm": 8.555087089538574, "learning_rate": 3.5426731078905e-05, "loss": 25.6545, "step": 2046 }, { "epoch": 4.946859903381642, "grad_norm": 7.7885613441467285, "learning_rate": 3.526570048309179e-05, "loss": 25.3195, "step": 2048 }, { "epoch": 4.951690821256038, "grad_norm": 6.383197784423828, "learning_rate": 3.5104669887278584e-05, "loss": 27.5458, "step": 2050 }, { "epoch": 4.956521739130435, "grad_norm": 7.210457801818848, "learning_rate": 3.494363929146538e-05, "loss": 26.0986, "step": 2052 }, { "epoch": 4.961352657004831, "grad_norm": 6.477179050445557, "learning_rate": 3.478260869565218e-05, "loss": 26.3113, "step": 2054 }, { "epoch": 4.966183574879227, "grad_norm": 6.750316619873047, "learning_rate": 3.462157809983897e-05, "loss": 26.5696, "step": 2056 }, { "epoch": 4.971014492753623, "grad_norm": 6.577611923217773, "learning_rate": 3.4460547504025764e-05, "loss": 26.4256, "step": 2058 }, { "epoch": 4.975845410628019, "grad_norm": 7.024559020996094, "learning_rate": 3.429951690821256e-05, "loss": 24.9867, "step": 2060 }, { "epoch": 4.980676328502415, "grad_norm": 7.051502704620361, "learning_rate": 3.413848631239936e-05, "loss": 25.3915, "step": 2062 }, { "epoch": 4.9855072463768115, "grad_norm": 7.6836838722229, "learning_rate": 3.3977455716586155e-05, "loss": 24.8861, "step": 2064 }, { "epoch": 4.990338164251208, "grad_norm": 7.69392204284668, "learning_rate": 3.381642512077295e-05, "loss": 26.2732, "step": 2066 }, { "epoch": 4.995169082125604, "grad_norm": 7.139024257659912, "learning_rate": 3.365539452495974e-05, "loss": 24.6846, "step": 2068 }, { "epoch": 5.0, "grad_norm": 6.70409631729126, "learning_rate": 3.349436392914654e-05, "loss": 27.4742, "step": 2070 }, { "epoch": 5.004830917874396, "grad_norm": 6.803808212280273, "learning_rate": 3.3333333333333335e-05, "loss": 24.7629, "step": 2072 }, { "epoch": 5.009661835748792, "grad_norm": 6.341485977172852, "learning_rate": 3.3172302737520125e-05, "loss": 27.7422, "step": 2074 }, { "epoch": 5.0144927536231885, "grad_norm": 6.5449066162109375, "learning_rate": 3.301127214170693e-05, "loss": 26.3897, "step": 2076 }, { "epoch": 5.019323671497585, "grad_norm": 6.326546669006348, "learning_rate": 3.2850241545893725e-05, "loss": 26.7782, "step": 2078 }, { "epoch": 5.024154589371981, "grad_norm": 7.492796897888184, "learning_rate": 3.2689210950080515e-05, "loss": 25.2565, "step": 2080 }, { "epoch": 5.028985507246377, "grad_norm": 7.679995536804199, "learning_rate": 3.252818035426731e-05, "loss": 25.2879, "step": 2082 }, { "epoch": 5.033816425120773, "grad_norm": 6.634117126464844, "learning_rate": 3.236714975845411e-05, "loss": 27.5415, "step": 2084 }, { "epoch": 5.038647342995169, "grad_norm": 6.707841873168945, "learning_rate": 3.22061191626409e-05, "loss": 26.2413, "step": 2086 }, { "epoch": 5.043478260869565, "grad_norm": 7.303376197814941, "learning_rate": 3.2045088566827695e-05, "loss": 22.9713, "step": 2088 }, { "epoch": 5.048309178743962, "grad_norm": 5.641716957092285, "learning_rate": 3.188405797101449e-05, "loss": 23.7231, "step": 2090 }, { "epoch": 5.053140096618358, "grad_norm": 7.5472636222839355, "learning_rate": 3.172302737520129e-05, "loss": 26.3035, "step": 2092 }, { "epoch": 5.057971014492754, "grad_norm": 6.629962921142578, "learning_rate": 3.1561996779388086e-05, "loss": 27.2519, "step": 2094 }, { "epoch": 5.06280193236715, "grad_norm": 6.610307216644287, "learning_rate": 3.140096618357488e-05, "loss": 24.3596, "step": 2096 }, { "epoch": 5.067632850241546, "grad_norm": 8.222330093383789, "learning_rate": 3.123993558776168e-05, "loss": 26.812, "step": 2098 }, { "epoch": 5.072463768115942, "grad_norm": 7.391679763793945, "learning_rate": 3.107890499194847e-05, "loss": 23.9302, "step": 2100 }, { "epoch": 5.0772946859903385, "grad_norm": 7.474515914916992, "learning_rate": 3.0917874396135266e-05, "loss": 26.0697, "step": 2102 }, { "epoch": 5.082125603864735, "grad_norm": 6.373252868652344, "learning_rate": 3.075684380032206e-05, "loss": 26.3179, "step": 2104 }, { "epoch": 5.086956521739131, "grad_norm": 7.464061260223389, "learning_rate": 3.059581320450886e-05, "loss": 25.6336, "step": 2106 }, { "epoch": 5.091787439613527, "grad_norm": 6.995118618011475, "learning_rate": 3.0434782608695656e-05, "loss": 26.2471, "step": 2108 }, { "epoch": 5.096618357487923, "grad_norm": 7.116311550140381, "learning_rate": 3.027375201288245e-05, "loss": 26.614, "step": 2110 }, { "epoch": 5.101449275362318, "grad_norm": 6.943987846374512, "learning_rate": 3.0112721417069246e-05, "loss": 25.0339, "step": 2112 }, { "epoch": 5.106280193236715, "grad_norm": 7.350955009460449, "learning_rate": 2.995169082125604e-05, "loss": 26.2694, "step": 2114 }, { "epoch": 5.111111111111111, "grad_norm": 6.849686622619629, "learning_rate": 2.9790660225442833e-05, "loss": 27.1826, "step": 2116 }, { "epoch": 5.115942028985507, "grad_norm": 7.7651567459106445, "learning_rate": 2.962962962962963e-05, "loss": 24.8541, "step": 2118 }, { "epoch": 5.120772946859903, "grad_norm": 5.836477279663086, "learning_rate": 2.9468599033816423e-05, "loss": 25.0215, "step": 2120 }, { "epoch": 5.125603864734299, "grad_norm": 6.189184665679932, "learning_rate": 2.9307568438003223e-05, "loss": 26.9194, "step": 2122 }, { "epoch": 5.130434782608695, "grad_norm": 6.857696533203125, "learning_rate": 2.914653784219002e-05, "loss": 26.1156, "step": 2124 }, { "epoch": 5.1352657004830915, "grad_norm": 6.773160934448242, "learning_rate": 2.8985507246376814e-05, "loss": 25.4986, "step": 2126 }, { "epoch": 5.140096618357488, "grad_norm": 8.016234397888184, "learning_rate": 2.882447665056361e-05, "loss": 26.8887, "step": 2128 }, { "epoch": 5.144927536231884, "grad_norm": 7.765948295593262, "learning_rate": 2.8663446054750404e-05, "loss": 24.2822, "step": 2130 }, { "epoch": 5.14975845410628, "grad_norm": 7.044548511505127, "learning_rate": 2.8502415458937197e-05, "loss": 25.6158, "step": 2132 }, { "epoch": 5.154589371980676, "grad_norm": 6.452057361602783, "learning_rate": 2.8341384863123994e-05, "loss": 24.7033, "step": 2134 }, { "epoch": 5.159420289855072, "grad_norm": 6.443338394165039, "learning_rate": 2.8180354267310787e-05, "loss": 25.911, "step": 2136 }, { "epoch": 5.164251207729468, "grad_norm": 7.172874450683594, "learning_rate": 2.8019323671497587e-05, "loss": 25.0313, "step": 2138 }, { "epoch": 5.169082125603865, "grad_norm": 7.001052379608154, "learning_rate": 2.7858293075684384e-05, "loss": 27.4582, "step": 2140 }, { "epoch": 5.173913043478261, "grad_norm": 6.618391513824463, "learning_rate": 2.7697262479871177e-05, "loss": 25.3195, "step": 2142 }, { "epoch": 5.178743961352657, "grad_norm": 7.667540073394775, "learning_rate": 2.753623188405797e-05, "loss": 24.9384, "step": 2144 }, { "epoch": 5.183574879227053, "grad_norm": 7.570556163787842, "learning_rate": 2.7375201288244768e-05, "loss": 25.1559, "step": 2146 }, { "epoch": 5.188405797101449, "grad_norm": 8.569737434387207, "learning_rate": 2.721417069243156e-05, "loss": 24.5403, "step": 2148 }, { "epoch": 5.193236714975845, "grad_norm": 6.5838623046875, "learning_rate": 2.7053140096618358e-05, "loss": 25.5745, "step": 2150 }, { "epoch": 5.1980676328502415, "grad_norm": 6.626333713531494, "learning_rate": 2.689210950080515e-05, "loss": 25.6172, "step": 2152 }, { "epoch": 5.202898550724638, "grad_norm": 7.9010186195373535, "learning_rate": 2.673107890499195e-05, "loss": 25.8519, "step": 2154 }, { "epoch": 5.207729468599034, "grad_norm": 6.161978244781494, "learning_rate": 2.6570048309178748e-05, "loss": 24.2309, "step": 2156 }, { "epoch": 5.21256038647343, "grad_norm": 6.870685577392578, "learning_rate": 2.640901771336554e-05, "loss": 27.1665, "step": 2158 }, { "epoch": 5.217391304347826, "grad_norm": 7.303822040557861, "learning_rate": 2.6247987117552335e-05, "loss": 25.556, "step": 2160 }, { "epoch": 5.222222222222222, "grad_norm": 6.584065914154053, "learning_rate": 2.608695652173913e-05, "loss": 25.468, "step": 2162 }, { "epoch": 5.2270531400966185, "grad_norm": 7.221360683441162, "learning_rate": 2.5925925925925925e-05, "loss": 25.8624, "step": 2164 }, { "epoch": 5.231884057971015, "grad_norm": 7.08326530456543, "learning_rate": 2.576489533011272e-05, "loss": 26.5428, "step": 2166 }, { "epoch": 5.236714975845411, "grad_norm": 6.360510349273682, "learning_rate": 2.5603864734299522e-05, "loss": 26.695, "step": 2168 }, { "epoch": 5.241545893719807, "grad_norm": 7.52411413192749, "learning_rate": 2.5442834138486315e-05, "loss": 25.9067, "step": 2170 }, { "epoch": 5.246376811594203, "grad_norm": 6.968140602111816, "learning_rate": 2.5281803542673112e-05, "loss": 25.0371, "step": 2172 }, { "epoch": 5.251207729468599, "grad_norm": 7.372687339782715, "learning_rate": 2.5120772946859905e-05, "loss": 26.3727, "step": 2174 }, { "epoch": 5.256038647342995, "grad_norm": 7.292659759521484, "learning_rate": 2.49597423510467e-05, "loss": 26.8115, "step": 2176 }, { "epoch": 5.260869565217392, "grad_norm": 6.425929546356201, "learning_rate": 2.4798711755233495e-05, "loss": 27.7444, "step": 2178 }, { "epoch": 5.265700483091788, "grad_norm": 7.451976776123047, "learning_rate": 2.4637681159420292e-05, "loss": 26.5838, "step": 2180 }, { "epoch": 5.270531400966184, "grad_norm": 7.282567024230957, "learning_rate": 2.4476650563607086e-05, "loss": 25.9177, "step": 2182 }, { "epoch": 5.27536231884058, "grad_norm": 7.04587459564209, "learning_rate": 2.4315619967793882e-05, "loss": 25.8643, "step": 2184 }, { "epoch": 5.280193236714976, "grad_norm": 7.137731075286865, "learning_rate": 2.4154589371980676e-05, "loss": 26.6203, "step": 2186 }, { "epoch": 5.285024154589372, "grad_norm": 6.674662113189697, "learning_rate": 2.3993558776167472e-05, "loss": 25.3759, "step": 2188 }, { "epoch": 5.2898550724637685, "grad_norm": 6.6438164710998535, "learning_rate": 2.383252818035427e-05, "loss": 24.2837, "step": 2190 }, { "epoch": 5.294685990338165, "grad_norm": 7.651294708251953, "learning_rate": 2.3671497584541063e-05, "loss": 26.9551, "step": 2192 }, { "epoch": 5.29951690821256, "grad_norm": 6.606574058532715, "learning_rate": 2.351046698872786e-05, "loss": 24.8967, "step": 2194 }, { "epoch": 5.304347826086957, "grad_norm": 6.956263065338135, "learning_rate": 2.3349436392914656e-05, "loss": 26.3236, "step": 2196 }, { "epoch": 5.309178743961352, "grad_norm": 7.141554832458496, "learning_rate": 2.318840579710145e-05, "loss": 26.4252, "step": 2198 }, { "epoch": 5.314009661835748, "grad_norm": 6.030832290649414, "learning_rate": 2.3027375201288246e-05, "loss": 28.0596, "step": 2200 }, { "epoch": 5.318840579710145, "grad_norm": 6.431146621704102, "learning_rate": 2.286634460547504e-05, "loss": 26.1359, "step": 2202 }, { "epoch": 5.323671497584541, "grad_norm": 7.26776647567749, "learning_rate": 2.2705314009661836e-05, "loss": 23.1691, "step": 2204 }, { "epoch": 5.328502415458937, "grad_norm": 7.198235988616943, "learning_rate": 2.2544283413848633e-05, "loss": 24.2558, "step": 2206 }, { "epoch": 5.333333333333333, "grad_norm": 7.205248832702637, "learning_rate": 2.2383252818035427e-05, "loss": 26.1497, "step": 2208 }, { "epoch": 5.338164251207729, "grad_norm": 6.834975242614746, "learning_rate": 2.2222222222222223e-05, "loss": 25.5315, "step": 2210 }, { "epoch": 5.342995169082125, "grad_norm": 6.981115341186523, "learning_rate": 2.206119162640902e-05, "loss": 25.0263, "step": 2212 }, { "epoch": 5.3478260869565215, "grad_norm": 6.798349380493164, "learning_rate": 2.1900161030595813e-05, "loss": 27.7364, "step": 2214 }, { "epoch": 5.352657004830918, "grad_norm": 7.136117458343506, "learning_rate": 2.173913043478261e-05, "loss": 26.0488, "step": 2216 }, { "epoch": 5.357487922705314, "grad_norm": 6.846739768981934, "learning_rate": 2.1578099838969404e-05, "loss": 24.9772, "step": 2218 }, { "epoch": 5.36231884057971, "grad_norm": 7.294228553771973, "learning_rate": 2.14170692431562e-05, "loss": 27.0872, "step": 2220 }, { "epoch": 5.367149758454106, "grad_norm": 7.222455978393555, "learning_rate": 2.1256038647342997e-05, "loss": 25.2418, "step": 2222 }, { "epoch": 5.371980676328502, "grad_norm": 6.867911338806152, "learning_rate": 2.109500805152979e-05, "loss": 26.6485, "step": 2224 }, { "epoch": 5.3768115942028984, "grad_norm": 7.119537353515625, "learning_rate": 2.0933977455716587e-05, "loss": 24.7069, "step": 2226 }, { "epoch": 5.381642512077295, "grad_norm": 6.486376762390137, "learning_rate": 2.0772946859903384e-05, "loss": 25.5981, "step": 2228 }, { "epoch": 5.386473429951691, "grad_norm": 6.030795097351074, "learning_rate": 2.0611916264090177e-05, "loss": 26.4169, "step": 2230 }, { "epoch": 5.391304347826087, "grad_norm": 6.1018171310424805, "learning_rate": 2.0450885668276974e-05, "loss": 25.8114, "step": 2232 }, { "epoch": 5.396135265700483, "grad_norm": 6.3123860359191895, "learning_rate": 2.028985507246377e-05, "loss": 27.2772, "step": 2234 }, { "epoch": 5.400966183574879, "grad_norm": 7.111965179443359, "learning_rate": 2.0128824476650564e-05, "loss": 22.8083, "step": 2236 }, { "epoch": 5.405797101449275, "grad_norm": 6.663313865661621, "learning_rate": 1.996779388083736e-05, "loss": 26.1006, "step": 2238 }, { "epoch": 5.4106280193236715, "grad_norm": 7.1827287673950195, "learning_rate": 1.9806763285024154e-05, "loss": 25.9993, "step": 2240 }, { "epoch": 5.415458937198068, "grad_norm": 6.989486217498779, "learning_rate": 1.964573268921095e-05, "loss": 27.1008, "step": 2242 }, { "epoch": 5.420289855072464, "grad_norm": 7.407745361328125, "learning_rate": 1.9484702093397748e-05, "loss": 25.4639, "step": 2244 }, { "epoch": 5.42512077294686, "grad_norm": 6.708901405334473, "learning_rate": 1.932367149758454e-05, "loss": 26.7837, "step": 2246 }, { "epoch": 5.429951690821256, "grad_norm": 6.670323848724365, "learning_rate": 1.9162640901771338e-05, "loss": 25.7905, "step": 2248 }, { "epoch": 5.434782608695652, "grad_norm": 7.481121063232422, "learning_rate": 1.9001610305958135e-05, "loss": 25.6084, "step": 2250 }, { "epoch": 5.4396135265700485, "grad_norm": 7.1586480140686035, "learning_rate": 1.8840579710144928e-05, "loss": 25.4086, "step": 2252 }, { "epoch": 5.444444444444445, "grad_norm": 6.693662166595459, "learning_rate": 1.8679549114331725e-05, "loss": 25.3542, "step": 2254 }, { "epoch": 5.449275362318841, "grad_norm": 6.597439289093018, "learning_rate": 1.8518518518518518e-05, "loss": 24.7352, "step": 2256 }, { "epoch": 5.454106280193237, "grad_norm": 6.5035400390625, "learning_rate": 1.8357487922705315e-05, "loss": 25.9761, "step": 2258 }, { "epoch": 5.458937198067633, "grad_norm": 6.170787811279297, "learning_rate": 1.8196457326892112e-05, "loss": 26.8993, "step": 2260 }, { "epoch": 5.463768115942029, "grad_norm": 6.216879367828369, "learning_rate": 1.8035426731078905e-05, "loss": 23.9469, "step": 2262 }, { "epoch": 5.468599033816425, "grad_norm": 6.804856777191162, "learning_rate": 1.78743961352657e-05, "loss": 27.1758, "step": 2264 }, { "epoch": 5.473429951690822, "grad_norm": 7.740478038787842, "learning_rate": 1.77133655394525e-05, "loss": 26.7605, "step": 2266 }, { "epoch": 5.478260869565218, "grad_norm": 6.862391471862793, "learning_rate": 1.7552334943639292e-05, "loss": 26.0278, "step": 2268 }, { "epoch": 5.483091787439614, "grad_norm": 6.675685882568359, "learning_rate": 1.739130434782609e-05, "loss": 24.291, "step": 2270 }, { "epoch": 5.48792270531401, "grad_norm": 7.202348232269287, "learning_rate": 1.7230273752012882e-05, "loss": 25.3346, "step": 2272 }, { "epoch": 5.492753623188406, "grad_norm": 7.335130214691162, "learning_rate": 1.706924315619968e-05, "loss": 24.2558, "step": 2274 }, { "epoch": 5.4975845410628015, "grad_norm": 6.820517539978027, "learning_rate": 1.6908212560386476e-05, "loss": 24.8161, "step": 2276 }, { "epoch": 5.5024154589371985, "grad_norm": 6.23611307144165, "learning_rate": 1.674718196457327e-05, "loss": 25.4897, "step": 2278 }, { "epoch": 5.507246376811594, "grad_norm": 6.273251056671143, "learning_rate": 1.6586151368760062e-05, "loss": 25.4466, "step": 2280 }, { "epoch": 5.512077294685991, "grad_norm": 6.126486301422119, "learning_rate": 1.6425120772946863e-05, "loss": 26.4718, "step": 2282 }, { "epoch": 5.516908212560386, "grad_norm": 6.196963787078857, "learning_rate": 1.6264090177133656e-05, "loss": 25.5031, "step": 2284 }, { "epoch": 5.521739130434782, "grad_norm": 6.553043842315674, "learning_rate": 1.610305958132045e-05, "loss": 26.3876, "step": 2286 }, { "epoch": 5.526570048309178, "grad_norm": 6.308940887451172, "learning_rate": 1.5942028985507246e-05, "loss": 25.3395, "step": 2288 }, { "epoch": 5.531400966183575, "grad_norm": 5.9868059158325195, "learning_rate": 1.5780998389694043e-05, "loss": 26.1367, "step": 2290 }, { "epoch": 5.536231884057971, "grad_norm": 5.966738224029541, "learning_rate": 1.561996779388084e-05, "loss": 24.9832, "step": 2292 }, { "epoch": 5.541062801932367, "grad_norm": 6.130259990692139, "learning_rate": 1.5458937198067633e-05, "loss": 26.0377, "step": 2294 }, { "epoch": 5.545893719806763, "grad_norm": 6.351025104522705, "learning_rate": 1.529790660225443e-05, "loss": 25.387, "step": 2296 }, { "epoch": 5.550724637681159, "grad_norm": 7.592315673828125, "learning_rate": 1.5136876006441225e-05, "loss": 25.8822, "step": 2298 }, { "epoch": 5.555555555555555, "grad_norm": 7.366810321807861, "learning_rate": 1.497584541062802e-05, "loss": 25.716, "step": 2300 }, { "epoch": 5.5603864734299515, "grad_norm": 6.494503974914551, "learning_rate": 1.4814814814814815e-05, "loss": 24.903, "step": 2302 }, { "epoch": 5.565217391304348, "grad_norm": 6.354084491729736, "learning_rate": 1.4653784219001612e-05, "loss": 24.19, "step": 2304 }, { "epoch": 5.570048309178744, "grad_norm": 6.83246374130249, "learning_rate": 1.4492753623188407e-05, "loss": 25.3202, "step": 2306 }, { "epoch": 5.57487922705314, "grad_norm": 7.3366379737854, "learning_rate": 1.4331723027375202e-05, "loss": 26.5993, "step": 2308 }, { "epoch": 5.579710144927536, "grad_norm": 6.854272842407227, "learning_rate": 1.4170692431561997e-05, "loss": 27.5142, "step": 2310 }, { "epoch": 5.584541062801932, "grad_norm": 7.033668041229248, "learning_rate": 1.4009661835748794e-05, "loss": 24.9908, "step": 2312 }, { "epoch": 5.5893719806763285, "grad_norm": 5.725836277008057, "learning_rate": 1.3848631239935589e-05, "loss": 26.9088, "step": 2314 }, { "epoch": 5.594202898550725, "grad_norm": 6.002683162689209, "learning_rate": 1.3687600644122384e-05, "loss": 26.1845, "step": 2316 }, { "epoch": 5.599033816425121, "grad_norm": 6.32890510559082, "learning_rate": 1.3526570048309179e-05, "loss": 24.4862, "step": 2318 }, { "epoch": 5.603864734299517, "grad_norm": 6.316839694976807, "learning_rate": 1.3365539452495976e-05, "loss": 25.2277, "step": 2320 }, { "epoch": 5.608695652173913, "grad_norm": 6.241401672363281, "learning_rate": 1.320450885668277e-05, "loss": 26.0008, "step": 2322 }, { "epoch": 5.613526570048309, "grad_norm": 6.929868221282959, "learning_rate": 1.3043478260869566e-05, "loss": 24.5377, "step": 2324 }, { "epoch": 5.618357487922705, "grad_norm": 6.343822956085205, "learning_rate": 1.288244766505636e-05, "loss": 24.4674, "step": 2326 }, { "epoch": 5.6231884057971016, "grad_norm": 7.933018684387207, "learning_rate": 1.2721417069243158e-05, "loss": 25.4539, "step": 2328 }, { "epoch": 5.628019323671498, "grad_norm": 6.561947345733643, "learning_rate": 1.2560386473429953e-05, "loss": 25.2955, "step": 2330 }, { "epoch": 5.632850241545894, "grad_norm": 7.06411075592041, "learning_rate": 1.2399355877616748e-05, "loss": 26.2145, "step": 2332 }, { "epoch": 5.63768115942029, "grad_norm": 8.267963409423828, "learning_rate": 1.2238325281803543e-05, "loss": 24.2516, "step": 2334 }, { "epoch": 5.642512077294686, "grad_norm": 7.202125072479248, "learning_rate": 1.2077294685990338e-05, "loss": 27.0705, "step": 2336 }, { "epoch": 5.647342995169082, "grad_norm": 6.419391632080078, "learning_rate": 1.1916264090177135e-05, "loss": 23.7174, "step": 2338 }, { "epoch": 5.6521739130434785, "grad_norm": 6.510631561279297, "learning_rate": 1.175523349436393e-05, "loss": 26.1881, "step": 2340 }, { "epoch": 5.657004830917875, "grad_norm": 7.408875465393066, "learning_rate": 1.1594202898550725e-05, "loss": 24.7877, "step": 2342 }, { "epoch": 5.661835748792271, "grad_norm": 6.503478050231934, "learning_rate": 1.143317230273752e-05, "loss": 24.4056, "step": 2344 }, { "epoch": 5.666666666666667, "grad_norm": 6.382200241088867, "learning_rate": 1.1272141706924317e-05, "loss": 25.4992, "step": 2346 }, { "epoch": 5.671497584541063, "grad_norm": 6.437609672546387, "learning_rate": 1.1111111111111112e-05, "loss": 26.6456, "step": 2348 }, { "epoch": 5.676328502415459, "grad_norm": 6.871528625488281, "learning_rate": 1.0950080515297907e-05, "loss": 25.0839, "step": 2350 }, { "epoch": 5.681159420289855, "grad_norm": 8.16054630279541, "learning_rate": 1.0789049919484702e-05, "loss": 26.1491, "step": 2352 }, { "epoch": 5.685990338164252, "grad_norm": 6.024045467376709, "learning_rate": 1.0628019323671499e-05, "loss": 25.0689, "step": 2354 }, { "epoch": 5.690821256038648, "grad_norm": 7.976418972015381, "learning_rate": 1.0466988727858294e-05, "loss": 26.3663, "step": 2356 }, { "epoch": 5.695652173913043, "grad_norm": 5.949817657470703, "learning_rate": 1.0305958132045089e-05, "loss": 27.0973, "step": 2358 }, { "epoch": 5.70048309178744, "grad_norm": 6.103696823120117, "learning_rate": 1.0144927536231885e-05, "loss": 27.1756, "step": 2360 }, { "epoch": 5.705314009661835, "grad_norm": 6.458801746368408, "learning_rate": 9.98389694041868e-06, "loss": 25.2463, "step": 2362 }, { "epoch": 5.710144927536232, "grad_norm": 7.07081413269043, "learning_rate": 9.822866344605476e-06, "loss": 26.1127, "step": 2364 }, { "epoch": 5.714975845410628, "grad_norm": 8.160017967224121, "learning_rate": 9.66183574879227e-06, "loss": 26.3955, "step": 2366 }, { "epoch": 5.719806763285024, "grad_norm": 6.197200775146484, "learning_rate": 9.500805152979067e-06, "loss": 26.8253, "step": 2368 }, { "epoch": 5.72463768115942, "grad_norm": 7.202108860015869, "learning_rate": 9.339774557165862e-06, "loss": 26.4504, "step": 2370 }, { "epoch": 5.729468599033816, "grad_norm": 6.539680480957031, "learning_rate": 9.178743961352658e-06, "loss": 25.6419, "step": 2372 }, { "epoch": 5.734299516908212, "grad_norm": 7.3082756996154785, "learning_rate": 9.017713365539453e-06, "loss": 23.5667, "step": 2374 }, { "epoch": 5.739130434782608, "grad_norm": 6.585788726806641, "learning_rate": 8.85668276972625e-06, "loss": 27.0034, "step": 2376 }, { "epoch": 5.743961352657005, "grad_norm": 8.16417121887207, "learning_rate": 8.695652173913044e-06, "loss": 25.4119, "step": 2378 }, { "epoch": 5.748792270531401, "grad_norm": 6.153932571411133, "learning_rate": 8.53462157809984e-06, "loss": 26.3388, "step": 2380 }, { "epoch": 5.753623188405797, "grad_norm": 7.043217182159424, "learning_rate": 8.373590982286635e-06, "loss": 27.3709, "step": 2382 }, { "epoch": 5.758454106280193, "grad_norm": 6.4633588790893555, "learning_rate": 8.212560386473431e-06, "loss": 28.7053, "step": 2384 }, { "epoch": 5.763285024154589, "grad_norm": 7.188209056854248, "learning_rate": 8.051529790660225e-06, "loss": 26.8206, "step": 2386 }, { "epoch": 5.768115942028985, "grad_norm": 6.451449394226074, "learning_rate": 7.890499194847021e-06, "loss": 24.2479, "step": 2388 }, { "epoch": 5.7729468599033815, "grad_norm": 6.818403720855713, "learning_rate": 7.729468599033817e-06, "loss": 25.5073, "step": 2390 }, { "epoch": 5.777777777777778, "grad_norm": 7.128567218780518, "learning_rate": 7.568438003220612e-06, "loss": 25.6976, "step": 2392 }, { "epoch": 5.782608695652174, "grad_norm": 7.445803165435791, "learning_rate": 7.4074074074074075e-06, "loss": 25.5917, "step": 2394 }, { "epoch": 5.78743961352657, "grad_norm": 6.30618953704834, "learning_rate": 7.246376811594203e-06, "loss": 24.4957, "step": 2396 }, { "epoch": 5.792270531400966, "grad_norm": 6.549522399902344, "learning_rate": 7.0853462157809985e-06, "loss": 24.7214, "step": 2398 }, { "epoch": 5.797101449275362, "grad_norm": 7.38835334777832, "learning_rate": 6.924315619967794e-06, "loss": 25.5254, "step": 2400 }, { "epoch": 5.8019323671497585, "grad_norm": 5.928407669067383, "learning_rate": 6.7632850241545894e-06, "loss": 26.4047, "step": 2402 }, { "epoch": 5.806763285024155, "grad_norm": 6.4094014167785645, "learning_rate": 6.602254428341385e-06, "loss": 27.6215, "step": 2404 }, { "epoch": 5.811594202898551, "grad_norm": 6.558480739593506, "learning_rate": 6.44122383252818e-06, "loss": 26.5965, "step": 2406 }, { "epoch": 5.816425120772947, "grad_norm": 6.696255207061768, "learning_rate": 6.280193236714976e-06, "loss": 26.2955, "step": 2408 }, { "epoch": 5.821256038647343, "grad_norm": 6.232416152954102, "learning_rate": 6.119162640901771e-06, "loss": 27.1097, "step": 2410 }, { "epoch": 5.826086956521739, "grad_norm": 6.8521199226379395, "learning_rate": 5.958132045088567e-06, "loss": 22.8195, "step": 2412 }, { "epoch": 5.830917874396135, "grad_norm": 6.833296298980713, "learning_rate": 5.797101449275362e-06, "loss": 26.155, "step": 2414 }, { "epoch": 5.835748792270532, "grad_norm": 7.534513473510742, "learning_rate": 5.636070853462158e-06, "loss": 25.9633, "step": 2416 }, { "epoch": 5.840579710144928, "grad_norm": 7.544939041137695, "learning_rate": 5.475040257648953e-06, "loss": 26.5301, "step": 2418 }, { "epoch": 5.845410628019324, "grad_norm": 6.818538188934326, "learning_rate": 5.314009661835749e-06, "loss": 27.4522, "step": 2420 }, { "epoch": 5.85024154589372, "grad_norm": 6.0586395263671875, "learning_rate": 5.152979066022544e-06, "loss": 24.9713, "step": 2422 }, { "epoch": 5.855072463768116, "grad_norm": 6.871267318725586, "learning_rate": 4.99194847020934e-06, "loss": 24.1245, "step": 2424 }, { "epoch": 5.859903381642512, "grad_norm": 6.431079387664795, "learning_rate": 4.830917874396135e-06, "loss": 26.3168, "step": 2426 }, { "epoch": 5.8647342995169085, "grad_norm": 6.309189319610596, "learning_rate": 4.669887278582931e-06, "loss": 25.1819, "step": 2428 }, { "epoch": 5.869565217391305, "grad_norm": 7.3601250648498535, "learning_rate": 4.508856682769726e-06, "loss": 25.487, "step": 2430 }, { "epoch": 5.874396135265701, "grad_norm": 6.830559730529785, "learning_rate": 4.347826086956522e-06, "loss": 24.2465, "step": 2432 }, { "epoch": 5.879227053140097, "grad_norm": 6.231956481933594, "learning_rate": 4.186795491143317e-06, "loss": 25.4524, "step": 2434 }, { "epoch": 5.884057971014493, "grad_norm": 7.170751094818115, "learning_rate": 4.025764895330112e-06, "loss": 25.4575, "step": 2436 }, { "epoch": 5.888888888888889, "grad_norm": 6.459787845611572, "learning_rate": 3.864734299516908e-06, "loss": 25.8052, "step": 2438 }, { "epoch": 5.8937198067632846, "grad_norm": 7.013184070587158, "learning_rate": 3.7037037037037037e-06, "loss": 25.0231, "step": 2440 }, { "epoch": 5.898550724637682, "grad_norm": 6.488290786743164, "learning_rate": 3.5426731078904992e-06, "loss": 26.5961, "step": 2442 }, { "epoch": 5.903381642512077, "grad_norm": 6.819639205932617, "learning_rate": 3.3816425120772947e-06, "loss": 24.8935, "step": 2444 }, { "epoch": 5.908212560386474, "grad_norm": 6.606305122375488, "learning_rate": 3.22061191626409e-06, "loss": 25.7707, "step": 2446 }, { "epoch": 5.913043478260869, "grad_norm": 6.314495086669922, "learning_rate": 3.0595813204508857e-06, "loss": 25.5029, "step": 2448 }, { "epoch": 5.917874396135265, "grad_norm": 6.191902160644531, "learning_rate": 2.898550724637681e-06, "loss": 24.0935, "step": 2450 }, { "epoch": 5.9227053140096615, "grad_norm": 7.267618179321289, "learning_rate": 2.7375201288244767e-06, "loss": 26.7584, "step": 2452 }, { "epoch": 5.927536231884058, "grad_norm": 6.361123561859131, "learning_rate": 2.576489533011272e-06, "loss": 26.5634, "step": 2454 }, { "epoch": 5.932367149758454, "grad_norm": 6.8421173095703125, "learning_rate": 2.4154589371980677e-06, "loss": 25.2036, "step": 2456 }, { "epoch": 5.93719806763285, "grad_norm": 6.87398099899292, "learning_rate": 2.254428341384863e-06, "loss": 26.766, "step": 2458 }, { "epoch": 5.942028985507246, "grad_norm": 10.025320053100586, "learning_rate": 2.0933977455716586e-06, "loss": 24.7975, "step": 2460 }, { "epoch": 5.946859903381642, "grad_norm": 6.717752933502197, "learning_rate": 1.932367149758454e-06, "loss": 26.9315, "step": 2462 }, { "epoch": 5.951690821256038, "grad_norm": 6.499180793762207, "learning_rate": 1.7713365539452496e-06, "loss": 25.7196, "step": 2464 }, { "epoch": 5.956521739130435, "grad_norm": 6.772797107696533, "learning_rate": 1.610305958132045e-06, "loss": 23.9813, "step": 2466 }, { "epoch": 5.961352657004831, "grad_norm": 6.387327671051025, "learning_rate": 1.4492753623188406e-06, "loss": 24.631, "step": 2468 }, { "epoch": 5.966183574879227, "grad_norm": 6.289485931396484, "learning_rate": 1.288244766505636e-06, "loss": 26.311, "step": 2470 }, { "epoch": 5.971014492753623, "grad_norm": 6.260473251342773, "learning_rate": 1.1272141706924316e-06, "loss": 25.3955, "step": 2472 }, { "epoch": 5.975845410628019, "grad_norm": 6.831587791442871, "learning_rate": 9.66183574879227e-07, "loss": 26.9048, "step": 2474 }, { "epoch": 5.980676328502415, "grad_norm": 7.09013032913208, "learning_rate": 8.051529790660226e-07, "loss": 26.662, "step": 2476 }, { "epoch": 5.9855072463768115, "grad_norm": 6.909030914306641, "learning_rate": 6.44122383252818e-07, "loss": 24.5214, "step": 2478 }, { "epoch": 5.990338164251208, "grad_norm": 6.548914432525635, "learning_rate": 4.830917874396135e-07, "loss": 26.6767, "step": 2480 }, { "epoch": 5.995169082125604, "grad_norm": 6.59926176071167, "learning_rate": 3.22061191626409e-07, "loss": 27.5884, "step": 2482 }, { "epoch": 6.0, "grad_norm": 6.204819679260254, "learning_rate": 1.610305958132045e-07, "loss": 25.388, "step": 2484 }, { "epoch": 6.0, "step": 2484, "total_flos": 629711046062928.0, "train_loss": 27.644595153857736, "train_runtime": 6156.0959, "train_samples_per_second": 6.456, "train_steps_per_second": 0.404 } ], "logging_steps": 2, "max_steps": 2484, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 629711046062928.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }