{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9986731534719153, "eval_steps": 142, "global_step": 1695, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.349672317504883, "learning_rate": 1.0000000000000002e-06, "loss": 3.4697, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.547457695007324, "eval_runtime": 14.4843, "eval_samples_per_second": 32.863, "eval_steps_per_second": 8.216, "step": 1 }, { "epoch": 0.0, "grad_norm": 5.466770648956299, "learning_rate": 2.0000000000000003e-06, "loss": 3.4361, "step": 2 }, { "epoch": 0.01, "grad_norm": 5.768375873565674, "learning_rate": 3e-06, "loss": 3.5871, "step": 3 }, { "epoch": 0.01, "grad_norm": 5.6878485679626465, "learning_rate": 4.000000000000001e-06, "loss": 3.4894, "step": 4 }, { "epoch": 0.01, "grad_norm": 5.205628871917725, "learning_rate": 5e-06, "loss": 3.501, "step": 5 }, { "epoch": 0.01, "grad_norm": 5.880322456359863, "learning_rate": 6e-06, "loss": 3.5771, "step": 6 }, { "epoch": 0.01, "grad_norm": 5.782011032104492, "learning_rate": 7e-06, "loss": 3.5119, "step": 7 }, { "epoch": 0.01, "grad_norm": 5.285853385925293, "learning_rate": 8.000000000000001e-06, "loss": 3.4089, "step": 8 }, { "epoch": 0.02, "grad_norm": 5.527816295623779, "learning_rate": 9e-06, "loss": 3.4341, "step": 9 }, { "epoch": 0.02, "grad_norm": 5.505781650543213, "learning_rate": 1e-05, "loss": 3.4175, "step": 10 }, { "epoch": 0.02, "grad_norm": 5.26746940612793, "learning_rate": 9.999991309598975e-06, "loss": 3.3163, "step": 11 }, { "epoch": 0.02, "grad_norm": 5.086071968078613, "learning_rate": 9.999965238426104e-06, "loss": 3.274, "step": 12 }, { "epoch": 0.02, "grad_norm": 5.271662712097168, "learning_rate": 9.999921786572015e-06, "loss": 3.2964, "step": 13 }, { "epoch": 0.02, "grad_norm": 4.7532830238342285, "learning_rate": 9.999860954187756e-06, "loss": 3.1292, "step": 14 }, { "epoch": 0.03, "grad_norm": 4.79344367980957, "learning_rate": 9.99978274148479e-06, "loss": 3.0481, "step": 15 }, { "epoch": 0.03, "grad_norm": 4.707954406738281, "learning_rate": 9.999687148734996e-06, "loss": 2.9757, "step": 16 }, { "epoch": 0.03, "grad_norm": 4.709782600402832, "learning_rate": 9.999574176270667e-06, "loss": 2.8673, "step": 17 }, { "epoch": 0.03, "grad_norm": 4.751430988311768, "learning_rate": 9.999443824484519e-06, "loss": 2.8341, "step": 18 }, { "epoch": 0.03, "grad_norm": 4.603825092315674, "learning_rate": 9.999296093829672e-06, "loss": 2.6421, "step": 19 }, { "epoch": 0.04, "grad_norm": 4.514200210571289, "learning_rate": 9.999130984819662e-06, "loss": 2.5699, "step": 20 }, { "epoch": 0.04, "grad_norm": 4.427622318267822, "learning_rate": 9.998948498028435e-06, "loss": 2.434, "step": 21 }, { "epoch": 0.04, "grad_norm": 4.507719993591309, "learning_rate": 9.998748634090344e-06, "loss": 2.3301, "step": 22 }, { "epoch": 0.04, "grad_norm": 4.355949878692627, "learning_rate": 9.998531393700149e-06, "loss": 2.0848, "step": 23 }, { "epoch": 0.04, "grad_norm": 4.219519138336182, "learning_rate": 9.99829677761301e-06, "loss": 1.9115, "step": 24 }, { "epoch": 0.04, "grad_norm": 4.065847873687744, "learning_rate": 9.998044786644492e-06, "loss": 1.7682, "step": 25 }, { "epoch": 0.05, "grad_norm": 3.7316126823425293, "learning_rate": 9.997775421670558e-06, "loss": 1.6426, "step": 26 }, { "epoch": 0.05, "grad_norm": 3.5228142738342285, "learning_rate": 9.997488683627558e-06, "loss": 1.5855, "step": 27 }, { "epoch": 0.05, "grad_norm": 3.2372498512268066, "learning_rate": 9.997184573512245e-06, "loss": 1.4059, "step": 28 }, { "epoch": 0.05, "grad_norm": 3.072031259536743, "learning_rate": 9.996863092381753e-06, "loss": 1.2633, "step": 29 }, { "epoch": 0.05, "grad_norm": 2.941805124282837, "learning_rate": 9.9965242413536e-06, "loss": 1.1709, "step": 30 }, { "epoch": 0.05, "grad_norm": 2.8302178382873535, "learning_rate": 9.99616802160569e-06, "loss": 1.0615, "step": 31 }, { "epoch": 0.06, "grad_norm": 2.7408287525177, "learning_rate": 9.995794434376297e-06, "loss": 1.0031, "step": 32 }, { "epoch": 0.06, "grad_norm": 2.6635422706604004, "learning_rate": 9.995403480964072e-06, "loss": 0.9273, "step": 33 }, { "epoch": 0.06, "grad_norm": 2.538907766342163, "learning_rate": 9.994995162728029e-06, "loss": 0.8141, "step": 34 }, { "epoch": 0.06, "grad_norm": 2.457651138305664, "learning_rate": 9.994569481087552e-06, "loss": 0.7208, "step": 35 }, { "epoch": 0.06, "grad_norm": 2.383510112762451, "learning_rate": 9.994126437522376e-06, "loss": 0.6763, "step": 36 }, { "epoch": 0.07, "grad_norm": 2.170029401779175, "learning_rate": 9.99366603357259e-06, "loss": 0.6163, "step": 37 }, { "epoch": 0.07, "grad_norm": 2.0166823863983154, "learning_rate": 9.993188270838636e-06, "loss": 0.5146, "step": 38 }, { "epoch": 0.07, "grad_norm": 1.9549278020858765, "learning_rate": 9.992693150981293e-06, "loss": 0.4851, "step": 39 }, { "epoch": 0.07, "grad_norm": 1.7454789876937866, "learning_rate": 9.992180675721671e-06, "loss": 0.4015, "step": 40 }, { "epoch": 0.07, "grad_norm": 1.5932520627975464, "learning_rate": 9.991650846841226e-06, "loss": 0.3704, "step": 41 }, { "epoch": 0.07, "grad_norm": 1.4394928216934204, "learning_rate": 9.991103666181721e-06, "loss": 0.3194, "step": 42 }, { "epoch": 0.08, "grad_norm": 1.2608178853988647, "learning_rate": 9.990539135645246e-06, "loss": 0.2621, "step": 43 }, { "epoch": 0.08, "grad_norm": 1.0825426578521729, "learning_rate": 9.989957257194199e-06, "loss": 0.2489, "step": 44 }, { "epoch": 0.08, "grad_norm": 1.0383331775665283, "learning_rate": 9.989358032851283e-06, "loss": 0.249, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.7754441499710083, "learning_rate": 9.9887414646995e-06, "loss": 0.162, "step": 46 }, { "epoch": 0.08, "grad_norm": 0.7348763942718506, "learning_rate": 9.988107554882138e-06, "loss": 0.1713, "step": 47 }, { "epoch": 0.08, "grad_norm": 0.5076927542686462, "learning_rate": 9.987456305602769e-06, "loss": 0.1438, "step": 48 }, { "epoch": 0.09, "grad_norm": 0.5017581582069397, "learning_rate": 9.986787719125241e-06, "loss": 0.1386, "step": 49 }, { "epoch": 0.09, "grad_norm": 0.42143169045448303, "learning_rate": 9.986101797773667e-06, "loss": 0.138, "step": 50 }, { "epoch": 0.09, "grad_norm": 0.2827802896499634, "learning_rate": 9.985398543932421e-06, "loss": 0.1165, "step": 51 }, { "epoch": 0.09, "grad_norm": 0.33519247174263, "learning_rate": 9.984677960046123e-06, "loss": 0.119, "step": 52 }, { "epoch": 0.09, "grad_norm": 0.2341219186782837, "learning_rate": 9.983940048619641e-06, "loss": 0.0748, "step": 53 }, { "epoch": 0.1, "grad_norm": 0.20707747340202332, "learning_rate": 9.983184812218071e-06, "loss": 0.0949, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.3519020974636078, "learning_rate": 9.98241225346674e-06, "loss": 0.0964, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.2268177717924118, "learning_rate": 9.981622375051183e-06, "loss": 0.0771, "step": 56 }, { "epoch": 0.1, "grad_norm": 0.16427360475063324, "learning_rate": 9.980815179717144e-06, "loss": 0.08, "step": 57 }, { "epoch": 0.1, "grad_norm": 0.3412397503852844, "learning_rate": 9.979990670270565e-06, "loss": 0.085, "step": 58 }, { "epoch": 0.1, "grad_norm": 0.13405166566371918, "learning_rate": 9.979148849577574e-06, "loss": 0.0852, "step": 59 }, { "epoch": 0.11, "grad_norm": 0.23308596014976501, "learning_rate": 9.978289720564471e-06, "loss": 0.0895, "step": 60 }, { "epoch": 0.11, "grad_norm": 0.20687691867351532, "learning_rate": 9.97741328621773e-06, "loss": 0.0875, "step": 61 }, { "epoch": 0.11, "grad_norm": 0.14851821959018707, "learning_rate": 9.976519549583974e-06, "loss": 0.0898, "step": 62 }, { "epoch": 0.11, "grad_norm": 0.31722521781921387, "learning_rate": 9.975608513769977e-06, "loss": 0.0902, "step": 63 }, { "epoch": 0.11, "grad_norm": 0.13900260627269745, "learning_rate": 9.974680181942645e-06, "loss": 0.0987, "step": 64 }, { "epoch": 0.11, "grad_norm": 0.21426311135292053, "learning_rate": 9.97373455732901e-06, "loss": 0.0765, "step": 65 }, { "epoch": 0.12, "grad_norm": 0.18942435085773468, "learning_rate": 9.972771643216213e-06, "loss": 0.092, "step": 66 }, { "epoch": 0.12, "grad_norm": 0.15527579188346863, "learning_rate": 9.971791442951498e-06, "loss": 0.0667, "step": 67 }, { "epoch": 0.12, "grad_norm": 0.2748473286628723, "learning_rate": 9.970793959942197e-06, "loss": 0.0905, "step": 68 }, { "epoch": 0.12, "grad_norm": 0.2710763216018677, "learning_rate": 9.969779197655726e-06, "loss": 0.0767, "step": 69 }, { "epoch": 0.12, "grad_norm": 0.19998699426651, "learning_rate": 9.968747159619556e-06, "loss": 0.0836, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.5467928051948547, "learning_rate": 9.96769784942122e-06, "loss": 0.1069, "step": 71 }, { "epoch": 0.13, "grad_norm": 0.1530974954366684, "learning_rate": 9.966631270708288e-06, "loss": 0.0867, "step": 72 }, { "epoch": 0.13, "grad_norm": 0.2446594089269638, "learning_rate": 9.965547427188358e-06, "loss": 0.0771, "step": 73 }, { "epoch": 0.13, "grad_norm": 0.14469081163406372, "learning_rate": 9.964446322629044e-06, "loss": 0.0892, "step": 74 }, { "epoch": 0.13, "grad_norm": 0.2065022885799408, "learning_rate": 9.963327960857962e-06, "loss": 0.0729, "step": 75 }, { "epoch": 0.13, "grad_norm": 0.27265021204948425, "learning_rate": 9.962192345762717e-06, "loss": 0.0684, "step": 76 }, { "epoch": 0.14, "grad_norm": 0.3528543710708618, "learning_rate": 9.961039481290888e-06, "loss": 0.0656, "step": 77 }, { "epoch": 0.14, "grad_norm": 0.14887100458145142, "learning_rate": 9.959869371450022e-06, "loss": 0.0794, "step": 78 }, { "epoch": 0.14, "grad_norm": 0.24467024207115173, "learning_rate": 9.958682020307602e-06, "loss": 0.0749, "step": 79 }, { "epoch": 0.14, "grad_norm": 0.4109407961368561, "learning_rate": 9.957477431991053e-06, "loss": 0.1062, "step": 80 }, { "epoch": 0.14, "grad_norm": 0.20120254158973694, "learning_rate": 9.95625561068772e-06, "loss": 0.101, "step": 81 }, { "epoch": 0.15, "grad_norm": 0.30039364099502563, "learning_rate": 9.955016560644847e-06, "loss": 0.1015, "step": 82 }, { "epoch": 0.15, "grad_norm": 0.3657380938529968, "learning_rate": 9.953760286169571e-06, "loss": 0.1124, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.3613692820072174, "learning_rate": 9.952486791628905e-06, "loss": 0.0836, "step": 84 }, { "epoch": 0.15, "grad_norm": 0.3250538110733032, "learning_rate": 9.95119608144972e-06, "loss": 0.0753, "step": 85 }, { "epoch": 0.15, "grad_norm": 0.21736566722393036, "learning_rate": 9.94988816011873e-06, "loss": 0.075, "step": 86 }, { "epoch": 0.15, "grad_norm": 0.8842391967773438, "learning_rate": 9.948563032182482e-06, "loss": 0.1067, "step": 87 }, { "epoch": 0.16, "grad_norm": 0.2460835725069046, "learning_rate": 9.947220702247329e-06, "loss": 0.0832, "step": 88 }, { "epoch": 0.16, "grad_norm": 0.5178576707839966, "learning_rate": 9.94586117497943e-06, "loss": 0.1007, "step": 89 }, { "epoch": 0.16, "grad_norm": 0.19302992522716522, "learning_rate": 9.944484455104716e-06, "loss": 0.0705, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.5060548186302185, "learning_rate": 9.943090547408888e-06, "loss": 0.1216, "step": 91 }, { "epoch": 0.16, "grad_norm": 0.3466648757457733, "learning_rate": 9.941679456737395e-06, "loss": 0.0938, "step": 92 }, { "epoch": 0.16, "grad_norm": 0.22704587876796722, "learning_rate": 9.940251187995412e-06, "loss": 0.0803, "step": 93 }, { "epoch": 0.17, "grad_norm": 0.25966060161590576, "learning_rate": 9.938805746147827e-06, "loss": 0.0966, "step": 94 }, { "epoch": 0.17, "grad_norm": 0.20877382159233093, "learning_rate": 9.937343136219234e-06, "loss": 0.0667, "step": 95 }, { "epoch": 0.17, "grad_norm": 0.1972026228904724, "learning_rate": 9.935863363293896e-06, "loss": 0.0642, "step": 96 }, { "epoch": 0.17, "grad_norm": 0.21666616201400757, "learning_rate": 9.934366432515741e-06, "loss": 0.0943, "step": 97 }, { "epoch": 0.17, "grad_norm": 0.3361506760120392, "learning_rate": 9.932852349088342e-06, "loss": 0.0797, "step": 98 }, { "epoch": 0.18, "grad_norm": 0.3094469904899597, "learning_rate": 9.931321118274897e-06, "loss": 0.0762, "step": 99 }, { "epoch": 0.18, "grad_norm": 0.33961156010627747, "learning_rate": 9.929772745398207e-06, "loss": 0.0744, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.3448229730129242, "learning_rate": 9.928207235840664e-06, "loss": 0.0562, "step": 101 }, { "epoch": 0.18, "grad_norm": 0.2657065689563751, "learning_rate": 9.926624595044235e-06, "loss": 0.0922, "step": 102 }, { "epoch": 0.18, "grad_norm": 0.2084828019142151, "learning_rate": 9.925024828510429e-06, "loss": 0.0616, "step": 103 }, { "epoch": 0.18, "grad_norm": 0.342433899641037, "learning_rate": 9.92340794180029e-06, "loss": 0.0827, "step": 104 }, { "epoch": 0.19, "grad_norm": 0.21574071049690247, "learning_rate": 9.921773940534382e-06, "loss": 0.0593, "step": 105 }, { "epoch": 0.19, "grad_norm": 0.15846671164035797, "learning_rate": 9.920122830392748e-06, "loss": 0.0732, "step": 106 }, { "epoch": 0.19, "grad_norm": 0.2687283456325531, "learning_rate": 9.91845461711492e-06, "loss": 0.0699, "step": 107 }, { "epoch": 0.19, "grad_norm": 0.18184144794940948, "learning_rate": 9.916769306499866e-06, "loss": 0.0632, "step": 108 }, { "epoch": 0.19, "grad_norm": 0.21744874119758606, "learning_rate": 9.915066904406e-06, "loss": 0.0805, "step": 109 }, { "epoch": 0.19, "grad_norm": 0.3575034737586975, "learning_rate": 9.913347416751148e-06, "loss": 0.0819, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.23368307948112488, "learning_rate": 9.91161084951252e-06, "loss": 0.0576, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.4599943161010742, "learning_rate": 9.909857208726705e-06, "loss": 0.0867, "step": 112 }, { "epoch": 0.2, "grad_norm": 0.26656806468963623, "learning_rate": 9.908086500489638e-06, "loss": 0.0586, "step": 113 }, { "epoch": 0.2, "grad_norm": 0.5311251282691956, "learning_rate": 9.906298730956585e-06, "loss": 0.102, "step": 114 }, { "epoch": 0.2, "grad_norm": 0.3186182677745819, "learning_rate": 9.904493906342124e-06, "loss": 0.0743, "step": 115 }, { "epoch": 0.21, "grad_norm": 0.20787174999713898, "learning_rate": 9.902672032920106e-06, "loss": 0.0536, "step": 116 }, { "epoch": 0.21, "grad_norm": 0.18734194338321686, "learning_rate": 9.900833117023665e-06, "loss": 0.06, "step": 117 }, { "epoch": 0.21, "grad_norm": 0.46386289596557617, "learning_rate": 9.898977165045161e-06, "loss": 0.0861, "step": 118 }, { "epoch": 0.21, "grad_norm": 0.2560313940048218, "learning_rate": 9.897104183436184e-06, "loss": 0.0574, "step": 119 }, { "epoch": 0.21, "grad_norm": 0.22062335908412933, "learning_rate": 9.895214178707516e-06, "loss": 0.0631, "step": 120 }, { "epoch": 0.21, "grad_norm": 0.2438971847295761, "learning_rate": 9.89330715742912e-06, "loss": 0.0906, "step": 121 }, { "epoch": 0.22, "grad_norm": 0.17582635581493378, "learning_rate": 9.891383126230105e-06, "loss": 0.0507, "step": 122 }, { "epoch": 0.22, "grad_norm": 0.2775309979915619, "learning_rate": 9.889442091798712e-06, "loss": 0.0741, "step": 123 }, { "epoch": 0.22, "grad_norm": 0.18693320453166962, "learning_rate": 9.887484060882292e-06, "loss": 0.0624, "step": 124 }, { "epoch": 0.22, "grad_norm": 0.5480987429618835, "learning_rate": 9.885509040287267e-06, "loss": 0.1104, "step": 125 }, { "epoch": 0.22, "grad_norm": 0.25776028633117676, "learning_rate": 9.883517036879133e-06, "loss": 0.0876, "step": 126 }, { "epoch": 0.22, "grad_norm": 0.6148080825805664, "learning_rate": 9.881508057582411e-06, "loss": 0.0678, "step": 127 }, { "epoch": 0.23, "grad_norm": 0.8079890608787537, "learning_rate": 9.879482109380634e-06, "loss": 0.0801, "step": 128 }, { "epoch": 0.23, "grad_norm": 0.22970955073833466, "learning_rate": 9.877439199316324e-06, "loss": 0.0662, "step": 129 }, { "epoch": 0.23, "grad_norm": 0.4607698917388916, "learning_rate": 9.875379334490962e-06, "loss": 0.0863, "step": 130 }, { "epoch": 0.23, "grad_norm": 0.4692334532737732, "learning_rate": 9.873302522064972e-06, "loss": 0.0968, "step": 131 }, { "epoch": 0.23, "grad_norm": 0.8688197135925293, "learning_rate": 9.871208769257686e-06, "loss": 0.0963, "step": 132 }, { "epoch": 0.24, "grad_norm": 0.2747116982936859, "learning_rate": 9.869098083347323e-06, "loss": 0.0801, "step": 133 }, { "epoch": 0.24, "grad_norm": 0.5444130897521973, "learning_rate": 9.866970471670968e-06, "loss": 0.0965, "step": 134 }, { "epoch": 0.24, "grad_norm": 0.303763210773468, "learning_rate": 9.864825941624538e-06, "loss": 0.0727, "step": 135 }, { "epoch": 0.24, "grad_norm": 0.16060137748718262, "learning_rate": 9.862664500662763e-06, "loss": 0.0468, "step": 136 }, { "epoch": 0.24, "grad_norm": 0.22011107206344604, "learning_rate": 9.860486156299164e-06, "loss": 0.0832, "step": 137 }, { "epoch": 0.24, "grad_norm": 0.38508448004722595, "learning_rate": 9.85829091610601e-06, "loss": 0.0947, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.2385941594839096, "learning_rate": 9.856078787714309e-06, "loss": 0.0676, "step": 139 }, { "epoch": 0.25, "grad_norm": 0.15575379133224487, "learning_rate": 9.853849778813777e-06, "loss": 0.0833, "step": 140 }, { "epoch": 0.25, "grad_norm": 0.23725704848766327, "learning_rate": 9.851603897152804e-06, "loss": 0.0807, "step": 141 }, { "epoch": 0.25, "grad_norm": 0.20930880308151245, "learning_rate": 9.849341150538434e-06, "loss": 0.0881, "step": 142 }, { "epoch": 0.25, "eval_loss": 0.08187390118837357, "eval_runtime": 14.7383, "eval_samples_per_second": 32.297, "eval_steps_per_second": 8.074, "step": 142 }, { "epoch": 0.25, "grad_norm": 0.2966826260089874, "learning_rate": 9.84706154683634e-06, "loss": 0.0567, "step": 143 }, { "epoch": 0.25, "grad_norm": 0.3383941948413849, "learning_rate": 9.844765093970787e-06, "loss": 0.0597, "step": 144 }, { "epoch": 0.26, "grad_norm": 0.20434536039829254, "learning_rate": 9.842451799924616e-06, "loss": 0.0873, "step": 145 }, { "epoch": 0.26, "grad_norm": 0.27841946482658386, "learning_rate": 9.840121672739208e-06, "loss": 0.0746, "step": 146 }, { "epoch": 0.26, "grad_norm": 0.18767426908016205, "learning_rate": 9.837774720514456e-06, "loss": 0.0928, "step": 147 }, { "epoch": 0.26, "grad_norm": 0.4455524981021881, "learning_rate": 9.835410951408748e-06, "loss": 0.0692, "step": 148 }, { "epoch": 0.26, "grad_norm": 0.24763479828834534, "learning_rate": 9.83303037363892e-06, "loss": 0.0643, "step": 149 }, { "epoch": 0.27, "grad_norm": 0.4498063325881958, "learning_rate": 9.830632995480243e-06, "loss": 0.0736, "step": 150 }, { "epoch": 0.27, "grad_norm": 0.2298639714717865, "learning_rate": 9.828218825266389e-06, "loss": 0.0678, "step": 151 }, { "epoch": 0.27, "grad_norm": 0.46498408913612366, "learning_rate": 9.8257878713894e-06, "loss": 0.0775, "step": 152 }, { "epoch": 0.27, "grad_norm": 0.3503478169441223, "learning_rate": 9.823340142299662e-06, "loss": 0.0749, "step": 153 }, { "epoch": 0.27, "grad_norm": 0.3784450590610504, "learning_rate": 9.820875646505874e-06, "loss": 0.0806, "step": 154 }, { "epoch": 0.27, "grad_norm": 0.2675660252571106, "learning_rate": 9.818394392575018e-06, "loss": 0.1054, "step": 155 }, { "epoch": 0.28, "grad_norm": 0.18610945343971252, "learning_rate": 9.815896389132333e-06, "loss": 0.0793, "step": 156 }, { "epoch": 0.28, "grad_norm": 0.25484928488731384, "learning_rate": 9.813381644861276e-06, "loss": 0.1004, "step": 157 }, { "epoch": 0.28, "grad_norm": 0.2842435836791992, "learning_rate": 9.810850168503506e-06, "loss": 0.0413, "step": 158 }, { "epoch": 0.28, "grad_norm": 0.2782611548900604, "learning_rate": 9.808301968858838e-06, "loss": 0.1083, "step": 159 }, { "epoch": 0.28, "grad_norm": 0.1917373687028885, "learning_rate": 9.805737054785223e-06, "loss": 0.0727, "step": 160 }, { "epoch": 0.28, "grad_norm": 0.2308584451675415, "learning_rate": 9.803155435198713e-06, "loss": 0.0629, "step": 161 }, { "epoch": 0.29, "grad_norm": 0.20095452666282654, "learning_rate": 9.800557119073433e-06, "loss": 0.0857, "step": 162 }, { "epoch": 0.29, "grad_norm": 0.28956976532936096, "learning_rate": 9.797942115441546e-06, "loss": 0.053, "step": 163 }, { "epoch": 0.29, "grad_norm": 0.24081195890903473, "learning_rate": 9.795310433393227e-06, "loss": 0.0611, "step": 164 }, { "epoch": 0.29, "grad_norm": 0.4568108022212982, "learning_rate": 9.792662082076618e-06, "loss": 0.1011, "step": 165 }, { "epoch": 0.29, "grad_norm": 0.16725283861160278, "learning_rate": 9.789997070697821e-06, "loss": 0.0525, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.27492183446884155, "learning_rate": 9.787315408520839e-06, "loss": 0.0581, "step": 167 }, { "epoch": 0.3, "grad_norm": 0.21994265913963318, "learning_rate": 9.78461710486756e-06, "loss": 0.068, "step": 168 }, { "epoch": 0.3, "grad_norm": 0.2217988818883896, "learning_rate": 9.78190216911772e-06, "loss": 0.0663, "step": 169 }, { "epoch": 0.3, "grad_norm": 0.43498608469963074, "learning_rate": 9.779170610708872e-06, "loss": 0.1003, "step": 170 }, { "epoch": 0.3, "grad_norm": 0.5494738817214966, "learning_rate": 9.776422439136351e-06, "loss": 0.0901, "step": 171 }, { "epoch": 0.3, "grad_norm": 0.30544596910476685, "learning_rate": 9.773657663953244e-06, "loss": 0.1049, "step": 172 }, { "epoch": 0.31, "grad_norm": 0.5173572301864624, "learning_rate": 9.77087629477035e-06, "loss": 0.084, "step": 173 }, { "epoch": 0.31, "grad_norm": 0.1275845617055893, "learning_rate": 9.768078341256156e-06, "loss": 0.0668, "step": 174 }, { "epoch": 0.31, "grad_norm": 0.28769540786743164, "learning_rate": 9.765263813136796e-06, "loss": 0.0743, "step": 175 }, { "epoch": 0.31, "grad_norm": 0.7122567296028137, "learning_rate": 9.762432720196024e-06, "loss": 0.129, "step": 176 }, { "epoch": 0.31, "grad_norm": 0.28987741470336914, "learning_rate": 9.759585072275171e-06, "loss": 0.077, "step": 177 }, { "epoch": 0.31, "grad_norm": 0.24338261783123016, "learning_rate": 9.756720879273117e-06, "loss": 0.0763, "step": 178 }, { "epoch": 0.32, "grad_norm": 0.3709527254104614, "learning_rate": 9.753840151146259e-06, "loss": 0.0639, "step": 179 }, { "epoch": 0.32, "grad_norm": 0.1488633006811142, "learning_rate": 9.750942897908468e-06, "loss": 0.0825, "step": 180 }, { "epoch": 0.32, "grad_norm": 0.13455910980701447, "learning_rate": 9.748029129631062e-06, "loss": 0.0594, "step": 181 }, { "epoch": 0.32, "grad_norm": 0.2483726590871811, "learning_rate": 9.745098856442769e-06, "loss": 0.0621, "step": 182 }, { "epoch": 0.32, "grad_norm": 0.34576350450515747, "learning_rate": 9.742152088529683e-06, "loss": 0.083, "step": 183 }, { "epoch": 0.33, "grad_norm": 0.28745996952056885, "learning_rate": 9.739188836135247e-06, "loss": 0.0517, "step": 184 }, { "epoch": 0.33, "grad_norm": 0.18788190186023712, "learning_rate": 9.736209109560201e-06, "loss": 0.0831, "step": 185 }, { "epoch": 0.33, "grad_norm": 0.1737545132637024, "learning_rate": 9.733212919162551e-06, "loss": 0.0597, "step": 186 }, { "epoch": 0.33, "grad_norm": 0.31250202655792236, "learning_rate": 9.730200275357535e-06, "loss": 0.0591, "step": 187 }, { "epoch": 0.33, "grad_norm": 0.20151746273040771, "learning_rate": 9.727171188617588e-06, "loss": 0.0687, "step": 188 }, { "epoch": 0.33, "grad_norm": 0.19415058195590973, "learning_rate": 9.7241256694723e-06, "loss": 0.0463, "step": 189 }, { "epoch": 0.34, "grad_norm": 0.1791050285100937, "learning_rate": 9.721063728508384e-06, "loss": 0.0727, "step": 190 }, { "epoch": 0.34, "grad_norm": 0.5859791040420532, "learning_rate": 9.71798537636964e-06, "loss": 0.0996, "step": 191 }, { "epoch": 0.34, "grad_norm": 0.4107828438282013, "learning_rate": 9.714890623756912e-06, "loss": 0.0675, "step": 192 }, { "epoch": 0.34, "grad_norm": 0.339235782623291, "learning_rate": 9.711779481428057e-06, "loss": 0.0916, "step": 193 }, { "epoch": 0.34, "grad_norm": 0.3153787851333618, "learning_rate": 9.708651960197904e-06, "loss": 0.0838, "step": 194 }, { "epoch": 0.34, "grad_norm": 0.38205188512802124, "learning_rate": 9.705508070938219e-06, "loss": 0.0773, "step": 195 }, { "epoch": 0.35, "grad_norm": 0.48745977878570557, "learning_rate": 9.702347824577667e-06, "loss": 0.0571, "step": 196 }, { "epoch": 0.35, "grad_norm": 0.3050192594528198, "learning_rate": 9.699171232101769e-06, "loss": 0.0473, "step": 197 }, { "epoch": 0.35, "grad_norm": 0.22496673464775085, "learning_rate": 9.695978304552871e-06, "loss": 0.0589, "step": 198 }, { "epoch": 0.35, "grad_norm": 0.3087387979030609, "learning_rate": 9.6927690530301e-06, "loss": 0.0814, "step": 199 }, { "epoch": 0.35, "grad_norm": 0.1646536886692047, "learning_rate": 9.689543488689332e-06, "loss": 0.0694, "step": 200 }, { "epoch": 0.36, "grad_norm": 0.3601110279560089, "learning_rate": 9.686301622743144e-06, "loss": 0.0919, "step": 201 }, { "epoch": 0.36, "grad_norm": 0.20077168941497803, "learning_rate": 9.683043466460783e-06, "loss": 0.0754, "step": 202 }, { "epoch": 0.36, "grad_norm": 0.2863204777240753, "learning_rate": 9.67976903116812e-06, "loss": 0.0814, "step": 203 }, { "epoch": 0.36, "grad_norm": 0.30959010124206543, "learning_rate": 9.676478328247623e-06, "loss": 0.1012, "step": 204 }, { "epoch": 0.36, "grad_norm": 0.2969251871109009, "learning_rate": 9.673171369138297e-06, "loss": 0.0983, "step": 205 }, { "epoch": 0.36, "grad_norm": 0.19835108518600464, "learning_rate": 9.669848165335668e-06, "loss": 0.0814, "step": 206 }, { "epoch": 0.37, "grad_norm": 0.30629757046699524, "learning_rate": 9.666508728391719e-06, "loss": 0.0985, "step": 207 }, { "epoch": 0.37, "grad_norm": 0.17222163081169128, "learning_rate": 9.663153069914874e-06, "loss": 0.0789, "step": 208 }, { "epoch": 0.37, "grad_norm": 0.4108015298843384, "learning_rate": 9.65978120156994e-06, "loss": 0.0797, "step": 209 }, { "epoch": 0.37, "grad_norm": 0.2489665299654007, "learning_rate": 9.656393135078067e-06, "loss": 0.0927, "step": 210 }, { "epoch": 0.37, "grad_norm": 0.27541467547416687, "learning_rate": 9.652988882216725e-06, "loss": 0.0496, "step": 211 }, { "epoch": 0.38, "grad_norm": 0.17665977776050568, "learning_rate": 9.649568454819637e-06, "loss": 0.0666, "step": 212 }, { "epoch": 0.38, "grad_norm": 0.20858129858970642, "learning_rate": 9.646131864776762e-06, "loss": 0.0708, "step": 213 }, { "epoch": 0.38, "grad_norm": 0.18341350555419922, "learning_rate": 9.642679124034234e-06, "loss": 0.0805, "step": 214 }, { "epoch": 0.38, "grad_norm": 0.26587384939193726, "learning_rate": 9.639210244594335e-06, "loss": 0.0744, "step": 215 }, { "epoch": 0.38, "grad_norm": 0.20824970304965973, "learning_rate": 9.635725238515447e-06, "loss": 0.0821, "step": 216 }, { "epoch": 0.38, "grad_norm": 0.20785243809223175, "learning_rate": 9.63222411791201e-06, "loss": 0.0435, "step": 217 }, { "epoch": 0.39, "grad_norm": 0.45304587483406067, "learning_rate": 9.628706894954481e-06, "loss": 0.0791, "step": 218 }, { "epoch": 0.39, "grad_norm": 0.34389665722846985, "learning_rate": 9.62517358186929e-06, "loss": 0.0829, "step": 219 }, { "epoch": 0.39, "grad_norm": 0.24149852991104126, "learning_rate": 9.621624190938802e-06, "loss": 0.0555, "step": 220 }, { "epoch": 0.39, "grad_norm": 0.24253778159618378, "learning_rate": 9.61805873450127e-06, "loss": 0.0948, "step": 221 }, { "epoch": 0.39, "grad_norm": 0.24377629160881042, "learning_rate": 9.614477224950788e-06, "loss": 0.0758, "step": 222 }, { "epoch": 0.39, "grad_norm": 0.1714078187942505, "learning_rate": 9.610879674737263e-06, "loss": 0.0773, "step": 223 }, { "epoch": 0.4, "grad_norm": 0.15262386202812195, "learning_rate": 9.607266096366353e-06, "loss": 0.0523, "step": 224 }, { "epoch": 0.4, "grad_norm": 0.34207943081855774, "learning_rate": 9.603636502399436e-06, "loss": 0.0981, "step": 225 }, { "epoch": 0.4, "grad_norm": 0.2515898048877716, "learning_rate": 9.599990905453567e-06, "loss": 0.0576, "step": 226 }, { "epoch": 0.4, "grad_norm": 0.509267270565033, "learning_rate": 9.59632931820142e-06, "loss": 0.0819, "step": 227 }, { "epoch": 0.4, "grad_norm": 0.21576029062271118, "learning_rate": 9.592651753371264e-06, "loss": 0.0758, "step": 228 }, { "epoch": 0.41, "grad_norm": 0.44684383273124695, "learning_rate": 9.588958223746903e-06, "loss": 0.0543, "step": 229 }, { "epoch": 0.41, "grad_norm": 0.4631918668746948, "learning_rate": 9.585248742167638e-06, "loss": 0.0795, "step": 230 }, { "epoch": 0.41, "grad_norm": 0.3433883488178253, "learning_rate": 9.581523321528224e-06, "loss": 0.0505, "step": 231 }, { "epoch": 0.41, "grad_norm": 0.23749905824661255, "learning_rate": 9.577781974778817e-06, "loss": 0.0463, "step": 232 }, { "epoch": 0.41, "grad_norm": 0.26895391941070557, "learning_rate": 9.574024714924941e-06, "loss": 0.0501, "step": 233 }, { "epoch": 0.41, "grad_norm": 0.45005208253860474, "learning_rate": 9.570251555027432e-06, "loss": 0.1112, "step": 234 }, { "epoch": 0.42, "grad_norm": 0.21089734137058258, "learning_rate": 9.566462508202403e-06, "loss": 0.0912, "step": 235 }, { "epoch": 0.42, "grad_norm": 0.22349245846271515, "learning_rate": 9.562657587621186e-06, "loss": 0.0671, "step": 236 }, { "epoch": 0.42, "grad_norm": 0.20209869742393494, "learning_rate": 9.558836806510292e-06, "loss": 0.0507, "step": 237 }, { "epoch": 0.42, "grad_norm": 0.29422205686569214, "learning_rate": 9.555000178151375e-06, "loss": 0.0744, "step": 238 }, { "epoch": 0.42, "grad_norm": 0.2201872318983078, "learning_rate": 9.551147715881167e-06, "loss": 0.0784, "step": 239 }, { "epoch": 0.42, "grad_norm": 0.2708396017551422, "learning_rate": 9.547279433091446e-06, "loss": 0.0574, "step": 240 }, { "epoch": 0.43, "grad_norm": 0.1722852736711502, "learning_rate": 9.543395343228984e-06, "loss": 0.0788, "step": 241 }, { "epoch": 0.43, "grad_norm": 0.5377947688102722, "learning_rate": 9.5394954597955e-06, "loss": 0.0908, "step": 242 }, { "epoch": 0.43, "grad_norm": 0.2269710898399353, "learning_rate": 9.535579796347612e-06, "loss": 0.078, "step": 243 }, { "epoch": 0.43, "grad_norm": 0.2239355891942978, "learning_rate": 9.531648366496799e-06, "loss": 0.0501, "step": 244 }, { "epoch": 0.43, "grad_norm": 0.20963357388973236, "learning_rate": 9.527701183909336e-06, "loss": 0.0611, "step": 245 }, { "epoch": 0.44, "grad_norm": 0.2847237288951874, "learning_rate": 9.52373826230627e-06, "loss": 0.0761, "step": 246 }, { "epoch": 0.44, "grad_norm": 0.20428815484046936, "learning_rate": 9.519759615463346e-06, "loss": 0.0684, "step": 247 }, { "epoch": 0.44, "grad_norm": 0.18516795337200165, "learning_rate": 9.51576525721098e-06, "loss": 0.0667, "step": 248 }, { "epoch": 0.44, "grad_norm": 0.4787046015262604, "learning_rate": 9.511755201434206e-06, "loss": 0.1339, "step": 249 }, { "epoch": 0.44, "grad_norm": 0.5016088485717773, "learning_rate": 9.507729462072615e-06, "loss": 0.0723, "step": 250 }, { "epoch": 0.44, "grad_norm": 0.24285417795181274, "learning_rate": 9.503688053120327e-06, "loss": 0.0553, "step": 251 }, { "epoch": 0.45, "grad_norm": 0.44292014837265015, "learning_rate": 9.499630988625926e-06, "loss": 0.071, "step": 252 }, { "epoch": 0.45, "grad_norm": 0.7392637133598328, "learning_rate": 9.495558282692421e-06, "loss": 0.1074, "step": 253 }, { "epoch": 0.45, "grad_norm": 0.24650004506111145, "learning_rate": 9.491469949477189e-06, "loss": 0.0505, "step": 254 }, { "epoch": 0.45, "grad_norm": 0.22753530740737915, "learning_rate": 9.48736600319193e-06, "loss": 0.065, "step": 255 }, { "epoch": 0.45, "grad_norm": 0.3731183111667633, "learning_rate": 9.483246458102626e-06, "loss": 0.0809, "step": 256 }, { "epoch": 0.45, "grad_norm": 0.16581465303897858, "learning_rate": 9.479111328529473e-06, "loss": 0.0575, "step": 257 }, { "epoch": 0.46, "grad_norm": 0.2750982344150543, "learning_rate": 9.474960628846844e-06, "loss": 0.0967, "step": 258 }, { "epoch": 0.46, "grad_norm": 0.3518216609954834, "learning_rate": 9.470794373483236e-06, "loss": 0.09, "step": 259 }, { "epoch": 0.46, "grad_norm": 0.3871642053127289, "learning_rate": 9.466612576921223e-06, "loss": 0.0741, "step": 260 }, { "epoch": 0.46, "grad_norm": 0.23743928968906403, "learning_rate": 9.462415253697402e-06, "loss": 0.0764, "step": 261 }, { "epoch": 0.46, "grad_norm": 0.4438982903957367, "learning_rate": 9.458202418402339e-06, "loss": 0.089, "step": 262 }, { "epoch": 0.47, "grad_norm": 0.1571800857782364, "learning_rate": 9.453974085680527e-06, "loss": 0.0481, "step": 263 }, { "epoch": 0.47, "grad_norm": 0.243282288312912, "learning_rate": 9.449730270230326e-06, "loss": 0.0843, "step": 264 }, { "epoch": 0.47, "grad_norm": 0.21889477968215942, "learning_rate": 9.445470986803922e-06, "loss": 0.0456, "step": 265 }, { "epoch": 0.47, "grad_norm": 0.14643190801143646, "learning_rate": 9.441196250207267e-06, "loss": 0.0555, "step": 266 }, { "epoch": 0.47, "grad_norm": 0.6666358709335327, "learning_rate": 9.436906075300032e-06, "loss": 0.0775, "step": 267 }, { "epoch": 0.47, "grad_norm": 0.16846437752246857, "learning_rate": 9.432600476995552e-06, "loss": 0.0354, "step": 268 }, { "epoch": 0.48, "grad_norm": 0.23625656962394714, "learning_rate": 9.428279470260776e-06, "loss": 0.0837, "step": 269 }, { "epoch": 0.48, "grad_norm": 0.25802698731422424, "learning_rate": 9.423943070116219e-06, "loss": 0.0685, "step": 270 }, { "epoch": 0.48, "grad_norm": 0.1842992752790451, "learning_rate": 9.419591291635901e-06, "loss": 0.0418, "step": 271 }, { "epoch": 0.48, "grad_norm": 0.3693784773349762, "learning_rate": 9.415224149947307e-06, "loss": 0.0619, "step": 272 }, { "epoch": 0.48, "grad_norm": 0.36359575390815735, "learning_rate": 9.410841660231315e-06, "loss": 0.0675, "step": 273 }, { "epoch": 0.48, "grad_norm": 0.25318172574043274, "learning_rate": 9.406443837722168e-06, "loss": 0.0521, "step": 274 }, { "epoch": 0.49, "grad_norm": 0.32068488001823425, "learning_rate": 9.402030697707398e-06, "loss": 0.0744, "step": 275 }, { "epoch": 0.49, "grad_norm": 0.2709636390209198, "learning_rate": 9.397602255527792e-06, "loss": 0.0446, "step": 276 }, { "epoch": 0.49, "grad_norm": 0.4686291813850403, "learning_rate": 9.393158526577322e-06, "loss": 0.0683, "step": 277 }, { "epoch": 0.49, "grad_norm": 0.32740774750709534, "learning_rate": 9.388699526303106e-06, "loss": 0.061, "step": 278 }, { "epoch": 0.49, "grad_norm": 0.5102832913398743, "learning_rate": 9.38422527020534e-06, "loss": 0.0904, "step": 279 }, { "epoch": 0.5, "grad_norm": 0.3581833243370056, "learning_rate": 9.37973577383726e-06, "loss": 0.0691, "step": 280 }, { "epoch": 0.5, "grad_norm": 0.22778642177581787, "learning_rate": 9.375231052805074e-06, "loss": 0.0507, "step": 281 }, { "epoch": 0.5, "grad_norm": 0.3017262816429138, "learning_rate": 9.370711122767912e-06, "loss": 0.0909, "step": 282 }, { "epoch": 0.5, "grad_norm": 0.24568532407283783, "learning_rate": 9.36617599943778e-06, "loss": 0.0622, "step": 283 }, { "epoch": 0.5, "grad_norm": 0.3963547348976135, "learning_rate": 9.361625698579493e-06, "loss": 0.1131, "step": 284 }, { "epoch": 0.5, "eval_loss": 0.07626692205667496, "eval_runtime": 14.7121, "eval_samples_per_second": 32.354, "eval_steps_per_second": 8.089, "step": 284 }, { "epoch": 0.5, "grad_norm": 0.6276136636734009, "learning_rate": 9.357060236010626e-06, "loss": 0.1067, "step": 285 }, { "epoch": 0.51, "grad_norm": 0.30459901690483093, "learning_rate": 9.35247962760146e-06, "loss": 0.0863, "step": 286 }, { "epoch": 0.51, "grad_norm": 0.5768241286277771, "learning_rate": 9.347883889274922e-06, "loss": 0.0966, "step": 287 }, { "epoch": 0.51, "grad_norm": 0.14496010541915894, "learning_rate": 9.34327303700654e-06, "loss": 0.0242, "step": 288 }, { "epoch": 0.51, "grad_norm": 0.25412389636039734, "learning_rate": 9.338647086824373e-06, "loss": 0.071, "step": 289 }, { "epoch": 0.51, "grad_norm": 0.23926912248134613, "learning_rate": 9.334006054808966e-06, "loss": 0.0378, "step": 290 }, { "epoch": 0.51, "grad_norm": 0.7999410629272461, "learning_rate": 9.329349957093293e-06, "loss": 0.0912, "step": 291 }, { "epoch": 0.52, "grad_norm": 0.18663926422595978, "learning_rate": 9.324678809862696e-06, "loss": 0.0658, "step": 292 }, { "epoch": 0.52, "grad_norm": 0.27844029664993286, "learning_rate": 9.319992629354828e-06, "loss": 0.0657, "step": 293 }, { "epoch": 0.52, "grad_norm": 0.295076847076416, "learning_rate": 9.31529143185961e-06, "loss": 0.0497, "step": 294 }, { "epoch": 0.52, "grad_norm": 0.25130167603492737, "learning_rate": 9.310575233719155e-06, "loss": 0.0771, "step": 295 }, { "epoch": 0.52, "grad_norm": 0.3607313632965088, "learning_rate": 9.305844051327725e-06, "loss": 0.0852, "step": 296 }, { "epoch": 0.53, "grad_norm": 0.2247592657804489, "learning_rate": 9.301097901131671e-06, "loss": 0.0793, "step": 297 }, { "epoch": 0.53, "grad_norm": 0.34789037704467773, "learning_rate": 9.296336799629368e-06, "loss": 0.0602, "step": 298 }, { "epoch": 0.53, "grad_norm": 0.27349668741226196, "learning_rate": 9.291560763371173e-06, "loss": 0.0546, "step": 299 }, { "epoch": 0.53, "grad_norm": 0.15801368653774261, "learning_rate": 9.28676980895935e-06, "loss": 0.0545, "step": 300 }, { "epoch": 0.53, "grad_norm": 0.22296564280986786, "learning_rate": 9.28196395304803e-06, "loss": 0.0512, "step": 301 }, { "epoch": 0.53, "grad_norm": 0.18935079872608185, "learning_rate": 9.277143212343134e-06, "loss": 0.0382, "step": 302 }, { "epoch": 0.54, "grad_norm": 0.41481491923332214, "learning_rate": 9.272307603602334e-06, "loss": 0.0924, "step": 303 }, { "epoch": 0.54, "grad_norm": 0.4681742489337921, "learning_rate": 9.26745714363498e-06, "loss": 0.0644, "step": 304 }, { "epoch": 0.54, "grad_norm": 0.2106870412826538, "learning_rate": 9.262591849302049e-06, "loss": 0.0562, "step": 305 }, { "epoch": 0.54, "grad_norm": 0.45636868476867676, "learning_rate": 9.257711737516083e-06, "loss": 0.0751, "step": 306 }, { "epoch": 0.54, "grad_norm": 0.21162806451320648, "learning_rate": 9.252816825241135e-06, "loss": 0.0356, "step": 307 }, { "epoch": 0.54, "grad_norm": 0.31407129764556885, "learning_rate": 9.247907129492707e-06, "loss": 0.0713, "step": 308 }, { "epoch": 0.55, "grad_norm": 0.15091249346733093, "learning_rate": 9.242982667337686e-06, "loss": 0.066, "step": 309 }, { "epoch": 0.55, "grad_norm": 0.22152362763881683, "learning_rate": 9.238043455894294e-06, "loss": 0.0732, "step": 310 }, { "epoch": 0.55, "grad_norm": 0.21332816779613495, "learning_rate": 9.233089512332021e-06, "loss": 0.0744, "step": 311 }, { "epoch": 0.55, "grad_norm": 0.3001808524131775, "learning_rate": 9.228120853871571e-06, "loss": 0.0337, "step": 312 }, { "epoch": 0.55, "grad_norm": 0.45393407344818115, "learning_rate": 9.223137497784798e-06, "loss": 0.0704, "step": 313 }, { "epoch": 0.56, "grad_norm": 0.36986440420150757, "learning_rate": 9.218139461394644e-06, "loss": 0.0751, "step": 314 }, { "epoch": 0.56, "grad_norm": 0.25236037373542786, "learning_rate": 9.213126762075088e-06, "loss": 0.0782, "step": 315 }, { "epoch": 0.56, "grad_norm": 0.19866494834423065, "learning_rate": 9.208099417251077e-06, "loss": 0.0404, "step": 316 }, { "epoch": 0.56, "grad_norm": 0.18358244001865387, "learning_rate": 9.203057444398469e-06, "loss": 0.0362, "step": 317 }, { "epoch": 0.56, "grad_norm": 0.38209205865859985, "learning_rate": 9.198000861043967e-06, "loss": 0.0531, "step": 318 }, { "epoch": 0.56, "grad_norm": 0.2181481420993805, "learning_rate": 9.19292968476507e-06, "loss": 0.0607, "step": 319 }, { "epoch": 0.57, "grad_norm": 0.19979895651340485, "learning_rate": 9.187843933189994e-06, "loss": 0.0654, "step": 320 }, { "epoch": 0.57, "grad_norm": 0.46315455436706543, "learning_rate": 9.182743623997634e-06, "loss": 0.0654, "step": 321 }, { "epoch": 0.57, "grad_norm": 0.31687167286872864, "learning_rate": 9.17762877491748e-06, "loss": 0.0628, "step": 322 }, { "epoch": 0.57, "grad_norm": 0.3118394613265991, "learning_rate": 9.172499403729567e-06, "loss": 0.0808, "step": 323 }, { "epoch": 0.57, "grad_norm": 0.8999722599983215, "learning_rate": 9.167355528264415e-06, "loss": 0.1028, "step": 324 }, { "epoch": 0.57, "grad_norm": 0.41446566581726074, "learning_rate": 9.162197166402957e-06, "loss": 0.0896, "step": 325 }, { "epoch": 0.58, "grad_norm": 0.23004719614982605, "learning_rate": 9.157024336076488e-06, "loss": 0.067, "step": 326 }, { "epoch": 0.58, "grad_norm": 0.42118194699287415, "learning_rate": 9.151837055266595e-06, "loss": 0.0391, "step": 327 }, { "epoch": 0.58, "grad_norm": 0.21042917668819427, "learning_rate": 9.1466353420051e-06, "loss": 0.0677, "step": 328 }, { "epoch": 0.58, "grad_norm": 0.22170236706733704, "learning_rate": 9.14141921437399e-06, "loss": 0.0551, "step": 329 }, { "epoch": 0.58, "grad_norm": 0.24892501533031464, "learning_rate": 9.136188690505363e-06, "loss": 0.0527, "step": 330 }, { "epoch": 0.59, "grad_norm": 0.319352924823761, "learning_rate": 9.130943788581359e-06, "loss": 0.0843, "step": 331 }, { "epoch": 0.59, "grad_norm": 0.36097249388694763, "learning_rate": 9.1256845268341e-06, "loss": 0.108, "step": 332 }, { "epoch": 0.59, "grad_norm": 0.39498621225357056, "learning_rate": 9.120410923545619e-06, "loss": 0.053, "step": 333 }, { "epoch": 0.59, "grad_norm": 0.5976508855819702, "learning_rate": 9.115122997047812e-06, "loss": 0.093, "step": 334 }, { "epoch": 0.59, "grad_norm": 0.3573974072933197, "learning_rate": 9.109820765722357e-06, "loss": 0.0988, "step": 335 }, { "epoch": 0.59, "grad_norm": 0.3447941839694977, "learning_rate": 9.10450424800066e-06, "loss": 0.1083, "step": 336 }, { "epoch": 0.6, "grad_norm": 0.36789995431900024, "learning_rate": 9.099173462363794e-06, "loss": 0.048, "step": 337 }, { "epoch": 0.6, "grad_norm": 0.23102298378944397, "learning_rate": 9.093828427342419e-06, "loss": 0.0615, "step": 338 }, { "epoch": 0.6, "grad_norm": 0.2154015451669693, "learning_rate": 9.088469161516735e-06, "loss": 0.0775, "step": 339 }, { "epoch": 0.6, "grad_norm": 0.2312682718038559, "learning_rate": 9.083095683516414e-06, "loss": 0.0708, "step": 340 }, { "epoch": 0.6, "grad_norm": 0.1561180055141449, "learning_rate": 9.077708012020525e-06, "loss": 0.0628, "step": 341 }, { "epoch": 0.61, "grad_norm": 0.2561321556568146, "learning_rate": 9.072306165757476e-06, "loss": 0.0913, "step": 342 }, { "epoch": 0.61, "grad_norm": 0.4026300013065338, "learning_rate": 9.066890163504956e-06, "loss": 0.0757, "step": 343 }, { "epoch": 0.61, "grad_norm": 0.2731042802333832, "learning_rate": 9.061460024089853e-06, "loss": 0.1009, "step": 344 }, { "epoch": 0.61, "grad_norm": 0.32153868675231934, "learning_rate": 9.056015766388205e-06, "loss": 0.0841, "step": 345 }, { "epoch": 0.61, "grad_norm": 0.2845655381679535, "learning_rate": 9.050557409325126e-06, "loss": 0.0573, "step": 346 }, { "epoch": 0.61, "grad_norm": 0.2339751124382019, "learning_rate": 9.045084971874738e-06, "loss": 0.083, "step": 347 }, { "epoch": 0.62, "grad_norm": 0.1685064285993576, "learning_rate": 9.039598473060114e-06, "loss": 0.0598, "step": 348 }, { "epoch": 0.62, "grad_norm": 0.2221630960702896, "learning_rate": 9.0340979319532e-06, "loss": 0.0639, "step": 349 }, { "epoch": 0.62, "grad_norm": 0.24915754795074463, "learning_rate": 9.028583367674767e-06, "loss": 0.1017, "step": 350 }, { "epoch": 0.62, "grad_norm": 0.21576786041259766, "learning_rate": 9.023054799394316e-06, "loss": 0.0598, "step": 351 }, { "epoch": 0.62, "grad_norm": 0.46793073415756226, "learning_rate": 9.017512246330043e-06, "loss": 0.0845, "step": 352 }, { "epoch": 0.62, "grad_norm": 0.23132359981536865, "learning_rate": 9.01195572774875e-06, "loss": 0.0865, "step": 353 }, { "epoch": 0.63, "grad_norm": 0.1407833993434906, "learning_rate": 9.006385262965786e-06, "loss": 0.0771, "step": 354 }, { "epoch": 0.63, "grad_norm": 0.34604090452194214, "learning_rate": 9.00080087134498e-06, "loss": 0.1058, "step": 355 }, { "epoch": 0.63, "grad_norm": 0.31735068559646606, "learning_rate": 8.995202572298575e-06, "loss": 0.0833, "step": 356 }, { "epoch": 0.63, "grad_norm": 0.2618115246295929, "learning_rate": 8.989590385287156e-06, "loss": 0.0688, "step": 357 }, { "epoch": 0.63, "grad_norm": 0.22336295247077942, "learning_rate": 8.983964329819584e-06, "loss": 0.0963, "step": 358 }, { "epoch": 0.64, "grad_norm": 0.27905017137527466, "learning_rate": 8.97832442545293e-06, "loss": 0.068, "step": 359 }, { "epoch": 0.64, "grad_norm": 0.16449055075645447, "learning_rate": 8.972670691792409e-06, "loss": 0.0951, "step": 360 }, { "epoch": 0.64, "grad_norm": 0.2156919538974762, "learning_rate": 8.967003148491305e-06, "loss": 0.0495, "step": 361 }, { "epoch": 0.64, "grad_norm": 0.17048591375350952, "learning_rate": 8.961321815250905e-06, "loss": 0.0739, "step": 362 }, { "epoch": 0.64, "grad_norm": 0.21207985281944275, "learning_rate": 8.955626711820438e-06, "loss": 0.0817, "step": 363 }, { "epoch": 0.64, "grad_norm": 0.17985820770263672, "learning_rate": 8.949917857996996e-06, "loss": 0.0798, "step": 364 }, { "epoch": 0.65, "grad_norm": 0.18554697930812836, "learning_rate": 8.944195273625472e-06, "loss": 0.0511, "step": 365 }, { "epoch": 0.65, "grad_norm": 0.41460761427879333, "learning_rate": 8.938458978598483e-06, "loss": 0.104, "step": 366 }, { "epoch": 0.65, "grad_norm": 0.1609378457069397, "learning_rate": 8.932708992856315e-06, "loss": 0.078, "step": 367 }, { "epoch": 0.65, "grad_norm": 0.5837603211402893, "learning_rate": 8.926945336386838e-06, "loss": 0.0916, "step": 368 }, { "epoch": 0.65, "grad_norm": 0.20917074382305145, "learning_rate": 8.921168029225448e-06, "loss": 0.0439, "step": 369 }, { "epoch": 0.65, "grad_norm": 0.16059182584285736, "learning_rate": 8.915377091454992e-06, "loss": 0.0622, "step": 370 }, { "epoch": 0.66, "grad_norm": 0.42982298135757446, "learning_rate": 8.909572543205697e-06, "loss": 0.0688, "step": 371 }, { "epoch": 0.66, "grad_norm": 0.204355388879776, "learning_rate": 8.903754404655107e-06, "loss": 0.0355, "step": 372 }, { "epoch": 0.66, "grad_norm": 0.1616220325231552, "learning_rate": 8.897922696027998e-06, "loss": 0.0751, "step": 373 }, { "epoch": 0.66, "grad_norm": 0.22931227087974548, "learning_rate": 8.892077437596333e-06, "loss": 0.064, "step": 374 }, { "epoch": 0.66, "grad_norm": 0.24884217977523804, "learning_rate": 8.886218649679162e-06, "loss": 0.092, "step": 375 }, { "epoch": 0.67, "grad_norm": 0.23468102514743805, "learning_rate": 8.880346352642575e-06, "loss": 0.074, "step": 376 }, { "epoch": 0.67, "grad_norm": 0.40000632405281067, "learning_rate": 8.874460566899616e-06, "loss": 0.0553, "step": 377 }, { "epoch": 0.67, "grad_norm": 0.24387776851654053, "learning_rate": 8.868561312910222e-06, "loss": 0.0469, "step": 378 }, { "epoch": 0.67, "grad_norm": 0.27059826254844666, "learning_rate": 8.862648611181145e-06, "loss": 0.0561, "step": 379 }, { "epoch": 0.67, "grad_norm": 0.2047024518251419, "learning_rate": 8.856722482265886e-06, "loss": 0.0425, "step": 380 }, { "epoch": 0.67, "grad_norm": 0.19120073318481445, "learning_rate": 8.850782946764618e-06, "loss": 0.0683, "step": 381 }, { "epoch": 0.68, "grad_norm": 0.25501665472984314, "learning_rate": 8.844830025324123e-06, "loss": 0.0625, "step": 382 }, { "epoch": 0.68, "grad_norm": 0.5250377058982849, "learning_rate": 8.838863738637707e-06, "loss": 0.0729, "step": 383 }, { "epoch": 0.68, "grad_norm": 0.18302053213119507, "learning_rate": 8.83288410744514e-06, "loss": 0.0509, "step": 384 }, { "epoch": 0.68, "grad_norm": 0.3579452633857727, "learning_rate": 8.826891152532579e-06, "loss": 0.0815, "step": 385 }, { "epoch": 0.68, "grad_norm": 0.4426077902317047, "learning_rate": 8.820884894732498e-06, "loss": 0.0868, "step": 386 }, { "epoch": 0.68, "grad_norm": 0.18046371638774872, "learning_rate": 8.814865354923614e-06, "loss": 0.0545, "step": 387 }, { "epoch": 0.69, "grad_norm": 0.17153623700141907, "learning_rate": 8.808832554030809e-06, "loss": 0.0407, "step": 388 }, { "epoch": 0.69, "grad_norm": 0.29191163182258606, "learning_rate": 8.802786513025069e-06, "loss": 0.0551, "step": 389 }, { "epoch": 0.69, "grad_norm": 0.3224787414073944, "learning_rate": 8.796727252923403e-06, "loss": 0.0435, "step": 390 }, { "epoch": 0.69, "grad_norm": 0.5621923208236694, "learning_rate": 8.79065479478877e-06, "loss": 0.0522, "step": 391 }, { "epoch": 0.69, "grad_norm": 0.3209080398082733, "learning_rate": 8.784569159730008e-06, "loss": 0.0546, "step": 392 }, { "epoch": 0.7, "grad_norm": 0.6716540455818176, "learning_rate": 8.778470368901761e-06, "loss": 0.0754, "step": 393 }, { "epoch": 0.7, "grad_norm": 0.19775713980197906, "learning_rate": 8.772358443504406e-06, "loss": 0.013, "step": 394 }, { "epoch": 0.7, "grad_norm": 0.25971564650535583, "learning_rate": 8.766233404783975e-06, "loss": 0.0344, "step": 395 }, { "epoch": 0.7, "grad_norm": 0.5102748870849609, "learning_rate": 8.760095274032083e-06, "loss": 0.0646, "step": 396 }, { "epoch": 0.7, "grad_norm": 0.30724096298217773, "learning_rate": 8.75394407258586e-06, "loss": 0.0444, "step": 397 }, { "epoch": 0.7, "grad_norm": 0.2920438051223755, "learning_rate": 8.747779821827868e-06, "loss": 0.0714, "step": 398 }, { "epoch": 0.71, "grad_norm": 0.5672811269760132, "learning_rate": 8.741602543186032e-06, "loss": 0.144, "step": 399 }, { "epoch": 0.71, "grad_norm": 0.29323235154151917, "learning_rate": 8.735412258133562e-06, "loss": 0.0467, "step": 400 }, { "epoch": 0.71, "grad_norm": 0.7952798008918762, "learning_rate": 8.729208988188882e-06, "loss": 0.1006, "step": 401 }, { "epoch": 0.71, "grad_norm": 0.1980268657207489, "learning_rate": 8.722992754915555e-06, "loss": 0.0282, "step": 402 }, { "epoch": 0.71, "grad_norm": 0.3340957462787628, "learning_rate": 8.716763579922203e-06, "loss": 0.0587, "step": 403 }, { "epoch": 0.71, "grad_norm": 0.3219829499721527, "learning_rate": 8.71052148486244e-06, "loss": 0.0484, "step": 404 }, { "epoch": 0.72, "grad_norm": 0.4473950266838074, "learning_rate": 8.704266491434787e-06, "loss": 0.0613, "step": 405 }, { "epoch": 0.72, "grad_norm": 0.4419131875038147, "learning_rate": 8.697998621382608e-06, "loss": 0.0569, "step": 406 }, { "epoch": 0.72, "grad_norm": 0.3174911141395569, "learning_rate": 8.69171789649402e-06, "loss": 0.0381, "step": 407 }, { "epoch": 0.72, "grad_norm": 0.445470929145813, "learning_rate": 8.685424338601833e-06, "loss": 0.0818, "step": 408 }, { "epoch": 0.72, "grad_norm": 0.30242660641670227, "learning_rate": 8.679117969583464e-06, "loss": 0.0733, "step": 409 }, { "epoch": 0.73, "grad_norm": 0.30104532837867737, "learning_rate": 8.672798811360863e-06, "loss": 0.0635, "step": 410 }, { "epoch": 0.73, "grad_norm": 0.4942661225795746, "learning_rate": 8.66646688590044e-06, "loss": 0.0652, "step": 411 }, { "epoch": 0.73, "grad_norm": 0.48222100734710693, "learning_rate": 8.660122215212976e-06, "loss": 0.0519, "step": 412 }, { "epoch": 0.73, "grad_norm": 0.18950317800045013, "learning_rate": 8.653764821353575e-06, "loss": 0.0372, "step": 413 }, { "epoch": 0.73, "grad_norm": 1.0479586124420166, "learning_rate": 8.647394726421547e-06, "loss": 0.1133, "step": 414 }, { "epoch": 0.73, "grad_norm": 0.9393060803413391, "learning_rate": 8.641011952560372e-06, "loss": 0.1114, "step": 415 }, { "epoch": 0.74, "grad_norm": 0.6437638998031616, "learning_rate": 8.63461652195759e-06, "loss": 0.0834, "step": 416 }, { "epoch": 0.74, "grad_norm": 0.5016691088676453, "learning_rate": 8.628208456844749e-06, "loss": 0.1051, "step": 417 }, { "epoch": 0.74, "grad_norm": 0.2746430039405823, "learning_rate": 8.621787779497307e-06, "loss": 0.0756, "step": 418 }, { "epoch": 0.74, "grad_norm": 0.1799677610397339, "learning_rate": 8.615354512234569e-06, "loss": 0.0748, "step": 419 }, { "epoch": 0.74, "grad_norm": 0.32306981086730957, "learning_rate": 8.608908677419606e-06, "loss": 0.09, "step": 420 }, { "epoch": 0.74, "grad_norm": 0.40375712513923645, "learning_rate": 8.602450297459173e-06, "loss": 0.1321, "step": 421 }, { "epoch": 0.75, "grad_norm": 0.1651470959186554, "learning_rate": 8.595979394803633e-06, "loss": 0.0613, "step": 422 }, { "epoch": 0.75, "grad_norm": 0.17097824811935425, "learning_rate": 8.589495991946885e-06, "loss": 0.0701, "step": 423 }, { "epoch": 0.75, "grad_norm": 0.5410835146903992, "learning_rate": 8.583000111426277e-06, "loss": 0.0655, "step": 424 }, { "epoch": 0.75, "grad_norm": 0.2845303416252136, "learning_rate": 8.576491775822527e-06, "loss": 0.0859, "step": 425 }, { "epoch": 0.75, "grad_norm": 0.2452799528837204, "learning_rate": 8.569971007759657e-06, "loss": 0.0538, "step": 426 }, { "epoch": 0.75, "eval_loss": 0.07323230057954788, "eval_runtime": 14.7111, "eval_samples_per_second": 32.356, "eval_steps_per_second": 8.089, "step": 426 }, { "epoch": 0.76, "grad_norm": 0.6456173062324524, "learning_rate": 8.563437829904904e-06, "loss": 0.1131, "step": 427 }, { "epoch": 0.76, "grad_norm": 0.3903690278530121, "learning_rate": 8.556892264968639e-06, "loss": 0.0997, "step": 428 }, { "epoch": 0.76, "grad_norm": 0.33470049500465393, "learning_rate": 8.550334335704298e-06, "loss": 0.0987, "step": 429 }, { "epoch": 0.76, "grad_norm": 0.1498459130525589, "learning_rate": 8.543764064908295e-06, "loss": 0.0524, "step": 430 }, { "epoch": 0.76, "grad_norm": 0.21974940598011017, "learning_rate": 8.537181475419944e-06, "loss": 0.0789, "step": 431 }, { "epoch": 0.76, "grad_norm": 0.16694916784763336, "learning_rate": 8.530586590121384e-06, "loss": 0.0731, "step": 432 }, { "epoch": 0.77, "grad_norm": 0.12150876969099045, "learning_rate": 8.523979431937493e-06, "loss": 0.0694, "step": 433 }, { "epoch": 0.77, "grad_norm": 0.17073531448841095, "learning_rate": 8.51736002383581e-06, "loss": 0.0698, "step": 434 }, { "epoch": 0.77, "grad_norm": 0.2708394527435303, "learning_rate": 8.510728388826464e-06, "loss": 0.0739, "step": 435 }, { "epoch": 0.77, "grad_norm": 0.1602393388748169, "learning_rate": 8.504084549962079e-06, "loss": 0.0709, "step": 436 }, { "epoch": 0.77, "grad_norm": 0.2071549892425537, "learning_rate": 8.497428530337707e-06, "loss": 0.0757, "step": 437 }, { "epoch": 0.77, "grad_norm": 0.1717323213815689, "learning_rate": 8.490760353090738e-06, "loss": 0.0802, "step": 438 }, { "epoch": 0.78, "grad_norm": 0.2539728879928589, "learning_rate": 8.484080041400827e-06, "loss": 0.0852, "step": 439 }, { "epoch": 0.78, "grad_norm": 0.15313653647899628, "learning_rate": 8.477387618489808e-06, "loss": 0.0788, "step": 440 }, { "epoch": 0.78, "grad_norm": 0.20286281406879425, "learning_rate": 8.470683107621616e-06, "loss": 0.0423, "step": 441 }, { "epoch": 0.78, "grad_norm": 0.21172399818897247, "learning_rate": 8.463966532102207e-06, "loss": 0.0575, "step": 442 }, { "epoch": 0.78, "grad_norm": 0.23021037876605988, "learning_rate": 8.457237915279477e-06, "loss": 0.0774, "step": 443 }, { "epoch": 0.79, "grad_norm": 0.14592242240905762, "learning_rate": 8.450497280543174e-06, "loss": 0.0699, "step": 444 }, { "epoch": 0.79, "grad_norm": 0.2122061848640442, "learning_rate": 8.443744651324828e-06, "loss": 0.0801, "step": 445 }, { "epoch": 0.79, "grad_norm": 0.21682047843933105, "learning_rate": 8.43698005109766e-06, "loss": 0.0478, "step": 446 }, { "epoch": 0.79, "grad_norm": 0.19426396489143372, "learning_rate": 8.430203503376506e-06, "loss": 0.0508, "step": 447 }, { "epoch": 0.79, "grad_norm": 0.14614954590797424, "learning_rate": 8.423415031717734e-06, "loss": 0.0712, "step": 448 }, { "epoch": 0.79, "grad_norm": 0.1340399831533432, "learning_rate": 8.416614659719158e-06, "loss": 0.0723, "step": 449 }, { "epoch": 0.8, "grad_norm": 0.16521310806274414, "learning_rate": 8.409802411019962e-06, "loss": 0.0486, "step": 450 }, { "epoch": 0.8, "grad_norm": 0.3560049533843994, "learning_rate": 8.40297830930062e-06, "loss": 0.0868, "step": 451 }, { "epoch": 0.8, "grad_norm": 0.1964522898197174, "learning_rate": 8.396142378282799e-06, "loss": 0.046, "step": 452 }, { "epoch": 0.8, "grad_norm": 0.175230011343956, "learning_rate": 8.389294641729293e-06, "loss": 0.0659, "step": 453 }, { "epoch": 0.8, "grad_norm": 0.22042769193649292, "learning_rate": 8.382435123443934e-06, "loss": 0.0746, "step": 454 }, { "epoch": 0.8, "grad_norm": 0.19862250983715057, "learning_rate": 8.375563847271506e-06, "loss": 0.055, "step": 455 }, { "epoch": 0.81, "grad_norm": 0.24993905425071716, "learning_rate": 8.36868083709767e-06, "loss": 0.0858, "step": 456 }, { "epoch": 0.81, "grad_norm": 0.14945238828659058, "learning_rate": 8.361786116848871e-06, "loss": 0.0573, "step": 457 }, { "epoch": 0.81, "grad_norm": 0.43160539865493774, "learning_rate": 8.354879710492264e-06, "loss": 0.094, "step": 458 }, { "epoch": 0.81, "grad_norm": 0.5086230635643005, "learning_rate": 8.347961642035624e-06, "loss": 0.0956, "step": 459 }, { "epoch": 0.81, "grad_norm": 0.1694391518831253, "learning_rate": 8.341031935527267e-06, "loss": 0.0539, "step": 460 }, { "epoch": 0.82, "grad_norm": 0.2055732011795044, "learning_rate": 8.334090615055966e-06, "loss": 0.0564, "step": 461 }, { "epoch": 0.82, "grad_norm": 0.14689050614833832, "learning_rate": 8.327137704750863e-06, "loss": 0.0554, "step": 462 }, { "epoch": 0.82, "grad_norm": 0.3445436656475067, "learning_rate": 8.32017322878139e-06, "loss": 0.1029, "step": 463 }, { "epoch": 0.82, "grad_norm": 0.1985045075416565, "learning_rate": 8.31319721135718e-06, "loss": 0.0703, "step": 464 }, { "epoch": 0.82, "grad_norm": 0.24056944251060486, "learning_rate": 8.306209676727994e-06, "loss": 0.0909, "step": 465 }, { "epoch": 0.82, "grad_norm": 0.20218642055988312, "learning_rate": 8.29921064918362e-06, "loss": 0.0658, "step": 466 }, { "epoch": 0.83, "grad_norm": 0.2052248865365982, "learning_rate": 8.2922001530538e-06, "loss": 0.0522, "step": 467 }, { "epoch": 0.83, "grad_norm": 0.2361009120941162, "learning_rate": 8.285178212708143e-06, "loss": 0.0674, "step": 468 }, { "epoch": 0.83, "grad_norm": 0.30261772871017456, "learning_rate": 8.278144852556042e-06, "loss": 0.0672, "step": 469 }, { "epoch": 0.83, "grad_norm": 0.3114418685436249, "learning_rate": 8.271100097046585e-06, "loss": 0.0762, "step": 470 }, { "epoch": 0.83, "grad_norm": 0.4094521701335907, "learning_rate": 8.26404397066847e-06, "loss": 0.0725, "step": 471 }, { "epoch": 0.84, "grad_norm": 0.23902684450149536, "learning_rate": 8.256976497949924e-06, "loss": 0.0408, "step": 472 }, { "epoch": 0.84, "grad_norm": 0.2393728494644165, "learning_rate": 8.249897703458619e-06, "loss": 0.0608, "step": 473 }, { "epoch": 0.84, "grad_norm": 0.24134708940982819, "learning_rate": 8.242807611801578e-06, "loss": 0.0369, "step": 474 }, { "epoch": 0.84, "grad_norm": 0.14594919979572296, "learning_rate": 8.235706247625098e-06, "loss": 0.0261, "step": 475 }, { "epoch": 0.84, "grad_norm": 0.32239043712615967, "learning_rate": 8.228593635614659e-06, "loss": 0.1011, "step": 476 }, { "epoch": 0.84, "grad_norm": 0.2879891097545624, "learning_rate": 8.22146980049484e-06, "loss": 0.0742, "step": 477 }, { "epoch": 0.85, "grad_norm": 0.5778201818466187, "learning_rate": 8.21433476702924e-06, "loss": 0.1146, "step": 478 }, { "epoch": 0.85, "grad_norm": 0.214900940656662, "learning_rate": 8.207188560020374e-06, "loss": 0.037, "step": 479 }, { "epoch": 0.85, "grad_norm": 0.22797343134880066, "learning_rate": 8.200031204309604e-06, "loss": 0.0595, "step": 480 }, { "epoch": 0.85, "grad_norm": 0.19148996472358704, "learning_rate": 8.192862724777052e-06, "loss": 0.0339, "step": 481 }, { "epoch": 0.85, "grad_norm": 0.15512730181217194, "learning_rate": 8.185683146341496e-06, "loss": 0.051, "step": 482 }, { "epoch": 0.85, "grad_norm": 0.20564667880535126, "learning_rate": 8.178492493960309e-06, "loss": 0.0453, "step": 483 }, { "epoch": 0.86, "grad_norm": 0.3061021566390991, "learning_rate": 8.171290792629348e-06, "loss": 0.033, "step": 484 }, { "epoch": 0.86, "grad_norm": 0.23765882849693298, "learning_rate": 8.16407806738288e-06, "loss": 0.0614, "step": 485 }, { "epoch": 0.86, "grad_norm": 0.3384253978729248, "learning_rate": 8.156854343293501e-06, "loss": 0.0662, "step": 486 }, { "epoch": 0.86, "grad_norm": 0.1890600174665451, "learning_rate": 8.149619645472031e-06, "loss": 0.0495, "step": 487 }, { "epoch": 0.86, "grad_norm": 0.30018725991249084, "learning_rate": 8.14237399906744e-06, "loss": 0.0722, "step": 488 }, { "epoch": 0.87, "grad_norm": 0.31263217329978943, "learning_rate": 8.135117429266756e-06, "loss": 0.0693, "step": 489 }, { "epoch": 0.87, "grad_norm": 0.18034443259239197, "learning_rate": 8.127849961294984e-06, "loss": 0.0409, "step": 490 }, { "epoch": 0.87, "grad_norm": 0.23813746869564056, "learning_rate": 8.120571620415007e-06, "loss": 0.0433, "step": 491 }, { "epoch": 0.87, "grad_norm": 0.23636382818222046, "learning_rate": 8.113282431927502e-06, "loss": 0.0544, "step": 492 }, { "epoch": 0.87, "grad_norm": 0.5150755047798157, "learning_rate": 8.10598242117086e-06, "loss": 0.1072, "step": 493 }, { "epoch": 0.87, "grad_norm": 0.3001669645309448, "learning_rate": 8.09867161352109e-06, "loss": 0.0414, "step": 494 }, { "epoch": 0.88, "grad_norm": 0.228012353181839, "learning_rate": 8.091350034391732e-06, "loss": 0.0701, "step": 495 }, { "epoch": 0.88, "grad_norm": 0.3816164433956146, "learning_rate": 8.084017709233767e-06, "loss": 0.0723, "step": 496 }, { "epoch": 0.88, "grad_norm": 0.32659652829170227, "learning_rate": 8.076674663535537e-06, "loss": 0.0697, "step": 497 }, { "epoch": 0.88, "grad_norm": 0.48343512415885925, "learning_rate": 8.069320922822644e-06, "loss": 0.1034, "step": 498 }, { "epoch": 0.88, "grad_norm": 0.29286321997642517, "learning_rate": 8.061956512657872e-06, "loss": 0.075, "step": 499 }, { "epoch": 0.88, "grad_norm": 0.4552519917488098, "learning_rate": 8.05458145864109e-06, "loss": 0.0568, "step": 500 }, { "epoch": 0.89, "grad_norm": 0.3469892740249634, "learning_rate": 8.047195786409172e-06, "loss": 0.0661, "step": 501 }, { "epoch": 0.89, "grad_norm": 0.1452968716621399, "learning_rate": 8.039799521635896e-06, "loss": 0.0226, "step": 502 }, { "epoch": 0.89, "grad_norm": 0.25091221928596497, "learning_rate": 8.032392690031868e-06, "loss": 0.0486, "step": 503 }, { "epoch": 0.89, "grad_norm": 0.1830379068851471, "learning_rate": 8.024975317344421e-06, "loss": 0.0448, "step": 504 }, { "epoch": 0.89, "grad_norm": 0.32811227440834045, "learning_rate": 8.017547429357532e-06, "loss": 0.055, "step": 505 }, { "epoch": 0.9, "grad_norm": 0.24694731831550598, "learning_rate": 8.010109051891731e-06, "loss": 0.0614, "step": 506 }, { "epoch": 0.9, "grad_norm": 0.3512776792049408, "learning_rate": 8.002660210804011e-06, "loss": 0.0699, "step": 507 }, { "epoch": 0.9, "grad_norm": 0.2562338709831238, "learning_rate": 7.995200931987744e-06, "loss": 0.0726, "step": 508 }, { "epoch": 0.9, "grad_norm": 0.38721486926078796, "learning_rate": 7.987731241372572e-06, "loss": 0.0798, "step": 509 }, { "epoch": 0.9, "grad_norm": 0.24697037041187286, "learning_rate": 7.980251164924342e-06, "loss": 0.0657, "step": 510 }, { "epoch": 0.9, "grad_norm": 0.5245312452316284, "learning_rate": 7.972760728644995e-06, "loss": 0.0575, "step": 511 }, { "epoch": 0.91, "grad_norm": 0.29805997014045715, "learning_rate": 7.965259958572495e-06, "loss": 0.052, "step": 512 }, { "epoch": 0.91, "grad_norm": 0.21135053038597107, "learning_rate": 7.957748880780722e-06, "loss": 0.0378, "step": 513 }, { "epoch": 0.91, "grad_norm": 0.23149773478507996, "learning_rate": 7.950227521379382e-06, "loss": 0.0726, "step": 514 }, { "epoch": 0.91, "grad_norm": 0.19880171120166779, "learning_rate": 7.94269590651393e-06, "loss": 0.0714, "step": 515 }, { "epoch": 0.91, "grad_norm": 0.28021228313446045, "learning_rate": 7.935154062365468e-06, "loss": 0.0634, "step": 516 }, { "epoch": 0.91, "grad_norm": 0.21841171383857727, "learning_rate": 7.927602015150655e-06, "loss": 0.0625, "step": 517 }, { "epoch": 0.92, "grad_norm": 0.1774914562702179, "learning_rate": 7.920039791121617e-06, "loss": 0.0321, "step": 518 }, { "epoch": 0.92, "grad_norm": 0.38757917284965515, "learning_rate": 7.91246741656586e-06, "loss": 0.1069, "step": 519 }, { "epoch": 0.92, "grad_norm": 0.19643576443195343, "learning_rate": 7.904884917806174e-06, "loss": 0.0409, "step": 520 }, { "epoch": 0.92, "grad_norm": 0.2885828912258148, "learning_rate": 7.897292321200538e-06, "loss": 0.0514, "step": 521 }, { "epoch": 0.92, "grad_norm": 0.4019085168838501, "learning_rate": 7.889689653142037e-06, "loss": 0.089, "step": 522 }, { "epoch": 0.93, "grad_norm": 0.350379079580307, "learning_rate": 7.882076940058764e-06, "loss": 0.0505, "step": 523 }, { "epoch": 0.93, "grad_norm": 0.3123965859413147, "learning_rate": 7.87445420841373e-06, "loss": 0.0555, "step": 524 }, { "epoch": 0.93, "grad_norm": 0.21883922815322876, "learning_rate": 7.866821484704777e-06, "loss": 0.0439, "step": 525 }, { "epoch": 0.93, "grad_norm": 0.2795931100845337, "learning_rate": 7.859178795464473e-06, "loss": 0.0757, "step": 526 }, { "epoch": 0.93, "grad_norm": 0.3848627507686615, "learning_rate": 7.851526167260034e-06, "loss": 0.0995, "step": 527 }, { "epoch": 0.93, "grad_norm": 0.5650475025177002, "learning_rate": 7.843863626693221e-06, "loss": 0.0956, "step": 528 }, { "epoch": 0.94, "grad_norm": 0.20236073434352875, "learning_rate": 7.836191200400257e-06, "loss": 0.0629, "step": 529 }, { "epoch": 0.94, "grad_norm": 0.19835765659809113, "learning_rate": 7.828508915051724e-06, "loss": 0.0661, "step": 530 }, { "epoch": 0.94, "grad_norm": 0.2083461880683899, "learning_rate": 7.82081679735248e-06, "loss": 0.051, "step": 531 }, { "epoch": 0.94, "grad_norm": 0.4042919874191284, "learning_rate": 7.813114874041558e-06, "loss": 0.0736, "step": 532 }, { "epoch": 0.94, "grad_norm": 0.20774157345294952, "learning_rate": 7.80540317189208e-06, "loss": 0.0578, "step": 533 }, { "epoch": 0.94, "grad_norm": 0.20473290979862213, "learning_rate": 7.797681717711162e-06, "loss": 0.0471, "step": 534 }, { "epoch": 0.95, "grad_norm": 0.2514810562133789, "learning_rate": 7.789950538339813e-06, "loss": 0.0665, "step": 535 }, { "epoch": 0.95, "grad_norm": 0.43802833557128906, "learning_rate": 7.782209660652855e-06, "loss": 0.087, "step": 536 }, { "epoch": 0.95, "grad_norm": 0.2503105103969574, "learning_rate": 7.774459111558821e-06, "loss": 0.0819, "step": 537 }, { "epoch": 0.95, "grad_norm": 0.16841083765029907, "learning_rate": 7.766698917999862e-06, "loss": 0.0505, "step": 538 }, { "epoch": 0.95, "grad_norm": 0.3313782513141632, "learning_rate": 7.758929106951656e-06, "loss": 0.0713, "step": 539 }, { "epoch": 0.96, "grad_norm": 0.16334685683250427, "learning_rate": 7.751149705423313e-06, "loss": 0.044, "step": 540 }, { "epoch": 0.96, "grad_norm": 0.4028722941875458, "learning_rate": 7.743360740457278e-06, "loss": 0.0847, "step": 541 }, { "epoch": 0.96, "grad_norm": 0.24219095706939697, "learning_rate": 7.735562239129248e-06, "loss": 0.0848, "step": 542 }, { "epoch": 0.96, "grad_norm": 0.19837279617786407, "learning_rate": 7.72775422854806e-06, "loss": 0.0331, "step": 543 }, { "epoch": 0.96, "grad_norm": 0.21855801343917847, "learning_rate": 7.719936735855611e-06, "loss": 0.0473, "step": 544 }, { "epoch": 0.96, "grad_norm": 0.1909436583518982, "learning_rate": 7.712109788226763e-06, "loss": 0.0694, "step": 545 }, { "epoch": 0.97, "grad_norm": 0.37223267555236816, "learning_rate": 7.704273412869239e-06, "loss": 0.077, "step": 546 }, { "epoch": 0.97, "grad_norm": 0.33873534202575684, "learning_rate": 7.696427637023537e-06, "loss": 0.0391, "step": 547 }, { "epoch": 0.97, "grad_norm": 0.22075894474983215, "learning_rate": 7.688572487962836e-06, "loss": 0.0363, "step": 548 }, { "epoch": 0.97, "grad_norm": 0.3139945864677429, "learning_rate": 7.680707992992889e-06, "loss": 0.0676, "step": 549 }, { "epoch": 0.97, "grad_norm": 0.24971792101860046, "learning_rate": 7.672834179451943e-06, "loss": 0.0561, "step": 550 }, { "epoch": 0.97, "grad_norm": 0.2864936292171478, "learning_rate": 7.664951074710638e-06, "loss": 0.0885, "step": 551 }, { "epoch": 0.98, "grad_norm": 0.32206228375434875, "learning_rate": 7.657058706171912e-06, "loss": 0.065, "step": 552 }, { "epoch": 0.98, "grad_norm": 0.28918203711509705, "learning_rate": 7.649157101270904e-06, "loss": 0.108, "step": 553 }, { "epoch": 0.98, "grad_norm": 0.2585963308811188, "learning_rate": 7.641246287474856e-06, "loss": 0.0536, "step": 554 }, { "epoch": 0.98, "grad_norm": 0.4212508797645569, "learning_rate": 7.633326292283028e-06, "loss": 0.0857, "step": 555 }, { "epoch": 0.98, "grad_norm": 0.1916050761938095, "learning_rate": 7.625397143226596e-06, "loss": 0.0597, "step": 556 }, { "epoch": 0.99, "grad_norm": 0.678855299949646, "learning_rate": 7.617458867868554e-06, "loss": 0.0759, "step": 557 }, { "epoch": 0.99, "grad_norm": 0.3328985273838043, "learning_rate": 7.609511493803616e-06, "loss": 0.0545, "step": 558 }, { "epoch": 0.99, "grad_norm": 0.2722048759460449, "learning_rate": 7.601555048658133e-06, "loss": 0.0633, "step": 559 }, { "epoch": 0.99, "grad_norm": 0.2508256733417511, "learning_rate": 7.593589560089984e-06, "loss": 0.0571, "step": 560 }, { "epoch": 0.99, "grad_norm": 0.49203789234161377, "learning_rate": 7.585615055788484e-06, "loss": 0.0834, "step": 561 }, { "epoch": 0.99, "grad_norm": 0.2127446085214615, "learning_rate": 7.577631563474291e-06, "loss": 0.0658, "step": 562 }, { "epoch": 1.0, "grad_norm": 0.6191400289535522, "learning_rate": 7.569639110899303e-06, "loss": 0.0848, "step": 563 }, { "epoch": 1.0, "grad_norm": 0.22974306344985962, "learning_rate": 7.561637725846568e-06, "loss": 0.0493, "step": 564 }, { "epoch": 1.0, "grad_norm": 0.6743472814559937, "learning_rate": 7.553627436130183e-06, "loss": 0.139, "step": 565 }, { "epoch": 1.0, "grad_norm": 0.2306680679321289, "learning_rate": 7.545608269595201e-06, "loss": 0.0575, "step": 566 }, { "epoch": 1.0, "grad_norm": 0.25629836320877075, "learning_rate": 7.537580254117531e-06, "loss": 0.0596, "step": 567 }, { "epoch": 1.0, "grad_norm": 0.27573418617248535, "learning_rate": 7.529543417603844e-06, "loss": 0.0425, "step": 568 }, { "epoch": 1.0, "eval_loss": 0.06561362743377686, "eval_runtime": 14.6894, "eval_samples_per_second": 32.404, "eval_steps_per_second": 8.101, "step": 568 }, { "epoch": 1.01, "grad_norm": 0.34876716136932373, "learning_rate": 7.521497787991472e-06, "loss": 0.1116, "step": 569 }, { "epoch": 1.01, "grad_norm": 0.17311374843120575, "learning_rate": 7.513443393248312e-06, "loss": 0.0543, "step": 570 }, { "epoch": 1.01, "grad_norm": 0.38628146052360535, "learning_rate": 7.505380261372735e-06, "loss": 0.0912, "step": 571 }, { "epoch": 1.01, "grad_norm": 0.3152383863925934, "learning_rate": 7.497308420393478e-06, "loss": 0.0964, "step": 572 }, { "epoch": 1.01, "grad_norm": 0.2237791121006012, "learning_rate": 7.489227898369558e-06, "loss": 0.0776, "step": 573 }, { "epoch": 1.02, "grad_norm": 0.3081691563129425, "learning_rate": 7.481138723390165e-06, "loss": 0.091, "step": 574 }, { "epoch": 1.02, "grad_norm": 0.23158672451972961, "learning_rate": 7.473040923574567e-06, "loss": 0.042, "step": 575 }, { "epoch": 1.02, "grad_norm": 0.20039266347885132, "learning_rate": 7.464934527072016e-06, "loss": 0.0467, "step": 576 }, { "epoch": 1.02, "grad_norm": 0.15118126571178436, "learning_rate": 7.456819562061649e-06, "loss": 0.0573, "step": 577 }, { "epoch": 1.02, "grad_norm": 0.25211474299430847, "learning_rate": 7.448696056752383e-06, "loss": 0.0767, "step": 578 }, { "epoch": 1.02, "grad_norm": 0.1856004297733307, "learning_rate": 7.440564039382827e-06, "loss": 0.0446, "step": 579 }, { "epoch": 1.03, "grad_norm": 0.19141128659248352, "learning_rate": 7.432423538221179e-06, "loss": 0.0481, "step": 580 }, { "epoch": 1.03, "grad_norm": 0.3133082091808319, "learning_rate": 7.424274581565123e-06, "loss": 0.0444, "step": 581 }, { "epoch": 1.03, "grad_norm": 0.15396596491336823, "learning_rate": 7.416117197741742e-06, "loss": 0.052, "step": 582 }, { "epoch": 1.03, "grad_norm": 0.21921683847904205, "learning_rate": 7.407951415107413e-06, "loss": 0.0577, "step": 583 }, { "epoch": 1.03, "grad_norm": 0.2533078193664551, "learning_rate": 7.3997772620477e-06, "loss": 0.0769, "step": 584 }, { "epoch": 1.03, "grad_norm": 0.16214172542095184, "learning_rate": 7.391594766977277e-06, "loss": 0.0415, "step": 585 }, { "epoch": 1.04, "grad_norm": 0.1633971482515335, "learning_rate": 7.383403958339806e-06, "loss": 0.0589, "step": 586 }, { "epoch": 1.04, "grad_norm": 0.223504438996315, "learning_rate": 7.375204864607852e-06, "loss": 0.0644, "step": 587 }, { "epoch": 1.04, "grad_norm": 0.3524682819843292, "learning_rate": 7.366997514282782e-06, "loss": 0.0793, "step": 588 }, { "epoch": 1.04, "grad_norm": 0.2214106321334839, "learning_rate": 7.358781935894659e-06, "loss": 0.059, "step": 589 }, { "epoch": 1.04, "grad_norm": 0.23937495052814484, "learning_rate": 7.350558158002154e-06, "loss": 0.0286, "step": 590 }, { "epoch": 1.05, "grad_norm": 0.2844971716403961, "learning_rate": 7.342326209192435e-06, "loss": 0.0516, "step": 591 }, { "epoch": 1.05, "grad_norm": 0.2881384491920471, "learning_rate": 7.334086118081081e-06, "loss": 0.0672, "step": 592 }, { "epoch": 1.05, "grad_norm": 0.20290230214595795, "learning_rate": 7.3258379133119665e-06, "loss": 0.0411, "step": 593 }, { "epoch": 1.05, "grad_norm": 0.26997798681259155, "learning_rate": 7.317581623557177e-06, "loss": 0.048, "step": 594 }, { "epoch": 1.05, "grad_norm": 0.296772301197052, "learning_rate": 7.3093172775169e-06, "loss": 0.0662, "step": 595 }, { "epoch": 1.05, "grad_norm": 0.3392322063446045, "learning_rate": 7.3010449039193255e-06, "loss": 0.0745, "step": 596 }, { "epoch": 1.06, "grad_norm": 0.3285764455795288, "learning_rate": 7.2927645315205535e-06, "loss": 0.0848, "step": 597 }, { "epoch": 1.06, "grad_norm": 0.20147007703781128, "learning_rate": 7.284476189104486e-06, "loss": 0.0544, "step": 598 }, { "epoch": 1.06, "grad_norm": 0.2233363389968872, "learning_rate": 7.27617990548273e-06, "loss": 0.048, "step": 599 }, { "epoch": 1.06, "grad_norm": 0.3063652217388153, "learning_rate": 7.2678757094945e-06, "loss": 0.0879, "step": 600 }, { "epoch": 1.06, "grad_norm": 0.34670838713645935, "learning_rate": 7.259563630006512e-06, "loss": 0.0753, "step": 601 }, { "epoch": 1.07, "grad_norm": 0.2491820603609085, "learning_rate": 7.251243695912886e-06, "loss": 0.0492, "step": 602 }, { "epoch": 1.07, "grad_norm": 0.2033078521490097, "learning_rate": 7.242915936135052e-06, "loss": 0.0468, "step": 603 }, { "epoch": 1.07, "grad_norm": 0.47694021463394165, "learning_rate": 7.234580379621636e-06, "loss": 0.1053, "step": 604 }, { "epoch": 1.07, "grad_norm": 0.25937044620513916, "learning_rate": 7.226237055348369e-06, "loss": 0.0632, "step": 605 }, { "epoch": 1.07, "grad_norm": 0.45980414748191833, "learning_rate": 7.2178859923179855e-06, "loss": 0.0611, "step": 606 }, { "epoch": 1.07, "grad_norm": 0.2813967168331146, "learning_rate": 7.20952721956012e-06, "loss": 0.0826, "step": 607 }, { "epoch": 1.08, "grad_norm": 0.35964691638946533, "learning_rate": 7.201160766131208e-06, "loss": 0.0573, "step": 608 }, { "epoch": 1.08, "grad_norm": 0.4538634717464447, "learning_rate": 7.192786661114384e-06, "loss": 0.0615, "step": 609 }, { "epoch": 1.08, "grad_norm": 0.43191802501678467, "learning_rate": 7.184404933619377e-06, "loss": 0.071, "step": 610 }, { "epoch": 1.08, "grad_norm": 0.24433721601963043, "learning_rate": 7.176015612782421e-06, "loss": 0.032, "step": 611 }, { "epoch": 1.08, "grad_norm": 0.3911650478839874, "learning_rate": 7.167618727766138e-06, "loss": 0.0901, "step": 612 }, { "epoch": 1.08, "grad_norm": 0.25476253032684326, "learning_rate": 7.1592143077594475e-06, "loss": 0.0386, "step": 613 }, { "epoch": 1.09, "grad_norm": 0.2659028172492981, "learning_rate": 7.1508023819774644e-06, "loss": 0.0617, "step": 614 }, { "epoch": 1.09, "grad_norm": 0.26452651619911194, "learning_rate": 7.142382979661386e-06, "loss": 0.0489, "step": 615 }, { "epoch": 1.09, "grad_norm": 0.2677484154701233, "learning_rate": 7.133956130078412e-06, "loss": 0.0444, "step": 616 }, { "epoch": 1.09, "grad_norm": 0.2430453896522522, "learning_rate": 7.12552186252162e-06, "loss": 0.0632, "step": 617 }, { "epoch": 1.09, "grad_norm": 0.2421427220106125, "learning_rate": 7.117080206309879e-06, "loss": 0.0597, "step": 618 }, { "epoch": 1.1, "grad_norm": 0.45028990507125854, "learning_rate": 7.1086311907877346e-06, "loss": 0.0779, "step": 619 }, { "epoch": 1.1, "grad_norm": 0.4141206443309784, "learning_rate": 7.100174845325327e-06, "loss": 0.0991, "step": 620 }, { "epoch": 1.1, "grad_norm": 0.23673787713050842, "learning_rate": 7.091711199318265e-06, "loss": 0.0728, "step": 621 }, { "epoch": 1.1, "grad_norm": 0.3629487454891205, "learning_rate": 7.083240282187542e-06, "loss": 0.0547, "step": 622 }, { "epoch": 1.1, "grad_norm": 0.23040874302387238, "learning_rate": 7.074762123379424e-06, "loss": 0.0398, "step": 623 }, { "epoch": 1.1, "grad_norm": 0.31620001792907715, "learning_rate": 7.0662767523653515e-06, "loss": 0.0509, "step": 624 }, { "epoch": 1.11, "grad_norm": 0.2546658217906952, "learning_rate": 7.057784198641835e-06, "loss": 0.0577, "step": 625 }, { "epoch": 1.11, "grad_norm": 0.3494987487792969, "learning_rate": 7.0492844917303535e-06, "loss": 0.088, "step": 626 }, { "epoch": 1.11, "grad_norm": 0.4441275894641876, "learning_rate": 7.040777661177251e-06, "loss": 0.06, "step": 627 }, { "epoch": 1.11, "grad_norm": 0.21604092419147491, "learning_rate": 7.032263736553635e-06, "loss": 0.0424, "step": 628 }, { "epoch": 1.11, "grad_norm": 0.2695225179195404, "learning_rate": 7.0237427474552755e-06, "loss": 0.0622, "step": 629 }, { "epoch": 1.11, "grad_norm": 0.2582091689109802, "learning_rate": 7.015214723502496e-06, "loss": 0.0777, "step": 630 }, { "epoch": 1.12, "grad_norm": 0.348019540309906, "learning_rate": 7.006679694340074e-06, "loss": 0.0666, "step": 631 }, { "epoch": 1.12, "grad_norm": 0.21345749497413635, "learning_rate": 6.998137689637142e-06, "loss": 0.0245, "step": 632 }, { "epoch": 1.12, "grad_norm": 0.19356510043144226, "learning_rate": 6.989588739087079e-06, "loss": 0.0445, "step": 633 }, { "epoch": 1.12, "grad_norm": 0.33389925956726074, "learning_rate": 6.981032872407406e-06, "loss": 0.0793, "step": 634 }, { "epoch": 1.12, "grad_norm": 0.21888035535812378, "learning_rate": 6.972470119339692e-06, "loss": 0.0583, "step": 635 }, { "epoch": 1.13, "grad_norm": 0.4273500442504883, "learning_rate": 6.963900509649435e-06, "loss": 0.0739, "step": 636 }, { "epoch": 1.13, "grad_norm": 0.2740592956542969, "learning_rate": 6.955324073125979e-06, "loss": 0.065, "step": 637 }, { "epoch": 1.13, "grad_norm": 0.4859815537929535, "learning_rate": 6.946740839582388e-06, "loss": 0.0498, "step": 638 }, { "epoch": 1.13, "grad_norm": 0.1959264874458313, "learning_rate": 6.93815083885536e-06, "loss": 0.0375, "step": 639 }, { "epoch": 1.13, "grad_norm": 0.28902924060821533, "learning_rate": 6.929554100805118e-06, "loss": 0.0493, "step": 640 }, { "epoch": 1.13, "grad_norm": 0.18830162286758423, "learning_rate": 6.920950655315298e-06, "loss": 0.0271, "step": 641 }, { "epoch": 1.14, "grad_norm": 0.31182587146759033, "learning_rate": 6.912340532292861e-06, "loss": 0.086, "step": 642 }, { "epoch": 1.14, "grad_norm": 0.4169754683971405, "learning_rate": 6.903723761667973e-06, "loss": 0.0639, "step": 643 }, { "epoch": 1.14, "grad_norm": 0.48527055978775024, "learning_rate": 6.8951003733939125e-06, "loss": 0.0852, "step": 644 }, { "epoch": 1.14, "grad_norm": 0.21746531128883362, "learning_rate": 6.886470397446958e-06, "loss": 0.06, "step": 645 }, { "epoch": 1.14, "grad_norm": 0.18846824765205383, "learning_rate": 6.877833863826295e-06, "loss": 0.0486, "step": 646 }, { "epoch": 1.14, "grad_norm": 0.24251620471477509, "learning_rate": 6.869190802553895e-06, "loss": 0.0663, "step": 647 }, { "epoch": 1.15, "grad_norm": 0.175017848610878, "learning_rate": 6.860541243674427e-06, "loss": 0.022, "step": 648 }, { "epoch": 1.15, "grad_norm": 0.33023568987846375, "learning_rate": 6.8518852172551454e-06, "loss": 0.0799, "step": 649 }, { "epoch": 1.15, "grad_norm": 0.4299830198287964, "learning_rate": 6.843222753385785e-06, "loss": 0.0401, "step": 650 }, { "epoch": 1.15, "grad_norm": 0.24229450523853302, "learning_rate": 6.834553882178464e-06, "loss": 0.0481, "step": 651 }, { "epoch": 1.15, "grad_norm": 0.21505345404148102, "learning_rate": 6.825878633767564e-06, "loss": 0.05, "step": 652 }, { "epoch": 1.16, "grad_norm": 0.29249686002731323, "learning_rate": 6.817197038309644e-06, "loss": 0.0344, "step": 653 }, { "epoch": 1.16, "grad_norm": 0.3737218677997589, "learning_rate": 6.808509125983321e-06, "loss": 0.068, "step": 654 }, { "epoch": 1.16, "grad_norm": 0.2989976108074188, "learning_rate": 6.799814926989171e-06, "loss": 0.0583, "step": 655 }, { "epoch": 1.16, "grad_norm": 0.2579036056995392, "learning_rate": 6.791114471549627e-06, "loss": 0.0347, "step": 656 }, { "epoch": 1.16, "grad_norm": 0.19712986052036285, "learning_rate": 6.782407789908864e-06, "loss": 0.056, "step": 657 }, { "epoch": 1.16, "grad_norm": 0.3034021854400635, "learning_rate": 6.773694912332706e-06, "loss": 0.0694, "step": 658 }, { "epoch": 1.17, "grad_norm": 0.2495863288640976, "learning_rate": 6.764975869108514e-06, "loss": 0.0562, "step": 659 }, { "epoch": 1.17, "grad_norm": 0.22996436059474945, "learning_rate": 6.756250690545079e-06, "loss": 0.0519, "step": 660 }, { "epoch": 1.17, "grad_norm": 0.42392170429229736, "learning_rate": 6.747519406972525e-06, "loss": 0.0487, "step": 661 }, { "epoch": 1.17, "grad_norm": 0.502571702003479, "learning_rate": 6.738782048742187e-06, "loss": 0.0956, "step": 662 }, { "epoch": 1.17, "grad_norm": 0.4921998083591461, "learning_rate": 6.730038646226532e-06, "loss": 0.037, "step": 663 }, { "epoch": 1.17, "grad_norm": 0.5764889717102051, "learning_rate": 6.721289229819024e-06, "loss": 0.0761, "step": 664 }, { "epoch": 1.18, "grad_norm": 0.6852768659591675, "learning_rate": 6.712533829934042e-06, "loss": 0.0516, "step": 665 }, { "epoch": 1.18, "grad_norm": 0.32368728518486023, "learning_rate": 6.703772477006758e-06, "loss": 0.0279, "step": 666 }, { "epoch": 1.18, "grad_norm": 0.3027501702308655, "learning_rate": 6.6950052014930375e-06, "loss": 0.0823, "step": 667 }, { "epoch": 1.18, "grad_norm": 0.21375176310539246, "learning_rate": 6.686232033869343e-06, "loss": 0.0565, "step": 668 }, { "epoch": 1.18, "grad_norm": 0.25710931420326233, "learning_rate": 6.677453004632608e-06, "loss": 0.0462, "step": 669 }, { "epoch": 1.19, "grad_norm": 0.3554043471813202, "learning_rate": 6.6686681443001485e-06, "loss": 0.0639, "step": 670 }, { "epoch": 1.19, "grad_norm": 0.44096800684928894, "learning_rate": 6.659877483409545e-06, "loss": 0.0585, "step": 671 }, { "epoch": 1.19, "grad_norm": 0.33739519119262695, "learning_rate": 6.65108105251855e-06, "loss": 0.0794, "step": 672 }, { "epoch": 1.19, "grad_norm": 0.3041416108608246, "learning_rate": 6.6422788822049635e-06, "loss": 0.0762, "step": 673 }, { "epoch": 1.19, "grad_norm": 0.45933637022972107, "learning_rate": 6.633471003066543e-06, "loss": 0.0652, "step": 674 }, { "epoch": 1.19, "grad_norm": 0.30925846099853516, "learning_rate": 6.62465744572089e-06, "loss": 0.0746, "step": 675 }, { "epoch": 1.2, "grad_norm": 0.9105139970779419, "learning_rate": 6.615838240805344e-06, "loss": 0.1138, "step": 676 }, { "epoch": 1.2, "grad_norm": 0.33594557642936707, "learning_rate": 6.607013418976874e-06, "loss": 0.0375, "step": 677 }, { "epoch": 1.2, "grad_norm": 0.22061163187026978, "learning_rate": 6.598183010911978e-06, "loss": 0.0472, "step": 678 }, { "epoch": 1.2, "grad_norm": 0.3593815863132477, "learning_rate": 6.5893470473065716e-06, "loss": 0.0636, "step": 679 }, { "epoch": 1.2, "grad_norm": 0.29337939620018005, "learning_rate": 6.580505558875878e-06, "loss": 0.0529, "step": 680 }, { "epoch": 1.2, "grad_norm": 0.16969534754753113, "learning_rate": 6.571658576354334e-06, "loss": 0.0741, "step": 681 }, { "epoch": 1.21, "grad_norm": 0.23921915888786316, "learning_rate": 6.562806130495467e-06, "loss": 0.0423, "step": 682 }, { "epoch": 1.21, "grad_norm": 0.22711874544620514, "learning_rate": 6.5539482520718e-06, "loss": 0.0615, "step": 683 }, { "epoch": 1.21, "grad_norm": 0.17673003673553467, "learning_rate": 6.545084971874738e-06, "loss": 0.0521, "step": 684 }, { "epoch": 1.21, "grad_norm": 0.22866903245449066, "learning_rate": 6.536216320714466e-06, "loss": 0.0452, "step": 685 }, { "epoch": 1.21, "grad_norm": 0.5617073178291321, "learning_rate": 6.527342329419837e-06, "loss": 0.0838, "step": 686 }, { "epoch": 1.22, "grad_norm": 0.45371171832084656, "learning_rate": 6.518463028838271e-06, "loss": 0.0485, "step": 687 }, { "epoch": 1.22, "grad_norm": 0.27730685472488403, "learning_rate": 6.5095784498356365e-06, "loss": 0.0348, "step": 688 }, { "epoch": 1.22, "grad_norm": 0.35711053013801575, "learning_rate": 6.5006886232961585e-06, "loss": 0.0639, "step": 689 }, { "epoch": 1.22, "grad_norm": 0.5427911281585693, "learning_rate": 6.491793580122301e-06, "loss": 0.0729, "step": 690 }, { "epoch": 1.22, "grad_norm": 0.4389806091785431, "learning_rate": 6.482893351234658e-06, "loss": 0.0547, "step": 691 }, { "epoch": 1.22, "grad_norm": 0.2546471357345581, "learning_rate": 6.473987967571855e-06, "loss": 0.0419, "step": 692 }, { "epoch": 1.23, "grad_norm": 0.28974685072898865, "learning_rate": 6.465077460090431e-06, "loss": 0.035, "step": 693 }, { "epoch": 1.23, "grad_norm": 0.4222591519355774, "learning_rate": 6.4561618597647445e-06, "loss": 0.047, "step": 694 }, { "epoch": 1.23, "grad_norm": 0.45247283577919006, "learning_rate": 6.447241197586847e-06, "loss": 0.1075, "step": 695 }, { "epoch": 1.23, "grad_norm": 0.5222399830818176, "learning_rate": 6.438315504566397e-06, "loss": 0.1131, "step": 696 }, { "epoch": 1.23, "grad_norm": 0.36488956212997437, "learning_rate": 6.429384811730528e-06, "loss": 0.0498, "step": 697 }, { "epoch": 1.23, "grad_norm": 0.39750203490257263, "learning_rate": 6.420449150123768e-06, "loss": 0.0623, "step": 698 }, { "epoch": 1.24, "grad_norm": 0.27273041009902954, "learning_rate": 6.411508550807905e-06, "loss": 0.058, "step": 699 }, { "epoch": 1.24, "grad_norm": 0.4950941205024719, "learning_rate": 6.402563044861899e-06, "loss": 0.0491, "step": 700 }, { "epoch": 1.24, "grad_norm": 0.30463680624961853, "learning_rate": 6.393612663381763e-06, "loss": 0.0677, "step": 701 }, { "epoch": 1.24, "grad_norm": 0.2015472948551178, "learning_rate": 6.384657437480458e-06, "loss": 0.0454, "step": 702 }, { "epoch": 1.24, "grad_norm": 0.2115863561630249, "learning_rate": 6.375697398287788e-06, "loss": 0.0437, "step": 703 }, { "epoch": 1.25, "grad_norm": 0.1840757578611374, "learning_rate": 6.3667325769502845e-06, "loss": 0.0712, "step": 704 }, { "epoch": 1.25, "grad_norm": 0.3236899971961975, "learning_rate": 6.357763004631104e-06, "loss": 0.057, "step": 705 }, { "epoch": 1.25, "grad_norm": 0.2529219686985016, "learning_rate": 6.34878871250992e-06, "loss": 0.0453, "step": 706 }, { "epoch": 1.25, "grad_norm": 0.20748858153820038, "learning_rate": 6.3398097317828114e-06, "loss": 0.0433, "step": 707 }, { "epoch": 1.25, "grad_norm": 0.3028950095176697, "learning_rate": 6.330826093662157e-06, "loss": 0.0545, "step": 708 }, { "epoch": 1.25, "grad_norm": 1.2200806140899658, "learning_rate": 6.321837829376521e-06, "loss": 0.1283, "step": 709 }, { "epoch": 1.26, "grad_norm": 0.44462844729423523, "learning_rate": 6.312844970170551e-06, "loss": 0.0866, "step": 710 }, { "epoch": 1.26, "eval_loss": 0.05823693051934242, "eval_runtime": 14.7341, "eval_samples_per_second": 32.306, "eval_steps_per_second": 8.076, "step": 710 }, { "epoch": 1.26, "grad_norm": 0.23898513615131378, "learning_rate": 6.303847547304872e-06, "loss": 0.0644, "step": 711 }, { "epoch": 1.26, "grad_norm": 0.2102598398923874, "learning_rate": 6.294845592055967e-06, "loss": 0.065, "step": 712 }, { "epoch": 1.26, "grad_norm": 0.13285306096076965, "learning_rate": 6.2858391357160785e-06, "loss": 0.0537, "step": 713 }, { "epoch": 1.26, "grad_norm": 0.5347241163253784, "learning_rate": 6.27682820959309e-06, "loss": 0.0458, "step": 714 }, { "epoch": 1.26, "grad_norm": 0.4614933431148529, "learning_rate": 6.267812845010431e-06, "loss": 0.07, "step": 715 }, { "epoch": 1.27, "grad_norm": 0.2043062001466751, "learning_rate": 6.258793073306949e-06, "loss": 0.0506, "step": 716 }, { "epoch": 1.27, "grad_norm": 0.2690112292766571, "learning_rate": 6.2497689258368225e-06, "loss": 0.0924, "step": 717 }, { "epoch": 1.27, "grad_norm": 0.3479662537574768, "learning_rate": 6.2407404339694324e-06, "loss": 0.0802, "step": 718 }, { "epoch": 1.27, "grad_norm": 0.20581212639808655, "learning_rate": 6.231707629089263e-06, "loss": 0.0677, "step": 719 }, { "epoch": 1.27, "grad_norm": 0.22926701605319977, "learning_rate": 6.2226705425958e-06, "loss": 0.0446, "step": 720 }, { "epoch": 1.28, "grad_norm": 0.39380067586898804, "learning_rate": 6.2136292059034e-06, "loss": 0.0418, "step": 721 }, { "epoch": 1.28, "grad_norm": 0.22926107048988342, "learning_rate": 6.204583650441201e-06, "loss": 0.0553, "step": 722 }, { "epoch": 1.28, "grad_norm": 0.3458244502544403, "learning_rate": 6.1955339076530045e-06, "loss": 0.0834, "step": 723 }, { "epoch": 1.28, "grad_norm": 0.5059794783592224, "learning_rate": 6.18648000899717e-06, "loss": 0.0524, "step": 724 }, { "epoch": 1.28, "grad_norm": 0.19327516853809357, "learning_rate": 6.177421985946499e-06, "loss": 0.0324, "step": 725 }, { "epoch": 1.28, "grad_norm": 0.27781131863594055, "learning_rate": 6.168359869988134e-06, "loss": 0.0677, "step": 726 }, { "epoch": 1.29, "grad_norm": 0.6419954299926758, "learning_rate": 6.159293692623443e-06, "loss": 0.1251, "step": 727 }, { "epoch": 1.29, "grad_norm": 0.1929435431957245, "learning_rate": 6.150223485367914e-06, "loss": 0.0547, "step": 728 }, { "epoch": 1.29, "grad_norm": 0.39410218596458435, "learning_rate": 6.141149279751043e-06, "loss": 0.0516, "step": 729 }, { "epoch": 1.29, "grad_norm": 0.2835167348384857, "learning_rate": 6.132071107316221e-06, "loss": 0.0719, "step": 730 }, { "epoch": 1.29, "grad_norm": 0.4182420074939728, "learning_rate": 6.122988999620634e-06, "loss": 0.0346, "step": 731 }, { "epoch": 1.3, "grad_norm": 0.3160066604614258, "learning_rate": 6.113902988235145e-06, "loss": 0.08, "step": 732 }, { "epoch": 1.3, "grad_norm": 0.24593767523765564, "learning_rate": 6.1048131047441876e-06, "loss": 0.052, "step": 733 }, { "epoch": 1.3, "grad_norm": 0.2121860831975937, "learning_rate": 6.095719380745654e-06, "loss": 0.0437, "step": 734 }, { "epoch": 1.3, "grad_norm": 0.605929970741272, "learning_rate": 6.0866218478507875e-06, "loss": 0.0886, "step": 735 }, { "epoch": 1.3, "grad_norm": 0.5460467338562012, "learning_rate": 6.0775205376840715e-06, "loss": 0.0617, "step": 736 }, { "epoch": 1.3, "grad_norm": 0.2423592209815979, "learning_rate": 6.068415481883122e-06, "loss": 0.0599, "step": 737 }, { "epoch": 1.31, "grad_norm": 0.38910943269729614, "learning_rate": 6.059306712098571e-06, "loss": 0.0756, "step": 738 }, { "epoch": 1.31, "grad_norm": 0.16872254014015198, "learning_rate": 6.050194259993967e-06, "loss": 0.0342, "step": 739 }, { "epoch": 1.31, "grad_norm": 0.44383174180984497, "learning_rate": 6.041078157245649e-06, "loss": 0.1051, "step": 740 }, { "epoch": 1.31, "grad_norm": 0.5203137397766113, "learning_rate": 6.031958435542659e-06, "loss": 0.1132, "step": 741 }, { "epoch": 1.31, "grad_norm": 0.614219605922699, "learning_rate": 6.022835126586609e-06, "loss": 0.0534, "step": 742 }, { "epoch": 1.31, "grad_norm": 0.1791837066411972, "learning_rate": 6.0137082620915865e-06, "loss": 0.0508, "step": 743 }, { "epoch": 1.32, "grad_norm": 0.28818053007125854, "learning_rate": 6.004577873784035e-06, "loss": 0.0623, "step": 744 }, { "epoch": 1.32, "grad_norm": 0.43602147698402405, "learning_rate": 5.995443993402647e-06, "loss": 0.0659, "step": 745 }, { "epoch": 1.32, "grad_norm": 0.2554510235786438, "learning_rate": 5.986306652698261e-06, "loss": 0.0762, "step": 746 }, { "epoch": 1.32, "grad_norm": 0.22898496687412262, "learning_rate": 5.977165883433734e-06, "loss": 0.0363, "step": 747 }, { "epoch": 1.32, "grad_norm": 0.5783344507217407, "learning_rate": 5.968021717383849e-06, "loss": 0.0927, "step": 748 }, { "epoch": 1.33, "grad_norm": 0.25347182154655457, "learning_rate": 5.958874186335193e-06, "loss": 0.0391, "step": 749 }, { "epoch": 1.33, "grad_norm": 0.2889784872531891, "learning_rate": 5.949723322086053e-06, "loss": 0.0589, "step": 750 }, { "epoch": 1.33, "grad_norm": 0.3338702321052551, "learning_rate": 5.940569156446299e-06, "loss": 0.0779, "step": 751 }, { "epoch": 1.33, "grad_norm": 0.23281054198741913, "learning_rate": 5.931411721237279e-06, "loss": 0.0398, "step": 752 }, { "epoch": 1.33, "grad_norm": 0.24785956740379333, "learning_rate": 5.9222510482917075e-06, "loss": 0.0445, "step": 753 }, { "epoch": 1.33, "grad_norm": 0.42631295323371887, "learning_rate": 5.9130871694535545e-06, "loss": 0.0522, "step": 754 }, { "epoch": 1.34, "grad_norm": 0.4015826880931854, "learning_rate": 5.9039201165779315e-06, "loss": 0.0434, "step": 755 }, { "epoch": 1.34, "grad_norm": 0.2290300726890564, "learning_rate": 5.8947499215309836e-06, "loss": 0.0587, "step": 756 }, { "epoch": 1.34, "grad_norm": 0.328885942697525, "learning_rate": 5.885576616189781e-06, "loss": 0.049, "step": 757 }, { "epoch": 1.34, "grad_norm": 0.5737412571907043, "learning_rate": 5.876400232442206e-06, "loss": 0.0765, "step": 758 }, { "epoch": 1.34, "grad_norm": 0.3481307327747345, "learning_rate": 5.867220802186837e-06, "loss": 0.0432, "step": 759 }, { "epoch": 1.34, "grad_norm": 0.20464670658111572, "learning_rate": 5.858038357332851e-06, "loss": 0.0589, "step": 760 }, { "epoch": 1.35, "grad_norm": 0.645138144493103, "learning_rate": 5.8488529297998946e-06, "loss": 0.0763, "step": 761 }, { "epoch": 1.35, "grad_norm": 0.22710151970386505, "learning_rate": 5.839664551517989e-06, "loss": 0.0452, "step": 762 }, { "epoch": 1.35, "grad_norm": 0.2501658499240875, "learning_rate": 5.83047325442741e-06, "loss": 0.0392, "step": 763 }, { "epoch": 1.35, "grad_norm": 0.40542712807655334, "learning_rate": 5.821279070478582e-06, "loss": 0.0449, "step": 764 }, { "epoch": 1.35, "grad_norm": 0.49357643723487854, "learning_rate": 5.812082031631966e-06, "loss": 0.0848, "step": 765 }, { "epoch": 1.36, "grad_norm": 0.33942776918411255, "learning_rate": 5.8028821698579385e-06, "loss": 0.0674, "step": 766 }, { "epoch": 1.36, "grad_norm": 0.33811476826667786, "learning_rate": 5.7936795171367e-06, "loss": 0.0854, "step": 767 }, { "epoch": 1.36, "grad_norm": 0.35965660214424133, "learning_rate": 5.784474105458143e-06, "loss": 0.0532, "step": 768 }, { "epoch": 1.36, "grad_norm": 0.27447643876075745, "learning_rate": 5.77526596682176e-06, "loss": 0.0496, "step": 769 }, { "epoch": 1.36, "grad_norm": 0.3869398534297943, "learning_rate": 5.766055133236513e-06, "loss": 0.09, "step": 770 }, { "epoch": 1.36, "grad_norm": 0.3004852831363678, "learning_rate": 5.75684163672074e-06, "loss": 0.0729, "step": 771 }, { "epoch": 1.37, "grad_norm": 0.30032235383987427, "learning_rate": 5.747625509302032e-06, "loss": 0.0611, "step": 772 }, { "epoch": 1.37, "grad_norm": 0.3252098858356476, "learning_rate": 5.7384067830171276e-06, "loss": 0.074, "step": 773 }, { "epoch": 1.37, "grad_norm": 0.43575194478034973, "learning_rate": 5.729185489911797e-06, "loss": 0.065, "step": 774 }, { "epoch": 1.37, "grad_norm": 0.2744060158729553, "learning_rate": 5.7199616620407325e-06, "loss": 0.0684, "step": 775 }, { "epoch": 1.37, "grad_norm": 0.47385725378990173, "learning_rate": 5.710735331467444e-06, "loss": 0.1052, "step": 776 }, { "epoch": 1.37, "grad_norm": 0.23851534724235535, "learning_rate": 5.701506530264133e-06, "loss": 0.0372, "step": 777 }, { "epoch": 1.38, "grad_norm": 0.44428861141204834, "learning_rate": 5.692275290511592e-06, "loss": 0.0685, "step": 778 }, { "epoch": 1.38, "grad_norm": 0.23831063508987427, "learning_rate": 5.683041644299094e-06, "loss": 0.055, "step": 779 }, { "epoch": 1.38, "grad_norm": 0.3996879458427429, "learning_rate": 5.673805623724272e-06, "loss": 0.0709, "step": 780 }, { "epoch": 1.38, "grad_norm": 0.29323792457580566, "learning_rate": 5.664567260893019e-06, "loss": 0.0638, "step": 781 }, { "epoch": 1.38, "grad_norm": 0.2788830101490021, "learning_rate": 5.655326587919361e-06, "loss": 0.0652, "step": 782 }, { "epoch": 1.39, "grad_norm": 0.2126566469669342, "learning_rate": 5.646083636925363e-06, "loss": 0.0463, "step": 783 }, { "epoch": 1.39, "grad_norm": 0.3818773925304413, "learning_rate": 5.636838440041004e-06, "loss": 0.058, "step": 784 }, { "epoch": 1.39, "grad_norm": 0.20874321460723877, "learning_rate": 5.627591029404072e-06, "loss": 0.069, "step": 785 }, { "epoch": 1.39, "grad_norm": 0.25264501571655273, "learning_rate": 5.61834143716005e-06, "loss": 0.0669, "step": 786 }, { "epoch": 1.39, "grad_norm": 0.22084520757198334, "learning_rate": 5.609089695462002e-06, "loss": 0.0435, "step": 787 }, { "epoch": 1.39, "grad_norm": 0.22601231932640076, "learning_rate": 5.599835836470469e-06, "loss": 0.0571, "step": 788 }, { "epoch": 1.4, "grad_norm": 0.19402346014976501, "learning_rate": 5.590579892353348e-06, "loss": 0.0418, "step": 789 }, { "epoch": 1.4, "grad_norm": 0.40445828437805176, "learning_rate": 5.581321895285787e-06, "loss": 0.0777, "step": 790 }, { "epoch": 1.4, "grad_norm": 0.25165826082229614, "learning_rate": 5.572061877450068e-06, "loss": 0.0469, "step": 791 }, { "epoch": 1.4, "grad_norm": 0.22140294313430786, "learning_rate": 5.562799871035496e-06, "loss": 0.059, "step": 792 }, { "epoch": 1.4, "grad_norm": 0.3147905170917511, "learning_rate": 5.553535908238295e-06, "loss": 0.0815, "step": 793 }, { "epoch": 1.4, "grad_norm": 0.2372606098651886, "learning_rate": 5.544270021261483e-06, "loss": 0.0699, "step": 794 }, { "epoch": 1.41, "grad_norm": 0.5909821391105652, "learning_rate": 5.535002242314772e-06, "loss": 0.0957, "step": 795 }, { "epoch": 1.41, "grad_norm": 0.3787746727466583, "learning_rate": 5.5257326036144446e-06, "loss": 0.0716, "step": 796 }, { "epoch": 1.41, "grad_norm": 0.20804928243160248, "learning_rate": 5.516461137383254e-06, "loss": 0.0392, "step": 797 }, { "epoch": 1.41, "grad_norm": 0.20450638234615326, "learning_rate": 5.507187875850305e-06, "loss": 0.0365, "step": 798 }, { "epoch": 1.41, "grad_norm": 0.30582720041275024, "learning_rate": 5.497912851250941e-06, "loss": 0.0455, "step": 799 }, { "epoch": 1.42, "grad_norm": 0.3252048194408417, "learning_rate": 5.488636095826636e-06, "loss": 0.0542, "step": 800 }, { "epoch": 1.42, "grad_norm": 0.44586294889450073, "learning_rate": 5.4793576418248775e-06, "loss": 0.0769, "step": 801 }, { "epoch": 1.42, "grad_norm": 0.21377098560333252, "learning_rate": 5.470077521499063e-06, "loss": 0.0517, "step": 802 }, { "epoch": 1.42, "grad_norm": 0.39969006180763245, "learning_rate": 5.460795767108379e-06, "loss": 0.0743, "step": 803 }, { "epoch": 1.42, "grad_norm": 0.2593373656272888, "learning_rate": 5.451512410917691e-06, "loss": 0.0364, "step": 804 }, { "epoch": 1.42, "grad_norm": 0.2312588095664978, "learning_rate": 5.4422274851974356e-06, "loss": 0.0743, "step": 805 }, { "epoch": 1.43, "grad_norm": 0.3301038444042206, "learning_rate": 5.432941022223503e-06, "loss": 0.0278, "step": 806 }, { "epoch": 1.43, "grad_norm": 0.1854841262102127, "learning_rate": 5.42365305427713e-06, "loss": 0.0617, "step": 807 }, { "epoch": 1.43, "grad_norm": 0.2691740095615387, "learning_rate": 5.414363613644782e-06, "loss": 0.0476, "step": 808 }, { "epoch": 1.43, "grad_norm": 0.3060298562049866, "learning_rate": 5.4050727326180426e-06, "loss": 0.053, "step": 809 }, { "epoch": 1.43, "grad_norm": 0.34074532985687256, "learning_rate": 5.395780443493508e-06, "loss": 0.0706, "step": 810 }, { "epoch": 1.43, "grad_norm": 0.7154234051704407, "learning_rate": 5.386486778572665e-06, "loss": 0.0592, "step": 811 }, { "epoch": 1.44, "grad_norm": 0.2346647083759308, "learning_rate": 5.377191770161783e-06, "loss": 0.0525, "step": 812 }, { "epoch": 1.44, "grad_norm": 0.46330153942108154, "learning_rate": 5.3678954505718016e-06, "loss": 0.0729, "step": 813 }, { "epoch": 1.44, "grad_norm": 0.3810093402862549, "learning_rate": 5.358597852118219e-06, "loss": 0.0993, "step": 814 }, { "epoch": 1.44, "grad_norm": 0.22972767055034637, "learning_rate": 5.34929900712098e-06, "loss": 0.0517, "step": 815 }, { "epoch": 1.44, "grad_norm": 0.2932167053222656, "learning_rate": 5.339998947904362e-06, "loss": 0.0583, "step": 816 }, { "epoch": 1.45, "grad_norm": 0.43843695521354675, "learning_rate": 5.330697706796861e-06, "loss": 0.0528, "step": 817 }, { "epoch": 1.45, "grad_norm": 0.29898717999458313, "learning_rate": 5.3213953161310825e-06, "loss": 0.0722, "step": 818 }, { "epoch": 1.45, "grad_norm": 0.23077327013015747, "learning_rate": 5.3120918082436315e-06, "loss": 0.045, "step": 819 }, { "epoch": 1.45, "grad_norm": 0.31443125009536743, "learning_rate": 5.302787215474992e-06, "loss": 0.0686, "step": 820 }, { "epoch": 1.45, "grad_norm": 0.30536675453186035, "learning_rate": 5.293481570169421e-06, "loss": 0.056, "step": 821 }, { "epoch": 1.45, "grad_norm": 0.8416706919670105, "learning_rate": 5.284174904674835e-06, "loss": 0.1033, "step": 822 }, { "epoch": 1.46, "grad_norm": 0.35498178005218506, "learning_rate": 5.274867251342694e-06, "loss": 0.0428, "step": 823 }, { "epoch": 1.46, "grad_norm": 0.3787842094898224, "learning_rate": 5.265558642527897e-06, "loss": 0.0896, "step": 824 }, { "epoch": 1.46, "grad_norm": 0.3150663673877716, "learning_rate": 5.256249110588659e-06, "loss": 0.0612, "step": 825 }, { "epoch": 1.46, "grad_norm": 0.4420692026615143, "learning_rate": 5.246938687886409e-06, "loss": 0.102, "step": 826 }, { "epoch": 1.46, "grad_norm": 0.5929310321807861, "learning_rate": 5.237627406785667e-06, "loss": 0.0862, "step": 827 }, { "epoch": 1.46, "grad_norm": 1.0055999755859375, "learning_rate": 5.228315299653942e-06, "loss": 0.1209, "step": 828 }, { "epoch": 1.47, "grad_norm": 0.37658628821372986, "learning_rate": 5.219002398861611e-06, "loss": 0.0911, "step": 829 }, { "epoch": 1.47, "grad_norm": 0.26989948749542236, "learning_rate": 5.209688736781811e-06, "loss": 0.0696, "step": 830 }, { "epoch": 1.47, "grad_norm": 0.24650172889232635, "learning_rate": 5.200374345790326e-06, "loss": 0.0712, "step": 831 }, { "epoch": 1.47, "grad_norm": 0.2733075022697449, "learning_rate": 5.1910592582654715e-06, "loss": 0.0739, "step": 832 }, { "epoch": 1.47, "grad_norm": 0.20648646354675293, "learning_rate": 5.18174350658799e-06, "loss": 0.0509, "step": 833 }, { "epoch": 1.48, "grad_norm": 0.5087945461273193, "learning_rate": 5.172427123140923e-06, "loss": 0.1139, "step": 834 }, { "epoch": 1.48, "grad_norm": 0.27857109904289246, "learning_rate": 5.163110140309518e-06, "loss": 0.0565, "step": 835 }, { "epoch": 1.48, "grad_norm": 0.2023928463459015, "learning_rate": 5.1537925904811004e-06, "loss": 0.0718, "step": 836 }, { "epoch": 1.48, "grad_norm": 0.2805935740470886, "learning_rate": 5.144474506044968e-06, "loss": 0.0558, "step": 837 }, { "epoch": 1.48, "grad_norm": 0.22690273821353912, "learning_rate": 5.13515591939228e-06, "loss": 0.0534, "step": 838 }, { "epoch": 1.48, "grad_norm": 0.2374301701784134, "learning_rate": 5.125836862915934e-06, "loss": 0.0447, "step": 839 }, { "epoch": 1.49, "grad_norm": 0.2171458601951599, "learning_rate": 5.116517369010467e-06, "loss": 0.0429, "step": 840 }, { "epoch": 1.49, "grad_norm": 0.22108136117458344, "learning_rate": 5.107197470071933e-06, "loss": 0.0556, "step": 841 }, { "epoch": 1.49, "grad_norm": 0.3796743154525757, "learning_rate": 5.0978771984978e-06, "loss": 0.0854, "step": 842 }, { "epoch": 1.49, "grad_norm": 0.2067977786064148, "learning_rate": 5.088556586686822e-06, "loss": 0.0368, "step": 843 }, { "epoch": 1.49, "grad_norm": 0.3032953441143036, "learning_rate": 5.079235667038944e-06, "loss": 0.0785, "step": 844 }, { "epoch": 1.49, "grad_norm": 0.46437495946884155, "learning_rate": 5.069914471955179e-06, "loss": 0.0834, "step": 845 }, { "epoch": 1.5, "grad_norm": 0.25711363554000854, "learning_rate": 5.060593033837493e-06, "loss": 0.0309, "step": 846 }, { "epoch": 1.5, "grad_norm": 0.4127390384674072, "learning_rate": 5.051271385088702e-06, "loss": 0.0653, "step": 847 }, { "epoch": 1.5, "grad_norm": 0.2039380520582199, "learning_rate": 5.041949558112351e-06, "loss": 0.0469, "step": 848 }, { "epoch": 1.5, "grad_norm": 0.1909148395061493, "learning_rate": 5.032627585312608e-06, "loss": 0.0569, "step": 849 }, { "epoch": 1.5, "grad_norm": 0.3275524973869324, "learning_rate": 5.023305499094145e-06, "loss": 0.075, "step": 850 }, { "epoch": 1.51, "grad_norm": 0.2979578971862793, "learning_rate": 5.013983331862027e-06, "loss": 0.0748, "step": 851 }, { "epoch": 1.51, "grad_norm": 0.36494141817092896, "learning_rate": 5.004661116021605e-06, "loss": 0.0705, "step": 852 }, { "epoch": 1.51, "eval_loss": 0.05927066504955292, "eval_runtime": 14.7269, "eval_samples_per_second": 32.322, "eval_steps_per_second": 8.08, "step": 852 }, { "epoch": 1.51, "grad_norm": 0.2674633860588074, "learning_rate": 4.995338883978396e-06, "loss": 0.0612, "step": 853 }, { "epoch": 1.51, "grad_norm": 0.22576095163822174, "learning_rate": 4.986016668137975e-06, "loss": 0.0552, "step": 854 }, { "epoch": 1.51, "grad_norm": 0.3576613664627075, "learning_rate": 4.976694500905858e-06, "loss": 0.0536, "step": 855 }, { "epoch": 1.51, "grad_norm": 0.4389832615852356, "learning_rate": 4.967372414687394e-06, "loss": 0.0794, "step": 856 }, { "epoch": 1.52, "grad_norm": 0.40479084849357605, "learning_rate": 4.958050441887651e-06, "loss": 0.0661, "step": 857 }, { "epoch": 1.52, "grad_norm": 0.2826189398765564, "learning_rate": 4.948728614911299e-06, "loss": 0.0494, "step": 858 }, { "epoch": 1.52, "grad_norm": 0.24314090609550476, "learning_rate": 4.939406966162508e-06, "loss": 0.0708, "step": 859 }, { "epoch": 1.52, "grad_norm": 0.2984601557254791, "learning_rate": 4.930085528044823e-06, "loss": 0.092, "step": 860 }, { "epoch": 1.52, "grad_norm": 0.19549059867858887, "learning_rate": 4.9207643329610565e-06, "loss": 0.0616, "step": 861 }, { "epoch": 1.52, "grad_norm": 0.28568658232688904, "learning_rate": 4.911443413313179e-06, "loss": 0.0416, "step": 862 }, { "epoch": 1.53, "grad_norm": 0.49393466114997864, "learning_rate": 4.902122801502202e-06, "loss": 0.0493, "step": 863 }, { "epoch": 1.53, "grad_norm": 0.21126003563404083, "learning_rate": 4.892802529928067e-06, "loss": 0.0302, "step": 864 }, { "epoch": 1.53, "grad_norm": 0.21899864077568054, "learning_rate": 4.883482630989536e-06, "loss": 0.0364, "step": 865 }, { "epoch": 1.53, "grad_norm": 0.2775663733482361, "learning_rate": 4.874163137084068e-06, "loss": 0.0582, "step": 866 }, { "epoch": 1.53, "grad_norm": 0.472811222076416, "learning_rate": 4.864844080607723e-06, "loss": 0.0564, "step": 867 }, { "epoch": 1.54, "grad_norm": 0.2039356380701065, "learning_rate": 4.855525493955033e-06, "loss": 0.0583, "step": 868 }, { "epoch": 1.54, "grad_norm": 0.2668206989765167, "learning_rate": 4.8462074095188995e-06, "loss": 0.036, "step": 869 }, { "epoch": 1.54, "grad_norm": 0.29921385645866394, "learning_rate": 4.8368898596904834e-06, "loss": 0.047, "step": 870 }, { "epoch": 1.54, "grad_norm": 0.2845795154571533, "learning_rate": 4.827572876859078e-06, "loss": 0.0254, "step": 871 }, { "epoch": 1.54, "grad_norm": 0.361094206571579, "learning_rate": 4.818256493412011e-06, "loss": 0.0297, "step": 872 }, { "epoch": 1.54, "grad_norm": 0.6772926449775696, "learning_rate": 4.80894074173453e-06, "loss": 0.0751, "step": 873 }, { "epoch": 1.55, "grad_norm": 0.14277713000774384, "learning_rate": 4.799625654209675e-06, "loss": 0.0156, "step": 874 }, { "epoch": 1.55, "grad_norm": 0.32318535447120667, "learning_rate": 4.790311263218191e-06, "loss": 0.0722, "step": 875 }, { "epoch": 1.55, "grad_norm": 0.6377431750297546, "learning_rate": 4.7809976011383905e-06, "loss": 0.0614, "step": 876 }, { "epoch": 1.55, "grad_norm": 0.26989302039146423, "learning_rate": 4.771684700346059e-06, "loss": 0.071, "step": 877 }, { "epoch": 1.55, "grad_norm": 0.25494495034217834, "learning_rate": 4.762372593214335e-06, "loss": 0.046, "step": 878 }, { "epoch": 1.56, "grad_norm": 0.2811807096004486, "learning_rate": 4.753061312113592e-06, "loss": 0.0369, "step": 879 }, { "epoch": 1.56, "grad_norm": 0.1959286481142044, "learning_rate": 4.743750889411342e-06, "loss": 0.0342, "step": 880 }, { "epoch": 1.56, "grad_norm": 0.7420495748519897, "learning_rate": 4.734441357472105e-06, "loss": 0.0808, "step": 881 }, { "epoch": 1.56, "grad_norm": 0.28202396631240845, "learning_rate": 4.725132748657307e-06, "loss": 0.0277, "step": 882 }, { "epoch": 1.56, "grad_norm": 0.2835392653942108, "learning_rate": 4.715825095325168e-06, "loss": 0.0685, "step": 883 }, { "epoch": 1.56, "grad_norm": 0.2612256109714508, "learning_rate": 4.70651842983058e-06, "loss": 0.0442, "step": 884 }, { "epoch": 1.57, "grad_norm": 0.5025742053985596, "learning_rate": 4.697212784525009e-06, "loss": 0.0935, "step": 885 }, { "epoch": 1.57, "grad_norm": 0.46131956577301025, "learning_rate": 4.687908191756369e-06, "loss": 0.0718, "step": 886 }, { "epoch": 1.57, "grad_norm": 0.2350883036851883, "learning_rate": 4.678604683868918e-06, "loss": 0.0404, "step": 887 }, { "epoch": 1.57, "grad_norm": 0.4978535771369934, "learning_rate": 4.6693022932031415e-06, "loss": 0.042, "step": 888 }, { "epoch": 1.57, "grad_norm": 0.1883852779865265, "learning_rate": 4.660001052095639e-06, "loss": 0.0417, "step": 889 }, { "epoch": 1.57, "grad_norm": 0.5793114900588989, "learning_rate": 4.65070099287902e-06, "loss": 0.0787, "step": 890 }, { "epoch": 1.58, "grad_norm": 0.4304198920726776, "learning_rate": 4.641402147881782e-06, "loss": 0.0608, "step": 891 }, { "epoch": 1.58, "grad_norm": 0.22804735600948334, "learning_rate": 4.6321045494282e-06, "loss": 0.0373, "step": 892 }, { "epoch": 1.58, "grad_norm": 0.4823177754878998, "learning_rate": 4.62280822983822e-06, "loss": 0.0896, "step": 893 }, { "epoch": 1.58, "grad_norm": 1.048827052116394, "learning_rate": 4.613513221427337e-06, "loss": 0.0711, "step": 894 }, { "epoch": 1.58, "grad_norm": 0.2886340618133545, "learning_rate": 4.604219556506492e-06, "loss": 0.031, "step": 895 }, { "epoch": 1.59, "grad_norm": 0.5323426723480225, "learning_rate": 4.594927267381958e-06, "loss": 0.1087, "step": 896 }, { "epoch": 1.59, "grad_norm": 0.2986535429954529, "learning_rate": 4.58563638635522e-06, "loss": 0.0636, "step": 897 }, { "epoch": 1.59, "grad_norm": 0.24155835807323456, "learning_rate": 4.57634694572287e-06, "loss": 0.0415, "step": 898 }, { "epoch": 1.59, "grad_norm": 0.23381805419921875, "learning_rate": 4.567058977776498e-06, "loss": 0.0354, "step": 899 }, { "epoch": 1.59, "grad_norm": 0.17703460156917572, "learning_rate": 4.557772514802564e-06, "loss": 0.0408, "step": 900 }, { "epoch": 1.59, "grad_norm": 0.3805024325847626, "learning_rate": 4.548487589082311e-06, "loss": 0.0612, "step": 901 }, { "epoch": 1.6, "grad_norm": 0.341226726770401, "learning_rate": 4.539204232891622e-06, "loss": 0.0843, "step": 902 }, { "epoch": 1.6, "grad_norm": 0.38434794545173645, "learning_rate": 4.529922478500938e-06, "loss": 0.0444, "step": 903 }, { "epoch": 1.6, "grad_norm": 0.2903810739517212, "learning_rate": 4.520642358175125e-06, "loss": 0.0323, "step": 904 }, { "epoch": 1.6, "grad_norm": 0.30602797865867615, "learning_rate": 4.511363904173366e-06, "loss": 0.0592, "step": 905 }, { "epoch": 1.6, "grad_norm": 0.5753944516181946, "learning_rate": 4.502087148749061e-06, "loss": 0.0891, "step": 906 }, { "epoch": 1.6, "grad_norm": 0.29496291279792786, "learning_rate": 4.492812124149696e-06, "loss": 0.0684, "step": 907 }, { "epoch": 1.61, "grad_norm": 0.348178505897522, "learning_rate": 4.483538862616747e-06, "loss": 0.0948, "step": 908 }, { "epoch": 1.61, "grad_norm": 0.2786281108856201, "learning_rate": 4.474267396385558e-06, "loss": 0.0505, "step": 909 }, { "epoch": 1.61, "grad_norm": 0.4709155559539795, "learning_rate": 4.46499775768523e-06, "loss": 0.0698, "step": 910 }, { "epoch": 1.61, "grad_norm": 0.23245489597320557, "learning_rate": 4.4557299787385175e-06, "loss": 0.0348, "step": 911 }, { "epoch": 1.61, "grad_norm": 0.22803007066249847, "learning_rate": 4.446464091761706e-06, "loss": 0.0523, "step": 912 }, { "epoch": 1.62, "grad_norm": 0.4116624593734741, "learning_rate": 4.437200128964505e-06, "loss": 0.087, "step": 913 }, { "epoch": 1.62, "grad_norm": 0.26835620403289795, "learning_rate": 4.427938122549935e-06, "loss": 0.0751, "step": 914 }, { "epoch": 1.62, "grad_norm": 0.23349997401237488, "learning_rate": 4.418678104714214e-06, "loss": 0.0575, "step": 915 }, { "epoch": 1.62, "grad_norm": 0.37686672806739807, "learning_rate": 4.409420107646652e-06, "loss": 0.0421, "step": 916 }, { "epoch": 1.62, "grad_norm": 0.17440485954284668, "learning_rate": 4.400164163529532e-06, "loss": 0.0545, "step": 917 }, { "epoch": 1.62, "grad_norm": 0.3214772939682007, "learning_rate": 4.390910304537999e-06, "loss": 0.0605, "step": 918 }, { "epoch": 1.63, "grad_norm": 0.5040317177772522, "learning_rate": 4.381658562839954e-06, "loss": 0.0842, "step": 919 }, { "epoch": 1.63, "grad_norm": 0.21060919761657715, "learning_rate": 4.372408970595931e-06, "loss": 0.0416, "step": 920 }, { "epoch": 1.63, "grad_norm": 0.279563307762146, "learning_rate": 4.363161559958996e-06, "loss": 0.0385, "step": 921 }, { "epoch": 1.63, "grad_norm": 0.3074188530445099, "learning_rate": 4.353916363074638e-06, "loss": 0.0521, "step": 922 }, { "epoch": 1.63, "grad_norm": 0.32682177424430847, "learning_rate": 4.34467341208064e-06, "loss": 0.0619, "step": 923 }, { "epoch": 1.63, "grad_norm": 0.2511276602745056, "learning_rate": 4.335432739106983e-06, "loss": 0.0349, "step": 924 }, { "epoch": 1.64, "grad_norm": 0.4092172682285309, "learning_rate": 4.326194376275729e-06, "loss": 0.0671, "step": 925 }, { "epoch": 1.64, "grad_norm": 0.5157366394996643, "learning_rate": 4.316958355700906e-06, "loss": 0.082, "step": 926 }, { "epoch": 1.64, "grad_norm": 0.2773836553096771, "learning_rate": 4.307724709488409e-06, "loss": 0.0346, "step": 927 }, { "epoch": 1.64, "grad_norm": 0.32867851853370667, "learning_rate": 4.2984934697358695e-06, "loss": 0.0746, "step": 928 }, { "epoch": 1.64, "grad_norm": 0.29042449593544006, "learning_rate": 4.2892646685325575e-06, "loss": 0.06, "step": 929 }, { "epoch": 1.65, "grad_norm": 0.22214840352535248, "learning_rate": 4.280038337959268e-06, "loss": 0.0406, "step": 930 }, { "epoch": 1.65, "grad_norm": 0.3768647611141205, "learning_rate": 4.270814510088203e-06, "loss": 0.0633, "step": 931 }, { "epoch": 1.65, "grad_norm": 0.22813941538333893, "learning_rate": 4.261593216982874e-06, "loss": 0.0676, "step": 932 }, { "epoch": 1.65, "grad_norm": 0.24750201404094696, "learning_rate": 4.2523744906979684e-06, "loss": 0.0681, "step": 933 }, { "epoch": 1.65, "grad_norm": 0.538628101348877, "learning_rate": 4.243158363279261e-06, "loss": 0.0895, "step": 934 }, { "epoch": 1.65, "grad_norm": 0.32601863145828247, "learning_rate": 4.2339448667634885e-06, "loss": 0.0547, "step": 935 }, { "epoch": 1.66, "grad_norm": 0.33620086312294006, "learning_rate": 4.224734033178242e-06, "loss": 0.0581, "step": 936 }, { "epoch": 1.66, "grad_norm": 0.41226306557655334, "learning_rate": 4.215525894541856e-06, "loss": 0.0638, "step": 937 }, { "epoch": 1.66, "grad_norm": 0.42651426792144775, "learning_rate": 4.206320482863301e-06, "loss": 0.0627, "step": 938 }, { "epoch": 1.66, "grad_norm": 0.18828719854354858, "learning_rate": 4.1971178301420615e-06, "loss": 0.0237, "step": 939 }, { "epoch": 1.66, "grad_norm": 0.1825932115316391, "learning_rate": 4.187917968368036e-06, "loss": 0.0409, "step": 940 }, { "epoch": 1.66, "grad_norm": 0.168092742562294, "learning_rate": 4.1787209295214186e-06, "loss": 0.0442, "step": 941 }, { "epoch": 1.67, "grad_norm": 0.31606152653694153, "learning_rate": 4.1695267455725904e-06, "loss": 0.053, "step": 942 }, { "epoch": 1.67, "grad_norm": 0.22704976797103882, "learning_rate": 4.160335448482014e-06, "loss": 0.0483, "step": 943 }, { "epoch": 1.67, "grad_norm": 0.26612523198127747, "learning_rate": 4.151147070200108e-06, "loss": 0.0632, "step": 944 }, { "epoch": 1.67, "grad_norm": 0.5621999502182007, "learning_rate": 4.141961642667152e-06, "loss": 0.0786, "step": 945 }, { "epoch": 1.67, "grad_norm": 0.26850804686546326, "learning_rate": 4.132779197813165e-06, "loss": 0.0448, "step": 946 }, { "epoch": 1.68, "grad_norm": 0.6089004278182983, "learning_rate": 4.123599767557795e-06, "loss": 0.0744, "step": 947 }, { "epoch": 1.68, "grad_norm": 0.19964046776294708, "learning_rate": 4.11442338381022e-06, "loss": 0.0297, "step": 948 }, { "epoch": 1.68, "grad_norm": 0.360899955034256, "learning_rate": 4.105250078469018e-06, "loss": 0.0699, "step": 949 }, { "epoch": 1.68, "grad_norm": 0.45353251695632935, "learning_rate": 4.09607988342207e-06, "loss": 0.0543, "step": 950 }, { "epoch": 1.68, "grad_norm": 0.26752933859825134, "learning_rate": 4.086912830546448e-06, "loss": 0.0612, "step": 951 }, { "epoch": 1.68, "grad_norm": 0.57179856300354, "learning_rate": 4.0777489517082925e-06, "loss": 0.0855, "step": 952 }, { "epoch": 1.69, "grad_norm": 0.36693987250328064, "learning_rate": 4.068588278762723e-06, "loss": 0.0508, "step": 953 }, { "epoch": 1.69, "grad_norm": 0.2643117606639862, "learning_rate": 4.059430843553703e-06, "loss": 0.0332, "step": 954 }, { "epoch": 1.69, "grad_norm": 0.3400687277317047, "learning_rate": 4.0502766779139485e-06, "loss": 0.0736, "step": 955 }, { "epoch": 1.69, "grad_norm": 0.20526453852653503, "learning_rate": 4.041125813664809e-06, "loss": 0.0319, "step": 956 }, { "epoch": 1.69, "grad_norm": 0.28753894567489624, "learning_rate": 4.0319782826161516e-06, "loss": 0.0782, "step": 957 }, { "epoch": 1.69, "grad_norm": 0.22514447569847107, "learning_rate": 4.022834116566269e-06, "loss": 0.0407, "step": 958 }, { "epoch": 1.7, "grad_norm": 0.27989277243614197, "learning_rate": 4.013693347301741e-06, "loss": 0.0438, "step": 959 }, { "epoch": 1.7, "grad_norm": 0.33890578150749207, "learning_rate": 4.0045560065973535e-06, "loss": 0.0524, "step": 960 }, { "epoch": 1.7, "grad_norm": 0.32162007689476013, "learning_rate": 3.995422126215968e-06, "loss": 0.0756, "step": 961 }, { "epoch": 1.7, "grad_norm": 0.4270891845226288, "learning_rate": 3.986291737908414e-06, "loss": 0.0644, "step": 962 }, { "epoch": 1.7, "grad_norm": 0.7270297408103943, "learning_rate": 3.977164873413391e-06, "loss": 0.075, "step": 963 }, { "epoch": 1.71, "grad_norm": 0.47613292932510376, "learning_rate": 3.968041564457342e-06, "loss": 0.0467, "step": 964 }, { "epoch": 1.71, "grad_norm": 0.17602606117725372, "learning_rate": 3.958921842754351e-06, "loss": 0.0412, "step": 965 }, { "epoch": 1.71, "grad_norm": 0.23278112709522247, "learning_rate": 3.949805740006037e-06, "loss": 0.0569, "step": 966 }, { "epoch": 1.71, "grad_norm": 0.6249252557754517, "learning_rate": 3.94069328790143e-06, "loss": 0.1109, "step": 967 }, { "epoch": 1.71, "grad_norm": 0.3838892877101898, "learning_rate": 3.931584518116878e-06, "loss": 0.0634, "step": 968 }, { "epoch": 1.71, "grad_norm": 0.2674984931945801, "learning_rate": 3.922479462315929e-06, "loss": 0.0396, "step": 969 }, { "epoch": 1.72, "grad_norm": 0.7437862753868103, "learning_rate": 3.913378152149214e-06, "loss": 0.1037, "step": 970 }, { "epoch": 1.72, "grad_norm": 0.22365398705005646, "learning_rate": 3.904280619254348e-06, "loss": 0.0472, "step": 971 }, { "epoch": 1.72, "grad_norm": 0.3518374264240265, "learning_rate": 3.895186895255814e-06, "loss": 0.0822, "step": 972 }, { "epoch": 1.72, "grad_norm": 0.4127620458602905, "learning_rate": 3.886097011764856e-06, "loss": 0.0435, "step": 973 }, { "epoch": 1.72, "grad_norm": 0.20354340970516205, "learning_rate": 3.877011000379367e-06, "loss": 0.0277, "step": 974 }, { "epoch": 1.72, "grad_norm": 0.23527513444423676, "learning_rate": 3.86792889268378e-06, "loss": 0.0511, "step": 975 }, { "epoch": 1.73, "grad_norm": 0.2532998025417328, "learning_rate": 3.858850720248959e-06, "loss": 0.0565, "step": 976 }, { "epoch": 1.73, "grad_norm": 0.22200430929660797, "learning_rate": 3.8497765146320874e-06, "loss": 0.0542, "step": 977 }, { "epoch": 1.73, "grad_norm": 0.39259976148605347, "learning_rate": 3.8407063073765574e-06, "loss": 0.0631, "step": 978 }, { "epoch": 1.73, "grad_norm": 0.21652863919734955, "learning_rate": 3.831640130011867e-06, "loss": 0.0513, "step": 979 }, { "epoch": 1.73, "grad_norm": 0.21479672193527222, "learning_rate": 3.8225780140535025e-06, "loss": 0.0448, "step": 980 }, { "epoch": 1.74, "grad_norm": 0.21375016868114471, "learning_rate": 3.8135199910028314e-06, "loss": 0.0368, "step": 981 }, { "epoch": 1.74, "grad_norm": 0.37601420283317566, "learning_rate": 3.8044660923469968e-06, "loss": 0.0438, "step": 982 }, { "epoch": 1.74, "grad_norm": 0.4459959864616394, "learning_rate": 3.7954163495588e-06, "loss": 0.0918, "step": 983 }, { "epoch": 1.74, "grad_norm": 0.3222822844982147, "learning_rate": 3.786370794096603e-06, "loss": 0.0356, "step": 984 }, { "epoch": 1.74, "grad_norm": 0.26596394181251526, "learning_rate": 3.777329457404202e-06, "loss": 0.0698, "step": 985 }, { "epoch": 1.74, "grad_norm": 0.4063988924026489, "learning_rate": 3.7682923709107367e-06, "loss": 0.0759, "step": 986 }, { "epoch": 1.75, "grad_norm": 0.34790676832199097, "learning_rate": 3.759259566030571e-06, "loss": 0.0471, "step": 987 }, { "epoch": 1.75, "grad_norm": 0.2744903266429901, "learning_rate": 3.750231074163179e-06, "loss": 0.0402, "step": 988 }, { "epoch": 1.75, "grad_norm": 0.18864701688289642, "learning_rate": 3.741206926693052e-06, "loss": 0.0257, "step": 989 }, { "epoch": 1.75, "grad_norm": 0.3292123079299927, "learning_rate": 3.7321871549895715e-06, "loss": 0.0645, "step": 990 }, { "epoch": 1.75, "grad_norm": 0.2857760488986969, "learning_rate": 3.7231717904069097e-06, "loss": 0.0747, "step": 991 }, { "epoch": 1.75, "grad_norm": 0.4177703857421875, "learning_rate": 3.714160864283923e-06, "loss": 0.0628, "step": 992 }, { "epoch": 1.76, "grad_norm": 0.3333231806755066, "learning_rate": 3.705154407944034e-06, "loss": 0.0559, "step": 993 }, { "epoch": 1.76, "grad_norm": 0.45318400859832764, "learning_rate": 3.696152452695128e-06, "loss": 0.0848, "step": 994 }, { "epoch": 1.76, "eval_loss": 0.056184083223342896, "eval_runtime": 14.7268, "eval_samples_per_second": 32.322, "eval_steps_per_second": 8.081, "step": 994 }, { "epoch": 1.76, "grad_norm": 0.36879318952560425, "learning_rate": 3.68715502982945e-06, "loss": 0.0497, "step": 995 }, { "epoch": 1.76, "grad_norm": 0.2791776657104492, "learning_rate": 3.6781621706234815e-06, "loss": 0.0385, "step": 996 }, { "epoch": 1.76, "grad_norm": 0.7810603380203247, "learning_rate": 3.6691739063378462e-06, "loss": 0.0805, "step": 997 }, { "epoch": 1.77, "grad_norm": 0.4113273620605469, "learning_rate": 3.66019026821719e-06, "loss": 0.0466, "step": 998 }, { "epoch": 1.77, "grad_norm": 0.2155979722738266, "learning_rate": 3.65121128749008e-06, "loss": 0.0199, "step": 999 }, { "epoch": 1.77, "grad_norm": 0.2696681320667267, "learning_rate": 3.6422369953688973e-06, "loss": 0.0518, "step": 1000 }, { "epoch": 1.77, "grad_norm": 0.36340588331222534, "learning_rate": 3.633267423049717e-06, "loss": 0.0644, "step": 1001 }, { "epoch": 1.77, "grad_norm": 0.219515860080719, "learning_rate": 3.624302601712213e-06, "loss": 0.0393, "step": 1002 }, { "epoch": 1.77, "grad_norm": 0.2098739892244339, "learning_rate": 3.6153425625195424e-06, "loss": 0.0492, "step": 1003 }, { "epoch": 1.78, "grad_norm": 0.4837384819984436, "learning_rate": 3.606387336618237e-06, "loss": 0.1312, "step": 1004 }, { "epoch": 1.78, "grad_norm": 0.3313275873661041, "learning_rate": 3.5974369551381023e-06, "loss": 0.0638, "step": 1005 }, { "epoch": 1.78, "grad_norm": 0.2670367658138275, "learning_rate": 3.5884914491920963e-06, "loss": 0.056, "step": 1006 }, { "epoch": 1.78, "grad_norm": 0.32359379529953003, "learning_rate": 3.579550849876233e-06, "loss": 0.0736, "step": 1007 }, { "epoch": 1.78, "grad_norm": 0.23541584610939026, "learning_rate": 3.5706151882694727e-06, "loss": 0.0354, "step": 1008 }, { "epoch": 1.79, "grad_norm": 0.3773065507411957, "learning_rate": 3.561684495433605e-06, "loss": 0.0552, "step": 1009 }, { "epoch": 1.79, "grad_norm": 0.4034823179244995, "learning_rate": 3.5527588024131542e-06, "loss": 0.0614, "step": 1010 }, { "epoch": 1.79, "grad_norm": 0.2549152970314026, "learning_rate": 3.543838140235257e-06, "loss": 0.0603, "step": 1011 }, { "epoch": 1.79, "grad_norm": 0.2435804307460785, "learning_rate": 3.5349225399095693e-06, "loss": 0.0422, "step": 1012 }, { "epoch": 1.79, "grad_norm": 0.6781402230262756, "learning_rate": 3.526012032428148e-06, "loss": 0.0816, "step": 1013 }, { "epoch": 1.79, "grad_norm": 0.4394122064113617, "learning_rate": 3.5171066487653427e-06, "loss": 0.0885, "step": 1014 }, { "epoch": 1.8, "grad_norm": 0.3086816072463989, "learning_rate": 3.5082064198777e-06, "loss": 0.0616, "step": 1015 }, { "epoch": 1.8, "grad_norm": 0.3581552803516388, "learning_rate": 3.4993113767038423e-06, "loss": 0.0557, "step": 1016 }, { "epoch": 1.8, "grad_norm": 0.28149986267089844, "learning_rate": 3.4904215501643647e-06, "loss": 0.0674, "step": 1017 }, { "epoch": 1.8, "grad_norm": 0.4265075623989105, "learning_rate": 3.481536971161732e-06, "loss": 0.0604, "step": 1018 }, { "epoch": 1.8, "grad_norm": 0.28389066457748413, "learning_rate": 3.472657670580164e-06, "loss": 0.032, "step": 1019 }, { "epoch": 1.8, "grad_norm": 0.48513075709342957, "learning_rate": 3.463783679285535e-06, "loss": 0.0518, "step": 1020 }, { "epoch": 1.81, "grad_norm": 0.2950093746185303, "learning_rate": 3.4549150281252635e-06, "loss": 0.0516, "step": 1021 }, { "epoch": 1.81, "grad_norm": 0.23902879655361176, "learning_rate": 3.446051747928202e-06, "loss": 0.0493, "step": 1022 }, { "epoch": 1.81, "grad_norm": 0.4256305694580078, "learning_rate": 3.4371938695045347e-06, "loss": 0.0729, "step": 1023 }, { "epoch": 1.81, "grad_norm": 0.22448065876960754, "learning_rate": 3.428341423645668e-06, "loss": 0.0501, "step": 1024 }, { "epoch": 1.81, "grad_norm": 0.19932492077350616, "learning_rate": 3.4194944411241213e-06, "loss": 0.0257, "step": 1025 }, { "epoch": 1.82, "grad_norm": 0.5429794788360596, "learning_rate": 3.4106529526934305e-06, "loss": 0.0788, "step": 1026 }, { "epoch": 1.82, "grad_norm": 0.19093726575374603, "learning_rate": 3.4018169890880227e-06, "loss": 0.027, "step": 1027 }, { "epoch": 1.82, "grad_norm": 0.6629717946052551, "learning_rate": 3.3929865810231264e-06, "loss": 0.0627, "step": 1028 }, { "epoch": 1.82, "grad_norm": 0.48709404468536377, "learning_rate": 3.3841617591946584e-06, "loss": 0.041, "step": 1029 }, { "epoch": 1.82, "grad_norm": 0.5192286968231201, "learning_rate": 3.3753425542791106e-06, "loss": 0.0729, "step": 1030 }, { "epoch": 1.82, "grad_norm": 0.30288419127464294, "learning_rate": 3.3665289969334587e-06, "loss": 0.0409, "step": 1031 }, { "epoch": 1.83, "grad_norm": 0.3584707975387573, "learning_rate": 3.3577211177950386e-06, "loss": 0.055, "step": 1032 }, { "epoch": 1.83, "grad_norm": 0.43994244933128357, "learning_rate": 3.348918947481452e-06, "loss": 0.0682, "step": 1033 }, { "epoch": 1.83, "grad_norm": 0.6027707457542419, "learning_rate": 3.340122516590456e-06, "loss": 0.0659, "step": 1034 }, { "epoch": 1.83, "grad_norm": 0.49970120191574097, "learning_rate": 3.3313318556998523e-06, "loss": 0.0611, "step": 1035 }, { "epoch": 1.83, "grad_norm": 0.21730153262615204, "learning_rate": 3.322546995367394e-06, "loss": 0.048, "step": 1036 }, { "epoch": 1.83, "grad_norm": 0.3189135193824768, "learning_rate": 3.3137679661306578e-06, "loss": 0.0866, "step": 1037 }, { "epoch": 1.84, "grad_norm": 0.798090934753418, "learning_rate": 3.304994798506962e-06, "loss": 0.0869, "step": 1038 }, { "epoch": 1.84, "grad_norm": 0.3200780153274536, "learning_rate": 3.296227522993245e-06, "loss": 0.0613, "step": 1039 }, { "epoch": 1.84, "grad_norm": 0.3401470482349396, "learning_rate": 3.2874661700659586e-06, "loss": 0.0498, "step": 1040 }, { "epoch": 1.84, "grad_norm": 0.2279331088066101, "learning_rate": 3.2787107701809757e-06, "loss": 0.0557, "step": 1041 }, { "epoch": 1.84, "grad_norm": 0.47060710191726685, "learning_rate": 3.2699613537734693e-06, "loss": 0.0604, "step": 1042 }, { "epoch": 1.85, "grad_norm": 0.22766916453838348, "learning_rate": 3.261217951257813e-06, "loss": 0.0394, "step": 1043 }, { "epoch": 1.85, "grad_norm": 0.6578858494758606, "learning_rate": 3.252480593027478e-06, "loss": 0.0754, "step": 1044 }, { "epoch": 1.85, "grad_norm": 0.6935117840766907, "learning_rate": 3.2437493094549223e-06, "loss": 0.0778, "step": 1045 }, { "epoch": 1.85, "grad_norm": 0.2019956409931183, "learning_rate": 3.2350241308914865e-06, "loss": 0.0369, "step": 1046 }, { "epoch": 1.85, "grad_norm": 0.4244823455810547, "learning_rate": 3.2263050876672954e-06, "loss": 0.0549, "step": 1047 }, { "epoch": 1.85, "grad_norm": 0.24040935933589935, "learning_rate": 3.217592210091137e-06, "loss": 0.0394, "step": 1048 }, { "epoch": 1.86, "grad_norm": 0.8403865098953247, "learning_rate": 3.2088855284503762e-06, "loss": 0.0904, "step": 1049 }, { "epoch": 1.86, "grad_norm": 0.21365146338939667, "learning_rate": 3.200185073010831e-06, "loss": 0.0267, "step": 1050 }, { "epoch": 1.86, "grad_norm": 0.7642082571983337, "learning_rate": 3.1914908740166793e-06, "loss": 0.0951, "step": 1051 }, { "epoch": 1.86, "grad_norm": 0.6344515085220337, "learning_rate": 3.182802961690357e-06, "loss": 0.0809, "step": 1052 }, { "epoch": 1.86, "grad_norm": 0.5710294246673584, "learning_rate": 3.1741213662324365e-06, "loss": 0.0858, "step": 1053 }, { "epoch": 1.86, "grad_norm": 0.3214423656463623, "learning_rate": 3.165446117821538e-06, "loss": 0.0686, "step": 1054 }, { "epoch": 1.87, "grad_norm": 0.2226896584033966, "learning_rate": 3.1567772466142156e-06, "loss": 0.0177, "step": 1055 }, { "epoch": 1.87, "grad_norm": 0.2788223326206207, "learning_rate": 3.1481147827448554e-06, "loss": 0.0411, "step": 1056 }, { "epoch": 1.87, "grad_norm": 0.22315210103988647, "learning_rate": 3.139458756325576e-06, "loss": 0.0398, "step": 1057 }, { "epoch": 1.87, "grad_norm": 0.2029338926076889, "learning_rate": 3.1308091974461064e-06, "loss": 0.0309, "step": 1058 }, { "epoch": 1.87, "grad_norm": 0.445560097694397, "learning_rate": 3.1221661361737065e-06, "loss": 0.0659, "step": 1059 }, { "epoch": 1.88, "grad_norm": 0.7532091736793518, "learning_rate": 3.1135296025530426e-06, "loss": 0.0883, "step": 1060 }, { "epoch": 1.88, "grad_norm": 0.5283082127571106, "learning_rate": 3.1048996266060883e-06, "loss": 0.0812, "step": 1061 }, { "epoch": 1.88, "grad_norm": 0.19848276674747467, "learning_rate": 3.0962762383320288e-06, "loss": 0.0294, "step": 1062 }, { "epoch": 1.88, "grad_norm": 0.3289884030818939, "learning_rate": 3.0876594677071405e-06, "loss": 0.0691, "step": 1063 }, { "epoch": 1.88, "grad_norm": 0.475301057100296, "learning_rate": 3.0790493446847024e-06, "loss": 0.0552, "step": 1064 }, { "epoch": 1.88, "grad_norm": 0.20955568552017212, "learning_rate": 3.070445899194885e-06, "loss": 0.0336, "step": 1065 }, { "epoch": 1.89, "grad_norm": 0.280180424451828, "learning_rate": 3.061849161144641e-06, "loss": 0.0732, "step": 1066 }, { "epoch": 1.89, "grad_norm": 0.23523566126823425, "learning_rate": 3.0532591604176132e-06, "loss": 0.0615, "step": 1067 }, { "epoch": 1.89, "grad_norm": 0.4371294379234314, "learning_rate": 3.044675926874023e-06, "loss": 0.0431, "step": 1068 }, { "epoch": 1.89, "grad_norm": 0.34492623805999756, "learning_rate": 3.0360994903505654e-06, "loss": 0.0539, "step": 1069 }, { "epoch": 1.89, "grad_norm": 0.3771841824054718, "learning_rate": 3.0275298806603102e-06, "loss": 0.0707, "step": 1070 }, { "epoch": 1.89, "grad_norm": 0.32946136593818665, "learning_rate": 3.0189671275925954e-06, "loss": 0.0485, "step": 1071 }, { "epoch": 1.9, "grad_norm": 0.34278154373168945, "learning_rate": 3.010411260912922e-06, "loss": 0.072, "step": 1072 }, { "epoch": 1.9, "grad_norm": 0.2122594267129898, "learning_rate": 3.00186231036286e-06, "loss": 0.0386, "step": 1073 }, { "epoch": 1.9, "grad_norm": 0.46752992272377014, "learning_rate": 2.9933203056599277e-06, "loss": 0.1058, "step": 1074 }, { "epoch": 1.9, "grad_norm": 0.4876655638217926, "learning_rate": 2.984785276497507e-06, "loss": 0.0755, "step": 1075 }, { "epoch": 1.9, "grad_norm": 0.4009245038032532, "learning_rate": 2.9762572525447266e-06, "loss": 0.0801, "step": 1076 }, { "epoch": 1.91, "grad_norm": 0.31884658336639404, "learning_rate": 2.9677362634463647e-06, "loss": 0.0491, "step": 1077 }, { "epoch": 1.91, "grad_norm": 0.4274621903896332, "learning_rate": 2.9592223388227505e-06, "loss": 0.0647, "step": 1078 }, { "epoch": 1.91, "grad_norm": 0.2394295334815979, "learning_rate": 2.950715508269648e-06, "loss": 0.0646, "step": 1079 }, { "epoch": 1.91, "grad_norm": 0.3970087170600891, "learning_rate": 2.9422158013581658e-06, "loss": 0.0751, "step": 1080 }, { "epoch": 1.91, "grad_norm": 0.33805954456329346, "learning_rate": 2.93372324763465e-06, "loss": 0.064, "step": 1081 }, { "epoch": 1.91, "grad_norm": 0.27474018931388855, "learning_rate": 2.925237876620576e-06, "loss": 0.0646, "step": 1082 }, { "epoch": 1.92, "grad_norm": 0.3348204791545868, "learning_rate": 2.9167597178124584e-06, "loss": 0.0602, "step": 1083 }, { "epoch": 1.92, "grad_norm": 0.22010396420955658, "learning_rate": 2.908288800681737e-06, "loss": 0.0502, "step": 1084 }, { "epoch": 1.92, "grad_norm": 0.46095865964889526, "learning_rate": 2.899825154674674e-06, "loss": 0.0698, "step": 1085 }, { "epoch": 1.92, "grad_norm": 0.3379755914211273, "learning_rate": 2.8913688092122667e-06, "loss": 0.0737, "step": 1086 }, { "epoch": 1.92, "grad_norm": 0.2667890787124634, "learning_rate": 2.882919793690123e-06, "loss": 0.0331, "step": 1087 }, { "epoch": 1.92, "grad_norm": 0.28333139419555664, "learning_rate": 2.8744781374783813e-06, "loss": 0.044, "step": 1088 }, { "epoch": 1.93, "grad_norm": 0.5258477330207825, "learning_rate": 2.8660438699215896e-06, "loss": 0.0611, "step": 1089 }, { "epoch": 1.93, "grad_norm": 0.37528035044670105, "learning_rate": 2.8576170203386144e-06, "loss": 0.0593, "step": 1090 }, { "epoch": 1.93, "grad_norm": 0.28744783997535706, "learning_rate": 2.849197618022539e-06, "loss": 0.056, "step": 1091 }, { "epoch": 1.93, "grad_norm": 0.2404329478740692, "learning_rate": 2.840785692240553e-06, "loss": 0.0526, "step": 1092 }, { "epoch": 1.93, "grad_norm": 0.41300275921821594, "learning_rate": 2.832381272233864e-06, "loss": 0.1055, "step": 1093 }, { "epoch": 1.94, "grad_norm": 0.37923821806907654, "learning_rate": 2.8239843872175814e-06, "loss": 0.0611, "step": 1094 }, { "epoch": 1.94, "grad_norm": 0.5538582801818848, "learning_rate": 2.8155950663806234e-06, "loss": 0.0666, "step": 1095 }, { "epoch": 1.94, "grad_norm": 0.44852039217948914, "learning_rate": 2.8072133388856194e-06, "loss": 0.0601, "step": 1096 }, { "epoch": 1.94, "grad_norm": 0.2169903963804245, "learning_rate": 2.7988392338687925e-06, "loss": 0.054, "step": 1097 }, { "epoch": 1.94, "grad_norm": 0.3337094187736511, "learning_rate": 2.7904727804398813e-06, "loss": 0.0637, "step": 1098 }, { "epoch": 1.94, "grad_norm": 0.5035146474838257, "learning_rate": 2.782114007682016e-06, "loss": 0.0875, "step": 1099 }, { "epoch": 1.95, "grad_norm": 0.2593541145324707, "learning_rate": 2.7737629446516325e-06, "loss": 0.0629, "step": 1100 }, { "epoch": 1.95, "grad_norm": 0.28386053442955017, "learning_rate": 2.765419620378366e-06, "loss": 0.0544, "step": 1101 }, { "epoch": 1.95, "grad_norm": 0.18949826061725616, "learning_rate": 2.7570840638649487e-06, "loss": 0.0438, "step": 1102 }, { "epoch": 1.95, "grad_norm": 0.26150980591773987, "learning_rate": 2.7487563040871145e-06, "loss": 0.0513, "step": 1103 }, { "epoch": 1.95, "grad_norm": 0.3108008801937103, "learning_rate": 2.740436369993491e-06, "loss": 0.0714, "step": 1104 }, { "epoch": 1.95, "grad_norm": 0.26925361156463623, "learning_rate": 2.732124290505501e-06, "loss": 0.0334, "step": 1105 }, { "epoch": 1.96, "grad_norm": 0.2529263496398926, "learning_rate": 2.72382009451727e-06, "loss": 0.0334, "step": 1106 }, { "epoch": 1.96, "grad_norm": 0.5129886865615845, "learning_rate": 2.7155238108955153e-06, "loss": 0.0989, "step": 1107 }, { "epoch": 1.96, "grad_norm": 0.5947487354278564, "learning_rate": 2.707235468479449e-06, "loss": 0.0673, "step": 1108 }, { "epoch": 1.96, "grad_norm": 0.3080863058567047, "learning_rate": 2.698955096080677e-06, "loss": 0.0565, "step": 1109 }, { "epoch": 1.96, "grad_norm": 0.24549928307533264, "learning_rate": 2.6906827224831024e-06, "loss": 0.0551, "step": 1110 }, { "epoch": 1.97, "grad_norm": 0.24393980205059052, "learning_rate": 2.6824183764428226e-06, "loss": 0.0549, "step": 1111 }, { "epoch": 1.97, "grad_norm": 0.5429656505584717, "learning_rate": 2.6741620866880335e-06, "loss": 0.1288, "step": 1112 }, { "epoch": 1.97, "grad_norm": 0.22410114109516144, "learning_rate": 2.665913881918921e-06, "loss": 0.0699, "step": 1113 }, { "epoch": 1.97, "grad_norm": 0.6905423998832703, "learning_rate": 2.6576737908075667e-06, "loss": 0.0894, "step": 1114 }, { "epoch": 1.97, "grad_norm": 0.23629222810268402, "learning_rate": 2.6494418419978485e-06, "loss": 0.057, "step": 1115 }, { "epoch": 1.97, "grad_norm": 0.24294273555278778, "learning_rate": 2.641218064105341e-06, "loss": 0.057, "step": 1116 }, { "epoch": 1.98, "grad_norm": 0.3973044753074646, "learning_rate": 2.6330024857172193e-06, "loss": 0.0545, "step": 1117 }, { "epoch": 1.98, "grad_norm": 0.35339024662971497, "learning_rate": 2.6247951353921484e-06, "loss": 0.0443, "step": 1118 }, { "epoch": 1.98, "grad_norm": 0.33749720454216003, "learning_rate": 2.6165960416601944e-06, "loss": 0.0633, "step": 1119 }, { "epoch": 1.98, "grad_norm": 0.19248297810554504, "learning_rate": 2.608405233022724e-06, "loss": 0.0657, "step": 1120 }, { "epoch": 1.98, "grad_norm": 0.30044496059417725, "learning_rate": 2.600222737952299e-06, "loss": 0.0596, "step": 1121 }, { "epoch": 1.98, "grad_norm": 0.18807631731033325, "learning_rate": 2.5920485848925914e-06, "loss": 0.0474, "step": 1122 }, { "epoch": 1.99, "grad_norm": 0.2754729688167572, "learning_rate": 2.5838828022582595e-06, "loss": 0.0382, "step": 1123 }, { "epoch": 1.99, "grad_norm": 0.38777849078178406, "learning_rate": 2.575725418434878e-06, "loss": 0.04, "step": 1124 }, { "epoch": 1.99, "grad_norm": 0.3121282756328583, "learning_rate": 2.5675764617788233e-06, "loss": 0.0592, "step": 1125 }, { "epoch": 1.99, "grad_norm": 0.2995060086250305, "learning_rate": 2.5594359606171728e-06, "loss": 0.0607, "step": 1126 }, { "epoch": 1.99, "grad_norm": 0.22326309978961945, "learning_rate": 2.5513039432476195e-06, "loss": 0.0661, "step": 1127 }, { "epoch": 2.0, "grad_norm": 0.21206530928611755, "learning_rate": 2.543180437938352e-06, "loss": 0.0389, "step": 1128 }, { "epoch": 2.0, "grad_norm": 0.26785749197006226, "learning_rate": 2.5350654729279832e-06, "loss": 0.0393, "step": 1129 }, { "epoch": 2.0, "grad_norm": 0.27896589040756226, "learning_rate": 2.526959076425434e-06, "loss": 0.084, "step": 1130 }, { "epoch": 2.0, "grad_norm": 0.24047411978244781, "learning_rate": 2.5188612766098373e-06, "loss": 0.0489, "step": 1131 }, { "epoch": 2.0, "grad_norm": 0.18415462970733643, "learning_rate": 2.5107721016304425e-06, "loss": 0.0444, "step": 1132 }, { "epoch": 2.0, "grad_norm": 0.2761925756931305, "learning_rate": 2.5026915796065233e-06, "loss": 0.0322, "step": 1133 }, { "epoch": 2.01, "grad_norm": 0.22625890374183655, "learning_rate": 2.4946197386272665e-06, "loss": 0.0395, "step": 1134 }, { "epoch": 2.01, "grad_norm": 0.35277360677719116, "learning_rate": 2.4865566067516896e-06, "loss": 0.0749, "step": 1135 }, { "epoch": 2.01, "grad_norm": 0.4235279858112335, "learning_rate": 2.4785022120085305e-06, "loss": 0.0631, "step": 1136 }, { "epoch": 2.01, "eval_loss": 0.05521814897656441, "eval_runtime": 14.6908, "eval_samples_per_second": 32.401, "eval_steps_per_second": 8.1, "step": 1136 }, { "epoch": 2.01, "grad_norm": 0.26577240228652954, "learning_rate": 2.470456582396156e-06, "loss": 0.0529, "step": 1137 }, { "epoch": 2.01, "grad_norm": 0.28920263051986694, "learning_rate": 2.4624197458824693e-06, "loss": 0.071, "step": 1138 }, { "epoch": 2.02, "grad_norm": 0.20264554023742676, "learning_rate": 2.4543917304047988e-06, "loss": 0.0273, "step": 1139 }, { "epoch": 2.02, "grad_norm": 0.26269856095314026, "learning_rate": 2.4463725638698182e-06, "loss": 0.0583, "step": 1140 }, { "epoch": 2.02, "grad_norm": 0.3714780807495117, "learning_rate": 2.4383622741534345e-06, "loss": 0.0879, "step": 1141 }, { "epoch": 2.02, "grad_norm": 0.294224351644516, "learning_rate": 2.4303608891006986e-06, "loss": 0.0291, "step": 1142 }, { "epoch": 2.02, "grad_norm": 0.28095486760139465, "learning_rate": 2.422368436525711e-06, "loss": 0.0766, "step": 1143 }, { "epoch": 2.02, "grad_norm": 0.4065314829349518, "learning_rate": 2.4143849442115157e-06, "loss": 0.0684, "step": 1144 }, { "epoch": 2.03, "grad_norm": 0.23143674433231354, "learning_rate": 2.406410439910017e-06, "loss": 0.0265, "step": 1145 }, { "epoch": 2.03, "grad_norm": 0.24622800946235657, "learning_rate": 2.3984449513418687e-06, "loss": 0.0368, "step": 1146 }, { "epoch": 2.03, "grad_norm": 0.2713804841041565, "learning_rate": 2.3904885061963844e-06, "loss": 0.0618, "step": 1147 }, { "epoch": 2.03, "grad_norm": 0.4864218533039093, "learning_rate": 2.382541132131449e-06, "loss": 0.0523, "step": 1148 }, { "epoch": 2.03, "grad_norm": 0.3260977864265442, "learning_rate": 2.374602856773404e-06, "loss": 0.0532, "step": 1149 }, { "epoch": 2.03, "grad_norm": 0.39952418208122253, "learning_rate": 2.366673707716973e-06, "loss": 0.0494, "step": 1150 }, { "epoch": 2.04, "grad_norm": 0.2501808702945709, "learning_rate": 2.358753712525147e-06, "loss": 0.0567, "step": 1151 }, { "epoch": 2.04, "grad_norm": 0.30919313430786133, "learning_rate": 2.350842898729099e-06, "loss": 0.0345, "step": 1152 }, { "epoch": 2.04, "grad_norm": 0.2551981508731842, "learning_rate": 2.3429412938280897e-06, "loss": 0.0341, "step": 1153 }, { "epoch": 2.04, "grad_norm": 0.2361391931772232, "learning_rate": 2.335048925289362e-06, "loss": 0.0454, "step": 1154 }, { "epoch": 2.04, "grad_norm": 0.247438445687294, "learning_rate": 2.327165820548059e-06, "loss": 0.0243, "step": 1155 }, { "epoch": 2.05, "grad_norm": 0.3898051977157593, "learning_rate": 2.3192920070071145e-06, "loss": 0.0652, "step": 1156 }, { "epoch": 2.05, "grad_norm": 0.2327452450990677, "learning_rate": 2.311427512037166e-06, "loss": 0.0329, "step": 1157 }, { "epoch": 2.05, "grad_norm": 0.2756439447402954, "learning_rate": 2.303572362976462e-06, "loss": 0.0399, "step": 1158 }, { "epoch": 2.05, "grad_norm": 0.7089345455169678, "learning_rate": 2.295726587130761e-06, "loss": 0.0944, "step": 1159 }, { "epoch": 2.05, "grad_norm": 0.192116841673851, "learning_rate": 2.287890211773238e-06, "loss": 0.0316, "step": 1160 }, { "epoch": 2.05, "grad_norm": 0.35979101061820984, "learning_rate": 2.2800632641443905e-06, "loss": 0.0397, "step": 1161 }, { "epoch": 2.06, "grad_norm": 0.24005566537380219, "learning_rate": 2.2722457714519418e-06, "loss": 0.0456, "step": 1162 }, { "epoch": 2.06, "grad_norm": 0.3431236445903778, "learning_rate": 2.2644377608707525e-06, "loss": 0.0566, "step": 1163 }, { "epoch": 2.06, "grad_norm": 0.3675476312637329, "learning_rate": 2.256639259542722e-06, "loss": 0.0533, "step": 1164 }, { "epoch": 2.06, "grad_norm": 0.26388221979141235, "learning_rate": 2.2488502945766893e-06, "loss": 0.0398, "step": 1165 }, { "epoch": 2.06, "grad_norm": 0.3835342824459076, "learning_rate": 2.2410708930483467e-06, "loss": 0.0633, "step": 1166 }, { "epoch": 2.06, "grad_norm": 0.3148863613605499, "learning_rate": 2.2333010820001395e-06, "loss": 0.0432, "step": 1167 }, { "epoch": 2.07, "grad_norm": 0.435087651014328, "learning_rate": 2.2255408884411794e-06, "loss": 0.069, "step": 1168 }, { "epoch": 2.07, "grad_norm": 0.5503972172737122, "learning_rate": 2.2177903393471463e-06, "loss": 0.0668, "step": 1169 }, { "epoch": 2.07, "grad_norm": 0.37396740913391113, "learning_rate": 2.210049461660189e-06, "loss": 0.0608, "step": 1170 }, { "epoch": 2.07, "grad_norm": 0.2814396917819977, "learning_rate": 2.20231828228884e-06, "loss": 0.0634, "step": 1171 }, { "epoch": 2.07, "grad_norm": 0.2502109110355377, "learning_rate": 2.194596828107921e-06, "loss": 0.0308, "step": 1172 }, { "epoch": 2.08, "grad_norm": 0.594089150428772, "learning_rate": 2.1868851259584427e-06, "loss": 0.0489, "step": 1173 }, { "epoch": 2.08, "grad_norm": 0.5948416590690613, "learning_rate": 2.179183202647524e-06, "loss": 0.0496, "step": 1174 }, { "epoch": 2.08, "grad_norm": 0.3347499966621399, "learning_rate": 2.1714910849482777e-06, "loss": 0.0356, "step": 1175 }, { "epoch": 2.08, "grad_norm": 0.37250491976737976, "learning_rate": 2.1638087995997444e-06, "loss": 0.0737, "step": 1176 }, { "epoch": 2.08, "grad_norm": 0.30487823486328125, "learning_rate": 2.1561363733067798e-06, "loss": 0.0351, "step": 1177 }, { "epoch": 2.08, "grad_norm": 0.4062784016132355, "learning_rate": 2.1484738327399686e-06, "loss": 0.0722, "step": 1178 }, { "epoch": 2.09, "grad_norm": 0.3428146243095398, "learning_rate": 2.140821204535529e-06, "loss": 0.0408, "step": 1179 }, { "epoch": 2.09, "grad_norm": 0.6917352676391602, "learning_rate": 2.1331785152952243e-06, "loss": 0.0907, "step": 1180 }, { "epoch": 2.09, "grad_norm": 0.5540242791175842, "learning_rate": 2.125545791586269e-06, "loss": 0.0625, "step": 1181 }, { "epoch": 2.09, "grad_norm": 0.20859470963478088, "learning_rate": 2.1179230599412374e-06, "loss": 0.0332, "step": 1182 }, { "epoch": 2.09, "grad_norm": 0.3827586770057678, "learning_rate": 2.1103103468579656e-06, "loss": 0.0821, "step": 1183 }, { "epoch": 2.09, "grad_norm": 0.5123510360717773, "learning_rate": 2.102707678799463e-06, "loss": 0.0604, "step": 1184 }, { "epoch": 2.1, "grad_norm": 0.33549225330352783, "learning_rate": 2.0951150821938278e-06, "loss": 0.051, "step": 1185 }, { "epoch": 2.1, "grad_norm": 0.2780905067920685, "learning_rate": 2.087532583434139e-06, "loss": 0.0506, "step": 1186 }, { "epoch": 2.1, "grad_norm": 0.3690384030342102, "learning_rate": 2.079960208878384e-06, "loss": 0.0795, "step": 1187 }, { "epoch": 2.1, "grad_norm": 0.2806644141674042, "learning_rate": 2.0723979848493476e-06, "loss": 0.0355, "step": 1188 }, { "epoch": 2.1, "grad_norm": 0.2811732292175293, "learning_rate": 2.064845937634533e-06, "loss": 0.0416, "step": 1189 }, { "epoch": 2.11, "grad_norm": 0.21322007477283478, "learning_rate": 2.0573040934860717e-06, "loss": 0.0337, "step": 1190 }, { "epoch": 2.11, "grad_norm": 0.27920645475387573, "learning_rate": 2.0497724786206187e-06, "loss": 0.0368, "step": 1191 }, { "epoch": 2.11, "grad_norm": 0.3612511157989502, "learning_rate": 2.04225111921928e-06, "loss": 0.057, "step": 1192 }, { "epoch": 2.11, "grad_norm": 0.2806641757488251, "learning_rate": 2.034740041427506e-06, "loss": 0.0267, "step": 1193 }, { "epoch": 2.11, "grad_norm": 0.28522801399230957, "learning_rate": 2.0272392713550047e-06, "loss": 0.0609, "step": 1194 }, { "epoch": 2.11, "grad_norm": 0.2815048396587372, "learning_rate": 2.0197488350756618e-06, "loss": 0.0363, "step": 1195 }, { "epoch": 2.12, "grad_norm": 0.27890756726264954, "learning_rate": 2.0122687586274297e-06, "loss": 0.0377, "step": 1196 }, { "epoch": 2.12, "grad_norm": 0.5223634243011475, "learning_rate": 2.0047990680122585e-06, "loss": 0.0778, "step": 1197 }, { "epoch": 2.12, "grad_norm": 0.3649851381778717, "learning_rate": 1.9973397891959896e-06, "loss": 0.043, "step": 1198 }, { "epoch": 2.12, "grad_norm": 0.29412540793418884, "learning_rate": 1.9898909481082703e-06, "loss": 0.0753, "step": 1199 }, { "epoch": 2.12, "grad_norm": 0.4066426157951355, "learning_rate": 1.98245257064247e-06, "loss": 0.057, "step": 1200 }, { "epoch": 2.12, "grad_norm": 0.4344981014728546, "learning_rate": 1.9750246826555803e-06, "loss": 0.0519, "step": 1201 }, { "epoch": 2.13, "grad_norm": 0.4167618751525879, "learning_rate": 1.967607309968134e-06, "loss": 0.0712, "step": 1202 }, { "epoch": 2.13, "grad_norm": 0.3872796893119812, "learning_rate": 1.9602004783641053e-06, "loss": 0.0533, "step": 1203 }, { "epoch": 2.13, "grad_norm": 0.5844889283180237, "learning_rate": 1.9528042135908293e-06, "loss": 0.0505, "step": 1204 }, { "epoch": 2.13, "grad_norm": 0.2862803637981415, "learning_rate": 1.945418541358911e-06, "loss": 0.0369, "step": 1205 }, { "epoch": 2.13, "grad_norm": 0.3237147927284241, "learning_rate": 1.9380434873421295e-06, "loss": 0.0671, "step": 1206 }, { "epoch": 2.14, "grad_norm": 0.4440895915031433, "learning_rate": 1.9306790771773575e-06, "loss": 0.0353, "step": 1207 }, { "epoch": 2.14, "grad_norm": 0.44251200556755066, "learning_rate": 1.9233253364644653e-06, "loss": 0.0379, "step": 1208 }, { "epoch": 2.14, "grad_norm": 0.27800270915031433, "learning_rate": 1.9159822907662335e-06, "loss": 0.0237, "step": 1209 }, { "epoch": 2.14, "grad_norm": 0.7781839370727539, "learning_rate": 1.9086499656082685e-06, "loss": 0.1048, "step": 1210 }, { "epoch": 2.14, "grad_norm": 0.4274277985095978, "learning_rate": 1.9013283864789107e-06, "loss": 0.0525, "step": 1211 }, { "epoch": 2.14, "grad_norm": 0.3520543575286865, "learning_rate": 1.8940175788291409e-06, "loss": 0.0508, "step": 1212 }, { "epoch": 2.15, "grad_norm": 0.5555064678192139, "learning_rate": 1.8867175680725004e-06, "loss": 0.0667, "step": 1213 }, { "epoch": 2.15, "grad_norm": 0.5980702638626099, "learning_rate": 1.879428379584995e-06, "loss": 0.0511, "step": 1214 }, { "epoch": 2.15, "grad_norm": 0.2821686565876007, "learning_rate": 1.872150038705015e-06, "loss": 0.0446, "step": 1215 }, { "epoch": 2.15, "grad_norm": 0.5620594024658203, "learning_rate": 1.8648825707332436e-06, "loss": 0.0957, "step": 1216 }, { "epoch": 2.15, "grad_norm": 0.3683249354362488, "learning_rate": 1.857626000932562e-06, "loss": 0.0405, "step": 1217 }, { "epoch": 2.15, "grad_norm": 0.595662534236908, "learning_rate": 1.850380354527972e-06, "loss": 0.1199, "step": 1218 }, { "epoch": 2.16, "grad_norm": 0.6600557565689087, "learning_rate": 1.8431456567065004e-06, "loss": 0.0636, "step": 1219 }, { "epoch": 2.16, "grad_norm": 0.5318007469177246, "learning_rate": 1.835921932617119e-06, "loss": 0.0565, "step": 1220 }, { "epoch": 2.16, "grad_norm": 0.6550355553627014, "learning_rate": 1.828709207370656e-06, "loss": 0.0884, "step": 1221 }, { "epoch": 2.16, "grad_norm": 0.29406094551086426, "learning_rate": 1.821507506039693e-06, "loss": 0.067, "step": 1222 }, { "epoch": 2.16, "grad_norm": 0.3510172367095947, "learning_rate": 1.814316853658503e-06, "loss": 0.0542, "step": 1223 }, { "epoch": 2.17, "grad_norm": 0.4210337996482849, "learning_rate": 1.80713727522295e-06, "loss": 0.0657, "step": 1224 }, { "epoch": 2.17, "grad_norm": 0.8575435876846313, "learning_rate": 1.7999687956903955e-06, "loss": 0.0631, "step": 1225 }, { "epoch": 2.17, "grad_norm": 0.5311256051063538, "learning_rate": 1.7928114399796297e-06, "loss": 0.0714, "step": 1226 }, { "epoch": 2.17, "grad_norm": 0.25541508197784424, "learning_rate": 1.7856652329707624e-06, "loss": 0.0307, "step": 1227 }, { "epoch": 2.17, "grad_norm": 0.24216127395629883, "learning_rate": 1.778530199505159e-06, "loss": 0.0579, "step": 1228 }, { "epoch": 2.17, "grad_norm": 0.4642407298088074, "learning_rate": 1.7714063643853425e-06, "loss": 0.0584, "step": 1229 }, { "epoch": 2.18, "grad_norm": 0.24120020866394043, "learning_rate": 1.7642937523749038e-06, "loss": 0.0582, "step": 1230 }, { "epoch": 2.18, "grad_norm": 0.49397069215774536, "learning_rate": 1.7571923881984238e-06, "loss": 0.0902, "step": 1231 }, { "epoch": 2.18, "grad_norm": 0.5537900924682617, "learning_rate": 1.7501022965413822e-06, "loss": 0.1185, "step": 1232 }, { "epoch": 2.18, "grad_norm": 0.2753561735153198, "learning_rate": 1.7430235020500757e-06, "loss": 0.0336, "step": 1233 }, { "epoch": 2.18, "grad_norm": 0.4124187231063843, "learning_rate": 1.735956029331532e-06, "loss": 0.0593, "step": 1234 }, { "epoch": 2.18, "grad_norm": 0.3811124861240387, "learning_rate": 1.7288999029534177e-06, "loss": 0.0692, "step": 1235 }, { "epoch": 2.19, "grad_norm": 0.443025678396225, "learning_rate": 1.7218551474439588e-06, "loss": 0.0536, "step": 1236 }, { "epoch": 2.19, "grad_norm": 0.42648717761039734, "learning_rate": 1.7148217872918581e-06, "loss": 0.0724, "step": 1237 }, { "epoch": 2.19, "grad_norm": 0.6200788617134094, "learning_rate": 1.7077998469462009e-06, "loss": 0.0882, "step": 1238 }, { "epoch": 2.19, "grad_norm": 0.23458677530288696, "learning_rate": 1.700789350816382e-06, "loss": 0.0284, "step": 1239 }, { "epoch": 2.19, "grad_norm": 0.35964828729629517, "learning_rate": 1.6937903232720076e-06, "loss": 0.0661, "step": 1240 }, { "epoch": 2.2, "grad_norm": 0.454102486371994, "learning_rate": 1.6868027886428195e-06, "loss": 0.0541, "step": 1241 }, { "epoch": 2.2, "grad_norm": 0.26902759075164795, "learning_rate": 1.6798267712186122e-06, "loss": 0.0484, "step": 1242 }, { "epoch": 2.2, "grad_norm": 0.3189028799533844, "learning_rate": 1.672862295249138e-06, "loss": 0.0653, "step": 1243 }, { "epoch": 2.2, "grad_norm": 0.34128284454345703, "learning_rate": 1.6659093849440355e-06, "loss": 0.0545, "step": 1244 }, { "epoch": 2.2, "grad_norm": 0.5419310927391052, "learning_rate": 1.6589680644727347e-06, "loss": 0.0586, "step": 1245 }, { "epoch": 2.2, "grad_norm": 0.284701943397522, "learning_rate": 1.6520383579643768e-06, "loss": 0.058, "step": 1246 }, { "epoch": 2.21, "grad_norm": 0.3617006540298462, "learning_rate": 1.6451202895077377e-06, "loss": 0.0349, "step": 1247 }, { "epoch": 2.21, "grad_norm": 0.29853832721710205, "learning_rate": 1.638213883151129e-06, "loss": 0.0591, "step": 1248 }, { "epoch": 2.21, "grad_norm": 0.3536060154438019, "learning_rate": 1.631319162902331e-06, "loss": 0.0969, "step": 1249 }, { "epoch": 2.21, "grad_norm": 0.579561173915863, "learning_rate": 1.6244361527284953e-06, "loss": 0.048, "step": 1250 }, { "epoch": 2.21, "grad_norm": 0.28079932928085327, "learning_rate": 1.617564876556067e-06, "loss": 0.0375, "step": 1251 }, { "epoch": 2.21, "grad_norm": 0.37491875886917114, "learning_rate": 1.6107053582707082e-06, "loss": 0.0701, "step": 1252 }, { "epoch": 2.22, "grad_norm": 0.2765287458896637, "learning_rate": 1.603857621717202e-06, "loss": 0.0625, "step": 1253 }, { "epoch": 2.22, "grad_norm": 0.3059149384498596, "learning_rate": 1.5970216906993818e-06, "loss": 0.0358, "step": 1254 }, { "epoch": 2.22, "grad_norm": 0.18991141021251678, "learning_rate": 1.5901975889800387e-06, "loss": 0.0378, "step": 1255 }, { "epoch": 2.22, "grad_norm": 0.33146369457244873, "learning_rate": 1.5833853402808436e-06, "loss": 0.0877, "step": 1256 }, { "epoch": 2.22, "grad_norm": 0.25877922773361206, "learning_rate": 1.5765849682822686e-06, "loss": 0.036, "step": 1257 }, { "epoch": 2.23, "grad_norm": 0.23875732719898224, "learning_rate": 1.5697964966234946e-06, "loss": 0.0324, "step": 1258 }, { "epoch": 2.23, "grad_norm": 0.24545472860336304, "learning_rate": 1.5630199489023417e-06, "loss": 0.0624, "step": 1259 }, { "epoch": 2.23, "grad_norm": 0.24156877398490906, "learning_rate": 1.556255348675174e-06, "loss": 0.0442, "step": 1260 }, { "epoch": 2.23, "grad_norm": 0.23376604914665222, "learning_rate": 1.5495027194568269e-06, "loss": 0.0425, "step": 1261 }, { "epoch": 2.23, "grad_norm": 0.22100867331027985, "learning_rate": 1.5427620847205239e-06, "loss": 0.0548, "step": 1262 }, { "epoch": 2.23, "grad_norm": 0.23252639174461365, "learning_rate": 1.5360334678977935e-06, "loss": 0.0512, "step": 1263 }, { "epoch": 2.24, "grad_norm": 0.30251815915107727, "learning_rate": 1.5293168923783857e-06, "loss": 0.0484, "step": 1264 }, { "epoch": 2.24, "grad_norm": 0.4926506280899048, "learning_rate": 1.522612381510195e-06, "loss": 0.0828, "step": 1265 }, { "epoch": 2.24, "grad_norm": 0.35899999737739563, "learning_rate": 1.5159199585991745e-06, "loss": 0.0809, "step": 1266 }, { "epoch": 2.24, "grad_norm": 0.42756742238998413, "learning_rate": 1.5092396469092618e-06, "loss": 0.077, "step": 1267 }, { "epoch": 2.24, "grad_norm": 0.6002163290977478, "learning_rate": 1.5025714696622933e-06, "loss": 0.0639, "step": 1268 }, { "epoch": 2.25, "grad_norm": 0.22693727910518646, "learning_rate": 1.4959154500379213e-06, "loss": 0.029, "step": 1269 }, { "epoch": 2.25, "grad_norm": 0.23469500243663788, "learning_rate": 1.489271611173538e-06, "loss": 0.0732, "step": 1270 }, { "epoch": 2.25, "grad_norm": 0.2830025255680084, "learning_rate": 1.4826399761641907e-06, "loss": 0.0452, "step": 1271 }, { "epoch": 2.25, "grad_norm": 0.26557472348213196, "learning_rate": 1.4760205680625083e-06, "loss": 0.0271, "step": 1272 }, { "epoch": 2.25, "grad_norm": 0.36435171961784363, "learning_rate": 1.4694134098786183e-06, "loss": 0.0602, "step": 1273 }, { "epoch": 2.25, "grad_norm": 0.4482146203517914, "learning_rate": 1.462818524580057e-06, "loss": 0.0628, "step": 1274 }, { "epoch": 2.26, "grad_norm": 0.32687318325042725, "learning_rate": 1.4562359350917054e-06, "loss": 0.0728, "step": 1275 }, { "epoch": 2.26, "grad_norm": 0.24356895685195923, "learning_rate": 1.4496656642957024e-06, "loss": 0.024, "step": 1276 }, { "epoch": 2.26, "grad_norm": 0.2981789708137512, "learning_rate": 1.443107735031361e-06, "loss": 0.0671, "step": 1277 }, { "epoch": 2.26, "grad_norm": 0.227223202586174, "learning_rate": 1.4365621700950989e-06, "loss": 0.0299, "step": 1278 }, { "epoch": 2.26, "eval_loss": 0.05510137230157852, "eval_runtime": 14.7157, "eval_samples_per_second": 32.346, "eval_steps_per_second": 8.087, "step": 1278 }, { "epoch": 2.26, "grad_norm": 0.24957701563835144, "learning_rate": 1.4300289922403443e-06, "loss": 0.0361, "step": 1279 }, { "epoch": 2.26, "grad_norm": 0.44862160086631775, "learning_rate": 1.423508224177474e-06, "loss": 0.0553, "step": 1280 }, { "epoch": 2.27, "grad_norm": 0.4574562907218933, "learning_rate": 1.4169998885737257e-06, "loss": 0.0476, "step": 1281 }, { "epoch": 2.27, "grad_norm": 0.34160852432250977, "learning_rate": 1.4105040080531162e-06, "loss": 0.0608, "step": 1282 }, { "epoch": 2.27, "grad_norm": 0.43555450439453125, "learning_rate": 1.4040206051963678e-06, "loss": 0.0518, "step": 1283 }, { "epoch": 2.27, "grad_norm": 0.26400384306907654, "learning_rate": 1.3975497025408285e-06, "loss": 0.0622, "step": 1284 }, { "epoch": 2.27, "grad_norm": 0.5399185419082642, "learning_rate": 1.3910913225803946e-06, "loss": 0.0767, "step": 1285 }, { "epoch": 2.28, "grad_norm": 0.22166672348976135, "learning_rate": 1.384645487765432e-06, "loss": 0.0385, "step": 1286 }, { "epoch": 2.28, "grad_norm": 0.24417129158973694, "learning_rate": 1.378212220502696e-06, "loss": 0.035, "step": 1287 }, { "epoch": 2.28, "grad_norm": 0.27536171674728394, "learning_rate": 1.3717915431552532e-06, "loss": 0.0465, "step": 1288 }, { "epoch": 2.28, "grad_norm": 0.3064848780632019, "learning_rate": 1.3653834780424112e-06, "loss": 0.0385, "step": 1289 }, { "epoch": 2.28, "grad_norm": 0.48406317830085754, "learning_rate": 1.35898804743963e-06, "loss": 0.0812, "step": 1290 }, { "epoch": 2.28, "grad_norm": 0.2509705126285553, "learning_rate": 1.352605273578454e-06, "loss": 0.0544, "step": 1291 }, { "epoch": 2.29, "grad_norm": 0.49904265999794006, "learning_rate": 1.3462351786464285e-06, "loss": 0.0797, "step": 1292 }, { "epoch": 2.29, "grad_norm": 0.274941086769104, "learning_rate": 1.3398777847870236e-06, "loss": 0.0427, "step": 1293 }, { "epoch": 2.29, "grad_norm": 0.370650053024292, "learning_rate": 1.3335331140995627e-06, "loss": 0.0527, "step": 1294 }, { "epoch": 2.29, "grad_norm": 0.3093520700931549, "learning_rate": 1.3272011886391368e-06, "loss": 0.0445, "step": 1295 }, { "epoch": 2.29, "grad_norm": 0.3960571587085724, "learning_rate": 1.3208820304165365e-06, "loss": 0.0723, "step": 1296 }, { "epoch": 2.29, "grad_norm": 0.2450592964887619, "learning_rate": 1.3145756613981682e-06, "loss": 0.0256, "step": 1297 }, { "epoch": 2.3, "grad_norm": 0.3382833003997803, "learning_rate": 1.308282103505981e-06, "loss": 0.0697, "step": 1298 }, { "epoch": 2.3, "grad_norm": 0.27255892753601074, "learning_rate": 1.3020013786173946e-06, "loss": 0.0268, "step": 1299 }, { "epoch": 2.3, "grad_norm": 0.19009582698345184, "learning_rate": 1.295733508565213e-06, "loss": 0.0329, "step": 1300 }, { "epoch": 2.3, "grad_norm": 0.4398796558380127, "learning_rate": 1.289478515137561e-06, "loss": 0.0634, "step": 1301 }, { "epoch": 2.3, "grad_norm": 0.373238205909729, "learning_rate": 1.283236420077798e-06, "loss": 0.0446, "step": 1302 }, { "epoch": 2.31, "grad_norm": 0.5806798338890076, "learning_rate": 1.2770072450844462e-06, "loss": 0.068, "step": 1303 }, { "epoch": 2.31, "grad_norm": 0.4200795888900757, "learning_rate": 1.2707910118111195e-06, "loss": 0.0492, "step": 1304 }, { "epoch": 2.31, "grad_norm": 0.25758886337280273, "learning_rate": 1.2645877418664394e-06, "loss": 0.0389, "step": 1305 }, { "epoch": 2.31, "grad_norm": 0.3216612637042999, "learning_rate": 1.25839745681397e-06, "loss": 0.0435, "step": 1306 }, { "epoch": 2.31, "grad_norm": 0.6248094439506531, "learning_rate": 1.2522201781721338e-06, "loss": 0.0454, "step": 1307 }, { "epoch": 2.31, "grad_norm": 0.39597734808921814, "learning_rate": 1.246055927414141e-06, "loss": 0.0738, "step": 1308 }, { "epoch": 2.32, "grad_norm": 0.4096474051475525, "learning_rate": 1.2399047259679182e-06, "loss": 0.0315, "step": 1309 }, { "epoch": 2.32, "grad_norm": 0.4106234312057495, "learning_rate": 1.2337665952160266e-06, "loss": 0.0722, "step": 1310 }, { "epoch": 2.32, "grad_norm": 0.345395028591156, "learning_rate": 1.227641556495595e-06, "loss": 0.0602, "step": 1311 }, { "epoch": 2.32, "grad_norm": 0.23210279643535614, "learning_rate": 1.2215296310982405e-06, "loss": 0.0425, "step": 1312 }, { "epoch": 2.32, "grad_norm": 0.5029785633087158, "learning_rate": 1.2154308402699933e-06, "loss": 0.095, "step": 1313 }, { "epoch": 2.32, "grad_norm": 0.19261600077152252, "learning_rate": 1.209345205211231e-06, "loss": 0.0272, "step": 1314 }, { "epoch": 2.33, "grad_norm": 0.32310032844543457, "learning_rate": 1.2032727470765982e-06, "loss": 0.0655, "step": 1315 }, { "epoch": 2.33, "grad_norm": 0.30620020627975464, "learning_rate": 1.1972134869749325e-06, "loss": 0.0299, "step": 1316 }, { "epoch": 2.33, "grad_norm": 0.21526066958904266, "learning_rate": 1.191167445969193e-06, "loss": 0.0412, "step": 1317 }, { "epoch": 2.33, "grad_norm": 0.4346780478954315, "learning_rate": 1.1851346450763879e-06, "loss": 0.0367, "step": 1318 }, { "epoch": 2.33, "grad_norm": 0.2232600748538971, "learning_rate": 1.179115105267502e-06, "loss": 0.0337, "step": 1319 }, { "epoch": 2.34, "grad_norm": 0.22668038308620453, "learning_rate": 1.1731088474674235e-06, "loss": 0.0325, "step": 1320 }, { "epoch": 2.34, "grad_norm": 0.5103705525398254, "learning_rate": 1.1671158925548625e-06, "loss": 0.0483, "step": 1321 }, { "epoch": 2.34, "grad_norm": 0.5469282269477844, "learning_rate": 1.1611362613622962e-06, "loss": 0.0784, "step": 1322 }, { "epoch": 2.34, "grad_norm": 0.4577953815460205, "learning_rate": 1.1551699746758788e-06, "loss": 0.0461, "step": 1323 }, { "epoch": 2.34, "grad_norm": 0.45581528544425964, "learning_rate": 1.1492170532353814e-06, "loss": 0.0304, "step": 1324 }, { "epoch": 2.34, "grad_norm": 0.661368727684021, "learning_rate": 1.1432775177341165e-06, "loss": 0.1014, "step": 1325 }, { "epoch": 2.35, "grad_norm": 0.408263236284256, "learning_rate": 1.1373513888188564e-06, "loss": 0.0459, "step": 1326 }, { "epoch": 2.35, "grad_norm": 0.38288700580596924, "learning_rate": 1.1314386870897793e-06, "loss": 0.0487, "step": 1327 }, { "epoch": 2.35, "grad_norm": 0.32982730865478516, "learning_rate": 1.1255394331003855e-06, "loss": 0.0241, "step": 1328 }, { "epoch": 2.35, "grad_norm": 0.3562586009502411, "learning_rate": 1.1196536473574277e-06, "loss": 0.0353, "step": 1329 }, { "epoch": 2.35, "grad_norm": 0.371602863073349, "learning_rate": 1.11378135032084e-06, "loss": 0.0784, "step": 1330 }, { "epoch": 2.35, "grad_norm": 1.1282484531402588, "learning_rate": 1.1079225624036687e-06, "loss": 0.0912, "step": 1331 }, { "epoch": 2.36, "grad_norm": 0.1809844821691513, "learning_rate": 1.1020773039720018e-06, "loss": 0.0265, "step": 1332 }, { "epoch": 2.36, "grad_norm": 0.3818490207195282, "learning_rate": 1.0962455953448952e-06, "loss": 0.0373, "step": 1333 }, { "epoch": 2.36, "grad_norm": 0.2761133313179016, "learning_rate": 1.0904274567943041e-06, "loss": 0.0361, "step": 1334 }, { "epoch": 2.36, "grad_norm": 0.19086763262748718, "learning_rate": 1.08462290854501e-06, "loss": 0.0377, "step": 1335 }, { "epoch": 2.36, "grad_norm": 0.43346959352493286, "learning_rate": 1.0788319707745526e-06, "loss": 0.0799, "step": 1336 }, { "epoch": 2.37, "grad_norm": 0.26036912202835083, "learning_rate": 1.0730546636131622e-06, "loss": 0.0602, "step": 1337 }, { "epoch": 2.37, "grad_norm": 0.31943923234939575, "learning_rate": 1.0672910071436865e-06, "loss": 0.0419, "step": 1338 }, { "epoch": 2.37, "grad_norm": 0.26354357600212097, "learning_rate": 1.0615410214015188e-06, "loss": 0.0495, "step": 1339 }, { "epoch": 2.37, "grad_norm": 0.5490753054618835, "learning_rate": 1.0558047263745297e-06, "loss": 0.0708, "step": 1340 }, { "epoch": 2.37, "grad_norm": 0.2513953149318695, "learning_rate": 1.050082142003005e-06, "loss": 0.0343, "step": 1341 }, { "epoch": 2.37, "grad_norm": 0.41870027780532837, "learning_rate": 1.0443732881795614e-06, "loss": 0.0573, "step": 1342 }, { "epoch": 2.38, "grad_norm": 0.41868382692337036, "learning_rate": 1.0386781847490951e-06, "loss": 0.0568, "step": 1343 }, { "epoch": 2.38, "grad_norm": 0.31021198630332947, "learning_rate": 1.0329968515086969e-06, "loss": 0.0606, "step": 1344 }, { "epoch": 2.38, "grad_norm": 0.6934405565261841, "learning_rate": 1.0273293082075914e-06, "loss": 0.0764, "step": 1345 }, { "epoch": 2.38, "grad_norm": 0.45182326436042786, "learning_rate": 1.0216755745470702e-06, "loss": 0.0739, "step": 1346 }, { "epoch": 2.38, "grad_norm": 0.5464628338813782, "learning_rate": 1.0160356701804169e-06, "loss": 0.0322, "step": 1347 }, { "epoch": 2.38, "grad_norm": 0.27524539828300476, "learning_rate": 1.0104096147128456e-06, "loss": 0.0555, "step": 1348 }, { "epoch": 2.39, "grad_norm": 0.4733411371707916, "learning_rate": 1.0047974277014267e-06, "loss": 0.0644, "step": 1349 }, { "epoch": 2.39, "grad_norm": 0.2954682409763336, "learning_rate": 9.991991286550207e-07, "loss": 0.0536, "step": 1350 }, { "epoch": 2.39, "grad_norm": 0.23338478803634644, "learning_rate": 9.936147370342164e-07, "loss": 0.0315, "step": 1351 }, { "epoch": 2.39, "grad_norm": 0.7352632880210876, "learning_rate": 9.880442722512518e-07, "loss": 0.0637, "step": 1352 }, { "epoch": 2.39, "grad_norm": 0.3377096354961395, "learning_rate": 9.824877536699584e-07, "loss": 0.0407, "step": 1353 }, { "epoch": 2.4, "grad_norm": 0.23006029427051544, "learning_rate": 9.769452006056857e-07, "loss": 0.0331, "step": 1354 }, { "epoch": 2.4, "grad_norm": 0.23662589490413666, "learning_rate": 9.71416632325235e-07, "loss": 0.0452, "step": 1355 }, { "epoch": 2.4, "grad_norm": 0.30282366275787354, "learning_rate": 9.659020680468e-07, "loss": 0.0407, "step": 1356 }, { "epoch": 2.4, "grad_norm": 0.533392071723938, "learning_rate": 9.604015269398874e-07, "loss": 0.0719, "step": 1357 }, { "epoch": 2.4, "grad_norm": 0.43559810519218445, "learning_rate": 9.549150281252633e-07, "loss": 0.0753, "step": 1358 }, { "epoch": 2.4, "grad_norm": 0.27676114439964294, "learning_rate": 9.494425906748761e-07, "loss": 0.0222, "step": 1359 }, { "epoch": 2.41, "grad_norm": 0.24139194190502167, "learning_rate": 9.439842336117954e-07, "loss": 0.0418, "step": 1360 }, { "epoch": 2.41, "grad_norm": 0.3086250126361847, "learning_rate": 9.385399759101482e-07, "loss": 0.046, "step": 1361 }, { "epoch": 2.41, "grad_norm": 0.2295229136943817, "learning_rate": 9.331098364950452e-07, "loss": 0.0356, "step": 1362 }, { "epoch": 2.41, "grad_norm": 0.4115982949733734, "learning_rate": 9.276938342425246e-07, "loss": 0.0762, "step": 1363 }, { "epoch": 2.41, "grad_norm": 0.2722083628177643, "learning_rate": 9.222919879794772e-07, "loss": 0.0559, "step": 1364 }, { "epoch": 2.41, "grad_norm": 0.42166340351104736, "learning_rate": 9.169043164835867e-07, "loss": 0.0485, "step": 1365 }, { "epoch": 2.42, "grad_norm": 0.19337353110313416, "learning_rate": 9.115308384832639e-07, "loss": 0.0259, "step": 1366 }, { "epoch": 2.42, "grad_norm": 0.31872624158859253, "learning_rate": 9.061715726575826e-07, "loss": 0.0527, "step": 1367 }, { "epoch": 2.42, "grad_norm": 0.4855688810348511, "learning_rate": 9.00826537636208e-07, "loss": 0.0765, "step": 1368 }, { "epoch": 2.42, "grad_norm": 0.3524826467037201, "learning_rate": 8.954957519993401e-07, "loss": 0.0553, "step": 1369 }, { "epoch": 2.42, "grad_norm": 0.28216788172721863, "learning_rate": 8.901792342776439e-07, "loss": 0.0214, "step": 1370 }, { "epoch": 2.43, "grad_norm": 0.3330092132091522, "learning_rate": 8.848770029521875e-07, "loss": 0.0587, "step": 1371 }, { "epoch": 2.43, "grad_norm": 0.23494839668273926, "learning_rate": 8.795890764543818e-07, "loss": 0.0387, "step": 1372 }, { "epoch": 2.43, "grad_norm": 0.21391533315181732, "learning_rate": 8.74315473165902e-07, "loss": 0.0336, "step": 1373 }, { "epoch": 2.43, "grad_norm": 0.3179157078266144, "learning_rate": 8.690562114186424e-07, "loss": 0.0414, "step": 1374 }, { "epoch": 2.43, "grad_norm": 0.8653597235679626, "learning_rate": 8.638113094946382e-07, "loss": 0.0641, "step": 1375 }, { "epoch": 2.43, "grad_norm": 0.35044169425964355, "learning_rate": 8.585807856260109e-07, "loss": 0.0398, "step": 1376 }, { "epoch": 2.44, "grad_norm": 0.35807642340660095, "learning_rate": 8.533646579949034e-07, "loss": 0.0753, "step": 1377 }, { "epoch": 2.44, "grad_norm": 0.2990626096725464, "learning_rate": 8.481629447334067e-07, "loss": 0.0529, "step": 1378 }, { "epoch": 2.44, "grad_norm": 0.296956866979599, "learning_rate": 8.429756639235137e-07, "loss": 0.0293, "step": 1379 }, { "epoch": 2.44, "grad_norm": 0.29938268661499023, "learning_rate": 8.378028335970451e-07, "loss": 0.0472, "step": 1380 }, { "epoch": 2.44, "grad_norm": 0.42714786529541016, "learning_rate": 8.326444717355875e-07, "loss": 0.0727, "step": 1381 }, { "epoch": 2.44, "grad_norm": 0.41626355051994324, "learning_rate": 8.275005962704347e-07, "loss": 0.0905, "step": 1382 }, { "epoch": 2.45, "grad_norm": 0.5657181739807129, "learning_rate": 8.223712250825216e-07, "loss": 0.1088, "step": 1383 }, { "epoch": 2.45, "grad_norm": 0.2844931483268738, "learning_rate": 8.172563760023666e-07, "loss": 0.0521, "step": 1384 }, { "epoch": 2.45, "grad_norm": 0.3464542031288147, "learning_rate": 8.121560668100065e-07, "loss": 0.0417, "step": 1385 }, { "epoch": 2.45, "grad_norm": 0.45611000061035156, "learning_rate": 8.070703152349336e-07, "loss": 0.053, "step": 1386 }, { "epoch": 2.45, "grad_norm": 0.44498467445373535, "learning_rate": 8.01999138956035e-07, "loss": 0.0686, "step": 1387 }, { "epoch": 2.46, "grad_norm": 0.3243922293186188, "learning_rate": 7.969425556015326e-07, "loss": 0.0527, "step": 1388 }, { "epoch": 2.46, "grad_norm": 0.43788135051727295, "learning_rate": 7.919005827489229e-07, "loss": 0.0736, "step": 1389 }, { "epoch": 2.46, "grad_norm": 0.47388508915901184, "learning_rate": 7.868732379249122e-07, "loss": 0.039, "step": 1390 }, { "epoch": 2.46, "grad_norm": 0.27618956565856934, "learning_rate": 7.818605386053574e-07, "loss": 0.0304, "step": 1391 }, { "epoch": 2.46, "grad_norm": 0.6428837180137634, "learning_rate": 7.768625022152038e-07, "loss": 0.068, "step": 1392 }, { "epoch": 2.46, "grad_norm": 0.5347036123275757, "learning_rate": 7.718791461284303e-07, "loss": 0.062, "step": 1393 }, { "epoch": 2.47, "grad_norm": 0.16088564693927765, "learning_rate": 7.669104876679794e-07, "loss": 0.0122, "step": 1394 }, { "epoch": 2.47, "grad_norm": 0.3364282548427582, "learning_rate": 7.619565441057075e-07, "loss": 0.0509, "step": 1395 }, { "epoch": 2.47, "grad_norm": 0.8042171001434326, "learning_rate": 7.570173326623154e-07, "loss": 0.0748, "step": 1396 }, { "epoch": 2.47, "grad_norm": 0.21089555323123932, "learning_rate": 7.520928705072939e-07, "loss": 0.032, "step": 1397 }, { "epoch": 2.47, "grad_norm": 0.4967990815639496, "learning_rate": 7.471831747588654e-07, "loss": 0.0426, "step": 1398 }, { "epoch": 2.48, "grad_norm": 0.2728939354419708, "learning_rate": 7.422882624839178e-07, "loss": 0.0425, "step": 1399 }, { "epoch": 2.48, "grad_norm": 0.43230342864990234, "learning_rate": 7.37408150697953e-07, "loss": 0.0596, "step": 1400 }, { "epoch": 2.48, "grad_norm": 0.24495214223861694, "learning_rate": 7.325428563650222e-07, "loss": 0.0323, "step": 1401 }, { "epoch": 2.48, "grad_norm": 0.38741186261177063, "learning_rate": 7.276923963976667e-07, "loss": 0.0588, "step": 1402 }, { "epoch": 2.48, "grad_norm": 0.6461971402168274, "learning_rate": 7.228567876568665e-07, "loss": 0.0561, "step": 1403 }, { "epoch": 2.48, "grad_norm": 0.4546187222003937, "learning_rate": 7.180360469519715e-07, "loss": 0.0687, "step": 1404 }, { "epoch": 2.49, "grad_norm": 0.3014548420906067, "learning_rate": 7.132301910406503e-07, "loss": 0.0488, "step": 1405 }, { "epoch": 2.49, "grad_norm": 0.3899924159049988, "learning_rate": 7.084392366288295e-07, "loss": 0.0821, "step": 1406 }, { "epoch": 2.49, "grad_norm": 0.5840611457824707, "learning_rate": 7.036632003706329e-07, "loss": 0.0535, "step": 1407 }, { "epoch": 2.49, "grad_norm": 0.4866988956928253, "learning_rate": 6.989020988683315e-07, "loss": 0.0585, "step": 1408 }, { "epoch": 2.49, "grad_norm": 0.2743123471736908, "learning_rate": 6.941559486722748e-07, "loss": 0.0448, "step": 1409 }, { "epoch": 2.49, "grad_norm": 0.3817217946052551, "learning_rate": 6.894247662808456e-07, "loss": 0.0712, "step": 1410 }, { "epoch": 2.5, "grad_norm": 0.234280526638031, "learning_rate": 6.847085681403914e-07, "loss": 0.03, "step": 1411 }, { "epoch": 2.5, "grad_norm": 0.5997692942619324, "learning_rate": 6.800073706451721e-07, "loss": 0.1133, "step": 1412 }, { "epoch": 2.5, "grad_norm": 0.5731911063194275, "learning_rate": 6.753211901373064e-07, "loss": 0.0695, "step": 1413 }, { "epoch": 2.5, "grad_norm": 0.37482261657714844, "learning_rate": 6.706500429067075e-07, "loss": 0.0701, "step": 1414 }, { "epoch": 2.5, "grad_norm": 0.3871476948261261, "learning_rate": 6.659939451910341e-07, "loss": 0.071, "step": 1415 }, { "epoch": 2.51, "grad_norm": 0.46698176860809326, "learning_rate": 6.613529131756286e-07, "loss": 0.0697, "step": 1416 }, { "epoch": 2.51, "grad_norm": 0.24281850457191467, "learning_rate": 6.567269629934614e-07, "loss": 0.0412, "step": 1417 }, { "epoch": 2.51, "grad_norm": 0.4457477033138275, "learning_rate": 6.521161107250778e-07, "loss": 0.0374, "step": 1418 }, { "epoch": 2.51, "grad_norm": 0.20889078080654144, "learning_rate": 6.475203723985419e-07, "loss": 0.0292, "step": 1419 }, { "epoch": 2.51, "grad_norm": 0.46522101759910583, "learning_rate": 6.429397639893758e-07, "loss": 0.0494, "step": 1420 }, { "epoch": 2.51, "eval_loss": 0.054507263004779816, "eval_runtime": 14.7198, "eval_samples_per_second": 32.337, "eval_steps_per_second": 8.084, "step": 1420 }, { "epoch": 2.51, "grad_norm": 0.5021933317184448, "learning_rate": 6.38374301420509e-07, "loss": 0.0421, "step": 1421 }, { "epoch": 2.52, "grad_norm": 0.30449554324150085, "learning_rate": 6.33824000562221e-07, "loss": 0.0589, "step": 1422 }, { "epoch": 2.52, "grad_norm": 0.25865137577056885, "learning_rate": 6.292888772320882e-07, "loss": 0.0304, "step": 1423 }, { "epoch": 2.52, "grad_norm": 0.21872945129871368, "learning_rate": 6.247689471949292e-07, "loss": 0.0372, "step": 1424 }, { "epoch": 2.52, "grad_norm": 0.28561004996299744, "learning_rate": 6.202642261627411e-07, "loss": 0.0614, "step": 1425 }, { "epoch": 2.52, "grad_norm": 0.3216119110584259, "learning_rate": 6.157747297946609e-07, "loss": 0.0538, "step": 1426 }, { "epoch": 2.52, "grad_norm": 0.25649017095565796, "learning_rate": 6.113004736968953e-07, "loss": 0.0533, "step": 1427 }, { "epoch": 2.53, "grad_norm": 0.3557852804660797, "learning_rate": 6.068414734226774e-07, "loss": 0.0412, "step": 1428 }, { "epoch": 2.53, "grad_norm": 0.4288893938064575, "learning_rate": 6.023977444722096e-07, "loss": 0.0444, "step": 1429 }, { "epoch": 2.53, "grad_norm": 0.7468822002410889, "learning_rate": 5.979693022926025e-07, "loss": 0.0917, "step": 1430 }, { "epoch": 2.53, "grad_norm": 0.32618439197540283, "learning_rate": 5.935561622778335e-07, "loss": 0.0522, "step": 1431 }, { "epoch": 2.53, "grad_norm": 0.5251040458679199, "learning_rate": 5.891583397686862e-07, "loss": 0.0734, "step": 1432 }, { "epoch": 2.54, "grad_norm": 0.4528569281101227, "learning_rate": 5.847758500526957e-07, "loss": 0.0458, "step": 1433 }, { "epoch": 2.54, "grad_norm": 0.30320999026298523, "learning_rate": 5.804087083641002e-07, "loss": 0.0615, "step": 1434 }, { "epoch": 2.54, "grad_norm": 0.41306984424591064, "learning_rate": 5.760569298837825e-07, "loss": 0.0648, "step": 1435 }, { "epoch": 2.54, "grad_norm": 0.5027666091918945, "learning_rate": 5.717205297392247e-07, "loss": 0.0621, "step": 1436 }, { "epoch": 2.54, "grad_norm": 0.27558037638664246, "learning_rate": 5.673995230044499e-07, "loss": 0.0418, "step": 1437 }, { "epoch": 2.54, "grad_norm": 0.3972420394420624, "learning_rate": 5.630939246999694e-07, "loss": 0.0707, "step": 1438 }, { "epoch": 2.55, "grad_norm": 0.5827575325965881, "learning_rate": 5.58803749792734e-07, "loss": 0.0997, "step": 1439 }, { "epoch": 2.55, "grad_norm": 0.5362456440925598, "learning_rate": 5.54529013196079e-07, "loss": 0.093, "step": 1440 }, { "epoch": 2.55, "grad_norm": 0.6558713316917419, "learning_rate": 5.502697297696746e-07, "loss": 0.0874, "step": 1441 }, { "epoch": 2.55, "grad_norm": 0.4873327314853668, "learning_rate": 5.460259143194751e-07, "loss": 0.0943, "step": 1442 }, { "epoch": 2.55, "grad_norm": 0.3433365821838379, "learning_rate": 5.417975815976628e-07, "loss": 0.0497, "step": 1443 }, { "epoch": 2.55, "grad_norm": 0.35977843403816223, "learning_rate": 5.37584746302599e-07, "loss": 0.0603, "step": 1444 }, { "epoch": 2.56, "grad_norm": 0.33071574568748474, "learning_rate": 5.333874230787772e-07, "loss": 0.0652, "step": 1445 }, { "epoch": 2.56, "grad_norm": 0.5084921717643738, "learning_rate": 5.292056265167645e-07, "loss": 0.0508, "step": 1446 }, { "epoch": 2.56, "grad_norm": 0.3442612886428833, "learning_rate": 5.250393711531582e-07, "loss": 0.0583, "step": 1447 }, { "epoch": 2.56, "grad_norm": 0.25802913308143616, "learning_rate": 5.208886714705291e-07, "loss": 0.0463, "step": 1448 }, { "epoch": 2.56, "grad_norm": 0.24014657735824585, "learning_rate": 5.16753541897374e-07, "loss": 0.0335, "step": 1449 }, { "epoch": 2.57, "grad_norm": 0.3322998881340027, "learning_rate": 5.126339968080696e-07, "loss": 0.0696, "step": 1450 }, { "epoch": 2.57, "grad_norm": 0.7653951644897461, "learning_rate": 5.085300505228125e-07, "loss": 0.0757, "step": 1451 }, { "epoch": 2.57, "grad_norm": 0.2684485614299774, "learning_rate": 5.044417173075806e-07, "loss": 0.0561, "step": 1452 }, { "epoch": 2.57, "grad_norm": 0.2133631557226181, "learning_rate": 5.00369011374075e-07, "loss": 0.0344, "step": 1453 }, { "epoch": 2.57, "grad_norm": 0.18056386709213257, "learning_rate": 4.963119468796739e-07, "loss": 0.032, "step": 1454 }, { "epoch": 2.57, "grad_norm": 0.3504900634288788, "learning_rate": 4.922705379273862e-07, "loss": 0.0383, "step": 1455 }, { "epoch": 2.58, "grad_norm": 0.30957111716270447, "learning_rate": 4.882447985657957e-07, "loss": 0.0451, "step": 1456 }, { "epoch": 2.58, "grad_norm": 0.22308412194252014, "learning_rate": 4.842347427890199e-07, "loss": 0.0257, "step": 1457 }, { "epoch": 2.58, "grad_norm": 0.5668940544128418, "learning_rate": 4.802403845366554e-07, "loss": 0.0752, "step": 1458 }, { "epoch": 2.58, "grad_norm": 0.4146835505962372, "learning_rate": 4.7626173769373116e-07, "loss": 0.0601, "step": 1459 }, { "epoch": 2.58, "grad_norm": 0.2994007170200348, "learning_rate": 4.7229881609066387e-07, "loss": 0.0647, "step": 1460 }, { "epoch": 2.58, "grad_norm": 0.36780592799186707, "learning_rate": 4.6835163350320176e-07, "loss": 0.0396, "step": 1461 }, { "epoch": 2.59, "grad_norm": 0.46702075004577637, "learning_rate": 4.6442020365238813e-07, "loss": 0.0601, "step": 1462 }, { "epoch": 2.59, "grad_norm": 0.2682836949825287, "learning_rate": 4.605045402045022e-07, "loss": 0.0235, "step": 1463 }, { "epoch": 2.59, "grad_norm": 0.5452938079833984, "learning_rate": 4.5660465677101693e-07, "loss": 0.095, "step": 1464 }, { "epoch": 2.59, "grad_norm": 0.7652738094329834, "learning_rate": 4.5272056690855494e-07, "loss": 0.0712, "step": 1465 }, { "epoch": 2.59, "grad_norm": 0.4131929278373718, "learning_rate": 4.488522841188336e-07, "loss": 0.0701, "step": 1466 }, { "epoch": 2.6, "grad_norm": 0.6171278357505798, "learning_rate": 4.4499982184862623e-07, "loss": 0.0705, "step": 1467 }, { "epoch": 2.6, "grad_norm": 0.84730064868927, "learning_rate": 4.4116319348970924e-07, "loss": 0.088, "step": 1468 }, { "epoch": 2.6, "grad_norm": 0.3159200847148895, "learning_rate": 4.373424123788167e-07, "loss": 0.0255, "step": 1469 }, { "epoch": 2.6, "grad_norm": 0.3903068006038666, "learning_rate": 4.335374917975982e-07, "loss": 0.0756, "step": 1470 }, { "epoch": 2.6, "grad_norm": 0.4379557967185974, "learning_rate": 4.2974844497256917e-07, "loss": 0.0521, "step": 1471 }, { "epoch": 2.6, "grad_norm": 0.35017457604408264, "learning_rate": 4.2597528507506094e-07, "loss": 0.0354, "step": 1472 }, { "epoch": 2.61, "grad_norm": 0.41873300075531006, "learning_rate": 4.2221802522118493e-07, "loss": 0.0757, "step": 1473 }, { "epoch": 2.61, "grad_norm": 0.18189111351966858, "learning_rate": 4.1847667847177754e-07, "loss": 0.0215, "step": 1474 }, { "epoch": 2.61, "grad_norm": 0.2842899262905121, "learning_rate": 4.147512578323615e-07, "loss": 0.0415, "step": 1475 }, { "epoch": 2.61, "grad_norm": 0.6110818982124329, "learning_rate": 4.110417762530977e-07, "loss": 0.1139, "step": 1476 }, { "epoch": 2.61, "grad_norm": 0.411731094121933, "learning_rate": 4.0734824662873593e-07, "loss": 0.0635, "step": 1477 }, { "epoch": 2.61, "grad_norm": 0.32420873641967773, "learning_rate": 4.036706817985803e-07, "loss": 0.0608, "step": 1478 }, { "epoch": 2.62, "grad_norm": 0.6847043633460999, "learning_rate": 4.000090945464341e-07, "loss": 0.0545, "step": 1479 }, { "epoch": 2.62, "grad_norm": 0.2487315833568573, "learning_rate": 3.9636349760056427e-07, "loss": 0.0431, "step": 1480 }, { "epoch": 2.62, "grad_norm": 0.365068644285202, "learning_rate": 3.927339036336486e-07, "loss": 0.067, "step": 1481 }, { "epoch": 2.62, "grad_norm": 0.6532365679740906, "learning_rate": 3.8912032526273844e-07, "loss": 0.0663, "step": 1482 }, { "epoch": 2.62, "grad_norm": 0.3708239197731018, "learning_rate": 3.8552277504921185e-07, "loss": 0.0543, "step": 1483 }, { "epoch": 2.63, "grad_norm": 0.6864272356033325, "learning_rate": 3.8194126549873144e-07, "loss": 0.1031, "step": 1484 }, { "epoch": 2.63, "grad_norm": 0.3340238928794861, "learning_rate": 3.783758090611983e-07, "loss": 0.045, "step": 1485 }, { "epoch": 2.63, "grad_norm": 0.3528662621974945, "learning_rate": 3.7482641813071095e-07, "loss": 0.0275, "step": 1486 }, { "epoch": 2.63, "grad_norm": 0.5660943388938904, "learning_rate": 3.712931050455204e-07, "loss": 0.1154, "step": 1487 }, { "epoch": 2.63, "grad_norm": 0.2540256381034851, "learning_rate": 3.677758820879912e-07, "loss": 0.0452, "step": 1488 }, { "epoch": 2.63, "grad_norm": 0.2318311631679535, "learning_rate": 3.6427476148455486e-07, "loss": 0.0309, "step": 1489 }, { "epoch": 2.64, "grad_norm": 0.20368333160877228, "learning_rate": 3.6078975540566716e-07, "loss": 0.0296, "step": 1490 }, { "epoch": 2.64, "grad_norm": 0.6395195126533508, "learning_rate": 3.5732087596576867e-07, "loss": 0.1028, "step": 1491 }, { "epoch": 2.64, "grad_norm": 0.47410356998443604, "learning_rate": 3.538681352232404e-07, "loss": 0.0451, "step": 1492 }, { "epoch": 2.64, "grad_norm": 0.2875344753265381, "learning_rate": 3.5043154518036305e-07, "loss": 0.0323, "step": 1493 }, { "epoch": 2.64, "grad_norm": 0.27395549416542053, "learning_rate": 3.470111177832758e-07, "loss": 0.0413, "step": 1494 }, { "epoch": 2.64, "grad_norm": 0.2487516701221466, "learning_rate": 3.4360686492193263e-07, "loss": 0.0481, "step": 1495 }, { "epoch": 2.65, "grad_norm": 0.3354431986808777, "learning_rate": 3.4021879843006144e-07, "loss": 0.0785, "step": 1496 }, { "epoch": 2.65, "grad_norm": 0.265449583530426, "learning_rate": 3.3684693008512626e-07, "loss": 0.043, "step": 1497 }, { "epoch": 2.65, "grad_norm": 0.2650119662284851, "learning_rate": 3.334912716082811e-07, "loss": 0.0497, "step": 1498 }, { "epoch": 2.65, "grad_norm": 0.3735601603984833, "learning_rate": 3.30151834664334e-07, "loss": 0.0406, "step": 1499 }, { "epoch": 2.65, "grad_norm": 0.23895516991615295, "learning_rate": 3.268286308617041e-07, "loss": 0.0371, "step": 1500 }, { "epoch": 2.66, "grad_norm": 0.7068232297897339, "learning_rate": 3.235216717523787e-07, "loss": 0.1153, "step": 1501 }, { "epoch": 2.66, "grad_norm": 0.24763190746307373, "learning_rate": 3.2023096883187997e-07, "loss": 0.0331, "step": 1502 }, { "epoch": 2.66, "grad_norm": 0.2691015899181366, "learning_rate": 3.169565335392183e-07, "loss": 0.0408, "step": 1503 }, { "epoch": 2.66, "grad_norm": 0.41367876529693604, "learning_rate": 3.136983772568569e-07, "loss": 0.0224, "step": 1504 }, { "epoch": 2.66, "grad_norm": 0.2690843343734741, "learning_rate": 3.104565113106689e-07, "loss": 0.0531, "step": 1505 }, { "epoch": 2.66, "grad_norm": 0.4415769577026367, "learning_rate": 3.072309469699003e-07, "loss": 0.0322, "step": 1506 }, { "epoch": 2.67, "grad_norm": 0.2419007122516632, "learning_rate": 3.040216954471309e-07, "loss": 0.0305, "step": 1507 }, { "epoch": 2.67, "grad_norm": 0.2643618583679199, "learning_rate": 3.0082876789823244e-07, "loss": 0.0534, "step": 1508 }, { "epoch": 2.67, "grad_norm": 0.6275476813316345, "learning_rate": 2.9765217542233436e-07, "loss": 0.0512, "step": 1509 }, { "epoch": 2.67, "grad_norm": 0.34075793623924255, "learning_rate": 2.9449192906178205e-07, "loss": 0.0674, "step": 1510 }, { "epoch": 2.67, "grad_norm": 0.2609458863735199, "learning_rate": 2.913480398020974e-07, "loss": 0.0504, "step": 1511 }, { "epoch": 2.67, "grad_norm": 0.19163215160369873, "learning_rate": 2.88220518571945e-07, "loss": 0.0242, "step": 1512 }, { "epoch": 2.68, "grad_norm": 0.29438456892967224, "learning_rate": 2.8510937624308956e-07, "loss": 0.0451, "step": 1513 }, { "epoch": 2.68, "grad_norm": 0.5320554375648499, "learning_rate": 2.8201462363036114e-07, "loss": 0.0506, "step": 1514 }, { "epoch": 2.68, "grad_norm": 0.3245362639427185, "learning_rate": 2.789362714916172e-07, "loss": 0.0762, "step": 1515 }, { "epoch": 2.68, "grad_norm": 0.31104961037635803, "learning_rate": 2.758743305277012e-07, "loss": 0.0485, "step": 1516 }, { "epoch": 2.68, "grad_norm": 0.44795843958854675, "learning_rate": 2.72828811382414e-07, "loss": 0.0975, "step": 1517 }, { "epoch": 2.69, "grad_norm": 0.37962451577186584, "learning_rate": 2.6979972464246607e-07, "loss": 0.0552, "step": 1518 }, { "epoch": 2.69, "grad_norm": 0.4587259292602539, "learning_rate": 2.6678708083745064e-07, "loss": 0.0774, "step": 1519 }, { "epoch": 2.69, "grad_norm": 0.27291038632392883, "learning_rate": 2.6379089043980064e-07, "loss": 0.0464, "step": 1520 }, { "epoch": 2.69, "grad_norm": 0.3441392183303833, "learning_rate": 2.6081116386475314e-07, "loss": 0.044, "step": 1521 }, { "epoch": 2.69, "grad_norm": 0.5960782170295715, "learning_rate": 2.578479114703164e-07, "loss": 0.0648, "step": 1522 }, { "epoch": 2.69, "grad_norm": 0.3497147858142853, "learning_rate": 2.5490114355723296e-07, "loss": 0.0598, "step": 1523 }, { "epoch": 2.7, "grad_norm": 0.6180885434150696, "learning_rate": 2.519708703689377e-07, "loss": 0.0435, "step": 1524 }, { "epoch": 2.7, "grad_norm": 0.3750920295715332, "learning_rate": 2.4905710209153224e-07, "loss": 0.0647, "step": 1525 }, { "epoch": 2.7, "grad_norm": 0.3500516712665558, "learning_rate": 2.4615984885374145e-07, "loss": 0.0918, "step": 1526 }, { "epoch": 2.7, "grad_norm": 0.26493725180625916, "learning_rate": 2.432791207268831e-07, "loss": 0.0424, "step": 1527 }, { "epoch": 2.7, "grad_norm": 0.6299352049827576, "learning_rate": 2.4041492772483134e-07, "loss": 0.0577, "step": 1528 }, { "epoch": 2.7, "grad_norm": 0.41473886370658875, "learning_rate": 2.3756727980397742e-07, "loss": 0.0413, "step": 1529 }, { "epoch": 2.71, "grad_norm": 0.2447299361228943, "learning_rate": 2.3473618686320477e-07, "loss": 0.0377, "step": 1530 }, { "epoch": 2.71, "grad_norm": 0.36810627579689026, "learning_rate": 2.3192165874384552e-07, "loss": 0.0772, "step": 1531 }, { "epoch": 2.71, "grad_norm": 0.2932353913784027, "learning_rate": 2.2912370522965133e-07, "loss": 0.0807, "step": 1532 }, { "epoch": 2.71, "grad_norm": 0.3338657319545746, "learning_rate": 2.2634233604675815e-07, "loss": 0.0577, "step": 1533 }, { "epoch": 2.71, "grad_norm": 0.4013504981994629, "learning_rate": 2.2357756086364923e-07, "loss": 0.088, "step": 1534 }, { "epoch": 2.72, "grad_norm": 0.22966735064983368, "learning_rate": 2.208293892911284e-07, "loss": 0.0302, "step": 1535 }, { "epoch": 2.72, "grad_norm": 0.3082610070705414, "learning_rate": 2.180978308822812e-07, "loss": 0.0374, "step": 1536 }, { "epoch": 2.72, "grad_norm": 0.271543025970459, "learning_rate": 2.1538289513244216e-07, "loss": 0.0368, "step": 1537 }, { "epoch": 2.72, "grad_norm": 0.4861089289188385, "learning_rate": 2.126845914791631e-07, "loss": 0.0562, "step": 1538 }, { "epoch": 2.72, "grad_norm": 0.20223701000213623, "learning_rate": 2.1000292930217992e-07, "loss": 0.0273, "step": 1539 }, { "epoch": 2.72, "grad_norm": 0.3119894862174988, "learning_rate": 2.0733791792338197e-07, "loss": 0.0348, "step": 1540 }, { "epoch": 2.73, "grad_norm": 0.5191625952720642, "learning_rate": 2.0468956660677552e-07, "loss": 0.0553, "step": 1541 }, { "epoch": 2.73, "grad_norm": 0.4338687062263489, "learning_rate": 2.0205788455845478e-07, "loss": 0.0937, "step": 1542 }, { "epoch": 2.73, "grad_norm": 0.20047542452812195, "learning_rate": 1.994428809265686e-07, "loss": 0.0311, "step": 1543 }, { "epoch": 2.73, "grad_norm": 0.4586922526359558, "learning_rate": 1.9684456480128845e-07, "loss": 0.0376, "step": 1544 }, { "epoch": 2.73, "grad_norm": 0.43704912066459656, "learning_rate": 1.9426294521477874e-07, "loss": 0.0779, "step": 1545 }, { "epoch": 2.74, "grad_norm": 0.589794397354126, "learning_rate": 1.9169803114116316e-07, "loss": 0.0658, "step": 1546 }, { "epoch": 2.74, "grad_norm": 0.5062592625617981, "learning_rate": 1.8914983149649513e-07, "loss": 0.068, "step": 1547 }, { "epoch": 2.74, "grad_norm": 0.3097711503505707, "learning_rate": 1.866183551387235e-07, "loss": 0.048, "step": 1548 }, { "epoch": 2.74, "grad_norm": 0.46899113059043884, "learning_rate": 1.8410361086766803e-07, "loss": 0.0653, "step": 1549 }, { "epoch": 2.74, "grad_norm": 0.5864695310592651, "learning_rate": 1.8160560742498223e-07, "loss": 0.0655, "step": 1550 }, { "epoch": 2.74, "grad_norm": 0.24746407568454742, "learning_rate": 1.7912435349412728e-07, "loss": 0.0475, "step": 1551 }, { "epoch": 2.75, "grad_norm": 0.8418275117874146, "learning_rate": 1.7665985770033976e-07, "loss": 0.0827, "step": 1552 }, { "epoch": 2.75, "grad_norm": 0.383968710899353, "learning_rate": 1.7421212861060132e-07, "loss": 0.0456, "step": 1553 }, { "epoch": 2.75, "grad_norm": 0.27309536933898926, "learning_rate": 1.717811747336129e-07, "loss": 0.0382, "step": 1554 }, { "epoch": 2.75, "grad_norm": 0.3256092071533203, "learning_rate": 1.6936700451975818e-07, "loss": 0.0577, "step": 1555 }, { "epoch": 2.75, "grad_norm": 0.2512308359146118, "learning_rate": 1.669696263610815e-07, "loss": 0.0311, "step": 1556 }, { "epoch": 2.75, "grad_norm": 0.3675495684146881, "learning_rate": 1.6458904859125325e-07, "loss": 0.047, "step": 1557 }, { "epoch": 2.76, "grad_norm": 0.4766847789287567, "learning_rate": 1.6222527948554334e-07, "loss": 0.091, "step": 1558 }, { "epoch": 2.76, "grad_norm": 0.46767014265060425, "learning_rate": 1.5987832726079344e-07, "loss": 0.0531, "step": 1559 }, { "epoch": 2.76, "grad_norm": 0.27260783314704895, "learning_rate": 1.5754820007538473e-07, "loss": 0.0352, "step": 1560 }, { "epoch": 2.76, "grad_norm": 0.5761088728904724, "learning_rate": 1.5523490602921353e-07, "loss": 0.0816, "step": 1561 }, { "epoch": 2.76, "grad_norm": 0.5682116150856018, "learning_rate": 1.5293845316366184e-07, "loss": 0.0417, "step": 1562 }, { "epoch": 2.76, "eval_loss": 0.053992871195077896, "eval_runtime": 14.7441, "eval_samples_per_second": 32.284, "eval_steps_per_second": 8.071, "step": 1562 }, { "epoch": 2.77, "grad_norm": 0.2451232671737671, "learning_rate": 1.5065884946156684e-07, "loss": 0.0647, "step": 1563 }, { "epoch": 2.77, "grad_norm": 0.3262011706829071, "learning_rate": 1.483961028471975e-07, "loss": 0.073, "step": 1564 }, { "epoch": 2.77, "grad_norm": 0.3662097156047821, "learning_rate": 1.461502211862237e-07, "loss": 0.0922, "step": 1565 }, { "epoch": 2.77, "grad_norm": 0.32570990920066833, "learning_rate": 1.4392121228569088e-07, "loss": 0.0754, "step": 1566 }, { "epoch": 2.77, "grad_norm": 0.3529726266860962, "learning_rate": 1.4170908389399108e-07, "loss": 0.0701, "step": 1567 }, { "epoch": 2.77, "grad_norm": 0.28727978467941284, "learning_rate": 1.39513843700837e-07, "loss": 0.0662, "step": 1568 }, { "epoch": 2.78, "grad_norm": 0.3806636333465576, "learning_rate": 1.3733549933723665e-07, "loss": 0.0319, "step": 1569 }, { "epoch": 2.78, "grad_norm": 0.27902525663375854, "learning_rate": 1.3517405837546404e-07, "loss": 0.0379, "step": 1570 }, { "epoch": 2.78, "grad_norm": 0.2431209832429886, "learning_rate": 1.3302952832903392e-07, "loss": 0.0398, "step": 1571 }, { "epoch": 2.78, "grad_norm": 0.42527928948402405, "learning_rate": 1.3090191665267814e-07, "loss": 0.0632, "step": 1572 }, { "epoch": 2.78, "grad_norm": 0.2568511664867401, "learning_rate": 1.2879123074231502e-07, "loss": 0.0433, "step": 1573 }, { "epoch": 2.78, "grad_norm": 0.22257547080516815, "learning_rate": 1.2669747793502828e-07, "loss": 0.0459, "step": 1574 }, { "epoch": 2.79, "grad_norm": 0.35752183198928833, "learning_rate": 1.2462066550903818e-07, "loss": 0.0369, "step": 1575 }, { "epoch": 2.79, "grad_norm": 0.4617360532283783, "learning_rate": 1.2256080068367758e-07, "loss": 0.0651, "step": 1576 }, { "epoch": 2.79, "grad_norm": 0.3756648898124695, "learning_rate": 1.2051789061936713e-07, "loss": 0.0462, "step": 1577 }, { "epoch": 2.79, "grad_norm": 0.28262534737586975, "learning_rate": 1.184919424175901e-07, "loss": 0.0503, "step": 1578 }, { "epoch": 2.79, "grad_norm": 0.41798335313796997, "learning_rate": 1.1648296312086748e-07, "loss": 0.0834, "step": 1579 }, { "epoch": 2.8, "grad_norm": 0.20020505785942078, "learning_rate": 1.1449095971273305e-07, "loss": 0.0284, "step": 1580 }, { "epoch": 2.8, "grad_norm": 0.5879182815551758, "learning_rate": 1.1251593911771053e-07, "loss": 0.0625, "step": 1581 }, { "epoch": 2.8, "grad_norm": 0.4198421537876129, "learning_rate": 1.105579082012892e-07, "loss": 0.0537, "step": 1582 }, { "epoch": 2.8, "grad_norm": 0.416815847158432, "learning_rate": 1.0861687376989672e-07, "loss": 0.0533, "step": 1583 }, { "epoch": 2.8, "grad_norm": 0.5320203304290771, "learning_rate": 1.0669284257088186e-07, "loss": 0.0914, "step": 1584 }, { "epoch": 2.8, "grad_norm": 0.18403738737106323, "learning_rate": 1.0478582129248516e-07, "loss": 0.0341, "step": 1585 }, { "epoch": 2.81, "grad_norm": 0.30317169427871704, "learning_rate": 1.0289581656381776e-07, "loss": 0.0582, "step": 1586 }, { "epoch": 2.81, "grad_norm": 0.2497711032629013, "learning_rate": 1.0102283495483978e-07, "loss": 0.0395, "step": 1587 }, { "epoch": 2.81, "grad_norm": 0.23093391954898834, "learning_rate": 9.916688297633647e-08, "loss": 0.0376, "step": 1588 }, { "epoch": 2.81, "grad_norm": 0.4418516457080841, "learning_rate": 9.732796707989378e-08, "loss": 0.0852, "step": 1589 }, { "epoch": 2.81, "grad_norm": 0.3121102750301361, "learning_rate": 9.550609365787888e-08, "loss": 0.0658, "step": 1590 }, { "epoch": 2.81, "grad_norm": 0.7046557664871216, "learning_rate": 9.37012690434147e-08, "loss": 0.1215, "step": 1591 }, { "epoch": 2.82, "grad_norm": 0.27790728211402893, "learning_rate": 9.191349951036266e-08, "loss": 0.0507, "step": 1592 }, { "epoch": 2.82, "grad_norm": 0.39402008056640625, "learning_rate": 9.014279127329607e-08, "loss": 0.052, "step": 1593 }, { "epoch": 2.82, "grad_norm": 0.39507508277893066, "learning_rate": 8.838915048748064e-08, "loss": 0.0691, "step": 1594 }, { "epoch": 2.82, "grad_norm": 0.1711668223142624, "learning_rate": 8.66525832488535e-08, "loss": 0.0196, "step": 1595 }, { "epoch": 2.82, "grad_norm": 0.5731654763221741, "learning_rate": 8.493309559399976e-08, "loss": 0.0889, "step": 1596 }, { "epoch": 2.83, "grad_norm": 0.4735415279865265, "learning_rate": 8.32306935001348e-08, "loss": 0.0686, "step": 1597 }, { "epoch": 2.83, "grad_norm": 0.32221677899360657, "learning_rate": 8.154538288508207e-08, "loss": 0.0587, "step": 1598 }, { "epoch": 2.83, "grad_norm": 0.28600865602493286, "learning_rate": 7.987716960725144e-08, "loss": 0.0551, "step": 1599 }, { "epoch": 2.83, "grad_norm": 0.30813297629356384, "learning_rate": 7.822605946561923e-08, "loss": 0.0509, "step": 1600 }, { "epoch": 2.83, "grad_norm": 0.22793856263160706, "learning_rate": 7.659205819970927e-08, "loss": 0.0259, "step": 1601 }, { "epoch": 2.83, "grad_norm": 0.29576751589775085, "learning_rate": 7.497517148957245e-08, "loss": 0.0549, "step": 1602 }, { "epoch": 2.84, "grad_norm": 0.34206637740135193, "learning_rate": 7.337540495576668e-08, "loss": 0.0474, "step": 1603 }, { "epoch": 2.84, "grad_norm": 0.4131069779396057, "learning_rate": 7.179276415933634e-08, "loss": 0.0826, "step": 1604 }, { "epoch": 2.84, "grad_norm": 0.2831874191761017, "learning_rate": 7.022725460179459e-08, "loss": 0.0588, "step": 1605 }, { "epoch": 2.84, "grad_norm": 0.1814374476671219, "learning_rate": 6.86788817251044e-08, "loss": 0.0398, "step": 1606 }, { "epoch": 2.84, "grad_norm": 0.5143124461174011, "learning_rate": 6.71476509116581e-08, "loss": 0.0554, "step": 1607 }, { "epoch": 2.84, "grad_norm": 0.29758432507514954, "learning_rate": 6.563356748425953e-08, "loss": 0.0564, "step": 1608 }, { "epoch": 2.85, "grad_norm": 0.39581355452537537, "learning_rate": 6.413663670610526e-08, "loss": 0.0508, "step": 1609 }, { "epoch": 2.85, "grad_norm": 0.2540724575519562, "learning_rate": 6.265686378076729e-08, "loss": 0.0356, "step": 1610 }, { "epoch": 2.85, "grad_norm": 0.5823767781257629, "learning_rate": 6.119425385217314e-08, "loss": 0.0577, "step": 1611 }, { "epoch": 2.85, "grad_norm": 0.23085378110408783, "learning_rate": 5.974881200459026e-08, "loss": 0.036, "step": 1612 }, { "epoch": 2.85, "grad_norm": 0.28562742471694946, "learning_rate": 5.8320543262606055e-08, "loss": 0.0718, "step": 1613 }, { "epoch": 2.86, "grad_norm": 0.4784908890724182, "learning_rate": 5.6909452591111804e-08, "loss": 0.0411, "step": 1614 }, { "epoch": 2.86, "grad_norm": 0.455228716135025, "learning_rate": 5.5515544895284324e-08, "loss": 0.0696, "step": 1615 }, { "epoch": 2.86, "grad_norm": 0.6585636138916016, "learning_rate": 5.413882502057155e-08, "loss": 0.0601, "step": 1616 }, { "epoch": 2.86, "grad_norm": 0.2227751612663269, "learning_rate": 5.277929775267143e-08, "loss": 0.0367, "step": 1617 }, { "epoch": 2.86, "grad_norm": 0.6290801167488098, "learning_rate": 5.1436967817519724e-08, "loss": 0.1083, "step": 1618 }, { "epoch": 2.86, "grad_norm": 0.46188145875930786, "learning_rate": 5.011183988127055e-08, "loss": 0.0647, "step": 1619 }, { "epoch": 2.87, "grad_norm": 0.2553968131542206, "learning_rate": 4.880391855028088e-08, "loss": 0.0519, "step": 1620 }, { "epoch": 2.87, "grad_norm": 0.299845427274704, "learning_rate": 4.751320837109552e-08, "loss": 0.0672, "step": 1621 }, { "epoch": 2.87, "grad_norm": 0.7285204529762268, "learning_rate": 4.6239713830429355e-08, "loss": 0.0819, "step": 1622 }, { "epoch": 2.87, "grad_norm": 0.2504163980484009, "learning_rate": 4.498343935515348e-08, "loss": 0.0595, "step": 1623 }, { "epoch": 2.87, "grad_norm": 0.3244449198246002, "learning_rate": 4.374438931228076e-08, "loss": 0.045, "step": 1624 }, { "epoch": 2.87, "grad_norm": 0.6529621481895447, "learning_rate": 4.252256800894694e-08, "loss": 0.1136, "step": 1625 }, { "epoch": 2.88, "grad_norm": 0.3478367328643799, "learning_rate": 4.131797969239903e-08, "loss": 0.0235, "step": 1626 }, { "epoch": 2.88, "grad_norm": 0.3354520797729492, "learning_rate": 4.013062854998029e-08, "loss": 0.0409, "step": 1627 }, { "epoch": 2.88, "grad_norm": 0.39834845066070557, "learning_rate": 3.896051870911188e-08, "loss": 0.1093, "step": 1628 }, { "epoch": 2.88, "grad_norm": 0.2333243191242218, "learning_rate": 3.7807654237284606e-08, "loss": 0.0378, "step": 1629 }, { "epoch": 2.88, "grad_norm": 0.30735358595848083, "learning_rate": 3.6672039142039426e-08, "loss": 0.064, "step": 1630 }, { "epoch": 2.89, "grad_norm": 0.4167795181274414, "learning_rate": 3.55536773709575e-08, "loss": 0.0668, "step": 1631 }, { "epoch": 2.89, "grad_norm": 0.2189754992723465, "learning_rate": 3.4452572811643494e-08, "loss": 0.0438, "step": 1632 }, { "epoch": 2.89, "grad_norm": 0.3518463373184204, "learning_rate": 3.336872929171286e-08, "loss": 0.0563, "step": 1633 }, { "epoch": 2.89, "grad_norm": 0.4295467138290405, "learning_rate": 3.230215057878017e-08, "loss": 0.0543, "step": 1634 }, { "epoch": 2.89, "grad_norm": 0.33514928817749023, "learning_rate": 3.125284038044407e-08, "loss": 0.0378, "step": 1635 }, { "epoch": 2.89, "grad_norm": 0.2797882854938507, "learning_rate": 3.022080234427516e-08, "loss": 0.026, "step": 1636 }, { "epoch": 2.9, "grad_norm": 0.41786250472068787, "learning_rate": 2.9206040057802587e-08, "loss": 0.0786, "step": 1637 }, { "epoch": 2.9, "grad_norm": 0.2052823305130005, "learning_rate": 2.820855704850356e-08, "loss": 0.0244, "step": 1638 }, { "epoch": 2.9, "grad_norm": 0.2519056797027588, "learning_rate": 2.722835678378888e-08, "loss": 0.0282, "step": 1639 }, { "epoch": 2.9, "grad_norm": 0.715143620967865, "learning_rate": 2.6265442670991293e-08, "loss": 0.0566, "step": 1640 }, { "epoch": 2.9, "grad_norm": 0.7333081960678101, "learning_rate": 2.531981805735606e-08, "loss": 0.0822, "step": 1641 }, { "epoch": 2.9, "grad_norm": 0.2924201488494873, "learning_rate": 2.4391486230024296e-08, "loss": 0.0589, "step": 1642 }, { "epoch": 2.91, "grad_norm": 0.2207706719636917, "learning_rate": 2.3480450416027422e-08, "loss": 0.0285, "step": 1643 }, { "epoch": 2.91, "grad_norm": 0.40874069929122925, "learning_rate": 2.2586713782272173e-08, "loss": 0.0599, "step": 1644 }, { "epoch": 2.91, "grad_norm": 0.21905629336833954, "learning_rate": 2.1710279435530058e-08, "loss": 0.0234, "step": 1645 }, { "epoch": 2.91, "grad_norm": 0.5945326685905457, "learning_rate": 2.085115042242791e-08, "loss": 0.0587, "step": 1646 }, { "epoch": 2.91, "grad_norm": 0.32817357778549194, "learning_rate": 2.0009329729435144e-08, "loss": 0.0619, "step": 1647 }, { "epoch": 2.92, "grad_norm": 0.4224092662334442, "learning_rate": 1.9184820282855954e-08, "loss": 0.068, "step": 1648 }, { "epoch": 2.92, "grad_norm": 0.37711820006370544, "learning_rate": 1.8377624948817674e-08, "loss": 0.0333, "step": 1649 }, { "epoch": 2.92, "grad_norm": 0.27166062593460083, "learning_rate": 1.7587746533260786e-08, "loss": 0.0523, "step": 1650 }, { "epoch": 2.92, "grad_norm": 0.3207380473613739, "learning_rate": 1.6815187781928922e-08, "loss": 0.0687, "step": 1651 }, { "epoch": 2.92, "grad_norm": 0.5786346197128296, "learning_rate": 1.6059951380359983e-08, "loss": 0.0792, "step": 1652 }, { "epoch": 2.92, "grad_norm": 0.35675281286239624, "learning_rate": 1.5322039953878377e-08, "loss": 0.0567, "step": 1653 }, { "epoch": 2.93, "grad_norm": 0.5655875205993652, "learning_rate": 1.4601456067580566e-08, "loss": 0.0789, "step": 1654 }, { "epoch": 2.93, "grad_norm": 0.26283755898475647, "learning_rate": 1.3898202226333424e-08, "loss": 0.0381, "step": 1655 }, { "epoch": 2.93, "grad_norm": 0.3109844923019409, "learning_rate": 1.3212280874759232e-08, "loss": 0.0409, "step": 1656 }, { "epoch": 2.93, "grad_norm": 0.37702232599258423, "learning_rate": 1.2543694397230689e-08, "loss": 0.0317, "step": 1657 }, { "epoch": 2.93, "grad_norm": 0.23807105422019958, "learning_rate": 1.1892445117862028e-08, "loss": 0.0306, "step": 1658 }, { "epoch": 2.93, "grad_norm": 0.3923358917236328, "learning_rate": 1.1258535300499585e-08, "loss": 0.059, "step": 1659 }, { "epoch": 2.94, "grad_norm": 0.2805118262767792, "learning_rate": 1.0641967148716236e-08, "loss": 0.0441, "step": 1660 }, { "epoch": 2.94, "grad_norm": 0.3381422162055969, "learning_rate": 1.004274280580142e-08, "loss": 0.0761, "step": 1661 }, { "epoch": 2.94, "grad_norm": 0.2109929323196411, "learning_rate": 9.460864354755016e-09, "loss": 0.0271, "step": 1662 }, { "epoch": 2.94, "grad_norm": 0.48929575085639954, "learning_rate": 8.896333818280145e-09, "loss": 0.0663, "step": 1663 }, { "epoch": 2.94, "grad_norm": 0.49233031272888184, "learning_rate": 8.349153158774825e-09, "loss": 0.0896, "step": 1664 }, { "epoch": 2.95, "grad_norm": 0.3483905494213104, "learning_rate": 7.819324278328099e-09, "loss": 0.0476, "step": 1665 }, { "epoch": 2.95, "grad_norm": 0.23602411150932312, "learning_rate": 7.306849018708928e-09, "loss": 0.0263, "step": 1666 }, { "epoch": 2.95, "grad_norm": 0.1845821589231491, "learning_rate": 6.811729161363967e-09, "loss": 0.0161, "step": 1667 }, { "epoch": 2.95, "grad_norm": 0.4176895320415497, "learning_rate": 6.333966427409244e-09, "loss": 0.0618, "step": 1668 }, { "epoch": 2.95, "grad_norm": 0.36043837666511536, "learning_rate": 5.873562477624606e-09, "loss": 0.0407, "step": 1669 }, { "epoch": 2.95, "grad_norm": 0.3081381618976593, "learning_rate": 5.430518912448169e-09, "loss": 0.0386, "step": 1670 }, { "epoch": 2.96, "grad_norm": 0.550875186920166, "learning_rate": 5.0048372719707635e-09, "loss": 0.0822, "step": 1671 }, { "epoch": 2.96, "grad_norm": 0.7006358504295349, "learning_rate": 4.596519035929281e-09, "loss": 0.0628, "step": 1672 }, { "epoch": 2.96, "grad_norm": 0.3571973741054535, "learning_rate": 4.205565623703889e-09, "loss": 0.0437, "step": 1673 }, { "epoch": 2.96, "grad_norm": 0.3002576529979706, "learning_rate": 3.83197839431082e-09, "loss": 0.0588, "step": 1674 }, { "epoch": 2.96, "grad_norm": 0.4107789993286133, "learning_rate": 3.4757586464001513e-09, "loss": 0.0792, "step": 1675 }, { "epoch": 2.97, "grad_norm": 0.27844133973121643, "learning_rate": 3.136907618248031e-09, "loss": 0.049, "step": 1676 }, { "epoch": 2.97, "grad_norm": 0.33063793182373047, "learning_rate": 2.815426487755568e-09, "loss": 0.0484, "step": 1677 }, { "epoch": 2.97, "grad_norm": 0.28345605731010437, "learning_rate": 2.511316372442729e-09, "loss": 0.0487, "step": 1678 }, { "epoch": 2.97, "grad_norm": 0.3538432717323303, "learning_rate": 2.224578329444449e-09, "loss": 0.0779, "step": 1679 }, { "epoch": 2.97, "grad_norm": 0.38014301657676697, "learning_rate": 1.9552133555084117e-09, "loss": 0.0682, "step": 1680 }, { "epoch": 2.97, "grad_norm": 0.43046075105667114, "learning_rate": 1.7032223869911656e-09, "loss": 0.0921, "step": 1681 }, { "epoch": 2.98, "grad_norm": 0.30206775665283203, "learning_rate": 1.4686062998525708e-09, "loss": 0.0654, "step": 1682 }, { "epoch": 2.98, "grad_norm": 0.36271175742149353, "learning_rate": 1.2513659096569096e-09, "loss": 0.0682, "step": 1683 }, { "epoch": 2.98, "grad_norm": 0.4958398640155792, "learning_rate": 1.0515019715656716e-09, "loss": 0.0649, "step": 1684 }, { "epoch": 2.98, "grad_norm": 0.3885645866394043, "learning_rate": 8.690151803386615e-10, "loss": 0.065, "step": 1685 }, { "epoch": 2.98, "grad_norm": 0.25948581099510193, "learning_rate": 7.0390617032845e-10, "loss": 0.0488, "step": 1686 }, { "epoch": 2.98, "grad_norm": 0.3367011249065399, "learning_rate": 5.561755154814829e-10, "loss": 0.0263, "step": 1687 }, { "epoch": 2.99, "grad_norm": 0.3822374641895294, "learning_rate": 4.258237293325307e-10, "loss": 0.0677, "step": 1688 }, { "epoch": 2.99, "grad_norm": 0.3856826424598694, "learning_rate": 3.12851265005798e-10, "loss": 0.0586, "step": 1689 }, { "epoch": 2.99, "grad_norm": 0.4796358644962311, "learning_rate": 2.1725851521103847e-10, "loss": 0.0572, "step": 1690 }, { "epoch": 2.99, "grad_norm": 0.22888964414596558, "learning_rate": 1.3904581224410962e-10, "loss": 0.0396, "step": 1691 }, { "epoch": 2.99, "grad_norm": 0.32935765385627747, "learning_rate": 7.821342798530751e-11, "loss": 0.0447, "step": 1692 }, { "epoch": 3.0, "grad_norm": 0.6059675812721252, "learning_rate": 3.4761573897701406e-11, "loss": 0.1114, "step": 1693 }, { "epoch": 3.0, "grad_norm": 0.36333170533180237, "learning_rate": 8.6904010265787e-12, "loss": 0.0752, "step": 1694 }, { "epoch": 3.0, "grad_norm": 0.4029941260814667, "learning_rate": 0.0, "loss": 0.0896, "step": 1695 } ], "logging_steps": 1, "max_steps": 1695, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 565, "total_flos": 1.582741075221545e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }