{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0045385779122542, "eval_steps": 500, "global_step": 166, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006051437216338881, "grad_norm": 0.23612292110919952, "learning_rate": 2e-05, "loss": 1.6094, "step": 1 }, { "epoch": 0.012102874432677761, "grad_norm": 0.21936853229999542, "learning_rate": 4e-05, "loss": 1.7958, "step": 2 }, { "epoch": 0.018154311649016642, "grad_norm": 0.2684606909751892, "learning_rate": 6e-05, "loss": 2.3944, "step": 3 }, { "epoch": 0.024205748865355523, "grad_norm": 0.2940954267978668, "learning_rate": 8e-05, "loss": 2.3674, "step": 4 }, { "epoch": 0.030257186081694403, "grad_norm": 0.2846325933933258, "learning_rate": 0.0001, "loss": 2.1885, "step": 5 }, { "epoch": 0.036308623298033284, "grad_norm": 0.3098210096359253, "learning_rate": 9.999048137490364e-05, "loss": 1.8546, "step": 6 }, { "epoch": 0.04236006051437216, "grad_norm": 0.33546316623687744, "learning_rate": 9.99619291237835e-05, "loss": 2.073, "step": 7 }, { "epoch": 0.048411497730711045, "grad_norm": 0.3401203453540802, "learning_rate": 9.991435411776654e-05, "loss": 1.9222, "step": 8 }, { "epoch": 0.05446293494704992, "grad_norm": 0.48030051589012146, "learning_rate": 9.984777447079862e-05, "loss": 2.4656, "step": 9 }, { "epoch": 0.060514372163388806, "grad_norm": 0.41418835520744324, "learning_rate": 9.976221553274767e-05, "loss": 2.5115, "step": 10 }, { "epoch": 0.06656580937972768, "grad_norm": 0.5238416790962219, "learning_rate": 9.965770987975189e-05, "loss": 2.4795, "step": 11 }, { "epoch": 0.07261724659606657, "grad_norm": 0.48317399621009827, "learning_rate": 9.953429730181653e-05, "loss": 2.505, "step": 12 }, { "epoch": 0.07866868381240545, "grad_norm": 0.6496583223342896, "learning_rate": 9.939202478766407e-05, "loss": 2.6298, "step": 13 }, { "epoch": 0.08472012102874432, "grad_norm": 0.7371865510940552, "learning_rate": 9.923094650684345e-05, "loss": 2.5862, "step": 14 }, { "epoch": 0.0907715582450832, "grad_norm": 0.7955237030982971, "learning_rate": 9.905112378910532e-05, "loss": 2.6021, "step": 15 }, { "epoch": 0.09682299546142209, "grad_norm": 0.747599720954895, "learning_rate": 9.885262510105102e-05, "loss": 2.3921, "step": 16 }, { "epoch": 0.10287443267776097, "grad_norm": 0.8165279626846313, "learning_rate": 9.863552602006435e-05, "loss": 2.3954, "step": 17 }, { "epoch": 0.10892586989409984, "grad_norm": 0.8175137042999268, "learning_rate": 9.839990920553565e-05, "loss": 2.5323, "step": 18 }, { "epoch": 0.11497730711043873, "grad_norm": 0.8728551864624023, "learning_rate": 9.814586436738998e-05, "loss": 2.3727, "step": 19 }, { "epoch": 0.12102874432677761, "grad_norm": 0.9302012920379639, "learning_rate": 9.787348823193013e-05, "loss": 2.4606, "step": 20 }, { "epoch": 0.12708018154311648, "grad_norm": 0.9512920379638672, "learning_rate": 9.75828845050089e-05, "loss": 2.201, "step": 21 }, { "epoch": 0.13313161875945537, "grad_norm": 1.1705322265625, "learning_rate": 9.72741638325434e-05, "loss": 2.3474, "step": 22 }, { "epoch": 0.13918305597579425, "grad_norm": 1.1705471277236938, "learning_rate": 9.694744375838725e-05, "loss": 2.3085, "step": 23 }, { "epoch": 0.14523449319213314, "grad_norm": 1.2117750644683838, "learning_rate": 9.660284867957636e-05, "loss": 2.0941, "step": 24 }, { "epoch": 0.15128593040847202, "grad_norm": 1.2110291719436646, "learning_rate": 9.624050979896533e-05, "loss": 2.0143, "step": 25 }, { "epoch": 0.1573373676248109, "grad_norm": 1.1926169395446777, "learning_rate": 9.586056507527266e-05, "loss": 1.9534, "step": 26 }, { "epoch": 0.16338880484114976, "grad_norm": 1.1735974550247192, "learning_rate": 9.546315917055361e-05, "loss": 1.9372, "step": 27 }, { "epoch": 0.16944024205748864, "grad_norm": 1.21225905418396, "learning_rate": 9.504844339512095e-05, "loss": 1.8305, "step": 28 }, { "epoch": 0.17549167927382753, "grad_norm": 1.3545117378234863, "learning_rate": 9.461657564993418e-05, "loss": 1.8428, "step": 29 }, { "epoch": 0.1815431164901664, "grad_norm": 1.6355072259902954, "learning_rate": 9.416772036647958e-05, "loss": 1.8407, "step": 30 }, { "epoch": 0.1875945537065053, "grad_norm": 2.1837329864501953, "learning_rate": 9.370204844416382e-05, "loss": 2.284, "step": 31 }, { "epoch": 0.19364599092284418, "grad_norm": 1.917792558670044, "learning_rate": 9.321973718524472e-05, "loss": 2.086, "step": 32 }, { "epoch": 0.19969742813918306, "grad_norm": 2.3043246269226074, "learning_rate": 9.272097022732443e-05, "loss": 2.1633, "step": 33 }, { "epoch": 0.20574886535552195, "grad_norm": 2.235715627670288, "learning_rate": 9.220593747343027e-05, "loss": 2.3737, "step": 34 }, { "epoch": 0.2118003025718608, "grad_norm": 2.3425846099853516, "learning_rate": 9.16748350197101e-05, "loss": 2.2327, "step": 35 }, { "epoch": 0.2178517397881997, "grad_norm": 2.3417069911956787, "learning_rate": 9.112786508076972e-05, "loss": 2.0814, "step": 36 }, { "epoch": 0.22390317700453857, "grad_norm": 3.114316701889038, "learning_rate": 9.056523591268064e-05, "loss": 2.2132, "step": 37 }, { "epoch": 0.22995461422087746, "grad_norm": 3.6260831356048584, "learning_rate": 8.998716173368762e-05, "loss": 2.1062, "step": 38 }, { "epoch": 0.23600605143721634, "grad_norm": 4.580545902252197, "learning_rate": 8.939386264264616e-05, "loss": 2.199, "step": 39 }, { "epoch": 0.24205748865355523, "grad_norm": 5.529226779937744, "learning_rate": 8.8785564535221e-05, "loss": 2.0428, "step": 40 }, { "epoch": 0.2481089258698941, "grad_norm": 8.231687545776367, "learning_rate": 8.816249901787737e-05, "loss": 2.1102, "step": 41 }, { "epoch": 0.25416036308623297, "grad_norm": 1.3886011838912964, "learning_rate": 8.752490331969807e-05, "loss": 1.3385, "step": 42 }, { "epoch": 0.26021180030257185, "grad_norm": 1.6931300163269043, "learning_rate": 8.687302020205968e-05, "loss": 1.8471, "step": 43 }, { "epoch": 0.26626323751891073, "grad_norm": 1.7250523567199707, "learning_rate": 8.620709786620231e-05, "loss": 1.9711, "step": 44 }, { "epoch": 0.2723146747352496, "grad_norm": 1.4766898155212402, "learning_rate": 8.552738985872833e-05, "loss": 1.9272, "step": 45 }, { "epoch": 0.2783661119515885, "grad_norm": 1.3497235774993896, "learning_rate": 8.483415497506567e-05, "loss": 1.7261, "step": 46 }, { "epoch": 0.2844175491679274, "grad_norm": 1.4176321029663086, "learning_rate": 8.412765716093272e-05, "loss": 1.5276, "step": 47 }, { "epoch": 0.29046898638426627, "grad_norm": 1.4237560033798218, "learning_rate": 8.340816541184249e-05, "loss": 1.3707, "step": 48 }, { "epoch": 0.29652042360060515, "grad_norm": 1.1526674032211304, "learning_rate": 8.267595367068375e-05, "loss": 1.1937, "step": 49 }, { "epoch": 0.30257186081694404, "grad_norm": 1.3353413343429565, "learning_rate": 8.193130072341873e-05, "loss": 1.3607, "step": 50 }, { "epoch": 0.3086232980332829, "grad_norm": 1.4287394285202026, "learning_rate": 8.117449009293668e-05, "loss": 1.5557, "step": 51 }, { "epoch": 0.3146747352496218, "grad_norm": 1.5747747421264648, "learning_rate": 8.040580993110404e-05, "loss": 1.6074, "step": 52 }, { "epoch": 0.3207261724659607, "grad_norm": 1.8766899108886719, "learning_rate": 7.962555290905197e-05, "loss": 1.5941, "step": 53 }, { "epoch": 0.3267776096822995, "grad_norm": 1.840211033821106, "learning_rate": 7.883401610574336e-05, "loss": 1.3652, "step": 54 }, { "epoch": 0.3328290468986384, "grad_norm": 2.002328872680664, "learning_rate": 7.803150089486144e-05, "loss": 1.2883, "step": 55 }, { "epoch": 0.3388804841149773, "grad_norm": 2.1058497428894043, "learning_rate": 7.721831283006322e-05, "loss": 1.079, "step": 56 }, { "epoch": 0.34493192133131617, "grad_norm": 1.7998846769332886, "learning_rate": 7.639476152864162e-05, "loss": 0.9134, "step": 57 }, { "epoch": 0.35098335854765506, "grad_norm": 1.864816427230835, "learning_rate": 7.556116055364008e-05, "loss": 0.82, "step": 58 }, { "epoch": 0.35703479576399394, "grad_norm": 1.9277801513671875, "learning_rate": 7.4717827294465e-05, "loss": 0.7148, "step": 59 }, { "epoch": 0.3630862329803328, "grad_norm": 2.2417094707489014, "learning_rate": 7.386508284604141e-05, "loss": 0.7864, "step": 60 }, { "epoch": 0.3691376701966717, "grad_norm": 2.0547397136688232, "learning_rate": 7.300325188655761e-05, "loss": 0.7012, "step": 61 }, { "epoch": 0.3751891074130106, "grad_norm": 2.6025710105895996, "learning_rate": 7.21326625538456e-05, "loss": 0.6169, "step": 62 }, { "epoch": 0.3812405446293495, "grad_norm": 2.259634017944336, "learning_rate": 7.125364632044422e-05, "loss": 0.6292, "step": 63 }, { "epoch": 0.38729198184568836, "grad_norm": 1.6727221012115479, "learning_rate": 7.036653786739263e-05, "loss": 0.5009, "step": 64 }, { "epoch": 0.39334341906202724, "grad_norm": 2.62076997756958, "learning_rate": 6.947167495680224e-05, "loss": 0.3475, "step": 65 }, { "epoch": 0.39939485627836613, "grad_norm": 2.0237042903900146, "learning_rate": 6.856939830325534e-05, "loss": 0.3402, "step": 66 }, { "epoch": 0.405446293494705, "grad_norm": 2.6460840702056885, "learning_rate": 6.76600514440799e-05, "loss": 0.4223, "step": 67 }, { "epoch": 0.4114977307110439, "grad_norm": 1.5343953371047974, "learning_rate": 6.674398060854931e-05, "loss": 0.3363, "step": 68 }, { "epoch": 0.4175491679273828, "grad_norm": 2.540858268737793, "learning_rate": 6.582153458605738e-05, "loss": 0.521, "step": 69 }, { "epoch": 0.4236006051437216, "grad_norm": 4.079755783081055, "learning_rate": 6.48930645933185e-05, "loss": 0.7838, "step": 70 }, { "epoch": 0.4296520423600605, "grad_norm": 2.2050883769989014, "learning_rate": 6.395892414064362e-05, "loss": 0.3974, "step": 71 }, { "epoch": 0.4357034795763994, "grad_norm": 3.4968700408935547, "learning_rate": 6.301946889734302e-05, "loss": 0.5544, "step": 72 }, { "epoch": 0.44175491679273826, "grad_norm": 3.0294439792633057, "learning_rate": 6.20750565563069e-05, "loss": 0.4241, "step": 73 }, { "epoch": 0.44780635400907715, "grad_norm": 2.7060914039611816, "learning_rate": 6.112604669781572e-05, "loss": 0.3418, "step": 74 }, { "epoch": 0.45385779122541603, "grad_norm": 1.8465664386749268, "learning_rate": 6.01728006526317e-05, "loss": 0.2793, "step": 75 }, { "epoch": 0.4599092284417549, "grad_norm": 2.7685420513153076, "learning_rate": 5.921568136442397e-05, "loss": 0.2668, "step": 76 }, { "epoch": 0.4659606656580938, "grad_norm": 3.251429319381714, "learning_rate": 5.8255053251579616e-05, "loss": 0.2514, "step": 77 }, { "epoch": 0.4720121028744327, "grad_norm": 3.9114441871643066, "learning_rate": 5.7291282068453166e-05, "loss": 0.2437, "step": 78 }, { "epoch": 0.47806354009077157, "grad_norm": 2.475526809692383, "learning_rate": 5.632473476610748e-05, "loss": 0.1536, "step": 79 }, { "epoch": 0.48411497730711045, "grad_norm": 4.316705703735352, "learning_rate": 5.535577935259888e-05, "loss": 0.2268, "step": 80 }, { "epoch": 0.49016641452344933, "grad_norm": 7.602482795715332, "learning_rate": 5.438478475286003e-05, "loss": 0.2635, "step": 81 }, { "epoch": 0.4962178517397882, "grad_norm": 6.1896653175354, "learning_rate": 5.341212066823355e-05, "loss": 0.3162, "step": 82 }, { "epoch": 0.5022692889561271, "grad_norm": 3.891818046569824, "learning_rate": 5.243815743571009e-05, "loss": 0.7189, "step": 83 }, { "epoch": 0.5083207261724659, "grad_norm": 7.040829658508301, "learning_rate": 5.146326588692438e-05, "loss": 1.1875, "step": 84 }, { "epoch": 0.5143721633888049, "grad_norm": 9.615294456481934, "learning_rate": 5.048781720696291e-05, "loss": 1.5413, "step": 85 }, { "epoch": 0.5204236006051437, "grad_norm": 8.135547637939453, "learning_rate": 4.95121827930371e-05, "loss": 1.5035, "step": 86 }, { "epoch": 0.5264750378214826, "grad_norm": 5.731124401092529, "learning_rate": 4.853673411307564e-05, "loss": 1.2925, "step": 87 }, { "epoch": 0.5325264750378215, "grad_norm": 5.864335060119629, "learning_rate": 4.756184256428992e-05, "loss": 0.8643, "step": 88 }, { "epoch": 0.5385779122541604, "grad_norm": 6.120126247406006, "learning_rate": 4.658787933176646e-05, "loss": 0.8852, "step": 89 }, { "epoch": 0.5446293494704992, "grad_norm": 3.3239099979400635, "learning_rate": 4.561521524713997e-05, "loss": 0.6142, "step": 90 }, { "epoch": 0.5506807866868382, "grad_norm": 3.291146993637085, "learning_rate": 4.4644220647401136e-05, "loss": 0.6233, "step": 91 }, { "epoch": 0.556732223903177, "grad_norm": 2.9767818450927734, "learning_rate": 4.367526523389253e-05, "loss": 0.4502, "step": 92 }, { "epoch": 0.5627836611195158, "grad_norm": 2.5687639713287354, "learning_rate": 4.2708717931546825e-05, "loss": 0.4639, "step": 93 }, { "epoch": 0.5688350983358548, "grad_norm": 2.8956291675567627, "learning_rate": 4.174494674842038e-05, "loss": 0.3833, "step": 94 }, { "epoch": 0.5748865355521936, "grad_norm": 3.41869854927063, "learning_rate": 4.0784318635576055e-05, "loss": 0.3458, "step": 95 }, { "epoch": 0.5809379727685325, "grad_norm": 2.012956142425537, "learning_rate": 3.982719934736832e-05, "loss": 0.2622, "step": 96 }, { "epoch": 0.5869894099848714, "grad_norm": 1.7718323469161987, "learning_rate": 3.887395330218429e-05, "loss": 0.1466, "step": 97 }, { "epoch": 0.5930408472012103, "grad_norm": 1.5787506103515625, "learning_rate": 3.792494344369311e-05, "loss": 0.1388, "step": 98 }, { "epoch": 0.5990922844175491, "grad_norm": 1.3233414888381958, "learning_rate": 3.698053110265699e-05, "loss": 0.117, "step": 99 }, { "epoch": 0.6051437216338881, "grad_norm": 1.1028014421463013, "learning_rate": 3.604107585935638e-05, "loss": 0.12, "step": 100 }, { "epoch": 0.6111951588502269, "grad_norm": 1.057910442352295, "learning_rate": 3.510693540668151e-05, "loss": 0.0996, "step": 101 }, { "epoch": 0.6172465960665658, "grad_norm": 0.8907783031463623, "learning_rate": 3.4178465413942625e-05, "loss": 0.0811, "step": 102 }, { "epoch": 0.6232980332829047, "grad_norm": 0.7459267377853394, "learning_rate": 3.325601939145069e-05, "loss": 0.0353, "step": 103 }, { "epoch": 0.6293494704992436, "grad_norm": 0.9089831113815308, "learning_rate": 3.23399485559201e-05, "loss": 0.0709, "step": 104 }, { "epoch": 0.6354009077155824, "grad_norm": 0.7159168124198914, "learning_rate": 3.143060169674468e-05, "loss": 0.0451, "step": 105 }, { "epoch": 0.6414523449319214, "grad_norm": 0.5289613008499146, "learning_rate": 3.0528325043197785e-05, "loss": 0.0278, "step": 106 }, { "epoch": 0.6475037821482602, "grad_norm": 0.5118010640144348, "learning_rate": 2.963346213260737e-05, "loss": 0.0221, "step": 107 }, { "epoch": 0.653555219364599, "grad_norm": 0.354955792427063, "learning_rate": 2.874635367955579e-05, "loss": 0.0104, "step": 108 }, { "epoch": 0.659606656580938, "grad_norm": 1.1922907829284668, "learning_rate": 2.7867337446154396e-05, "loss": 0.0275, "step": 109 }, { "epoch": 0.6656580937972768, "grad_norm": 0.9284811615943909, "learning_rate": 2.6996748113442394e-05, "loss": 0.0236, "step": 110 }, { "epoch": 0.6717095310136157, "grad_norm": 0.6479511857032776, "learning_rate": 2.613491715395861e-05, "loss": 0.0328, "step": 111 }, { "epoch": 0.6777609682299546, "grad_norm": 0.3264722526073456, "learning_rate": 2.5282172705535013e-05, "loss": 0.0126, "step": 112 }, { "epoch": 0.6838124054462935, "grad_norm": 0.5275784134864807, "learning_rate": 2.4438839446359933e-05, "loss": 0.0267, "step": 113 }, { "epoch": 0.6898638426626323, "grad_norm": 1.2883987426757812, "learning_rate": 2.360523847135838e-05, "loss": 0.0482, "step": 114 }, { "epoch": 0.6959152798789713, "grad_norm": 0.6634830832481384, "learning_rate": 2.2781687169936795e-05, "loss": 0.0287, "step": 115 }, { "epoch": 0.7019667170953101, "grad_norm": 0.8741732239723206, "learning_rate": 2.196849910513858e-05, "loss": 0.037, "step": 116 }, { "epoch": 0.708018154311649, "grad_norm": 0.7417043447494507, "learning_rate": 2.1165983894256647e-05, "loss": 0.0244, "step": 117 }, { "epoch": 0.7140695915279879, "grad_norm": 1.1073297262191772, "learning_rate": 2.037444709094804e-05, "loss": 0.0391, "step": 118 }, { "epoch": 0.7201210287443268, "grad_norm": 0.9330146908760071, "learning_rate": 1.9594190068895968e-05, "loss": 0.0312, "step": 119 }, { "epoch": 0.7261724659606656, "grad_norm": 0.8450430631637573, "learning_rate": 1.8825509907063327e-05, "loss": 0.0212, "step": 120 }, { "epoch": 0.7322239031770046, "grad_norm": 0.6666033864021301, "learning_rate": 1.8068699276581285e-05, "loss": 0.0202, "step": 121 }, { "epoch": 0.7382753403933434, "grad_norm": 0.8927512168884277, "learning_rate": 1.732404632931625e-05, "loss": 0.0204, "step": 122 }, { "epoch": 0.7443267776096822, "grad_norm": 0.919458270072937, "learning_rate": 1.6591834588157523e-05, "loss": 0.0185, "step": 123 }, { "epoch": 0.7503782148260212, "grad_norm": 1.465542197227478, "learning_rate": 1.5872342839067306e-05, "loss": 0.5584, "step": 124 }, { "epoch": 0.75642965204236, "grad_norm": 1.944527506828308, "learning_rate": 1.5165845024934366e-05, "loss": 0.7492, "step": 125 }, { "epoch": 0.762481089258699, "grad_norm": 2.006526470184326, "learning_rate": 1.447261014127167e-05, "loss": 0.725, "step": 126 }, { "epoch": 0.7685325264750378, "grad_norm": 2.1773271560668945, "learning_rate": 1.3792902133797692e-05, "loss": 0.6446, "step": 127 }, { "epoch": 0.7745839636913767, "grad_norm": 2.4729340076446533, "learning_rate": 1.3126979797940336e-05, "loss": 0.8293, "step": 128 }, { "epoch": 0.7806354009077155, "grad_norm": 2.1801207065582275, "learning_rate": 1.2475096680301934e-05, "loss": 0.6321, "step": 129 }, { "epoch": 0.7866868381240545, "grad_norm": 1.851625680923462, "learning_rate": 1.1837500982122645e-05, "loss": 0.2963, "step": 130 }, { "epoch": 0.7927382753403933, "grad_norm": 2.0671639442443848, "learning_rate": 1.1214435464779006e-05, "loss": 0.2112, "step": 131 }, { "epoch": 0.7987897125567323, "grad_norm": 1.930985450744629, "learning_rate": 1.060613735735384e-05, "loss": 0.2137, "step": 132 }, { "epoch": 0.8048411497730711, "grad_norm": 2.326185464859009, "learning_rate": 1.0012838266312396e-05, "loss": 0.1678, "step": 133 }, { "epoch": 0.81089258698941, "grad_norm": 2.306683301925659, "learning_rate": 9.434764087319376e-06, "loss": 0.1384, "step": 134 }, { "epoch": 0.8169440242057489, "grad_norm": 4.009836673736572, "learning_rate": 8.87213491923029e-06, "loss": 0.1649, "step": 135 }, { "epoch": 0.8229954614220878, "grad_norm": 3.4153988361358643, "learning_rate": 8.325164980289896e-06, "loss": 0.1384, "step": 136 }, { "epoch": 0.8290468986384266, "grad_norm": 4.114608287811279, "learning_rate": 7.794062526569734e-06, "loss": 0.0969, "step": 137 }, { "epoch": 0.8350983358547656, "grad_norm": 1.5541014671325684, "learning_rate": 7.2790297726755716e-06, "loss": 0.1201, "step": 138 }, { "epoch": 0.8411497730711044, "grad_norm": 0.6836862564086914, "learning_rate": 6.780262814755284e-06, "loss": 0.0259, "step": 139 }, { "epoch": 0.8472012102874432, "grad_norm": 0.5702232718467712, "learning_rate": 6.29795155583619e-06, "loss": 0.0119, "step": 140 }, { "epoch": 0.8532526475037822, "grad_norm": 0.27770644426345825, "learning_rate": 5.83227963352041e-06, "loss": 0.0051, "step": 141 }, { "epoch": 0.859304084720121, "grad_norm": 0.44976603984832764, "learning_rate": 5.383424350065824e-06, "loss": 0.0063, "step": 142 }, { "epoch": 0.8653555219364599, "grad_norm": 0.40759414434432983, "learning_rate": 4.951556604879048e-06, "loss": 0.0091, "step": 143 }, { "epoch": 0.8714069591527988, "grad_norm": 0.573310136795044, "learning_rate": 4.536840829446387e-06, "loss": 0.0135, "step": 144 }, { "epoch": 0.8774583963691377, "grad_norm": 0.3286815285682678, "learning_rate": 4.139434924727359e-06, "loss": 0.0067, "step": 145 }, { "epoch": 0.8835098335854765, "grad_norm": 0.12208826839923859, "learning_rate": 3.7594902010346768e-06, "loss": 0.0034, "step": 146 }, { "epoch": 0.8895612708018155, "grad_norm": 0.17780545353889465, "learning_rate": 3.397151320423647e-06, "loss": 0.0044, "step": 147 }, { "epoch": 0.8956127080181543, "grad_norm": 0.4996080696582794, "learning_rate": 3.0525562416127472e-06, "loss": 0.0241, "step": 148 }, { "epoch": 0.9016641452344932, "grad_norm": 0.3310739994049072, "learning_rate": 2.725836167456608e-06, "loss": 0.0066, "step": 149 }, { "epoch": 0.9077155824508321, "grad_norm": 0.7427249550819397, "learning_rate": 2.4171154949911067e-06, "loss": 0.0224, "step": 150 }, { "epoch": 0.913767019667171, "grad_norm": 0.7192853689193726, "learning_rate": 2.1265117680698665e-06, "loss": 0.0321, "step": 151 }, { "epoch": 0.9198184568835098, "grad_norm": 0.653380274772644, "learning_rate": 1.8541356326100433e-06, "loss": 0.0235, "step": 152 }, { "epoch": 0.9258698940998488, "grad_norm": 0.7149347066879272, "learning_rate": 1.6000907944643428e-06, "loss": 0.0172, "step": 153 }, { "epoch": 0.9319213313161876, "grad_norm": 0.7151485681533813, "learning_rate": 1.3644739799356764e-06, "loss": 0.0246, "step": 154 }, { "epoch": 0.9379727685325264, "grad_norm": 0.21587873995304108, "learning_rate": 1.1473748989489774e-06, "loss": 0.0053, "step": 155 }, { "epoch": 0.9440242057488654, "grad_norm": 0.3646269738674164, "learning_rate": 9.488762108946892e-07, "loss": 0.0073, "step": 156 }, { "epoch": 0.9500756429652042, "grad_norm": 0.2228485345840454, "learning_rate": 7.690534931565518e-07, "loss": 0.0057, "step": 157 }, { "epoch": 0.9561270801815431, "grad_norm": 0.5082440376281738, "learning_rate": 6.079752123359261e-07, "loss": 0.0128, "step": 158 }, { "epoch": 0.962178517397882, "grad_norm": 0.9114623069763184, "learning_rate": 4.6570269818346224e-07, "loss": 0.0233, "step": 159 }, { "epoch": 0.9682299546142209, "grad_norm": 0.4167875349521637, "learning_rate": 3.4229012024811214e-07, "loss": 0.0073, "step": 160 }, { "epoch": 0.9742813918305597, "grad_norm": 0.13948678970336914, "learning_rate": 2.3778446725233327e-07, "loss": 0.0032, "step": 161 }, { "epoch": 0.9803328290468987, "grad_norm": 0.34661903977394104, "learning_rate": 1.5222552920138856e-07, "loss": 0.0053, "step": 162 }, { "epoch": 0.9863842662632375, "grad_norm": 0.15658196806907654, "learning_rate": 8.564588223346492e-08, "loss": 0.0029, "step": 163 }, { "epoch": 0.9924357034795764, "grad_norm": 0.40814340114593506, "learning_rate": 3.8070876216506066e-08, "loss": 0.0051, "step": 164 }, { "epoch": 0.9984871406959153, "grad_norm": 0.8116607069969177, "learning_rate": 9.518625096366363e-09, "loss": 0.1255, "step": 165 }, { "epoch": 0.9984871406959153, "eval_loss": 0.06893022358417511, "eval_runtime": 7.1896, "eval_samples_per_second": 19.473, "eval_steps_per_second": 4.868, "step": 165 }, { "epoch": 1.0045385779122542, "grad_norm": 1.0615386962890625, "learning_rate": 0.0, "loss": 0.2848, "step": 166 } ], "logging_steps": 1, "max_steps": 166, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 42, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1627739017537126e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }