{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.023545108503708355, "eval_steps": 38, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015696739002472236, "grad_norm": 12.95445442199707, "learning_rate": 4e-05, "loss": 3.7815, "step": 1 }, { "epoch": 0.00015696739002472236, "eval_loss": 0.7498059868812561, "eval_runtime": 821.0788, "eval_samples_per_second": 3.268, "eval_steps_per_second": 1.634, "step": 1 }, { "epoch": 0.0003139347800494447, "grad_norm": 14.029548645019531, "learning_rate": 8e-05, "loss": 3.2859, "step": 2 }, { "epoch": 0.0004709021700741671, "grad_norm": 6.270756244659424, "learning_rate": 0.00012, "loss": 2.9561, "step": 3 }, { "epoch": 0.0006278695600988894, "grad_norm": 14.352215766906738, "learning_rate": 0.00016, "loss": 4.7327, "step": 4 }, { "epoch": 0.0007848369501236119, "grad_norm": 10.848075866699219, "learning_rate": 0.0002, "loss": 2.3772, "step": 5 }, { "epoch": 0.0009418043401483342, "grad_norm": 11.940299987792969, "learning_rate": 0.00024, "loss": 2.1429, "step": 6 }, { "epoch": 0.0010987717301730565, "grad_norm": 9.965449333190918, "learning_rate": 0.00028, "loss": 2.3103, "step": 7 }, { "epoch": 0.0012557391201977789, "grad_norm": 14.837979316711426, "learning_rate": 0.00032, "loss": 2.7905, "step": 8 }, { "epoch": 0.0014127065102225013, "grad_norm": 11.992790222167969, "learning_rate": 0.00036, "loss": 2.418, "step": 9 }, { "epoch": 0.0015696739002472237, "grad_norm": 16.958616256713867, "learning_rate": 0.0004, "loss": 3.5232, "step": 10 }, { "epoch": 0.001726641290271946, "grad_norm": 6.701943874359131, "learning_rate": 0.0003999496469885013, "loss": 2.2088, "step": 11 }, { "epoch": 0.0018836086802966683, "grad_norm": 5.240756511688232, "learning_rate": 0.00039979861330826294, "loss": 1.9175, "step": 12 }, { "epoch": 0.0020405760703213907, "grad_norm": 11.867003440856934, "learning_rate": 0.0003995469750092912, "loss": 2.6973, "step": 13 }, { "epoch": 0.002197543460346113, "grad_norm": 11.783756256103516, "learning_rate": 0.00039919485879904784, "loss": 3.5406, "step": 14 }, { "epoch": 0.0023545108503708356, "grad_norm": 10.173484802246094, "learning_rate": 0.00039874244197864856, "loss": 2.6159, "step": 15 }, { "epoch": 0.0025114782403955578, "grad_norm": 6.714694499969482, "learning_rate": 0.00039818995235358696, "loss": 1.4429, "step": 16 }, { "epoch": 0.00266844563042028, "grad_norm": 11.201931953430176, "learning_rate": 0.00039753766811902755, "loss": 2.7839, "step": 17 }, { "epoch": 0.0028254130204450026, "grad_norm": 7.3074951171875, "learning_rate": 0.0003967859177197259, "loss": 1.5164, "step": 18 }, { "epoch": 0.002982380410469725, "grad_norm": 8.446715354919434, "learning_rate": 0.00039593507968464716, "loss": 2.2741, "step": 19 }, { "epoch": 0.0031393478004944474, "grad_norm": 5.973799705505371, "learning_rate": 0.0003949855824363647, "loss": 1.5339, "step": 20 }, { "epoch": 0.0032963151905191696, "grad_norm": 6.163540363311768, "learning_rate": 0.0003939379040753374, "loss": 1.755, "step": 21 }, { "epoch": 0.003453282580543892, "grad_norm": 7.136409759521484, "learning_rate": 0.00039279257213917066, "loss": 2.1914, "step": 22 }, { "epoch": 0.0036102499705686145, "grad_norm": 7.554473400115967, "learning_rate": 0.0003915501633369861, "loss": 2.1009, "step": 23 }, { "epoch": 0.0037672173605933367, "grad_norm": 37.26305389404297, "learning_rate": 0.00039021130325903074, "loss": 4.9365, "step": 24 }, { "epoch": 0.003924184750618059, "grad_norm": 10.216833114624023, "learning_rate": 0.00038877666606167355, "loss": 1.8749, "step": 25 }, { "epoch": 0.0040811521406427815, "grad_norm": 11.548598289489746, "learning_rate": 0.00038724697412794747, "loss": 2.5951, "step": 26 }, { "epoch": 0.004238119530667504, "grad_norm": 10.462121963500977, "learning_rate": 0.0003856229977038078, "loss": 2.2098, "step": 27 }, { "epoch": 0.004395086920692226, "grad_norm": 9.643802642822266, "learning_rate": 0.0003839055545102902, "loss": 1.9106, "step": 28 }, { "epoch": 0.0045520543107169485, "grad_norm": 9.896982192993164, "learning_rate": 0.00038209550933176323, "loss": 2.4819, "step": 29 }, { "epoch": 0.004709021700741671, "grad_norm": 15.647576332092285, "learning_rate": 0.0003801937735804838, "loss": 3.3157, "step": 30 }, { "epoch": 0.004865989090766393, "grad_norm": 7.6956634521484375, "learning_rate": 0.0003782013048376736, "loss": 2.4637, "step": 31 }, { "epoch": 0.0050229564807911155, "grad_norm": 7.653892993927002, "learning_rate": 0.0003761191063713476, "loss": 1.8644, "step": 32 }, { "epoch": 0.005179923870815838, "grad_norm": 20.444591522216797, "learning_rate": 0.0003739482266311391, "loss": 2.7017, "step": 33 }, { "epoch": 0.00533689126084056, "grad_norm": 7.471802711486816, "learning_rate": 0.00037168975872037323, "loss": 2.9944, "step": 34 }, { "epoch": 0.005493858650865283, "grad_norm": 6.789485454559326, "learning_rate": 0.00036934483984565685, "loss": 1.5157, "step": 35 }, { "epoch": 0.005650826040890005, "grad_norm": 6.496057510375977, "learning_rate": 0.00036691465074426054, "loss": 2.2969, "step": 36 }, { "epoch": 0.005807793430914728, "grad_norm": 7.474277496337891, "learning_rate": 0.00036440041508958203, "loss": 1.8918, "step": 37 }, { "epoch": 0.00596476082093945, "grad_norm": 12.531452178955078, "learning_rate": 0.0003618033988749895, "loss": 2.7539, "step": 38 }, { "epoch": 0.00596476082093945, "eval_loss": 0.6094685196876526, "eval_runtime": 820.3189, "eval_samples_per_second": 3.271, "eval_steps_per_second": 1.636, "step": 38 }, { "epoch": 0.006121728210964172, "grad_norm": 11.514887809753418, "learning_rate": 0.00035912490977635625, "loss": 3.1396, "step": 39 }, { "epoch": 0.006278695600988895, "grad_norm": 9.372078895568848, "learning_rate": 0.000356366296493606, "loss": 2.4996, "step": 40 }, { "epoch": 0.006435662991013617, "grad_norm": 12.012513160705566, "learning_rate": 0.0003535289480716022, "loss": 2.7457, "step": 41 }, { "epoch": 0.006592630381038339, "grad_norm": 9.123496055603027, "learning_rate": 0.00035061429320072223, "loss": 2.8422, "step": 42 }, { "epoch": 0.006749597771063062, "grad_norm": 6.817448139190674, "learning_rate": 0.00034762379949746815, "loss": 2.2889, "step": 43 }, { "epoch": 0.006906565161087784, "grad_norm": 5.610447883605957, "learning_rate": 0.0003445589727654783, "loss": 1.5969, "step": 44 }, { "epoch": 0.007063532551112506, "grad_norm": 7.098076343536377, "learning_rate": 0.0003414213562373095, "loss": 2.2073, "step": 45 }, { "epoch": 0.007220499941137229, "grad_norm": 9.250218391418457, "learning_rate": 0.00033821252979737297, "loss": 2.2856, "step": 46 }, { "epoch": 0.007377467331161951, "grad_norm": 6.528243064880371, "learning_rate": 0.0003349341091864149, "loss": 1.8539, "step": 47 }, { "epoch": 0.007534434721186673, "grad_norm": 11.957141876220703, "learning_rate": 0.00033158774518794254, "loss": 3.4304, "step": 48 }, { "epoch": 0.007691402111211396, "grad_norm": 5.783234596252441, "learning_rate": 0.0003281751227970048, "loss": 1.9596, "step": 49 }, { "epoch": 0.007848369501236118, "grad_norm": 15.507065773010254, "learning_rate": 0.00032469796037174674, "loss": 3.9566, "step": 50 }, { "epoch": 0.008005336891260841, "grad_norm": 7.186437606811523, "learning_rate": 0.000321158008768164, "loss": 1.9811, "step": 51 }, { "epoch": 0.008162304281285563, "grad_norm": 7.9519124031066895, "learning_rate": 0.00031755705045849464, "loss": 2.5435, "step": 52 }, { "epoch": 0.008319271671310285, "grad_norm": 11.823431968688965, "learning_rate": 0.0003138968986336904, "loss": 2.5919, "step": 53 }, { "epoch": 0.008476239061335008, "grad_norm": 5.554260730743408, "learning_rate": 0.0003101793962904205, "loss": 1.4945, "step": 54 }, { "epoch": 0.00863320645135973, "grad_norm": 7.568012237548828, "learning_rate": 0.00030640641530306733, "loss": 2.155, "step": 55 }, { "epoch": 0.008790173841384452, "grad_norm": 31.618026733398438, "learning_rate": 0.00030257985548118126, "loss": 2.7959, "step": 56 }, { "epoch": 0.008947141231409175, "grad_norm": 11.720658302307129, "learning_rate": 0.0002987016436128694, "loss": 3.1511, "step": 57 }, { "epoch": 0.009104108621433897, "grad_norm": 9.653231620788574, "learning_rate": 0.0002947737324945997, "loss": 2.2899, "step": 58 }, { "epoch": 0.009261076011458619, "grad_norm": 8.848836898803711, "learning_rate": 0.00029079809994790937, "loss": 1.7292, "step": 59 }, { "epoch": 0.009418043401483342, "grad_norm": 7.872807025909424, "learning_rate": 0.00028677674782351165, "loss": 2.3457, "step": 60 }, { "epoch": 0.009575010791508064, "grad_norm": 7.429281711578369, "learning_rate": 0.00028271170099330415, "loss": 1.9487, "step": 61 }, { "epoch": 0.009731978181532786, "grad_norm": 12.662517547607422, "learning_rate": 0.00027860500633078477, "loss": 2.1071, "step": 62 }, { "epoch": 0.00988894557155751, "grad_norm": 7.445736885070801, "learning_rate": 0.00027445873168038907, "loss": 2.6263, "step": 63 }, { "epoch": 0.010045912961582231, "grad_norm": 8.707975387573242, "learning_rate": 0.0002702749648162686, "loss": 2.4687, "step": 64 }, { "epoch": 0.010202880351606953, "grad_norm": 7.881085395812988, "learning_rate": 0.00026605581239103347, "loss": 2.033, "step": 65 }, { "epoch": 0.010359847741631676, "grad_norm": 10.121798515319824, "learning_rate": 0.00026180339887498953, "loss": 1.9015, "step": 66 }, { "epoch": 0.010516815131656398, "grad_norm": 21.383502960205078, "learning_rate": 0.00025751986548640346, "loss": 2.6845, "step": 67 }, { "epoch": 0.01067378252168112, "grad_norm": 13.87905216217041, "learning_rate": 0.00025320736911333503, "loss": 2.0689, "step": 68 }, { "epoch": 0.010830749911705843, "grad_norm": 20.752147674560547, "learning_rate": 0.0002488680812275788, "loss": 2.3047, "step": 69 }, { "epoch": 0.010987717301730565, "grad_norm": 8.677228927612305, "learning_rate": 0.0002445041867912629, "loss": 2.2145, "step": 70 }, { "epoch": 0.011144684691755289, "grad_norm": 16.90841293334961, "learning_rate": 0.00024011788315665458, "loss": 2.6634, "step": 71 }, { "epoch": 0.01130165208178001, "grad_norm": 58.644561767578125, "learning_rate": 0.00023571137895972733, "loss": 3.7571, "step": 72 }, { "epoch": 0.011458619471804732, "grad_norm": 9.913825035095215, "learning_rate": 0.0002312868930080462, "loss": 2.2174, "step": 73 }, { "epoch": 0.011615586861829456, "grad_norm": 7.47474479675293, "learning_rate": 0.0002268466531635311, "loss": 1.7509, "step": 74 }, { "epoch": 0.011772554251854177, "grad_norm": 8.850740432739258, "learning_rate": 0.00022239289522066157, "loss": 1.9296, "step": 75 }, { "epoch": 0.0119295216418789, "grad_norm": 6.602862358093262, "learning_rate": 0.00021792786178068672, "loss": 2.5602, "step": 76 }, { "epoch": 0.0119295216418789, "eval_loss": 0.5898057818412781, "eval_runtime": 820.7489, "eval_samples_per_second": 3.269, "eval_steps_per_second": 1.635, "step": 76 }, { "epoch": 0.012086489031903623, "grad_norm": 6.227441787719727, "learning_rate": 0.00021345380112240797, "loss": 1.5853, "step": 77 }, { "epoch": 0.012243456421928344, "grad_norm": 6.998437881469727, "learning_rate": 0.00020897296607010301, "loss": 2.4302, "step": 78 }, { "epoch": 0.012400423811953066, "grad_norm": 7.076618671417236, "learning_rate": 0.00020448761285916104, "loss": 2.2843, "step": 79 }, { "epoch": 0.01255739120197779, "grad_norm": 7.667428016662598, "learning_rate": 0.0002, "loss": 2.4866, "step": 80 }, { "epoch": 0.012714358592002512, "grad_norm": 8.555462837219238, "learning_rate": 0.00019551238714083903, "loss": 2.2074, "step": 81 }, { "epoch": 0.012871325982027233, "grad_norm": 13.07919979095459, "learning_rate": 0.00019102703392989709, "loss": 2.0399, "step": 82 }, { "epoch": 0.013028293372051957, "grad_norm": 7.338263034820557, "learning_rate": 0.00018654619887759207, "loss": 2.3211, "step": 83 }, { "epoch": 0.013185260762076679, "grad_norm": 15.829663276672363, "learning_rate": 0.00018207213821931333, "loss": 2.6278, "step": 84 }, { "epoch": 0.0133422281521014, "grad_norm": 24.069778442382812, "learning_rate": 0.00017760710477933845, "loss": 2.8608, "step": 85 }, { "epoch": 0.013499195542126124, "grad_norm": 10.94772720336914, "learning_rate": 0.00017315334683646897, "loss": 2.7356, "step": 86 }, { "epoch": 0.013656162932150846, "grad_norm": 11.706610679626465, "learning_rate": 0.00016871310699195379, "loss": 2.2746, "step": 87 }, { "epoch": 0.013813130322175567, "grad_norm": 6.187313556671143, "learning_rate": 0.00016428862104027268, "loss": 1.8093, "step": 88 }, { "epoch": 0.01397009771220029, "grad_norm": 21.33131980895996, "learning_rate": 0.00015988211684334546, "loss": 3.0504, "step": 89 }, { "epoch": 0.014127065102225013, "grad_norm": 5.162511825561523, "learning_rate": 0.00015549581320873715, "loss": 1.5756, "step": 90 }, { "epoch": 0.014284032492249734, "grad_norm": 7.791429042816162, "learning_rate": 0.00015113191877242117, "loss": 1.8701, "step": 91 }, { "epoch": 0.014440999882274458, "grad_norm": 10.379227638244629, "learning_rate": 0.00014679263088666499, "loss": 2.3252, "step": 92 }, { "epoch": 0.01459796727229918, "grad_norm": 10.369537353515625, "learning_rate": 0.00014248013451359656, "loss": 2.2013, "step": 93 }, { "epoch": 0.014754934662323901, "grad_norm": 7.18357515335083, "learning_rate": 0.00013819660112501054, "loss": 1.4809, "step": 94 }, { "epoch": 0.014911902052348625, "grad_norm": 7.497470855712891, "learning_rate": 0.00013394418760896666, "loss": 1.8012, "step": 95 }, { "epoch": 0.015068869442373347, "grad_norm": 8.430214881896973, "learning_rate": 0.00012972503518373144, "loss": 1.6045, "step": 96 }, { "epoch": 0.01522583683239807, "grad_norm": 7.286400318145752, "learning_rate": 0.00012554126831961098, "loss": 1.9472, "step": 97 }, { "epoch": 0.015382804222422792, "grad_norm": 6.470623970031738, "learning_rate": 0.0001213949936692153, "loss": 1.9869, "step": 98 }, { "epoch": 0.015539771612447514, "grad_norm": 13.478861808776855, "learning_rate": 0.00011728829900669591, "loss": 2.1083, "step": 99 }, { "epoch": 0.015696739002472235, "grad_norm": 6.1007585525512695, "learning_rate": 0.00011322325217648839, "loss": 1.8082, "step": 100 }, { "epoch": 0.015853706392496957, "grad_norm": 7.112922668457031, "learning_rate": 0.00010920190005209065, "loss": 1.9454, "step": 101 }, { "epoch": 0.016010673782521682, "grad_norm": 12.862048149108887, "learning_rate": 0.00010522626750540028, "loss": 2.4716, "step": 102 }, { "epoch": 0.016167641172546404, "grad_norm": 6.302742958068848, "learning_rate": 0.00010129835638713063, "loss": 1.688, "step": 103 }, { "epoch": 0.016324608562571126, "grad_norm": 11.294086456298828, "learning_rate": 9.74201445188188e-05, "loss": 2.2397, "step": 104 }, { "epoch": 0.016481575952595848, "grad_norm": 12.633040428161621, "learning_rate": 9.359358469693271e-05, "loss": 2.1285, "step": 105 }, { "epoch": 0.01663854334262057, "grad_norm": 6.535755157470703, "learning_rate": 8.982060370957952e-05, "loss": 1.8518, "step": 106 }, { "epoch": 0.01679551073264529, "grad_norm": 7.625582695007324, "learning_rate": 8.610310136630962e-05, "loss": 2.0861, "step": 107 }, { "epoch": 0.016952478122670016, "grad_norm": 6.302818775177002, "learning_rate": 8.24429495415054e-05, "loss": 1.6471, "step": 108 }, { "epoch": 0.017109445512694738, "grad_norm": 7.8976030349731445, "learning_rate": 7.884199123183605e-05, "loss": 1.7234, "step": 109 }, { "epoch": 0.01726641290271946, "grad_norm": 19.286394119262695, "learning_rate": 7.530203962825331e-05, "loss": 3.6885, "step": 110 }, { "epoch": 0.017423380292744182, "grad_norm": 7.020461082458496, "learning_rate": 7.182487720299517e-05, "loss": 1.7969, "step": 111 }, { "epoch": 0.017580347682768904, "grad_norm": 10.761330604553223, "learning_rate": 6.841225481205749e-05, "loss": 2.5873, "step": 112 }, { "epoch": 0.01773731507279363, "grad_norm": 6.884322166442871, "learning_rate": 6.506589081358514e-05, "loss": 1.5864, "step": 113 }, { "epoch": 0.01789428246281835, "grad_norm": 8.130167007446289, "learning_rate": 6.178747020262707e-05, "loss": 1.8232, "step": 114 }, { "epoch": 0.01789428246281835, "eval_loss": 0.5430928468704224, "eval_runtime": 820.8054, "eval_samples_per_second": 3.269, "eval_steps_per_second": 1.635, "step": 114 }, { "epoch": 0.018051249852843072, "grad_norm": 6.698558807373047, "learning_rate": 5.857864376269051e-05, "loss": 2.5849, "step": 115 }, { "epoch": 0.018208217242867794, "grad_norm": 10.639579772949219, "learning_rate": 5.544102723452171e-05, "loss": 2.6, "step": 116 }, { "epoch": 0.018365184632892516, "grad_norm": 8.183032989501953, "learning_rate": 5.237620050253189e-05, "loss": 2.6161, "step": 117 }, { "epoch": 0.018522152022917238, "grad_norm": 6.063802242279053, "learning_rate": 4.938570679927783e-05, "loss": 1.3668, "step": 118 }, { "epoch": 0.018679119412941963, "grad_norm": 5.778029441833496, "learning_rate": 4.647105192839778e-05, "loss": 1.6742, "step": 119 }, { "epoch": 0.018836086802966685, "grad_norm": 9.222543716430664, "learning_rate": 4.363370350639404e-05, "loss": 2.0035, "step": 120 }, { "epoch": 0.018993054192991406, "grad_norm": 10.651296615600586, "learning_rate": 4.087509022364382e-05, "loss": 2.5014, "step": 121 }, { "epoch": 0.019150021583016128, "grad_norm": 11.831440925598145, "learning_rate": 3.819660112501053e-05, "loss": 2.2869, "step": 122 }, { "epoch": 0.01930698897304085, "grad_norm": 7.712268829345703, "learning_rate": 3.5599584910418035e-05, "loss": 2.0851, "step": 123 }, { "epoch": 0.01946395636306557, "grad_norm": 9.053482055664062, "learning_rate": 3.3085349255739474e-05, "loss": 1.8142, "step": 124 }, { "epoch": 0.019620923753090297, "grad_norm": 8.176172256469727, "learning_rate": 3.0655160154343174e-05, "loss": 2.3284, "step": 125 }, { "epoch": 0.01977789114311502, "grad_norm": 18.283300399780273, "learning_rate": 2.831024127962678e-05, "loss": 1.9836, "step": 126 }, { "epoch": 0.01993485853313974, "grad_norm": 7.915673732757568, "learning_rate": 2.6051773368860934e-05, "loss": 2.3106, "step": 127 }, { "epoch": 0.020091825923164462, "grad_norm": 6.272395133972168, "learning_rate": 2.38808936286524e-05, "loss": 1.862, "step": 128 }, { "epoch": 0.020248793313189184, "grad_norm": 9.373351097106934, "learning_rate": 2.1798695162326442e-05, "loss": 2.0214, "step": 129 }, { "epoch": 0.020405760703213906, "grad_norm": 8.452390670776367, "learning_rate": 1.9806226419516192e-05, "loss": 1.8931, "step": 130 }, { "epoch": 0.02056272809323863, "grad_norm": 6.104604721069336, "learning_rate": 1.790449066823683e-05, "loss": 1.9672, "step": 131 }, { "epoch": 0.020719695483263353, "grad_norm": 5.282866477966309, "learning_rate": 1.6094445489709885e-05, "loss": 2.1532, "step": 132 }, { "epoch": 0.020876662873288074, "grad_norm": 8.8634672164917, "learning_rate": 1.4377002296192233e-05, "loss": 1.9147, "step": 133 }, { "epoch": 0.021033630263312796, "grad_norm": 29.37977409362793, "learning_rate": 1.275302587205256e-05, "loss": 3.5516, "step": 134 }, { "epoch": 0.021190597653337518, "grad_norm": 12.045405387878418, "learning_rate": 1.1223333938326485e-05, "loss": 2.0136, "step": 135 }, { "epoch": 0.02134756504336224, "grad_norm": 6.3739399909973145, "learning_rate": 9.788696740969295e-06, "loss": 1.7538, "step": 136 }, { "epoch": 0.021504532433386965, "grad_norm": 10.208410263061523, "learning_rate": 8.44983666301391e-06, "loss": 1.9149, "step": 137 }, { "epoch": 0.021661499823411687, "grad_norm": 6.10170841217041, "learning_rate": 7.2074278608293525e-06, "loss": 1.7403, "step": 138 }, { "epoch": 0.02181846721343641, "grad_norm": 11.614684104919434, "learning_rate": 6.062095924662625e-06, "loss": 2.1914, "step": 139 }, { "epoch": 0.02197543460346113, "grad_norm": 13.544835090637207, "learning_rate": 5.0144175636352765e-06, "loss": 2.4752, "step": 140 }, { "epoch": 0.022132401993485852, "grad_norm": 10.997184753417969, "learning_rate": 4.064920315352904e-06, "loss": 1.691, "step": 141 }, { "epoch": 0.022289369383510577, "grad_norm": 5.709866046905518, "learning_rate": 3.2140822802740668e-06, "loss": 1.5808, "step": 142 }, { "epoch": 0.0224463367735353, "grad_norm": 8.197881698608398, "learning_rate": 2.462331880972468e-06, "loss": 2.0699, "step": 143 }, { "epoch": 0.02260330416356002, "grad_norm": 9.883748054504395, "learning_rate": 1.81004764641306e-06, "loss": 2.9457, "step": 144 }, { "epoch": 0.022760271553584743, "grad_norm": 9.526391983032227, "learning_rate": 1.2575580213514792e-06, "loss": 2.7024, "step": 145 }, { "epoch": 0.022917238943609464, "grad_norm": 6.334656715393066, "learning_rate": 8.051412009521864e-07, "loss": 2.2296, "step": 146 }, { "epoch": 0.023074206333634186, "grad_norm": 6.532505989074707, "learning_rate": 4.530249907087836e-07, "loss": 2.0629, "step": 147 }, { "epoch": 0.02323117372365891, "grad_norm": 6.112399578094482, "learning_rate": 2.0138669173708213e-07, "loss": 1.8538, "step": 148 }, { "epoch": 0.023388141113683633, "grad_norm": 6.967254161834717, "learning_rate": 5.035301149869387e-08, "loss": 1.9779, "step": 149 }, { "epoch": 0.023545108503708355, "grad_norm": 6.000161647796631, "learning_rate": 0.0, "loss": 1.649, "step": 150 } ], "logging_steps": 1, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 38, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.333035535623782e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }