{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 500, "global_step": 504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07976071784646062, "grad_norm": 8.951485022406759, "learning_rate": 3.1372549019607846e-06, "loss": 1.6286, "step": 5 }, { "epoch": 0.15952143569292124, "grad_norm": 1.7911944426834843, "learning_rate": 7.058823529411766e-06, "loss": 1.4082, "step": 10 }, { "epoch": 0.23928215353938184, "grad_norm": 1.188948888919745, "learning_rate": 1.0980392156862747e-05, "loss": 1.2277, "step": 15 }, { "epoch": 0.3190428713858425, "grad_norm": 0.6895721187833729, "learning_rate": 1.4901960784313726e-05, "loss": 1.1183, "step": 20 }, { "epoch": 0.3988035892323031, "grad_norm": 0.5800787932266839, "learning_rate": 1.8823529411764708e-05, "loss": 1.065, "step": 25 }, { "epoch": 0.4785643070787637, "grad_norm": 0.5106042574703776, "learning_rate": 2.274509803921569e-05, "loss": 1.0376, "step": 30 }, { "epoch": 0.5583250249252243, "grad_norm": 0.5130368281162043, "learning_rate": 2.6666666666666667e-05, "loss": 1.0193, "step": 35 }, { "epoch": 0.638085742771685, "grad_norm": 0.5048169792373989, "learning_rate": 3.0588235294117644e-05, "loss": 1.0101, "step": 40 }, { "epoch": 0.7178464606181456, "grad_norm": 0.49100483855991833, "learning_rate": 3.450980392156863e-05, "loss": 0.9855, "step": 45 }, { "epoch": 0.7976071784646062, "grad_norm": 0.5082070499580861, "learning_rate": 3.8431372549019614e-05, "loss": 0.9472, "step": 50 }, { "epoch": 0.8773678963110668, "grad_norm": 0.45791310359750687, "learning_rate": 3.999567157212646e-05, "loss": 0.9543, "step": 55 }, { "epoch": 0.9571286141575274, "grad_norm": 0.45379188386242486, "learning_rate": 3.996922685294587e-05, "loss": 0.9476, "step": 60 }, { "epoch": 1.0319042871385842, "grad_norm": 0.6240889944377968, "learning_rate": 3.991877385171789e-05, "loss": 0.897, "step": 65 }, { "epoch": 1.111665004985045, "grad_norm": 0.5440466161961147, "learning_rate": 3.9844373226268305e-05, "loss": 0.8082, "step": 70 }, { "epoch": 1.1914257228315055, "grad_norm": 0.5107070923561905, "learning_rate": 3.97461144257888e-05, "loss": 0.8094, "step": 75 }, { "epoch": 1.271186440677966, "grad_norm": 0.4651202387725849, "learning_rate": 3.9624115583295375e-05, "loss": 0.7905, "step": 80 }, { "epoch": 1.3509471585244266, "grad_norm": 0.4774453312775275, "learning_rate": 3.9478523373601325e-05, "loss": 0.8298, "step": 85 }, { "epoch": 1.4307078763708874, "grad_norm": 0.4556406153190642, "learning_rate": 3.930951283697534e-05, "loss": 0.7867, "step": 90 }, { "epoch": 1.510468594217348, "grad_norm": 0.4830106629278, "learning_rate": 3.9117287168696956e-05, "loss": 0.8198, "step": 95 }, { "epoch": 1.5902293120638085, "grad_norm": 0.5022923875137891, "learning_rate": 3.8902077474762155e-05, "loss": 0.7834, "step": 100 }, { "epoch": 1.6699900299102692, "grad_norm": 0.44123948565588444, "learning_rate": 3.866414249403295e-05, "loss": 0.8185, "step": 105 }, { "epoch": 1.7497507477567298, "grad_norm": 0.4430352770475653, "learning_rate": 3.840376828716499e-05, "loss": 0.8124, "step": 110 }, { "epoch": 1.8295114656031903, "grad_norm": 0.4924851273312515, "learning_rate": 3.812126789268712e-05, "loss": 0.8057, "step": 115 }, { "epoch": 1.909272183449651, "grad_norm": 0.46057083006170463, "learning_rate": 3.781698095064647e-05, "loss": 0.8145, "step": 120 }, { "epoch": 1.9890329012961117, "grad_norm": 0.4611804285939624, "learning_rate": 3.7491273294271386e-05, "loss": 0.7913, "step": 125 }, { "epoch": 2.0638085742771684, "grad_norm": 0.6959783330087537, "learning_rate": 3.7144536510143436e-05, "loss": 0.6452, "step": 130 }, { "epoch": 2.143569292123629, "grad_norm": 0.5166517982510289, "learning_rate": 3.6777187467406857e-05, "loss": 0.5773, "step": 135 }, { "epoch": 2.22333000997009, "grad_norm": 0.571267388696848, "learning_rate": 3.638966781658187e-05, "loss": 0.5875, "step": 140 }, { "epoch": 2.30309072781655, "grad_norm": 0.5939430686190696, "learning_rate": 3.598244345858412e-05, "loss": 0.5823, "step": 145 }, { "epoch": 2.382851445663011, "grad_norm": 0.5778600291078902, "learning_rate": 3.555600398458885e-05, "loss": 0.582, "step": 150 }, { "epoch": 2.4626121635094718, "grad_norm": 0.5529898865422133, "learning_rate": 3.511086208741303e-05, "loss": 0.5911, "step": 155 }, { "epoch": 2.542372881355932, "grad_norm": 0.5081756947326533, "learning_rate": 3.464755294512325e-05, "loss": 0.5844, "step": 160 }, { "epoch": 2.622133599202393, "grad_norm": 0.4848619192082748, "learning_rate": 3.4166633577610425e-05, "loss": 0.5893, "step": 165 }, { "epoch": 2.701894317048853, "grad_norm": 0.5322251164040472, "learning_rate": 3.366868217690482e-05, "loss": 0.5912, "step": 170 }, { "epoch": 2.781655034895314, "grad_norm": 0.48557366731521723, "learning_rate": 3.315429741203666e-05, "loss": 0.5813, "step": 175 }, { "epoch": 2.8614157527417747, "grad_norm": 0.48422520390869345, "learning_rate": 3.2624097709277855e-05, "loss": 0.5943, "step": 180 }, { "epoch": 2.9411764705882355, "grad_norm": 0.447236488520265, "learning_rate": 3.2078720508630427e-05, "loss": 0.5924, "step": 185 }, { "epoch": 3.015952143569292, "grad_norm": 1.2170595347725597, "learning_rate": 3.1518821497455326e-05, "loss": 0.5568, "step": 190 }, { "epoch": 3.0957128614157527, "grad_norm": 0.9128571202802258, "learning_rate": 3.094507382216312e-05, "loss": 0.3975, "step": 195 }, { "epoch": 3.1754735792622135, "grad_norm": 0.6352191401696539, "learning_rate": 3.0358167278914387e-05, "loss": 0.3864, "step": 200 }, { "epoch": 3.255234297108674, "grad_norm": 0.5562946553628527, "learning_rate": 2.9758807484302566e-05, "loss": 0.3815, "step": 205 }, { "epoch": 3.3349950149551346, "grad_norm": 0.5968079984333592, "learning_rate": 2.9147715027016593e-05, "loss": 0.3902, "step": 210 }, { "epoch": 3.4147557328015954, "grad_norm": 0.5987175013101326, "learning_rate": 2.8525624601503055e-05, "loss": 0.3884, "step": 215 }, { "epoch": 3.4945164506480557, "grad_norm": 0.582626716390902, "learning_rate": 2.789328412466953e-05, "loss": 0.4097, "step": 220 }, { "epoch": 3.5742771684945165, "grad_norm": 0.5696325846022255, "learning_rate": 2.725145383669106e-05, "loss": 0.3813, "step": 225 }, { "epoch": 3.6540378863409773, "grad_norm": 0.6291341426490248, "learning_rate": 2.6600905387000716e-05, "loss": 0.3874, "step": 230 }, { "epoch": 3.7337986041874376, "grad_norm": 0.5512196870000226, "learning_rate": 2.594242090656335e-05, "loss": 0.3864, "step": 235 }, { "epoch": 3.8135593220338984, "grad_norm": 0.5172511656987956, "learning_rate": 2.5276792067547672e-05, "loss": 0.4047, "step": 240 }, { "epoch": 3.8933200398803587, "grad_norm": 0.5434124466868848, "learning_rate": 2.460481913152734e-05, "loss": 0.3849, "step": 245 }, { "epoch": 3.9730807577268195, "grad_norm": 0.5364334555823091, "learning_rate": 2.392730998735529e-05, "loss": 0.3827, "step": 250 }, { "epoch": 4.047856430707876, "grad_norm": 0.6202233480341806, "learning_rate": 2.3245079179868054e-05, "loss": 0.2996, "step": 255 }, { "epoch": 4.127617148554337, "grad_norm": 0.675034415488875, "learning_rate": 2.2558946930587907e-05, "loss": 0.2318, "step": 260 }, { "epoch": 4.2073778664007975, "grad_norm": 0.6378422283616842, "learning_rate": 2.18697381516e-05, "loss": 0.2264, "step": 265 }, { "epoch": 4.287138584247258, "grad_norm": 0.5828333823927044, "learning_rate": 2.1178281453790358e-05, "loss": 0.2249, "step": 270 }, { "epoch": 4.366899302093719, "grad_norm": 0.5645506561383866, "learning_rate": 2.0485408150636804e-05, "loss": 0.2257, "step": 275 }, { "epoch": 4.44666001994018, "grad_norm": 0.5286311194195279, "learning_rate": 1.979195125875072e-05, "loss": 0.2258, "step": 280 }, { "epoch": 4.526420737786641, "grad_norm": 0.5354213636574895, "learning_rate": 1.909874449637122e-05, "loss": 0.2277, "step": 285 }, { "epoch": 4.6061814556331, "grad_norm": 0.5132597119160957, "learning_rate": 1.84066212810157e-05, "loss": 0.2263, "step": 290 }, { "epoch": 4.685942173479561, "grad_norm": 0.4992674146867158, "learning_rate": 1.7716413727492035e-05, "loss": 0.2309, "step": 295 }, { "epoch": 4.765702891326022, "grad_norm": 0.5871955206074188, "learning_rate": 1.7028951647476862e-05, "loss": 0.2256, "step": 300 }, { "epoch": 4.845463609172483, "grad_norm": 0.5483461557535811, "learning_rate": 1.634506155186295e-05, "loss": 0.2366, "step": 305 }, { "epoch": 4.9252243270189435, "grad_norm": 0.5615291707528146, "learning_rate": 1.5665565657074874e-05, "loss": 0.2177, "step": 310 }, { "epoch": 5.0, "grad_norm": 0.5560610126898546, "learning_rate": 1.4991280896547893e-05, "loss": 0.217, "step": 315 }, { "epoch": 5.079760717846461, "grad_norm": 0.63285574638701, "learning_rate": 1.4323017938558245e-05, "loss": 0.1192, "step": 320 }, { "epoch": 5.1595214356929215, "grad_norm": 0.51986072558017, "learning_rate": 1.3661580211585947e-05, "loss": 0.1174, "step": 325 }, { "epoch": 5.239282153539381, "grad_norm": 0.4103698636395131, "learning_rate": 1.3007762938381619e-05, "loss": 0.1121, "step": 330 }, { "epoch": 5.319042871385842, "grad_norm": 0.47784813487742994, "learning_rate": 1.2362352179898855e-05, "loss": 0.1191, "step": 335 }, { "epoch": 5.398803589232303, "grad_norm": 0.424232714663781, "learning_rate": 1.1726123890241439e-05, "loss": 0.1128, "step": 340 }, { "epoch": 5.478564307078764, "grad_norm": 0.48412806315859497, "learning_rate": 1.1099842983761712e-05, "loss": 0.1144, "step": 345 }, { "epoch": 5.5583250249252245, "grad_norm": 0.4266976645996111, "learning_rate": 1.0484262415431536e-05, "loss": 0.1056, "step": 350 }, { "epoch": 5.638085742771685, "grad_norm": 0.3812408369839083, "learning_rate": 9.880122275591752e-06, "loss": 0.1106, "step": 355 }, { "epoch": 5.717846460618146, "grad_norm": 0.422453041096817, "learning_rate": 9.288148900168122e-06, "loss": 0.1056, "step": 360 }, { "epoch": 5.797607178464606, "grad_norm": 0.4322042305854276, "learning_rate": 8.70905399742389e-06, "loss": 0.112, "step": 365 }, { "epoch": 5.877367896311067, "grad_norm": 0.41261441396205645, "learning_rate": 8.143533792298545e-06, "loss": 0.1055, "step": 370 }, { "epoch": 5.9571286141575275, "grad_norm": 0.4218939861247821, "learning_rate": 7.59226818936166e-06, "loss": 0.1054, "step": 375 }, { "epoch": 6.031904287138584, "grad_norm": 0.4601781395305167, "learning_rate": 7.055919955388122e-06, "loss": 0.0903, "step": 380 }, { "epoch": 6.111665004985045, "grad_norm": 0.42572965163147103, "learning_rate": 6.535133922537513e-06, "loss": 0.056, "step": 385 }, { "epoch": 6.1914257228315055, "grad_norm": 0.39199127923177135, "learning_rate": 6.0305362130956504e-06, "loss": 0.0553, "step": 390 }, { "epoch": 6.271186440677966, "grad_norm": 0.30614111878955885, "learning_rate": 5.542733486710299e-06, "loss": 0.0538, "step": 395 }, { "epoch": 6.350947158524427, "grad_norm": 0.2838511806116735, "learning_rate": 5.072312211026125e-06, "loss": 0.0533, "step": 400 }, { "epoch": 6.430707876370887, "grad_norm": 0.3527112901949325, "learning_rate": 4.619837956595825e-06, "loss": 0.0579, "step": 405 }, { "epoch": 6.510468594217348, "grad_norm": 0.2802030569387254, "learning_rate": 4.185854716914952e-06, "loss": 0.0516, "step": 410 }, { "epoch": 6.5902293120638085, "grad_norm": 0.28919103613267777, "learning_rate": 3.7708842543981928e-06, "loss": 0.0522, "step": 415 }, { "epoch": 6.669990029910269, "grad_norm": 0.28238569460732055, "learning_rate": 3.375425473083185e-06, "loss": 0.0554, "step": 420 }, { "epoch": 6.74975074775673, "grad_norm": 0.35381160740706563, "learning_rate": 2.9999538188161705e-06, "loss": 0.0536, "step": 425 }, { "epoch": 6.829511465603191, "grad_norm": 0.27756519386816786, "learning_rate": 2.6449207076405857e-06, "loss": 0.0511, "step": 430 }, { "epoch": 6.909272183449651, "grad_norm": 0.3349698709554607, "learning_rate": 2.310752983075819e-06, "loss": 0.0531, "step": 435 }, { "epoch": 6.989032901296111, "grad_norm": 0.3120620614394932, "learning_rate": 1.9978524029386026e-06, "loss": 0.0538, "step": 440 }, { "epoch": 7.063808574277169, "grad_norm": 0.2631675945808927, "learning_rate": 1.7065951563241022e-06, "loss": 0.0479, "step": 445 }, { "epoch": 7.14356929212363, "grad_norm": 0.26021291176981515, "learning_rate": 1.437331411327274e-06, "loss": 0.0451, "step": 450 }, { "epoch": 7.2233300099700894, "grad_norm": 0.2211041691097816, "learning_rate": 1.1903848940484241e-06, "loss": 0.0373, "step": 455 }, { "epoch": 7.30309072781655, "grad_norm": 0.30062035959934313, "learning_rate": 9.660524993889386e-07, "loss": 0.0389, "step": 460 }, { "epoch": 7.382851445663011, "grad_norm": 0.24836493024466927, "learning_rate": 7.646039341052747e-07, "loss": 0.0345, "step": 465 }, { "epoch": 7.462612163509472, "grad_norm": 0.2128207343872171, "learning_rate": 5.862813925502209e-07, "loss": 0.0335, "step": 470 }, { "epoch": 7.5423728813559325, "grad_norm": 0.22526753960255325, "learning_rate": 4.3129926549136057e-07, "loss": 0.0356, "step": 475 }, { "epoch": 7.622133599202392, "grad_norm": 0.21640137907033566, "learning_rate": 2.99843882356774e-07, "loss": 0.0352, "step": 480 }, { "epoch": 7.701894317048853, "grad_norm": 0.22957439567149518, "learning_rate": 1.9207328721788653e-07, "loss": 0.0366, "step": 485 }, { "epoch": 7.781655034895314, "grad_norm": 0.20281851671161966, "learning_rate": 1.0811704877875528e-07, "loss": 0.0355, "step": 490 }, { "epoch": 7.861415752741775, "grad_norm": 0.2399792418034644, "learning_rate": 4.807610460030976e-08, "loss": 0.0345, "step": 495 }, { "epoch": 7.9411764705882355, "grad_norm": 0.2284150095647677, "learning_rate": 1.202263974674045e-08, "loss": 0.0367, "step": 500 }, { "epoch": 8.0, "step": 504, "total_flos": 718744629805056.0, "train_loss": 0.414989875806939, "train_runtime": 39904.848, "train_samples_per_second": 1.608, "train_steps_per_second": 0.013 } ], "logging_steps": 5, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 718744629805056.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }