qwen3-8B-sft-mix-v20250907-8ep / trainer_state.json
rulins's picture
Upload folder using huggingface_hub
5a1f1b4 verified
raw
history blame
18.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 500,
"global_step": 504,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07976071784646062,
"grad_norm": 8.951485022406759,
"learning_rate": 3.1372549019607846e-06,
"loss": 1.6286,
"step": 5
},
{
"epoch": 0.15952143569292124,
"grad_norm": 1.7911944426834843,
"learning_rate": 7.058823529411766e-06,
"loss": 1.4082,
"step": 10
},
{
"epoch": 0.23928215353938184,
"grad_norm": 1.188948888919745,
"learning_rate": 1.0980392156862747e-05,
"loss": 1.2277,
"step": 15
},
{
"epoch": 0.3190428713858425,
"grad_norm": 0.6895721187833729,
"learning_rate": 1.4901960784313726e-05,
"loss": 1.1183,
"step": 20
},
{
"epoch": 0.3988035892323031,
"grad_norm": 0.5800787932266839,
"learning_rate": 1.8823529411764708e-05,
"loss": 1.065,
"step": 25
},
{
"epoch": 0.4785643070787637,
"grad_norm": 0.5106042574703776,
"learning_rate": 2.274509803921569e-05,
"loss": 1.0376,
"step": 30
},
{
"epoch": 0.5583250249252243,
"grad_norm": 0.5130368281162043,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.0193,
"step": 35
},
{
"epoch": 0.638085742771685,
"grad_norm": 0.5048169792373989,
"learning_rate": 3.0588235294117644e-05,
"loss": 1.0101,
"step": 40
},
{
"epoch": 0.7178464606181456,
"grad_norm": 0.49100483855991833,
"learning_rate": 3.450980392156863e-05,
"loss": 0.9855,
"step": 45
},
{
"epoch": 0.7976071784646062,
"grad_norm": 0.5082070499580861,
"learning_rate": 3.8431372549019614e-05,
"loss": 0.9472,
"step": 50
},
{
"epoch": 0.8773678963110668,
"grad_norm": 0.45791310359750687,
"learning_rate": 3.999567157212646e-05,
"loss": 0.9543,
"step": 55
},
{
"epoch": 0.9571286141575274,
"grad_norm": 0.45379188386242486,
"learning_rate": 3.996922685294587e-05,
"loss": 0.9476,
"step": 60
},
{
"epoch": 1.0319042871385842,
"grad_norm": 0.6240889944377968,
"learning_rate": 3.991877385171789e-05,
"loss": 0.897,
"step": 65
},
{
"epoch": 1.111665004985045,
"grad_norm": 0.5440466161961147,
"learning_rate": 3.9844373226268305e-05,
"loss": 0.8082,
"step": 70
},
{
"epoch": 1.1914257228315055,
"grad_norm": 0.5107070923561905,
"learning_rate": 3.97461144257888e-05,
"loss": 0.8094,
"step": 75
},
{
"epoch": 1.271186440677966,
"grad_norm": 0.4651202387725849,
"learning_rate": 3.9624115583295375e-05,
"loss": 0.7905,
"step": 80
},
{
"epoch": 1.3509471585244266,
"grad_norm": 0.4774453312775275,
"learning_rate": 3.9478523373601325e-05,
"loss": 0.8298,
"step": 85
},
{
"epoch": 1.4307078763708874,
"grad_norm": 0.4556406153190642,
"learning_rate": 3.930951283697534e-05,
"loss": 0.7867,
"step": 90
},
{
"epoch": 1.510468594217348,
"grad_norm": 0.4830106629278,
"learning_rate": 3.9117287168696956e-05,
"loss": 0.8198,
"step": 95
},
{
"epoch": 1.5902293120638085,
"grad_norm": 0.5022923875137891,
"learning_rate": 3.8902077474762155e-05,
"loss": 0.7834,
"step": 100
},
{
"epoch": 1.6699900299102692,
"grad_norm": 0.44123948565588444,
"learning_rate": 3.866414249403295e-05,
"loss": 0.8185,
"step": 105
},
{
"epoch": 1.7497507477567298,
"grad_norm": 0.4430352770475653,
"learning_rate": 3.840376828716499e-05,
"loss": 0.8124,
"step": 110
},
{
"epoch": 1.8295114656031903,
"grad_norm": 0.4924851273312515,
"learning_rate": 3.812126789268712e-05,
"loss": 0.8057,
"step": 115
},
{
"epoch": 1.909272183449651,
"grad_norm": 0.46057083006170463,
"learning_rate": 3.781698095064647e-05,
"loss": 0.8145,
"step": 120
},
{
"epoch": 1.9890329012961117,
"grad_norm": 0.4611804285939624,
"learning_rate": 3.7491273294271386e-05,
"loss": 0.7913,
"step": 125
},
{
"epoch": 2.0638085742771684,
"grad_norm": 0.6959783330087537,
"learning_rate": 3.7144536510143436e-05,
"loss": 0.6452,
"step": 130
},
{
"epoch": 2.143569292123629,
"grad_norm": 0.5166517982510289,
"learning_rate": 3.6777187467406857e-05,
"loss": 0.5773,
"step": 135
},
{
"epoch": 2.22333000997009,
"grad_norm": 0.571267388696848,
"learning_rate": 3.638966781658187e-05,
"loss": 0.5875,
"step": 140
},
{
"epoch": 2.30309072781655,
"grad_norm": 0.5939430686190696,
"learning_rate": 3.598244345858412e-05,
"loss": 0.5823,
"step": 145
},
{
"epoch": 2.382851445663011,
"grad_norm": 0.5778600291078902,
"learning_rate": 3.555600398458885e-05,
"loss": 0.582,
"step": 150
},
{
"epoch": 2.4626121635094718,
"grad_norm": 0.5529898865422133,
"learning_rate": 3.511086208741303e-05,
"loss": 0.5911,
"step": 155
},
{
"epoch": 2.542372881355932,
"grad_norm": 0.5081756947326533,
"learning_rate": 3.464755294512325e-05,
"loss": 0.5844,
"step": 160
},
{
"epoch": 2.622133599202393,
"grad_norm": 0.4848619192082748,
"learning_rate": 3.4166633577610425e-05,
"loss": 0.5893,
"step": 165
},
{
"epoch": 2.701894317048853,
"grad_norm": 0.5322251164040472,
"learning_rate": 3.366868217690482e-05,
"loss": 0.5912,
"step": 170
},
{
"epoch": 2.781655034895314,
"grad_norm": 0.48557366731521723,
"learning_rate": 3.315429741203666e-05,
"loss": 0.5813,
"step": 175
},
{
"epoch": 2.8614157527417747,
"grad_norm": 0.48422520390869345,
"learning_rate": 3.2624097709277855e-05,
"loss": 0.5943,
"step": 180
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.447236488520265,
"learning_rate": 3.2078720508630427e-05,
"loss": 0.5924,
"step": 185
},
{
"epoch": 3.015952143569292,
"grad_norm": 1.2170595347725597,
"learning_rate": 3.1518821497455326e-05,
"loss": 0.5568,
"step": 190
},
{
"epoch": 3.0957128614157527,
"grad_norm": 0.9128571202802258,
"learning_rate": 3.094507382216312e-05,
"loss": 0.3975,
"step": 195
},
{
"epoch": 3.1754735792622135,
"grad_norm": 0.6352191401696539,
"learning_rate": 3.0358167278914387e-05,
"loss": 0.3864,
"step": 200
},
{
"epoch": 3.255234297108674,
"grad_norm": 0.5562946553628527,
"learning_rate": 2.9758807484302566e-05,
"loss": 0.3815,
"step": 205
},
{
"epoch": 3.3349950149551346,
"grad_norm": 0.5968079984333592,
"learning_rate": 2.9147715027016593e-05,
"loss": 0.3902,
"step": 210
},
{
"epoch": 3.4147557328015954,
"grad_norm": 0.5987175013101326,
"learning_rate": 2.8525624601503055e-05,
"loss": 0.3884,
"step": 215
},
{
"epoch": 3.4945164506480557,
"grad_norm": 0.582626716390902,
"learning_rate": 2.789328412466953e-05,
"loss": 0.4097,
"step": 220
},
{
"epoch": 3.5742771684945165,
"grad_norm": 0.5696325846022255,
"learning_rate": 2.725145383669106e-05,
"loss": 0.3813,
"step": 225
},
{
"epoch": 3.6540378863409773,
"grad_norm": 0.6291341426490248,
"learning_rate": 2.6600905387000716e-05,
"loss": 0.3874,
"step": 230
},
{
"epoch": 3.7337986041874376,
"grad_norm": 0.5512196870000226,
"learning_rate": 2.594242090656335e-05,
"loss": 0.3864,
"step": 235
},
{
"epoch": 3.8135593220338984,
"grad_norm": 0.5172511656987956,
"learning_rate": 2.5276792067547672e-05,
"loss": 0.4047,
"step": 240
},
{
"epoch": 3.8933200398803587,
"grad_norm": 0.5434124466868848,
"learning_rate": 2.460481913152734e-05,
"loss": 0.3849,
"step": 245
},
{
"epoch": 3.9730807577268195,
"grad_norm": 0.5364334555823091,
"learning_rate": 2.392730998735529e-05,
"loss": 0.3827,
"step": 250
},
{
"epoch": 4.047856430707876,
"grad_norm": 0.6202233480341806,
"learning_rate": 2.3245079179868054e-05,
"loss": 0.2996,
"step": 255
},
{
"epoch": 4.127617148554337,
"grad_norm": 0.675034415488875,
"learning_rate": 2.2558946930587907e-05,
"loss": 0.2318,
"step": 260
},
{
"epoch": 4.2073778664007975,
"grad_norm": 0.6378422283616842,
"learning_rate": 2.18697381516e-05,
"loss": 0.2264,
"step": 265
},
{
"epoch": 4.287138584247258,
"grad_norm": 0.5828333823927044,
"learning_rate": 2.1178281453790358e-05,
"loss": 0.2249,
"step": 270
},
{
"epoch": 4.366899302093719,
"grad_norm": 0.5645506561383866,
"learning_rate": 2.0485408150636804e-05,
"loss": 0.2257,
"step": 275
},
{
"epoch": 4.44666001994018,
"grad_norm": 0.5286311194195279,
"learning_rate": 1.979195125875072e-05,
"loss": 0.2258,
"step": 280
},
{
"epoch": 4.526420737786641,
"grad_norm": 0.5354213636574895,
"learning_rate": 1.909874449637122e-05,
"loss": 0.2277,
"step": 285
},
{
"epoch": 4.6061814556331,
"grad_norm": 0.5132597119160957,
"learning_rate": 1.84066212810157e-05,
"loss": 0.2263,
"step": 290
},
{
"epoch": 4.685942173479561,
"grad_norm": 0.4992674146867158,
"learning_rate": 1.7716413727492035e-05,
"loss": 0.2309,
"step": 295
},
{
"epoch": 4.765702891326022,
"grad_norm": 0.5871955206074188,
"learning_rate": 1.7028951647476862e-05,
"loss": 0.2256,
"step": 300
},
{
"epoch": 4.845463609172483,
"grad_norm": 0.5483461557535811,
"learning_rate": 1.634506155186295e-05,
"loss": 0.2366,
"step": 305
},
{
"epoch": 4.9252243270189435,
"grad_norm": 0.5615291707528146,
"learning_rate": 1.5665565657074874e-05,
"loss": 0.2177,
"step": 310
},
{
"epoch": 5.0,
"grad_norm": 0.5560610126898546,
"learning_rate": 1.4991280896547893e-05,
"loss": 0.217,
"step": 315
},
{
"epoch": 5.079760717846461,
"grad_norm": 0.63285574638701,
"learning_rate": 1.4323017938558245e-05,
"loss": 0.1192,
"step": 320
},
{
"epoch": 5.1595214356929215,
"grad_norm": 0.51986072558017,
"learning_rate": 1.3661580211585947e-05,
"loss": 0.1174,
"step": 325
},
{
"epoch": 5.239282153539381,
"grad_norm": 0.4103698636395131,
"learning_rate": 1.3007762938381619e-05,
"loss": 0.1121,
"step": 330
},
{
"epoch": 5.319042871385842,
"grad_norm": 0.47784813487742994,
"learning_rate": 1.2362352179898855e-05,
"loss": 0.1191,
"step": 335
},
{
"epoch": 5.398803589232303,
"grad_norm": 0.424232714663781,
"learning_rate": 1.1726123890241439e-05,
"loss": 0.1128,
"step": 340
},
{
"epoch": 5.478564307078764,
"grad_norm": 0.48412806315859497,
"learning_rate": 1.1099842983761712e-05,
"loss": 0.1144,
"step": 345
},
{
"epoch": 5.5583250249252245,
"grad_norm": 0.4266976645996111,
"learning_rate": 1.0484262415431536e-05,
"loss": 0.1056,
"step": 350
},
{
"epoch": 5.638085742771685,
"grad_norm": 0.3812408369839083,
"learning_rate": 9.880122275591752e-06,
"loss": 0.1106,
"step": 355
},
{
"epoch": 5.717846460618146,
"grad_norm": 0.422453041096817,
"learning_rate": 9.288148900168122e-06,
"loss": 0.1056,
"step": 360
},
{
"epoch": 5.797607178464606,
"grad_norm": 0.4322042305854276,
"learning_rate": 8.70905399742389e-06,
"loss": 0.112,
"step": 365
},
{
"epoch": 5.877367896311067,
"grad_norm": 0.41261441396205645,
"learning_rate": 8.143533792298545e-06,
"loss": 0.1055,
"step": 370
},
{
"epoch": 5.9571286141575275,
"grad_norm": 0.4218939861247821,
"learning_rate": 7.59226818936166e-06,
"loss": 0.1054,
"step": 375
},
{
"epoch": 6.031904287138584,
"grad_norm": 0.4601781395305167,
"learning_rate": 7.055919955388122e-06,
"loss": 0.0903,
"step": 380
},
{
"epoch": 6.111665004985045,
"grad_norm": 0.42572965163147103,
"learning_rate": 6.535133922537513e-06,
"loss": 0.056,
"step": 385
},
{
"epoch": 6.1914257228315055,
"grad_norm": 0.39199127923177135,
"learning_rate": 6.0305362130956504e-06,
"loss": 0.0553,
"step": 390
},
{
"epoch": 6.271186440677966,
"grad_norm": 0.30614111878955885,
"learning_rate": 5.542733486710299e-06,
"loss": 0.0538,
"step": 395
},
{
"epoch": 6.350947158524427,
"grad_norm": 0.2838511806116735,
"learning_rate": 5.072312211026125e-06,
"loss": 0.0533,
"step": 400
},
{
"epoch": 6.430707876370887,
"grad_norm": 0.3527112901949325,
"learning_rate": 4.619837956595825e-06,
"loss": 0.0579,
"step": 405
},
{
"epoch": 6.510468594217348,
"grad_norm": 0.2802030569387254,
"learning_rate": 4.185854716914952e-06,
"loss": 0.0516,
"step": 410
},
{
"epoch": 6.5902293120638085,
"grad_norm": 0.28919103613267777,
"learning_rate": 3.7708842543981928e-06,
"loss": 0.0522,
"step": 415
},
{
"epoch": 6.669990029910269,
"grad_norm": 0.28238569460732055,
"learning_rate": 3.375425473083185e-06,
"loss": 0.0554,
"step": 420
},
{
"epoch": 6.74975074775673,
"grad_norm": 0.35381160740706563,
"learning_rate": 2.9999538188161705e-06,
"loss": 0.0536,
"step": 425
},
{
"epoch": 6.829511465603191,
"grad_norm": 0.27756519386816786,
"learning_rate": 2.6449207076405857e-06,
"loss": 0.0511,
"step": 430
},
{
"epoch": 6.909272183449651,
"grad_norm": 0.3349698709554607,
"learning_rate": 2.310752983075819e-06,
"loss": 0.0531,
"step": 435
},
{
"epoch": 6.989032901296111,
"grad_norm": 0.3120620614394932,
"learning_rate": 1.9978524029386026e-06,
"loss": 0.0538,
"step": 440
},
{
"epoch": 7.063808574277169,
"grad_norm": 0.2631675945808927,
"learning_rate": 1.7065951563241022e-06,
"loss": 0.0479,
"step": 445
},
{
"epoch": 7.14356929212363,
"grad_norm": 0.26021291176981515,
"learning_rate": 1.437331411327274e-06,
"loss": 0.0451,
"step": 450
},
{
"epoch": 7.2233300099700894,
"grad_norm": 0.2211041691097816,
"learning_rate": 1.1903848940484241e-06,
"loss": 0.0373,
"step": 455
},
{
"epoch": 7.30309072781655,
"grad_norm": 0.30062035959934313,
"learning_rate": 9.660524993889386e-07,
"loss": 0.0389,
"step": 460
},
{
"epoch": 7.382851445663011,
"grad_norm": 0.24836493024466927,
"learning_rate": 7.646039341052747e-07,
"loss": 0.0345,
"step": 465
},
{
"epoch": 7.462612163509472,
"grad_norm": 0.2128207343872171,
"learning_rate": 5.862813925502209e-07,
"loss": 0.0335,
"step": 470
},
{
"epoch": 7.5423728813559325,
"grad_norm": 0.22526753960255325,
"learning_rate": 4.3129926549136057e-07,
"loss": 0.0356,
"step": 475
},
{
"epoch": 7.622133599202392,
"grad_norm": 0.21640137907033566,
"learning_rate": 2.99843882356774e-07,
"loss": 0.0352,
"step": 480
},
{
"epoch": 7.701894317048853,
"grad_norm": 0.22957439567149518,
"learning_rate": 1.9207328721788653e-07,
"loss": 0.0366,
"step": 485
},
{
"epoch": 7.781655034895314,
"grad_norm": 0.20281851671161966,
"learning_rate": 1.0811704877875528e-07,
"loss": 0.0355,
"step": 490
},
{
"epoch": 7.861415752741775,
"grad_norm": 0.2399792418034644,
"learning_rate": 4.807610460030976e-08,
"loss": 0.0345,
"step": 495
},
{
"epoch": 7.9411764705882355,
"grad_norm": 0.2284150095647677,
"learning_rate": 1.202263974674045e-08,
"loss": 0.0367,
"step": 500
},
{
"epoch": 8.0,
"step": 504,
"total_flos": 718744629805056.0,
"train_loss": 0.414989875806939,
"train_runtime": 39904.848,
"train_samples_per_second": 1.608,
"train_steps_per_second": 0.013
}
],
"logging_steps": 5,
"max_steps": 504,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 718744629805056.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}