ItsAndy0's picture
Upload folder using huggingface_hub
a9d35b6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 798,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03766478342749529,
"grad_norm": 4.520346641540527,
"learning_rate": 0.0001986466165413534,
"loss": 1.7976,
"mean_token_accuracy": 0.5935400031507015,
"num_tokens": 6488.0,
"step": 10
},
{
"epoch": 0.07532956685499058,
"grad_norm": 4.291996955871582,
"learning_rate": 0.00019714285714285716,
"loss": 1.5155,
"mean_token_accuracy": 0.648256602883339,
"num_tokens": 12907.0,
"step": 20
},
{
"epoch": 0.11299435028248588,
"grad_norm": 4.346818447113037,
"learning_rate": 0.00019578947368421054,
"loss": 1.4412,
"mean_token_accuracy": 0.6501220405101776,
"num_tokens": 19242.0,
"step": 30
},
{
"epoch": 0.15065913370998116,
"grad_norm": 3.3988215923309326,
"learning_rate": 0.0001942857142857143,
"loss": 1.2435,
"mean_token_accuracy": 0.7024012058973312,
"num_tokens": 25647.0,
"step": 40
},
{
"epoch": 0.18832391713747645,
"grad_norm": 3.1845004558563232,
"learning_rate": 0.00019278195488721807,
"loss": 1.2935,
"mean_token_accuracy": 0.6964049726724625,
"num_tokens": 32099.0,
"step": 50
},
{
"epoch": 0.22598870056497175,
"grad_norm": 4.023508071899414,
"learning_rate": 0.0001912781954887218,
"loss": 1.1738,
"mean_token_accuracy": 0.7159222587943077,
"num_tokens": 38623.0,
"step": 60
},
{
"epoch": 0.263653483992467,
"grad_norm": 3.649766206741333,
"learning_rate": 0.0001897744360902256,
"loss": 1.0487,
"mean_token_accuracy": 0.735117182135582,
"num_tokens": 45055.0,
"step": 70
},
{
"epoch": 0.3013182674199623,
"grad_norm": 4.424032688140869,
"learning_rate": 0.00018827067669172933,
"loss": 1.0402,
"mean_token_accuracy": 0.7437395498156547,
"num_tokens": 51352.0,
"step": 80
},
{
"epoch": 0.3389830508474576,
"grad_norm": 3.893348455429077,
"learning_rate": 0.0001867669172932331,
"loss": 0.9936,
"mean_token_accuracy": 0.7585602343082428,
"num_tokens": 57835.0,
"step": 90
},
{
"epoch": 0.3766478342749529,
"grad_norm": 4.915287494659424,
"learning_rate": 0.00018526315789473685,
"loss": 0.9204,
"mean_token_accuracy": 0.7651444330811501,
"num_tokens": 64237.0,
"step": 100
},
{
"epoch": 0.4143126177024482,
"grad_norm": 3.8559658527374268,
"learning_rate": 0.00018375939849624062,
"loss": 0.9346,
"mean_token_accuracy": 0.7480458408594132,
"num_tokens": 70645.0,
"step": 110
},
{
"epoch": 0.4519774011299435,
"grad_norm": 4.7108025550842285,
"learning_rate": 0.00018225563909774438,
"loss": 0.8954,
"mean_token_accuracy": 0.777623999118805,
"num_tokens": 76950.0,
"step": 120
},
{
"epoch": 0.4896421845574388,
"grad_norm": 3.1379129886627197,
"learning_rate": 0.00018075187969924814,
"loss": 0.9075,
"mean_token_accuracy": 0.769656004011631,
"num_tokens": 83477.0,
"step": 130
},
{
"epoch": 0.527306967984934,
"grad_norm": 3.6787190437316895,
"learning_rate": 0.00017924812030075188,
"loss": 0.8276,
"mean_token_accuracy": 0.7966844961047173,
"num_tokens": 89666.0,
"step": 140
},
{
"epoch": 0.5649717514124294,
"grad_norm": 3.936288595199585,
"learning_rate": 0.00017774436090225567,
"loss": 0.754,
"mean_token_accuracy": 0.7978162422776223,
"num_tokens": 96045.0,
"step": 150
},
{
"epoch": 0.6026365348399246,
"grad_norm": 3.9623541831970215,
"learning_rate": 0.0001762406015037594,
"loss": 0.7443,
"mean_token_accuracy": 0.8057964265346527,
"num_tokens": 102612.0,
"step": 160
},
{
"epoch": 0.64030131826742,
"grad_norm": 3.3509137630462646,
"learning_rate": 0.00017473684210526317,
"loss": 0.7718,
"mean_token_accuracy": 0.8012749120593071,
"num_tokens": 109028.0,
"step": 170
},
{
"epoch": 0.6779661016949152,
"grad_norm": 3.897428512573242,
"learning_rate": 0.00017323308270676693,
"loss": 0.7048,
"mean_token_accuracy": 0.8161927372217178,
"num_tokens": 115373.0,
"step": 180
},
{
"epoch": 0.7156308851224106,
"grad_norm": 2.7902867794036865,
"learning_rate": 0.0001717293233082707,
"loss": 0.6571,
"mean_token_accuracy": 0.8328385710716247,
"num_tokens": 121714.0,
"step": 190
},
{
"epoch": 0.7532956685499058,
"grad_norm": 3.233376979827881,
"learning_rate": 0.00017022556390977443,
"loss": 0.6449,
"mean_token_accuracy": 0.8228127270936966,
"num_tokens": 128163.0,
"step": 200
},
{
"epoch": 0.7909604519774012,
"grad_norm": 3.536149501800537,
"learning_rate": 0.00016872180451127822,
"loss": 0.5095,
"mean_token_accuracy": 0.8665768191218376,
"num_tokens": 134507.0,
"step": 210
},
{
"epoch": 0.8286252354048964,
"grad_norm": 3.885110378265381,
"learning_rate": 0.00016721804511278196,
"loss": 0.6011,
"mean_token_accuracy": 0.8428927510976791,
"num_tokens": 140804.0,
"step": 220
},
{
"epoch": 0.8662900188323918,
"grad_norm": 3.187678098678589,
"learning_rate": 0.00016571428571428575,
"loss": 0.6367,
"mean_token_accuracy": 0.8326474368572235,
"num_tokens": 147072.0,
"step": 230
},
{
"epoch": 0.903954802259887,
"grad_norm": 3.375328779220581,
"learning_rate": 0.00016421052631578948,
"loss": 0.5193,
"mean_token_accuracy": 0.8652853280305862,
"num_tokens": 153356.0,
"step": 240
},
{
"epoch": 0.9416195856873822,
"grad_norm": 3.7706921100616455,
"learning_rate": 0.00016270676691729325,
"loss": 0.5301,
"mean_token_accuracy": 0.8526319354772568,
"num_tokens": 159757.0,
"step": 250
},
{
"epoch": 0.9792843691148776,
"grad_norm": 3.51759934425354,
"learning_rate": 0.00016135338345864663,
"loss": 0.5719,
"mean_token_accuracy": 0.847623547911644,
"num_tokens": 166079.0,
"step": 260
},
{
"epoch": 1.015065913370998,
"grad_norm": 3.8684723377227783,
"learning_rate": 0.0001598496240601504,
"loss": 0.3488,
"mean_token_accuracy": 0.8969152930535769,
"num_tokens": 172086.0,
"step": 270
},
{
"epoch": 1.0527306967984935,
"grad_norm": 2.8484208583831787,
"learning_rate": 0.00015834586466165416,
"loss": 0.2515,
"mean_token_accuracy": 0.9283478140830994,
"num_tokens": 178513.0,
"step": 280
},
{
"epoch": 1.0903954802259888,
"grad_norm": 2.833587884902954,
"learning_rate": 0.0001568421052631579,
"loss": 0.2459,
"mean_token_accuracy": 0.9338237583637238,
"num_tokens": 184863.0,
"step": 290
},
{
"epoch": 1.128060263653484,
"grad_norm": 2.4005439281463623,
"learning_rate": 0.00015533834586466168,
"loss": 0.2355,
"mean_token_accuracy": 0.9258405908942222,
"num_tokens": 191326.0,
"step": 300
},
{
"epoch": 1.1657250470809792,
"grad_norm": 2.199273109436035,
"learning_rate": 0.00015383458646616542,
"loss": 0.28,
"mean_token_accuracy": 0.9157944962382316,
"num_tokens": 197503.0,
"step": 310
},
{
"epoch": 1.2033898305084745,
"grad_norm": 1.9038857221603394,
"learning_rate": 0.00015233082706766918,
"loss": 0.3046,
"mean_token_accuracy": 0.9144061759114266,
"num_tokens": 203923.0,
"step": 320
},
{
"epoch": 1.24105461393597,
"grad_norm": 4.188675880432129,
"learning_rate": 0.00015082706766917294,
"loss": 0.2209,
"mean_token_accuracy": 0.9331182524561882,
"num_tokens": 210174.0,
"step": 330
},
{
"epoch": 1.2787193973634652,
"grad_norm": 3.8147690296173096,
"learning_rate": 0.0001493233082706767,
"loss": 0.2594,
"mean_token_accuracy": 0.9269167870283127,
"num_tokens": 216633.0,
"step": 340
},
{
"epoch": 1.3163841807909604,
"grad_norm": 2.800786256790161,
"learning_rate": 0.00014781954887218047,
"loss": 0.2326,
"mean_token_accuracy": 0.931777173280716,
"num_tokens": 223210.0,
"step": 350
},
{
"epoch": 1.3540489642184557,
"grad_norm": 3.3343842029571533,
"learning_rate": 0.00014631578947368423,
"loss": 0.2586,
"mean_token_accuracy": 0.9242904737591744,
"num_tokens": 229716.0,
"step": 360
},
{
"epoch": 1.3917137476459511,
"grad_norm": 2.7086610794067383,
"learning_rate": 0.00014481203007518797,
"loss": 0.2315,
"mean_token_accuracy": 0.9330500423908233,
"num_tokens": 236091.0,
"step": 370
},
{
"epoch": 1.4293785310734464,
"grad_norm": 2.444577693939209,
"learning_rate": 0.00014330827067669176,
"loss": 0.262,
"mean_token_accuracy": 0.9290693372488021,
"num_tokens": 242422.0,
"step": 380
},
{
"epoch": 1.4670433145009416,
"grad_norm": 2.5115718841552734,
"learning_rate": 0.0001418045112781955,
"loss": 0.2257,
"mean_token_accuracy": 0.9272137597203255,
"num_tokens": 248767.0,
"step": 390
},
{
"epoch": 1.5047080979284368,
"grad_norm": 2.618799924850464,
"learning_rate": 0.00014030075187969926,
"loss": 0.2231,
"mean_token_accuracy": 0.9364230826497077,
"num_tokens": 254965.0,
"step": 400
},
{
"epoch": 1.542372881355932,
"grad_norm": 3.186830997467041,
"learning_rate": 0.00013879699248120302,
"loss": 0.2071,
"mean_token_accuracy": 0.9404660388827324,
"num_tokens": 261348.0,
"step": 410
},
{
"epoch": 1.5800376647834273,
"grad_norm": 3.5681018829345703,
"learning_rate": 0.00013729323308270676,
"loss": 0.2567,
"mean_token_accuracy": 0.9265485420823097,
"num_tokens": 267657.0,
"step": 420
},
{
"epoch": 1.6177024482109228,
"grad_norm": 2.8113648891448975,
"learning_rate": 0.00013578947368421055,
"loss": 0.2537,
"mean_token_accuracy": 0.9251923531293869,
"num_tokens": 274158.0,
"step": 430
},
{
"epoch": 1.655367231638418,
"grad_norm": 3.763946771621704,
"learning_rate": 0.00013428571428571428,
"loss": 0.2339,
"mean_token_accuracy": 0.930957356095314,
"num_tokens": 280543.0,
"step": 440
},
{
"epoch": 1.6930320150659135,
"grad_norm": 2.3775360584259033,
"learning_rate": 0.00013278195488721804,
"loss": 0.1749,
"mean_token_accuracy": 0.9463336855173111,
"num_tokens": 286815.0,
"step": 450
},
{
"epoch": 1.7306967984934087,
"grad_norm": 6.756045818328857,
"learning_rate": 0.0001312781954887218,
"loss": 0.181,
"mean_token_accuracy": 0.9514397040009499,
"num_tokens": 293146.0,
"step": 460
},
{
"epoch": 1.768361581920904,
"grad_norm": 1.9489086866378784,
"learning_rate": 0.00012977443609022557,
"loss": 0.173,
"mean_token_accuracy": 0.947341488301754,
"num_tokens": 299536.0,
"step": 470
},
{
"epoch": 1.8060263653483992,
"grad_norm": 2.5123419761657715,
"learning_rate": 0.0001282706766917293,
"loss": 0.2269,
"mean_token_accuracy": 0.9323122307658196,
"num_tokens": 305960.0,
"step": 480
},
{
"epoch": 1.8436911487758945,
"grad_norm": 2.6642258167266846,
"learning_rate": 0.0001267669172932331,
"loss": 0.2046,
"mean_token_accuracy": 0.9483947545289994,
"num_tokens": 312339.0,
"step": 490
},
{
"epoch": 1.8813559322033897,
"grad_norm": 2.118013620376587,
"learning_rate": 0.00012526315789473683,
"loss": 0.1925,
"mean_token_accuracy": 0.9490439668297768,
"num_tokens": 318841.0,
"step": 500
},
{
"epoch": 1.9190207156308852,
"grad_norm": 1.8715368509292603,
"learning_rate": 0.0001237593984962406,
"loss": 0.1834,
"mean_token_accuracy": 0.94656672924757,
"num_tokens": 325239.0,
"step": 510
},
{
"epoch": 1.9566854990583804,
"grad_norm": 4.930976867675781,
"learning_rate": 0.00012225563909774436,
"loss": 0.1391,
"mean_token_accuracy": 0.9627918288111686,
"num_tokens": 331667.0,
"step": 520
},
{
"epoch": 1.9943502824858759,
"grad_norm": 2.284675359725952,
"learning_rate": 0.00012075187969924812,
"loss": 0.1153,
"mean_token_accuracy": 0.9665953874588012,
"num_tokens": 338092.0,
"step": 530
},
{
"epoch": 2.030131826741996,
"grad_norm": 3.0949432849884033,
"learning_rate": 0.00011924812030075187,
"loss": 0.0908,
"mean_token_accuracy": 0.9738617178640867,
"num_tokens": 344133.0,
"step": 540
},
{
"epoch": 2.0677966101694913,
"grad_norm": 2.162048578262329,
"learning_rate": 0.00011774436090225565,
"loss": 0.0896,
"mean_token_accuracy": 0.9742276027798653,
"num_tokens": 350507.0,
"step": 550
},
{
"epoch": 2.105461393596987,
"grad_norm": 2.7815120220184326,
"learning_rate": 0.0001162406015037594,
"loss": 0.1341,
"mean_token_accuracy": 0.9573554307222366,
"num_tokens": 356943.0,
"step": 560
},
{
"epoch": 2.1431261770244823,
"grad_norm": 1.86222505569458,
"learning_rate": 0.00011473684210526316,
"loss": 0.0943,
"mean_token_accuracy": 0.9724091812968254,
"num_tokens": 363213.0,
"step": 570
},
{
"epoch": 2.1807909604519775,
"grad_norm": 0.9379479885101318,
"learning_rate": 0.00011323308270676691,
"loss": 0.0797,
"mean_token_accuracy": 0.9795262023806572,
"num_tokens": 369571.0,
"step": 580
},
{
"epoch": 2.2184557438794728,
"grad_norm": 1.6179430484771729,
"learning_rate": 0.00011172932330827069,
"loss": 0.1002,
"mean_token_accuracy": 0.9702985122799873,
"num_tokens": 375929.0,
"step": 590
},
{
"epoch": 2.256120527306968,
"grad_norm": 3.739664077758789,
"learning_rate": 0.00011022556390977444,
"loss": 0.0895,
"mean_token_accuracy": 0.9730613097548485,
"num_tokens": 382286.0,
"step": 600
},
{
"epoch": 2.2937853107344632,
"grad_norm": 2.5955328941345215,
"learning_rate": 0.0001087218045112782,
"loss": 0.1093,
"mean_token_accuracy": 0.96891999989748,
"num_tokens": 388642.0,
"step": 610
},
{
"epoch": 2.3314500941619585,
"grad_norm": 1.9484935998916626,
"learning_rate": 0.00010721804511278195,
"loss": 0.0956,
"mean_token_accuracy": 0.9696866631507873,
"num_tokens": 394906.0,
"step": 620
},
{
"epoch": 2.3691148775894537,
"grad_norm": 1.6120738983154297,
"learning_rate": 0.00010571428571428572,
"loss": 0.0924,
"mean_token_accuracy": 0.9751150533556938,
"num_tokens": 401508.0,
"step": 630
},
{
"epoch": 2.406779661016949,
"grad_norm": 1.1444437503814697,
"learning_rate": 0.00010421052631578947,
"loss": 0.086,
"mean_token_accuracy": 0.9736846208572387,
"num_tokens": 407893.0,
"step": 640
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.6979761123657227,
"learning_rate": 0.00010270676691729324,
"loss": 0.083,
"mean_token_accuracy": 0.9740224435925484,
"num_tokens": 414233.0,
"step": 650
},
{
"epoch": 2.48210922787194,
"grad_norm": 0.8328177332878113,
"learning_rate": 0.00010120300751879699,
"loss": 0.0821,
"mean_token_accuracy": 0.9786466941237449,
"num_tokens": 420630.0,
"step": 660
},
{
"epoch": 2.519774011299435,
"grad_norm": 2.2006723880767822,
"learning_rate": 9.969924812030076e-05,
"loss": 0.0957,
"mean_token_accuracy": 0.9709162205457688,
"num_tokens": 426919.0,
"step": 670
},
{
"epoch": 2.5574387947269304,
"grad_norm": 1.046185851097107,
"learning_rate": 9.819548872180451e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.9735636353492737,
"num_tokens": 433364.0,
"step": 680
},
{
"epoch": 2.5951035781544256,
"grad_norm": 3.8349199295043945,
"learning_rate": 9.669172932330828e-05,
"loss": 0.0865,
"mean_token_accuracy": 0.9770249351859093,
"num_tokens": 439584.0,
"step": 690
},
{
"epoch": 2.632768361581921,
"grad_norm": 1.839320421218872,
"learning_rate": 9.518796992481204e-05,
"loss": 0.0758,
"mean_token_accuracy": 0.976737704873085,
"num_tokens": 445907.0,
"step": 700
},
{
"epoch": 2.670433145009416,
"grad_norm": 2.620213031768799,
"learning_rate": 9.36842105263158e-05,
"loss": 0.0782,
"mean_token_accuracy": 0.9791989624500275,
"num_tokens": 452290.0,
"step": 710
},
{
"epoch": 2.7080979284369113,
"grad_norm": 3.2904560565948486,
"learning_rate": 9.218045112781955e-05,
"loss": 0.0723,
"mean_token_accuracy": 0.981065520644188,
"num_tokens": 458747.0,
"step": 720
},
{
"epoch": 2.7457627118644066,
"grad_norm": 1.8491514921188354,
"learning_rate": 9.067669172932331e-05,
"loss": 0.0775,
"mean_token_accuracy": 0.9784526824951172,
"num_tokens": 465249.0,
"step": 730
},
{
"epoch": 2.7834274952919023,
"grad_norm": 0.4412521421909332,
"learning_rate": 8.917293233082708e-05,
"loss": 0.0664,
"mean_token_accuracy": 0.9805421829223633,
"num_tokens": 471730.0,
"step": 740
},
{
"epoch": 2.8210922787193975,
"grad_norm": 3.756784439086914,
"learning_rate": 8.766917293233084e-05,
"loss": 0.0765,
"mean_token_accuracy": 0.9758831724524498,
"num_tokens": 478195.0,
"step": 750
},
{
"epoch": 2.8587570621468927,
"grad_norm": 1.6190953254699707,
"learning_rate": 8.616541353383459e-05,
"loss": 0.0578,
"mean_token_accuracy": 0.9829958915710449,
"num_tokens": 484653.0,
"step": 760
},
{
"epoch": 2.896421845574388,
"grad_norm": 0.9864803552627563,
"learning_rate": 8.466165413533835e-05,
"loss": 0.07,
"mean_token_accuracy": 0.9816947236657143,
"num_tokens": 490920.0,
"step": 770
},
{
"epoch": 2.934086629001883,
"grad_norm": 0.9541674852371216,
"learning_rate": 8.315789473684212e-05,
"loss": 0.0747,
"mean_token_accuracy": 0.9807368606328964,
"num_tokens": 497396.0,
"step": 780
},
{
"epoch": 2.9717514124293785,
"grad_norm": 2.457627058029175,
"learning_rate": 8.165413533834588e-05,
"loss": 0.061,
"mean_token_accuracy": 0.9817963764071465,
"num_tokens": 503722.0,
"step": 790
}
],
"logging_steps": 10,
"max_steps": 1330,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.324640904814592e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}