{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 798, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03766478342749529, "grad_norm": 4.520346641540527, "learning_rate": 0.0001986466165413534, "loss": 1.7976, "mean_token_accuracy": 0.5935400031507015, "num_tokens": 6488.0, "step": 10 }, { "epoch": 0.07532956685499058, "grad_norm": 4.291996955871582, "learning_rate": 0.00019714285714285716, "loss": 1.5155, "mean_token_accuracy": 0.648256602883339, "num_tokens": 12907.0, "step": 20 }, { "epoch": 0.11299435028248588, "grad_norm": 4.346818447113037, "learning_rate": 0.00019578947368421054, "loss": 1.4412, "mean_token_accuracy": 0.6501220405101776, "num_tokens": 19242.0, "step": 30 }, { "epoch": 0.15065913370998116, "grad_norm": 3.3988215923309326, "learning_rate": 0.0001942857142857143, "loss": 1.2435, "mean_token_accuracy": 0.7024012058973312, "num_tokens": 25647.0, "step": 40 }, { "epoch": 0.18832391713747645, "grad_norm": 3.1845004558563232, "learning_rate": 0.00019278195488721807, "loss": 1.2935, "mean_token_accuracy": 0.6964049726724625, "num_tokens": 32099.0, "step": 50 }, { "epoch": 0.22598870056497175, "grad_norm": 4.023508071899414, "learning_rate": 0.0001912781954887218, "loss": 1.1738, "mean_token_accuracy": 0.7159222587943077, "num_tokens": 38623.0, "step": 60 }, { "epoch": 0.263653483992467, "grad_norm": 3.649766206741333, "learning_rate": 0.0001897744360902256, "loss": 1.0487, "mean_token_accuracy": 0.735117182135582, "num_tokens": 45055.0, "step": 70 }, { "epoch": 0.3013182674199623, "grad_norm": 4.424032688140869, "learning_rate": 0.00018827067669172933, "loss": 1.0402, "mean_token_accuracy": 0.7437395498156547, "num_tokens": 51352.0, "step": 80 }, { "epoch": 0.3389830508474576, "grad_norm": 3.893348455429077, "learning_rate": 0.0001867669172932331, "loss": 0.9936, "mean_token_accuracy": 0.7585602343082428, "num_tokens": 57835.0, "step": 90 }, { "epoch": 0.3766478342749529, "grad_norm": 4.915287494659424, "learning_rate": 0.00018526315789473685, "loss": 0.9204, "mean_token_accuracy": 0.7651444330811501, "num_tokens": 64237.0, "step": 100 }, { "epoch": 0.4143126177024482, "grad_norm": 3.8559658527374268, "learning_rate": 0.00018375939849624062, "loss": 0.9346, "mean_token_accuracy": 0.7480458408594132, "num_tokens": 70645.0, "step": 110 }, { "epoch": 0.4519774011299435, "grad_norm": 4.7108025550842285, "learning_rate": 0.00018225563909774438, "loss": 0.8954, "mean_token_accuracy": 0.777623999118805, "num_tokens": 76950.0, "step": 120 }, { "epoch": 0.4896421845574388, "grad_norm": 3.1379129886627197, "learning_rate": 0.00018075187969924814, "loss": 0.9075, "mean_token_accuracy": 0.769656004011631, "num_tokens": 83477.0, "step": 130 }, { "epoch": 0.527306967984934, "grad_norm": 3.6787190437316895, "learning_rate": 0.00017924812030075188, "loss": 0.8276, "mean_token_accuracy": 0.7966844961047173, "num_tokens": 89666.0, "step": 140 }, { "epoch": 0.5649717514124294, "grad_norm": 3.936288595199585, "learning_rate": 0.00017774436090225567, "loss": 0.754, "mean_token_accuracy": 0.7978162422776223, "num_tokens": 96045.0, "step": 150 }, { "epoch": 0.6026365348399246, "grad_norm": 3.9623541831970215, "learning_rate": 0.0001762406015037594, "loss": 0.7443, "mean_token_accuracy": 0.8057964265346527, "num_tokens": 102612.0, "step": 160 }, { "epoch": 0.64030131826742, "grad_norm": 3.3509137630462646, "learning_rate": 0.00017473684210526317, "loss": 0.7718, "mean_token_accuracy": 0.8012749120593071, "num_tokens": 109028.0, "step": 170 }, { "epoch": 0.6779661016949152, "grad_norm": 3.897428512573242, "learning_rate": 0.00017323308270676693, "loss": 0.7048, "mean_token_accuracy": 0.8161927372217178, "num_tokens": 115373.0, "step": 180 }, { "epoch": 0.7156308851224106, "grad_norm": 2.7902867794036865, "learning_rate": 0.0001717293233082707, "loss": 0.6571, "mean_token_accuracy": 0.8328385710716247, "num_tokens": 121714.0, "step": 190 }, { "epoch": 0.7532956685499058, "grad_norm": 3.233376979827881, "learning_rate": 0.00017022556390977443, "loss": 0.6449, "mean_token_accuracy": 0.8228127270936966, "num_tokens": 128163.0, "step": 200 }, { "epoch": 0.7909604519774012, "grad_norm": 3.536149501800537, "learning_rate": 0.00016872180451127822, "loss": 0.5095, "mean_token_accuracy": 0.8665768191218376, "num_tokens": 134507.0, "step": 210 }, { "epoch": 0.8286252354048964, "grad_norm": 3.885110378265381, "learning_rate": 0.00016721804511278196, "loss": 0.6011, "mean_token_accuracy": 0.8428927510976791, "num_tokens": 140804.0, "step": 220 }, { "epoch": 0.8662900188323918, "grad_norm": 3.187678098678589, "learning_rate": 0.00016571428571428575, "loss": 0.6367, "mean_token_accuracy": 0.8326474368572235, "num_tokens": 147072.0, "step": 230 }, { "epoch": 0.903954802259887, "grad_norm": 3.375328779220581, "learning_rate": 0.00016421052631578948, "loss": 0.5193, "mean_token_accuracy": 0.8652853280305862, "num_tokens": 153356.0, "step": 240 }, { "epoch": 0.9416195856873822, "grad_norm": 3.7706921100616455, "learning_rate": 0.00016270676691729325, "loss": 0.5301, "mean_token_accuracy": 0.8526319354772568, "num_tokens": 159757.0, "step": 250 }, { "epoch": 0.9792843691148776, "grad_norm": 3.51759934425354, "learning_rate": 0.00016135338345864663, "loss": 0.5719, "mean_token_accuracy": 0.847623547911644, "num_tokens": 166079.0, "step": 260 }, { "epoch": 1.015065913370998, "grad_norm": 3.8684723377227783, "learning_rate": 0.0001598496240601504, "loss": 0.3488, "mean_token_accuracy": 0.8969152930535769, "num_tokens": 172086.0, "step": 270 }, { "epoch": 1.0527306967984935, "grad_norm": 2.8484208583831787, "learning_rate": 0.00015834586466165416, "loss": 0.2515, "mean_token_accuracy": 0.9283478140830994, "num_tokens": 178513.0, "step": 280 }, { "epoch": 1.0903954802259888, "grad_norm": 2.833587884902954, "learning_rate": 0.0001568421052631579, "loss": 0.2459, "mean_token_accuracy": 0.9338237583637238, "num_tokens": 184863.0, "step": 290 }, { "epoch": 1.128060263653484, "grad_norm": 2.4005439281463623, "learning_rate": 0.00015533834586466168, "loss": 0.2355, "mean_token_accuracy": 0.9258405908942222, "num_tokens": 191326.0, "step": 300 }, { "epoch": 1.1657250470809792, "grad_norm": 2.199273109436035, "learning_rate": 0.00015383458646616542, "loss": 0.28, "mean_token_accuracy": 0.9157944962382316, "num_tokens": 197503.0, "step": 310 }, { "epoch": 1.2033898305084745, "grad_norm": 1.9038857221603394, "learning_rate": 0.00015233082706766918, "loss": 0.3046, "mean_token_accuracy": 0.9144061759114266, "num_tokens": 203923.0, "step": 320 }, { "epoch": 1.24105461393597, "grad_norm": 4.188675880432129, "learning_rate": 0.00015082706766917294, "loss": 0.2209, "mean_token_accuracy": 0.9331182524561882, "num_tokens": 210174.0, "step": 330 }, { "epoch": 1.2787193973634652, "grad_norm": 3.8147690296173096, "learning_rate": 0.0001493233082706767, "loss": 0.2594, "mean_token_accuracy": 0.9269167870283127, "num_tokens": 216633.0, "step": 340 }, { "epoch": 1.3163841807909604, "grad_norm": 2.800786256790161, "learning_rate": 0.00014781954887218047, "loss": 0.2326, "mean_token_accuracy": 0.931777173280716, "num_tokens": 223210.0, "step": 350 }, { "epoch": 1.3540489642184557, "grad_norm": 3.3343842029571533, "learning_rate": 0.00014631578947368423, "loss": 0.2586, "mean_token_accuracy": 0.9242904737591744, "num_tokens": 229716.0, "step": 360 }, { "epoch": 1.3917137476459511, "grad_norm": 2.7086610794067383, "learning_rate": 0.00014481203007518797, "loss": 0.2315, "mean_token_accuracy": 0.9330500423908233, "num_tokens": 236091.0, "step": 370 }, { "epoch": 1.4293785310734464, "grad_norm": 2.444577693939209, "learning_rate": 0.00014330827067669176, "loss": 0.262, "mean_token_accuracy": 0.9290693372488021, "num_tokens": 242422.0, "step": 380 }, { "epoch": 1.4670433145009416, "grad_norm": 2.5115718841552734, "learning_rate": 0.0001418045112781955, "loss": 0.2257, "mean_token_accuracy": 0.9272137597203255, "num_tokens": 248767.0, "step": 390 }, { "epoch": 1.5047080979284368, "grad_norm": 2.618799924850464, "learning_rate": 0.00014030075187969926, "loss": 0.2231, "mean_token_accuracy": 0.9364230826497077, "num_tokens": 254965.0, "step": 400 }, { "epoch": 1.542372881355932, "grad_norm": 3.186830997467041, "learning_rate": 0.00013879699248120302, "loss": 0.2071, "mean_token_accuracy": 0.9404660388827324, "num_tokens": 261348.0, "step": 410 }, { "epoch": 1.5800376647834273, "grad_norm": 3.5681018829345703, "learning_rate": 0.00013729323308270676, "loss": 0.2567, "mean_token_accuracy": 0.9265485420823097, "num_tokens": 267657.0, "step": 420 }, { "epoch": 1.6177024482109228, "grad_norm": 2.8113648891448975, "learning_rate": 0.00013578947368421055, "loss": 0.2537, "mean_token_accuracy": 0.9251923531293869, "num_tokens": 274158.0, "step": 430 }, { "epoch": 1.655367231638418, "grad_norm": 3.763946771621704, "learning_rate": 0.00013428571428571428, "loss": 0.2339, "mean_token_accuracy": 0.930957356095314, "num_tokens": 280543.0, "step": 440 }, { "epoch": 1.6930320150659135, "grad_norm": 2.3775360584259033, "learning_rate": 0.00013278195488721804, "loss": 0.1749, "mean_token_accuracy": 0.9463336855173111, "num_tokens": 286815.0, "step": 450 }, { "epoch": 1.7306967984934087, "grad_norm": 6.756045818328857, "learning_rate": 0.0001312781954887218, "loss": 0.181, "mean_token_accuracy": 0.9514397040009499, "num_tokens": 293146.0, "step": 460 }, { "epoch": 1.768361581920904, "grad_norm": 1.9489086866378784, "learning_rate": 0.00012977443609022557, "loss": 0.173, "mean_token_accuracy": 0.947341488301754, "num_tokens": 299536.0, "step": 470 }, { "epoch": 1.8060263653483992, "grad_norm": 2.5123419761657715, "learning_rate": 0.0001282706766917293, "loss": 0.2269, "mean_token_accuracy": 0.9323122307658196, "num_tokens": 305960.0, "step": 480 }, { "epoch": 1.8436911487758945, "grad_norm": 2.6642258167266846, "learning_rate": 0.0001267669172932331, "loss": 0.2046, "mean_token_accuracy": 0.9483947545289994, "num_tokens": 312339.0, "step": 490 }, { "epoch": 1.8813559322033897, "grad_norm": 2.118013620376587, "learning_rate": 0.00012526315789473683, "loss": 0.1925, "mean_token_accuracy": 0.9490439668297768, "num_tokens": 318841.0, "step": 500 }, { "epoch": 1.9190207156308852, "grad_norm": 1.8715368509292603, "learning_rate": 0.0001237593984962406, "loss": 0.1834, "mean_token_accuracy": 0.94656672924757, "num_tokens": 325239.0, "step": 510 }, { "epoch": 1.9566854990583804, "grad_norm": 4.930976867675781, "learning_rate": 0.00012225563909774436, "loss": 0.1391, "mean_token_accuracy": 0.9627918288111686, "num_tokens": 331667.0, "step": 520 }, { "epoch": 1.9943502824858759, "grad_norm": 2.284675359725952, "learning_rate": 0.00012075187969924812, "loss": 0.1153, "mean_token_accuracy": 0.9665953874588012, "num_tokens": 338092.0, "step": 530 }, { "epoch": 2.030131826741996, "grad_norm": 3.0949432849884033, "learning_rate": 0.00011924812030075187, "loss": 0.0908, "mean_token_accuracy": 0.9738617178640867, "num_tokens": 344133.0, "step": 540 }, { "epoch": 2.0677966101694913, "grad_norm": 2.162048578262329, "learning_rate": 0.00011774436090225565, "loss": 0.0896, "mean_token_accuracy": 0.9742276027798653, "num_tokens": 350507.0, "step": 550 }, { "epoch": 2.105461393596987, "grad_norm": 2.7815120220184326, "learning_rate": 0.0001162406015037594, "loss": 0.1341, "mean_token_accuracy": 0.9573554307222366, "num_tokens": 356943.0, "step": 560 }, { "epoch": 2.1431261770244823, "grad_norm": 1.86222505569458, "learning_rate": 0.00011473684210526316, "loss": 0.0943, "mean_token_accuracy": 0.9724091812968254, "num_tokens": 363213.0, "step": 570 }, { "epoch": 2.1807909604519775, "grad_norm": 0.9379479885101318, "learning_rate": 0.00011323308270676691, "loss": 0.0797, "mean_token_accuracy": 0.9795262023806572, "num_tokens": 369571.0, "step": 580 }, { "epoch": 2.2184557438794728, "grad_norm": 1.6179430484771729, "learning_rate": 0.00011172932330827069, "loss": 0.1002, "mean_token_accuracy": 0.9702985122799873, "num_tokens": 375929.0, "step": 590 }, { "epoch": 2.256120527306968, "grad_norm": 3.739664077758789, "learning_rate": 0.00011022556390977444, "loss": 0.0895, "mean_token_accuracy": 0.9730613097548485, "num_tokens": 382286.0, "step": 600 }, { "epoch": 2.2937853107344632, "grad_norm": 2.5955328941345215, "learning_rate": 0.0001087218045112782, "loss": 0.1093, "mean_token_accuracy": 0.96891999989748, "num_tokens": 388642.0, "step": 610 }, { "epoch": 2.3314500941619585, "grad_norm": 1.9484935998916626, "learning_rate": 0.00010721804511278195, "loss": 0.0956, "mean_token_accuracy": 0.9696866631507873, "num_tokens": 394906.0, "step": 620 }, { "epoch": 2.3691148775894537, "grad_norm": 1.6120738983154297, "learning_rate": 0.00010571428571428572, "loss": 0.0924, "mean_token_accuracy": 0.9751150533556938, "num_tokens": 401508.0, "step": 630 }, { "epoch": 2.406779661016949, "grad_norm": 1.1444437503814697, "learning_rate": 0.00010421052631578947, "loss": 0.086, "mean_token_accuracy": 0.9736846208572387, "num_tokens": 407893.0, "step": 640 }, { "epoch": 2.4444444444444446, "grad_norm": 1.6979761123657227, "learning_rate": 0.00010270676691729324, "loss": 0.083, "mean_token_accuracy": 0.9740224435925484, "num_tokens": 414233.0, "step": 650 }, { "epoch": 2.48210922787194, "grad_norm": 0.8328177332878113, "learning_rate": 0.00010120300751879699, "loss": 0.0821, "mean_token_accuracy": 0.9786466941237449, "num_tokens": 420630.0, "step": 660 }, { "epoch": 2.519774011299435, "grad_norm": 2.2006723880767822, "learning_rate": 9.969924812030076e-05, "loss": 0.0957, "mean_token_accuracy": 0.9709162205457688, "num_tokens": 426919.0, "step": 670 }, { "epoch": 2.5574387947269304, "grad_norm": 1.046185851097107, "learning_rate": 9.819548872180451e-05, "loss": 0.0861, "mean_token_accuracy": 0.9735636353492737, "num_tokens": 433364.0, "step": 680 }, { "epoch": 2.5951035781544256, "grad_norm": 3.8349199295043945, "learning_rate": 9.669172932330828e-05, "loss": 0.0865, "mean_token_accuracy": 0.9770249351859093, "num_tokens": 439584.0, "step": 690 }, { "epoch": 2.632768361581921, "grad_norm": 1.839320421218872, "learning_rate": 9.518796992481204e-05, "loss": 0.0758, "mean_token_accuracy": 0.976737704873085, "num_tokens": 445907.0, "step": 700 }, { "epoch": 2.670433145009416, "grad_norm": 2.620213031768799, "learning_rate": 9.36842105263158e-05, "loss": 0.0782, "mean_token_accuracy": 0.9791989624500275, "num_tokens": 452290.0, "step": 710 }, { "epoch": 2.7080979284369113, "grad_norm": 3.2904560565948486, "learning_rate": 9.218045112781955e-05, "loss": 0.0723, "mean_token_accuracy": 0.981065520644188, "num_tokens": 458747.0, "step": 720 }, { "epoch": 2.7457627118644066, "grad_norm": 1.8491514921188354, "learning_rate": 9.067669172932331e-05, "loss": 0.0775, "mean_token_accuracy": 0.9784526824951172, "num_tokens": 465249.0, "step": 730 }, { "epoch": 2.7834274952919023, "grad_norm": 0.4412521421909332, "learning_rate": 8.917293233082708e-05, "loss": 0.0664, "mean_token_accuracy": 0.9805421829223633, "num_tokens": 471730.0, "step": 740 }, { "epoch": 2.8210922787193975, "grad_norm": 3.756784439086914, "learning_rate": 8.766917293233084e-05, "loss": 0.0765, "mean_token_accuracy": 0.9758831724524498, "num_tokens": 478195.0, "step": 750 }, { "epoch": 2.8587570621468927, "grad_norm": 1.6190953254699707, "learning_rate": 8.616541353383459e-05, "loss": 0.0578, "mean_token_accuracy": 0.9829958915710449, "num_tokens": 484653.0, "step": 760 }, { "epoch": 2.896421845574388, "grad_norm": 0.9864803552627563, "learning_rate": 8.466165413533835e-05, "loss": 0.07, "mean_token_accuracy": 0.9816947236657143, "num_tokens": 490920.0, "step": 770 }, { "epoch": 2.934086629001883, "grad_norm": 0.9541674852371216, "learning_rate": 8.315789473684212e-05, "loss": 0.0747, "mean_token_accuracy": 0.9807368606328964, "num_tokens": 497396.0, "step": 780 }, { "epoch": 2.9717514124293785, "grad_norm": 2.457627058029175, "learning_rate": 8.165413533834588e-05, "loss": 0.061, "mean_token_accuracy": 0.9817963764071465, "num_tokens": 503722.0, "step": 790 } ], "logging_steps": 10, "max_steps": 1330, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.324640904814592e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }