| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 798, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03766478342749529, | |
| "grad_norm": 4.520346641540527, | |
| "learning_rate": 0.0001986466165413534, | |
| "loss": 1.7976, | |
| "mean_token_accuracy": 0.5935400031507015, | |
| "num_tokens": 6488.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07532956685499058, | |
| "grad_norm": 4.291996955871582, | |
| "learning_rate": 0.00019714285714285716, | |
| "loss": 1.5155, | |
| "mean_token_accuracy": 0.648256602883339, | |
| "num_tokens": 12907.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11299435028248588, | |
| "grad_norm": 4.346818447113037, | |
| "learning_rate": 0.00019578947368421054, | |
| "loss": 1.4412, | |
| "mean_token_accuracy": 0.6501220405101776, | |
| "num_tokens": 19242.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15065913370998116, | |
| "grad_norm": 3.3988215923309326, | |
| "learning_rate": 0.0001942857142857143, | |
| "loss": 1.2435, | |
| "mean_token_accuracy": 0.7024012058973312, | |
| "num_tokens": 25647.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18832391713747645, | |
| "grad_norm": 3.1845004558563232, | |
| "learning_rate": 0.00019278195488721807, | |
| "loss": 1.2935, | |
| "mean_token_accuracy": 0.6964049726724625, | |
| "num_tokens": 32099.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22598870056497175, | |
| "grad_norm": 4.023508071899414, | |
| "learning_rate": 0.0001912781954887218, | |
| "loss": 1.1738, | |
| "mean_token_accuracy": 0.7159222587943077, | |
| "num_tokens": 38623.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.263653483992467, | |
| "grad_norm": 3.649766206741333, | |
| "learning_rate": 0.0001897744360902256, | |
| "loss": 1.0487, | |
| "mean_token_accuracy": 0.735117182135582, | |
| "num_tokens": 45055.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3013182674199623, | |
| "grad_norm": 4.424032688140869, | |
| "learning_rate": 0.00018827067669172933, | |
| "loss": 1.0402, | |
| "mean_token_accuracy": 0.7437395498156547, | |
| "num_tokens": 51352.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3389830508474576, | |
| "grad_norm": 3.893348455429077, | |
| "learning_rate": 0.0001867669172932331, | |
| "loss": 0.9936, | |
| "mean_token_accuracy": 0.7585602343082428, | |
| "num_tokens": 57835.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3766478342749529, | |
| "grad_norm": 4.915287494659424, | |
| "learning_rate": 0.00018526315789473685, | |
| "loss": 0.9204, | |
| "mean_token_accuracy": 0.7651444330811501, | |
| "num_tokens": 64237.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4143126177024482, | |
| "grad_norm": 3.8559658527374268, | |
| "learning_rate": 0.00018375939849624062, | |
| "loss": 0.9346, | |
| "mean_token_accuracy": 0.7480458408594132, | |
| "num_tokens": 70645.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4519774011299435, | |
| "grad_norm": 4.7108025550842285, | |
| "learning_rate": 0.00018225563909774438, | |
| "loss": 0.8954, | |
| "mean_token_accuracy": 0.777623999118805, | |
| "num_tokens": 76950.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4896421845574388, | |
| "grad_norm": 3.1379129886627197, | |
| "learning_rate": 0.00018075187969924814, | |
| "loss": 0.9075, | |
| "mean_token_accuracy": 0.769656004011631, | |
| "num_tokens": 83477.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.527306967984934, | |
| "grad_norm": 3.6787190437316895, | |
| "learning_rate": 0.00017924812030075188, | |
| "loss": 0.8276, | |
| "mean_token_accuracy": 0.7966844961047173, | |
| "num_tokens": 89666.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5649717514124294, | |
| "grad_norm": 3.936288595199585, | |
| "learning_rate": 0.00017774436090225567, | |
| "loss": 0.754, | |
| "mean_token_accuracy": 0.7978162422776223, | |
| "num_tokens": 96045.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6026365348399246, | |
| "grad_norm": 3.9623541831970215, | |
| "learning_rate": 0.0001762406015037594, | |
| "loss": 0.7443, | |
| "mean_token_accuracy": 0.8057964265346527, | |
| "num_tokens": 102612.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.64030131826742, | |
| "grad_norm": 3.3509137630462646, | |
| "learning_rate": 0.00017473684210526317, | |
| "loss": 0.7718, | |
| "mean_token_accuracy": 0.8012749120593071, | |
| "num_tokens": 109028.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 3.897428512573242, | |
| "learning_rate": 0.00017323308270676693, | |
| "loss": 0.7048, | |
| "mean_token_accuracy": 0.8161927372217178, | |
| "num_tokens": 115373.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7156308851224106, | |
| "grad_norm": 2.7902867794036865, | |
| "learning_rate": 0.0001717293233082707, | |
| "loss": 0.6571, | |
| "mean_token_accuracy": 0.8328385710716247, | |
| "num_tokens": 121714.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7532956685499058, | |
| "grad_norm": 3.233376979827881, | |
| "learning_rate": 0.00017022556390977443, | |
| "loss": 0.6449, | |
| "mean_token_accuracy": 0.8228127270936966, | |
| "num_tokens": 128163.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7909604519774012, | |
| "grad_norm": 3.536149501800537, | |
| "learning_rate": 0.00016872180451127822, | |
| "loss": 0.5095, | |
| "mean_token_accuracy": 0.8665768191218376, | |
| "num_tokens": 134507.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8286252354048964, | |
| "grad_norm": 3.885110378265381, | |
| "learning_rate": 0.00016721804511278196, | |
| "loss": 0.6011, | |
| "mean_token_accuracy": 0.8428927510976791, | |
| "num_tokens": 140804.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8662900188323918, | |
| "grad_norm": 3.187678098678589, | |
| "learning_rate": 0.00016571428571428575, | |
| "loss": 0.6367, | |
| "mean_token_accuracy": 0.8326474368572235, | |
| "num_tokens": 147072.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.903954802259887, | |
| "grad_norm": 3.375328779220581, | |
| "learning_rate": 0.00016421052631578948, | |
| "loss": 0.5193, | |
| "mean_token_accuracy": 0.8652853280305862, | |
| "num_tokens": 153356.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9416195856873822, | |
| "grad_norm": 3.7706921100616455, | |
| "learning_rate": 0.00016270676691729325, | |
| "loss": 0.5301, | |
| "mean_token_accuracy": 0.8526319354772568, | |
| "num_tokens": 159757.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9792843691148776, | |
| "grad_norm": 3.51759934425354, | |
| "learning_rate": 0.00016135338345864663, | |
| "loss": 0.5719, | |
| "mean_token_accuracy": 0.847623547911644, | |
| "num_tokens": 166079.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.015065913370998, | |
| "grad_norm": 3.8684723377227783, | |
| "learning_rate": 0.0001598496240601504, | |
| "loss": 0.3488, | |
| "mean_token_accuracy": 0.8969152930535769, | |
| "num_tokens": 172086.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0527306967984935, | |
| "grad_norm": 2.8484208583831787, | |
| "learning_rate": 0.00015834586466165416, | |
| "loss": 0.2515, | |
| "mean_token_accuracy": 0.9283478140830994, | |
| "num_tokens": 178513.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0903954802259888, | |
| "grad_norm": 2.833587884902954, | |
| "learning_rate": 0.0001568421052631579, | |
| "loss": 0.2459, | |
| "mean_token_accuracy": 0.9338237583637238, | |
| "num_tokens": 184863.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.128060263653484, | |
| "grad_norm": 2.4005439281463623, | |
| "learning_rate": 0.00015533834586466168, | |
| "loss": 0.2355, | |
| "mean_token_accuracy": 0.9258405908942222, | |
| "num_tokens": 191326.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1657250470809792, | |
| "grad_norm": 2.199273109436035, | |
| "learning_rate": 0.00015383458646616542, | |
| "loss": 0.28, | |
| "mean_token_accuracy": 0.9157944962382316, | |
| "num_tokens": 197503.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2033898305084745, | |
| "grad_norm": 1.9038857221603394, | |
| "learning_rate": 0.00015233082706766918, | |
| "loss": 0.3046, | |
| "mean_token_accuracy": 0.9144061759114266, | |
| "num_tokens": 203923.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.24105461393597, | |
| "grad_norm": 4.188675880432129, | |
| "learning_rate": 0.00015082706766917294, | |
| "loss": 0.2209, | |
| "mean_token_accuracy": 0.9331182524561882, | |
| "num_tokens": 210174.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2787193973634652, | |
| "grad_norm": 3.8147690296173096, | |
| "learning_rate": 0.0001493233082706767, | |
| "loss": 0.2594, | |
| "mean_token_accuracy": 0.9269167870283127, | |
| "num_tokens": 216633.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3163841807909604, | |
| "grad_norm": 2.800786256790161, | |
| "learning_rate": 0.00014781954887218047, | |
| "loss": 0.2326, | |
| "mean_token_accuracy": 0.931777173280716, | |
| "num_tokens": 223210.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.3540489642184557, | |
| "grad_norm": 3.3343842029571533, | |
| "learning_rate": 0.00014631578947368423, | |
| "loss": 0.2586, | |
| "mean_token_accuracy": 0.9242904737591744, | |
| "num_tokens": 229716.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3917137476459511, | |
| "grad_norm": 2.7086610794067383, | |
| "learning_rate": 0.00014481203007518797, | |
| "loss": 0.2315, | |
| "mean_token_accuracy": 0.9330500423908233, | |
| "num_tokens": 236091.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4293785310734464, | |
| "grad_norm": 2.444577693939209, | |
| "learning_rate": 0.00014330827067669176, | |
| "loss": 0.262, | |
| "mean_token_accuracy": 0.9290693372488021, | |
| "num_tokens": 242422.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4670433145009416, | |
| "grad_norm": 2.5115718841552734, | |
| "learning_rate": 0.0001418045112781955, | |
| "loss": 0.2257, | |
| "mean_token_accuracy": 0.9272137597203255, | |
| "num_tokens": 248767.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.5047080979284368, | |
| "grad_norm": 2.618799924850464, | |
| "learning_rate": 0.00014030075187969926, | |
| "loss": 0.2231, | |
| "mean_token_accuracy": 0.9364230826497077, | |
| "num_tokens": 254965.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.542372881355932, | |
| "grad_norm": 3.186830997467041, | |
| "learning_rate": 0.00013879699248120302, | |
| "loss": 0.2071, | |
| "mean_token_accuracy": 0.9404660388827324, | |
| "num_tokens": 261348.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5800376647834273, | |
| "grad_norm": 3.5681018829345703, | |
| "learning_rate": 0.00013729323308270676, | |
| "loss": 0.2567, | |
| "mean_token_accuracy": 0.9265485420823097, | |
| "num_tokens": 267657.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6177024482109228, | |
| "grad_norm": 2.8113648891448975, | |
| "learning_rate": 0.00013578947368421055, | |
| "loss": 0.2537, | |
| "mean_token_accuracy": 0.9251923531293869, | |
| "num_tokens": 274158.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.655367231638418, | |
| "grad_norm": 3.763946771621704, | |
| "learning_rate": 0.00013428571428571428, | |
| "loss": 0.2339, | |
| "mean_token_accuracy": 0.930957356095314, | |
| "num_tokens": 280543.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6930320150659135, | |
| "grad_norm": 2.3775360584259033, | |
| "learning_rate": 0.00013278195488721804, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9463336855173111, | |
| "num_tokens": 286815.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7306967984934087, | |
| "grad_norm": 6.756045818328857, | |
| "learning_rate": 0.0001312781954887218, | |
| "loss": 0.181, | |
| "mean_token_accuracy": 0.9514397040009499, | |
| "num_tokens": 293146.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.768361581920904, | |
| "grad_norm": 1.9489086866378784, | |
| "learning_rate": 0.00012977443609022557, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.947341488301754, | |
| "num_tokens": 299536.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.8060263653483992, | |
| "grad_norm": 2.5123419761657715, | |
| "learning_rate": 0.0001282706766917293, | |
| "loss": 0.2269, | |
| "mean_token_accuracy": 0.9323122307658196, | |
| "num_tokens": 305960.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.8436911487758945, | |
| "grad_norm": 2.6642258167266846, | |
| "learning_rate": 0.0001267669172932331, | |
| "loss": 0.2046, | |
| "mean_token_accuracy": 0.9483947545289994, | |
| "num_tokens": 312339.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8813559322033897, | |
| "grad_norm": 2.118013620376587, | |
| "learning_rate": 0.00012526315789473683, | |
| "loss": 0.1925, | |
| "mean_token_accuracy": 0.9490439668297768, | |
| "num_tokens": 318841.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9190207156308852, | |
| "grad_norm": 1.8715368509292603, | |
| "learning_rate": 0.0001237593984962406, | |
| "loss": 0.1834, | |
| "mean_token_accuracy": 0.94656672924757, | |
| "num_tokens": 325239.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9566854990583804, | |
| "grad_norm": 4.930976867675781, | |
| "learning_rate": 0.00012225563909774436, | |
| "loss": 0.1391, | |
| "mean_token_accuracy": 0.9627918288111686, | |
| "num_tokens": 331667.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9943502824858759, | |
| "grad_norm": 2.284675359725952, | |
| "learning_rate": 0.00012075187969924812, | |
| "loss": 0.1153, | |
| "mean_token_accuracy": 0.9665953874588012, | |
| "num_tokens": 338092.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.030131826741996, | |
| "grad_norm": 3.0949432849884033, | |
| "learning_rate": 0.00011924812030075187, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9738617178640867, | |
| "num_tokens": 344133.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.0677966101694913, | |
| "grad_norm": 2.162048578262329, | |
| "learning_rate": 0.00011774436090225565, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9742276027798653, | |
| "num_tokens": 350507.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.105461393596987, | |
| "grad_norm": 2.7815120220184326, | |
| "learning_rate": 0.0001162406015037594, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9573554307222366, | |
| "num_tokens": 356943.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.1431261770244823, | |
| "grad_norm": 1.86222505569458, | |
| "learning_rate": 0.00011473684210526316, | |
| "loss": 0.0943, | |
| "mean_token_accuracy": 0.9724091812968254, | |
| "num_tokens": 363213.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.1807909604519775, | |
| "grad_norm": 0.9379479885101318, | |
| "learning_rate": 0.00011323308270676691, | |
| "loss": 0.0797, | |
| "mean_token_accuracy": 0.9795262023806572, | |
| "num_tokens": 369571.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.2184557438794728, | |
| "grad_norm": 1.6179430484771729, | |
| "learning_rate": 0.00011172932330827069, | |
| "loss": 0.1002, | |
| "mean_token_accuracy": 0.9702985122799873, | |
| "num_tokens": 375929.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.256120527306968, | |
| "grad_norm": 3.739664077758789, | |
| "learning_rate": 0.00011022556390977444, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9730613097548485, | |
| "num_tokens": 382286.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.2937853107344632, | |
| "grad_norm": 2.5955328941345215, | |
| "learning_rate": 0.0001087218045112782, | |
| "loss": 0.1093, | |
| "mean_token_accuracy": 0.96891999989748, | |
| "num_tokens": 388642.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.3314500941619585, | |
| "grad_norm": 1.9484935998916626, | |
| "learning_rate": 0.00010721804511278195, | |
| "loss": 0.0956, | |
| "mean_token_accuracy": 0.9696866631507873, | |
| "num_tokens": 394906.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.3691148775894537, | |
| "grad_norm": 1.6120738983154297, | |
| "learning_rate": 0.00010571428571428572, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9751150533556938, | |
| "num_tokens": 401508.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.406779661016949, | |
| "grad_norm": 1.1444437503814697, | |
| "learning_rate": 0.00010421052631578947, | |
| "loss": 0.086, | |
| "mean_token_accuracy": 0.9736846208572387, | |
| "num_tokens": 407893.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 1.6979761123657227, | |
| "learning_rate": 0.00010270676691729324, | |
| "loss": 0.083, | |
| "mean_token_accuracy": 0.9740224435925484, | |
| "num_tokens": 414233.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.48210922787194, | |
| "grad_norm": 0.8328177332878113, | |
| "learning_rate": 0.00010120300751879699, | |
| "loss": 0.0821, | |
| "mean_token_accuracy": 0.9786466941237449, | |
| "num_tokens": 420630.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.519774011299435, | |
| "grad_norm": 2.2006723880767822, | |
| "learning_rate": 9.969924812030076e-05, | |
| "loss": 0.0957, | |
| "mean_token_accuracy": 0.9709162205457688, | |
| "num_tokens": 426919.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.5574387947269304, | |
| "grad_norm": 1.046185851097107, | |
| "learning_rate": 9.819548872180451e-05, | |
| "loss": 0.0861, | |
| "mean_token_accuracy": 0.9735636353492737, | |
| "num_tokens": 433364.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.5951035781544256, | |
| "grad_norm": 3.8349199295043945, | |
| "learning_rate": 9.669172932330828e-05, | |
| "loss": 0.0865, | |
| "mean_token_accuracy": 0.9770249351859093, | |
| "num_tokens": 439584.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.632768361581921, | |
| "grad_norm": 1.839320421218872, | |
| "learning_rate": 9.518796992481204e-05, | |
| "loss": 0.0758, | |
| "mean_token_accuracy": 0.976737704873085, | |
| "num_tokens": 445907.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.670433145009416, | |
| "grad_norm": 2.620213031768799, | |
| "learning_rate": 9.36842105263158e-05, | |
| "loss": 0.0782, | |
| "mean_token_accuracy": 0.9791989624500275, | |
| "num_tokens": 452290.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.7080979284369113, | |
| "grad_norm": 3.2904560565948486, | |
| "learning_rate": 9.218045112781955e-05, | |
| "loss": 0.0723, | |
| "mean_token_accuracy": 0.981065520644188, | |
| "num_tokens": 458747.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.7457627118644066, | |
| "grad_norm": 1.8491514921188354, | |
| "learning_rate": 9.067669172932331e-05, | |
| "loss": 0.0775, | |
| "mean_token_accuracy": 0.9784526824951172, | |
| "num_tokens": 465249.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.7834274952919023, | |
| "grad_norm": 0.4412521421909332, | |
| "learning_rate": 8.917293233082708e-05, | |
| "loss": 0.0664, | |
| "mean_token_accuracy": 0.9805421829223633, | |
| "num_tokens": 471730.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.8210922787193975, | |
| "grad_norm": 3.756784439086914, | |
| "learning_rate": 8.766917293233084e-05, | |
| "loss": 0.0765, | |
| "mean_token_accuracy": 0.9758831724524498, | |
| "num_tokens": 478195.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.8587570621468927, | |
| "grad_norm": 1.6190953254699707, | |
| "learning_rate": 8.616541353383459e-05, | |
| "loss": 0.0578, | |
| "mean_token_accuracy": 0.9829958915710449, | |
| "num_tokens": 484653.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.896421845574388, | |
| "grad_norm": 0.9864803552627563, | |
| "learning_rate": 8.466165413533835e-05, | |
| "loss": 0.07, | |
| "mean_token_accuracy": 0.9816947236657143, | |
| "num_tokens": 490920.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.934086629001883, | |
| "grad_norm": 0.9541674852371216, | |
| "learning_rate": 8.315789473684212e-05, | |
| "loss": 0.0747, | |
| "mean_token_accuracy": 0.9807368606328964, | |
| "num_tokens": 497396.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.9717514124293785, | |
| "grad_norm": 2.457627058029175, | |
| "learning_rate": 8.165413533834588e-05, | |
| "loss": 0.061, | |
| "mean_token_accuracy": 0.9817963764071465, | |
| "num_tokens": 503722.0, | |
| "step": 790 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1330, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.324640904814592e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |