{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 266, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03766478342749529, "grad_norm": 4.520346641540527, "learning_rate": 0.0001986466165413534, "loss": 1.7976, "mean_token_accuracy": 0.5935400031507015, "num_tokens": 6488.0, "step": 10 }, { "epoch": 0.07532956685499058, "grad_norm": 4.291996955871582, "learning_rate": 0.00019714285714285716, "loss": 1.5155, "mean_token_accuracy": 0.648256602883339, "num_tokens": 12907.0, "step": 20 }, { "epoch": 0.11299435028248588, "grad_norm": 4.346818447113037, "learning_rate": 0.00019578947368421054, "loss": 1.4412, "mean_token_accuracy": 0.6501220405101776, "num_tokens": 19242.0, "step": 30 }, { "epoch": 0.15065913370998116, "grad_norm": 3.3988215923309326, "learning_rate": 0.0001942857142857143, "loss": 1.2435, "mean_token_accuracy": 0.7024012058973312, "num_tokens": 25647.0, "step": 40 }, { "epoch": 0.18832391713747645, "grad_norm": 3.1845004558563232, "learning_rate": 0.00019278195488721807, "loss": 1.2935, "mean_token_accuracy": 0.6964049726724625, "num_tokens": 32099.0, "step": 50 }, { "epoch": 0.22598870056497175, "grad_norm": 4.023508071899414, "learning_rate": 0.0001912781954887218, "loss": 1.1738, "mean_token_accuracy": 0.7159222587943077, "num_tokens": 38623.0, "step": 60 }, { "epoch": 0.263653483992467, "grad_norm": 3.649766206741333, "learning_rate": 0.0001897744360902256, "loss": 1.0487, "mean_token_accuracy": 0.735117182135582, "num_tokens": 45055.0, "step": 70 }, { "epoch": 0.3013182674199623, "grad_norm": 4.424032688140869, "learning_rate": 0.00018827067669172933, "loss": 1.0402, "mean_token_accuracy": 0.7437395498156547, "num_tokens": 51352.0, "step": 80 }, { "epoch": 0.3389830508474576, "grad_norm": 3.893348455429077, "learning_rate": 0.0001867669172932331, "loss": 0.9936, "mean_token_accuracy": 0.7585602343082428, "num_tokens": 57835.0, "step": 90 }, { "epoch": 0.3766478342749529, "grad_norm": 4.915287494659424, "learning_rate": 0.00018526315789473685, "loss": 0.9204, "mean_token_accuracy": 0.7651444330811501, "num_tokens": 64237.0, "step": 100 }, { "epoch": 0.4143126177024482, "grad_norm": 3.8559658527374268, "learning_rate": 0.00018375939849624062, "loss": 0.9346, "mean_token_accuracy": 0.7480458408594132, "num_tokens": 70645.0, "step": 110 }, { "epoch": 0.4519774011299435, "grad_norm": 4.7108025550842285, "learning_rate": 0.00018225563909774438, "loss": 0.8954, "mean_token_accuracy": 0.777623999118805, "num_tokens": 76950.0, "step": 120 }, { "epoch": 0.4896421845574388, "grad_norm": 3.1379129886627197, "learning_rate": 0.00018075187969924814, "loss": 0.9075, "mean_token_accuracy": 0.769656004011631, "num_tokens": 83477.0, "step": 130 }, { "epoch": 0.527306967984934, "grad_norm": 3.6787190437316895, "learning_rate": 0.00017924812030075188, "loss": 0.8276, "mean_token_accuracy": 0.7966844961047173, "num_tokens": 89666.0, "step": 140 }, { "epoch": 0.5649717514124294, "grad_norm": 3.936288595199585, "learning_rate": 0.00017774436090225567, "loss": 0.754, "mean_token_accuracy": 0.7978162422776223, "num_tokens": 96045.0, "step": 150 }, { "epoch": 0.6026365348399246, "grad_norm": 3.9623541831970215, "learning_rate": 0.0001762406015037594, "loss": 0.7443, "mean_token_accuracy": 0.8057964265346527, "num_tokens": 102612.0, "step": 160 }, { "epoch": 0.64030131826742, "grad_norm": 3.3509137630462646, "learning_rate": 0.00017473684210526317, "loss": 0.7718, "mean_token_accuracy": 0.8012749120593071, "num_tokens": 109028.0, "step": 170 }, { "epoch": 0.6779661016949152, "grad_norm": 3.897428512573242, "learning_rate": 0.00017323308270676693, "loss": 0.7048, "mean_token_accuracy": 0.8161927372217178, "num_tokens": 115373.0, "step": 180 }, { "epoch": 0.7156308851224106, "grad_norm": 2.7902867794036865, "learning_rate": 0.0001717293233082707, "loss": 0.6571, "mean_token_accuracy": 0.8328385710716247, "num_tokens": 121714.0, "step": 190 }, { "epoch": 0.7532956685499058, "grad_norm": 3.233376979827881, "learning_rate": 0.00017022556390977443, "loss": 0.6449, "mean_token_accuracy": 0.8228127270936966, "num_tokens": 128163.0, "step": 200 }, { "epoch": 0.7909604519774012, "grad_norm": 3.536149501800537, "learning_rate": 0.00016872180451127822, "loss": 0.5095, "mean_token_accuracy": 0.8665768191218376, "num_tokens": 134507.0, "step": 210 }, { "epoch": 0.8286252354048964, "grad_norm": 3.885110378265381, "learning_rate": 0.00016721804511278196, "loss": 0.6011, "mean_token_accuracy": 0.8428927510976791, "num_tokens": 140804.0, "step": 220 }, { "epoch": 0.8662900188323918, "grad_norm": 3.187678098678589, "learning_rate": 0.00016571428571428575, "loss": 0.6367, "mean_token_accuracy": 0.8326474368572235, "num_tokens": 147072.0, "step": 230 }, { "epoch": 0.903954802259887, "grad_norm": 3.375328779220581, "learning_rate": 0.00016421052631578948, "loss": 0.5193, "mean_token_accuracy": 0.8652853280305862, "num_tokens": 153356.0, "step": 240 }, { "epoch": 0.9416195856873822, "grad_norm": 3.7706921100616455, "learning_rate": 0.00016270676691729325, "loss": 0.5301, "mean_token_accuracy": 0.8526319354772568, "num_tokens": 159757.0, "step": 250 }, { "epoch": 0.9792843691148776, "grad_norm": 3.51759934425354, "learning_rate": 0.00016135338345864663, "loss": 0.5719, "mean_token_accuracy": 0.847623547911644, "num_tokens": 166079.0, "step": 260 } ], "logging_steps": 10, "max_steps": 1330, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7741528751800320.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }