{ "best_metric": 0.9128336310386658, "best_model_checkpoint": "./outputs/public-irc-mistral-24b/checkpoint-24", "epoch": 0.9746192893401016, "eval_steps": 500, "global_step": 24, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04060913705583756, "grad_norm": 3.373054027557373, "learning_rate": 2e-05, "loss": 1.2957, "step": 1 }, { "epoch": 0.08121827411167512, "grad_norm": 3.194347381591797, "learning_rate": 4e-05, "loss": 1.3221, "step": 2 }, { "epoch": 0.1218274111675127, "grad_norm": 1.2986558675765991, "learning_rate": 6.000000000000001e-05, "loss": 1.2683, "step": 3 }, { "epoch": 0.16243654822335024, "grad_norm": 1.1605945825576782, "learning_rate": 8e-05, "loss": 1.1636, "step": 4 }, { "epoch": 0.20304568527918782, "grad_norm": 0.811213493347168, "learning_rate": 7.997668089464696e-05, "loss": 1.0964, "step": 5 }, { "epoch": 0.2436548223350254, "grad_norm": 0.7070867419242859, "learning_rate": 7.990675076762158e-05, "loss": 1.0897, "step": 6 }, { "epoch": 0.28426395939086296, "grad_norm": 0.674956738948822, "learning_rate": 7.97902911543238e-05, "loss": 1.0602, "step": 7 }, { "epoch": 0.3248730964467005, "grad_norm": 0.6653350591659546, "learning_rate": 7.962743784145323e-05, "loss": 1.0097, "step": 8 }, { "epoch": 0.36548223350253806, "grad_norm": 0.6503349542617798, "learning_rate": 7.941838070868787e-05, "loss": 1.0102, "step": 9 }, { "epoch": 0.40609137055837563, "grad_norm": 0.49681031703948975, "learning_rate": 7.916336350729293e-05, "loss": 1.0227, "step": 10 }, { "epoch": 0.4467005076142132, "grad_norm": 0.5602617263793945, "learning_rate": 7.886268357591766e-05, "loss": 0.9935, "step": 11 }, { "epoch": 0.4873096446700508, "grad_norm": 0.48682689666748047, "learning_rate": 7.851669149391198e-05, "loss": 0.9811, "step": 12 }, { "epoch": 0.5279187817258884, "grad_norm": 0.5210645198822021, "learning_rate": 7.812579067256644e-05, "loss": 0.9828, "step": 13 }, { "epoch": 0.5685279187817259, "grad_norm": 0.46042340993881226, "learning_rate": 7.769043688475283e-05, "loss": 0.9629, "step": 14 }, { "epoch": 0.6091370558375635, "grad_norm": 0.4750231206417084, "learning_rate": 7.721113773351333e-05, "loss": 0.9599, "step": 15 }, { "epoch": 0.649746192893401, "grad_norm": 0.44704967737197876, "learning_rate": 7.668845206021812e-05, "loss": 0.9417, "step": 16 }, { "epoch": 0.6903553299492385, "grad_norm": 0.45399850606918335, "learning_rate": 7.612298929298132e-05, "loss": 0.9442, "step": 17 }, { "epoch": 0.7309644670050761, "grad_norm": 0.48387065529823303, "learning_rate": 7.551540873609502e-05, "loss": 0.9388, "step": 18 }, { "epoch": 0.7715736040609137, "grad_norm": 0.4469858705997467, "learning_rate": 7.486641880131006e-05, "loss": 0.9357, "step": 19 }, { "epoch": 0.8121827411167513, "grad_norm": 0.4322652816772461, "learning_rate": 7.417677618185955e-05, "loss": 0.9191, "step": 20 }, { "epoch": 0.8527918781725888, "grad_norm": 0.4220181703567505, "learning_rate": 7.344728497018844e-05, "loss": 0.9269, "step": 21 }, { "epoch": 0.8934010152284264, "grad_norm": 0.4223184287548065, "learning_rate": 7.267879572041768e-05, "loss": 0.9092, "step": 22 }, { "epoch": 0.934010152284264, "grad_norm": 0.5329856276512146, "learning_rate": 7.187220445663618e-05, "loss": 0.8954, "step": 23 }, { "epoch": 0.9746192893401016, "grad_norm": 0.504860520362854, "learning_rate": 7.10284516281768e-05, "loss": 0.9145, "step": 24 }, { "epoch": 0.9746192893401016, "eval_loss": 0.9128336310386658, "eval_runtime": 89.3062, "eval_samples_per_second": 0.84, "eval_steps_per_second": 0.426, "step": 24 } ], "logging_steps": 1, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.4736262854148096e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }