{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100.0, "global_step": 218, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009223674096848577, "grad_norm": 0.8444623351097107, "learning_rate": 7.142857142857143e-07, "loss": 0.9354559779167175, "memory(GiB)": 31.86, "step": 1, "token_acc": 0.7403846153846154, "train_speed(iter/s)": 0.091482 }, { "epoch": 0.04611837048424289, "grad_norm": 0.86482173204422, "learning_rate": 3.5714285714285718e-06, "loss": 0.9165018796920776, "memory(GiB)": 38.56, "step": 5, "token_acc": 0.7583969031082773, "train_speed(iter/s)": 0.140848 }, { "epoch": 0.09223674096848578, "grad_norm": 0.9186728000640869, "learning_rate": 4.997506466835171e-06, "loss": 0.8853721618652344, "memory(GiB)": 38.56, "step": 10, "token_acc": 0.7468285807221078, "train_speed(iter/s)": 0.152667 }, { "epoch": 0.13835511145272866, "grad_norm": 1.031731128692627, "learning_rate": 4.982286218320023e-06, "loss": 0.9246766090393066, "memory(GiB)": 38.58, "step": 15, "token_acc": 0.7695923163379994, "train_speed(iter/s)": 0.15883 }, { "epoch": 0.18447348193697155, "grad_norm": 1.0214720964431763, "learning_rate": 4.953315228402512e-06, "loss": 0.8758598327636719, "memory(GiB)": 42.66, "step": 20, "token_acc": 0.7553402334287602, "train_speed(iter/s)": 0.159997 }, { "epoch": 0.23059185242121444, "grad_norm": 0.8352872729301453, "learning_rate": 4.910753983245589e-06, "loss": 0.9050235748291016, "memory(GiB)": 42.66, "step": 25, "token_acc": 0.72705078125, "train_speed(iter/s)": 0.160969 }, { "epoch": 0.2767102229054573, "grad_norm": 0.8205368518829346, "learning_rate": 4.854838252871097e-06, "loss": 0.8648523330688477, "memory(GiB)": 42.66, "step": 30, "token_acc": 0.7498213009292352, "train_speed(iter/s)": 0.161648 }, { "epoch": 0.3228285933897002, "grad_norm": 0.7823546528816223, "learning_rate": 4.785877785100633e-06, "loss": 0.827482032775879, "memory(GiB)": 42.66, "step": 35, "token_acc": 0.7509648370497427, "train_speed(iter/s)": 0.162779 }, { "epoch": 0.3689469638739431, "grad_norm": 0.7273194789886475, "learning_rate": 4.704254589692903e-06, "loss": 0.8310216903686524, "memory(GiB)": 42.66, "step": 40, "token_acc": 0.7523450244698205, "train_speed(iter/s)": 0.163121 }, { "epoch": 0.415065334358186, "grad_norm": 0.6990888714790344, "learning_rate": 4.610420822182671e-06, "loss": 0.8238757133483887, "memory(GiB)": 42.66, "step": 45, "token_acc": 0.7666873320934078, "train_speed(iter/s)": 0.163448 }, { "epoch": 0.4611837048424289, "grad_norm": 0.6725892424583435, "learning_rate": 4.5048962791438885e-06, "loss": 0.8079369544982911, "memory(GiB)": 42.66, "step": 50, "token_acc": 0.753395913859746, "train_speed(iter/s)": 0.163817 }, { "epoch": 0.5073020753266718, "grad_norm": 0.6294735074043274, "learning_rate": 4.388265518752085e-06, "loss": 0.7883205890655518, "memory(GiB)": 42.66, "step": 55, "token_acc": 0.7696396300627193, "train_speed(iter/s)": 0.164121 }, { "epoch": 0.5534204458109147, "grad_norm": 0.6453043818473816, "learning_rate": 4.261174622596835e-06, "loss": 0.7576101303100586, "memory(GiB)": 42.66, "step": 60, "token_acc": 0.7681740125930167, "train_speed(iter/s)": 0.164365 }, { "epoch": 0.5995388162951576, "grad_norm": 0.6130831241607666, "learning_rate": 4.124327616682362e-06, "loss": 0.7497624397277832, "memory(GiB)": 42.66, "step": 65, "token_acc": 0.7717697420299651, "train_speed(iter/s)": 0.164524 }, { "epoch": 0.6456571867794004, "grad_norm": 0.5963612198829651, "learning_rate": 3.978482571442339e-06, "loss": 0.7395827770233154, "memory(GiB)": 42.66, "step": 70, "token_acc": 0.7635451123531425, "train_speed(iter/s)": 0.165147 }, { "epoch": 0.6917755572636434, "grad_norm": 0.5419044494628906, "learning_rate": 3.8244474023730155e-06, "loss": 0.7070199489593506, "memory(GiB)": 42.66, "step": 75, "token_acc": 0.7876491956408925, "train_speed(iter/s)": 0.165396 }, { "epoch": 0.7378939277478862, "grad_norm": 0.6451313495635986, "learning_rate": 3.6630753945472854e-06, "loss": 0.7012932777404786, "memory(GiB)": 42.66, "step": 80, "token_acc": 0.7734261375441512, "train_speed(iter/s)": 0.165398 }, { "epoch": 0.7840122982321291, "grad_norm": 0.5576961636543274, "learning_rate": 3.495260475801841e-06, "loss": 0.6922179222106933, "memory(GiB)": 42.66, "step": 85, "token_acc": 0.786723163841808, "train_speed(iter/s)": 0.165703 }, { "epoch": 0.830130668716372, "grad_norm": 0.5406575202941895, "learning_rate": 3.321932264781822e-06, "loss": 0.6766807079315186, "memory(GiB)": 42.66, "step": 90, "token_acc": 0.7791790873652832, "train_speed(iter/s)": 0.166072 }, { "epoch": 0.8762490392006149, "grad_norm": 0.5346857309341431, "learning_rate": 3.1440509212745584e-06, "loss": 0.6620856285095215, "memory(GiB)": 42.66, "step": 95, "token_acc": 0.782299192520187, "train_speed(iter/s)": 0.166276 }, { "epoch": 0.9223674096848578, "grad_norm": 0.5536125898361206, "learning_rate": 2.962601827359208e-06, "loss": 0.6514341354370117, "memory(GiB)": 42.66, "step": 100, "token_acc": 0.7910814606741573, "train_speed(iter/s)": 0.166341 }, { "epoch": 0.9684857801691007, "grad_norm": 0.5314112305641174, "learning_rate": 2.7785901288363253e-06, "loss": 0.6576204299926758, "memory(GiB)": 42.66, "step": 105, "token_acc": 0.7999794955915522, "train_speed(iter/s)": 0.159886 }, { "epoch": 1.0092236740968485, "grad_norm": 0.5262110233306885, "learning_rate": 2.5930351671753707e-06, "loss": 0.6351086616516113, "memory(GiB)": 42.66, "step": 110, "token_acc": 0.7974320618299676, "train_speed(iter/s)": 0.161025 }, { "epoch": 1.0553420445810915, "grad_norm": 0.47874826192855835, "learning_rate": 2.4069648328246305e-06, "loss": 0.6372325420379639, "memory(GiB)": 42.66, "step": 115, "token_acc": 0.800898684069755, "train_speed(iter/s)": 0.161424 }, { "epoch": 1.1014604150653344, "grad_norm": 0.4817003011703491, "learning_rate": 2.221409871163675e-06, "loss": 0.6104538917541504, "memory(GiB)": 42.66, "step": 120, "token_acc": 0.7949481710949601, "train_speed(iter/s)": 0.16161 }, { "epoch": 1.1475787855495772, "grad_norm": 0.4979017376899719, "learning_rate": 2.037398172640793e-06, "loss": 0.6325568675994873, "memory(GiB)": 42.66, "step": 125, "token_acc": 0.7952386536882843, "train_speed(iter/s)": 0.161944 }, { "epoch": 1.19369715603382, "grad_norm": 0.4578051269054413, "learning_rate": 1.8559490787254423e-06, "loss": 0.6286015033721923, "memory(GiB)": 42.66, "step": 130, "token_acc": 0.795903701042041, "train_speed(iter/s)": 0.162238 }, { "epoch": 1.239815526518063, "grad_norm": 0.5420213937759399, "learning_rate": 1.6780677352181781e-06, "loss": 0.6169236183166504, "memory(GiB)": 42.66, "step": 135, "token_acc": 0.8036298049825273, "train_speed(iter/s)": 0.162512 }, { "epoch": 1.285933897002306, "grad_norm": 0.4351955056190491, "learning_rate": 1.5047395241981606e-06, "loss": 0.6146146774291992, "memory(GiB)": 42.66, "step": 140, "token_acc": 0.800178412132025, "train_speed(iter/s)": 0.162628 }, { "epoch": 1.332052267486549, "grad_norm": 0.48030343651771545, "learning_rate": 1.3369246054527152e-06, "loss": 0.6185456275939941, "memory(GiB)": 42.66, "step": 145, "token_acc": 0.7926472310033954, "train_speed(iter/s)": 0.162744 }, { "epoch": 1.3781706379707916, "grad_norm": 0.4361996352672577, "learning_rate": 1.1755525976269851e-06, "loss": 0.614862060546875, "memory(GiB)": 42.66, "step": 150, "token_acc": 0.8130265050339018, "train_speed(iter/s)": 0.16294 }, { "epoch": 1.4242890084550346, "grad_norm": 0.520553469657898, "learning_rate": 1.0215174285576615e-06, "loss": 0.6011684894561767, "memory(GiB)": 42.66, "step": 155, "token_acc": 0.814690350456073, "train_speed(iter/s)": 0.16305 }, { "epoch": 1.4704073789392775, "grad_norm": 0.4834914207458496, "learning_rate": 8.756723833176376e-07, "loss": 0.6098825454711914, "memory(GiB)": 42.66, "step": 160, "token_acc": 0.8096282487114815, "train_speed(iter/s)": 0.163347 }, { "epoch": 1.5165257494235203, "grad_norm": 0.45448240637779236, "learning_rate": 7.388253774031659e-07, "loss": 0.5952889442443847, "memory(GiB)": 42.66, "step": 165, "token_acc": 0.8119583104772353, "train_speed(iter/s)": 0.163326 }, { "epoch": 1.5626441199077634, "grad_norm": 0.470241904258728, "learning_rate": 6.117344812479154e-07, "loss": 0.5944385051727294, "memory(GiB)": 42.66, "step": 170, "token_acc": 0.8014315400600324, "train_speed(iter/s)": 0.163452 }, { "epoch": 1.6087624903920061, "grad_norm": 0.48042455315589905, "learning_rate": 4.951037208561116e-07, "loss": 0.5971737861633301, "memory(GiB)": 42.66, "step": 175, "token_acc": 0.8101413840756884, "train_speed(iter/s)": 0.163514 }, { "epoch": 1.654880860876249, "grad_norm": 0.4469975531101227, "learning_rate": 3.8957917781732883e-07, "loss": 0.5962778091430664, "memory(GiB)": 42.66, "step": 180, "token_acc": 0.802772808586762, "train_speed(iter/s)": 0.163665 }, { "epoch": 1.700999231360492, "grad_norm": 0.42613735795021057, "learning_rate": 2.957454103070978e-07, "loss": 0.5963385581970215, "memory(GiB)": 42.66, "step": 185, "token_acc": 0.7997950119576358, "train_speed(iter/s)": 0.163744 }, { "epoch": 1.7471176018447347, "grad_norm": 0.42817509174346924, "learning_rate": 2.1412221489936796e-07, "loss": 0.5908424377441406, "memory(GiB)": 42.66, "step": 190, "token_acc": 0.8128774928774929, "train_speed(iter/s)": 0.163747 }, { "epoch": 1.7932359723289777, "grad_norm": 0.47536778450012207, "learning_rate": 1.4516174712890406e-07, "loss": 0.5989221572875977, "memory(GiB)": 42.66, "step": 195, "token_acc": 0.8069948186528497, "train_speed(iter/s)": 0.163951 }, { "epoch": 1.8393543428132206, "grad_norm": 0.4647919237613678, "learning_rate": 8.924601675441207e-08, "loss": 0.5813657283782959, "memory(GiB)": 42.66, "step": 200, "token_acc": 0.818207654414551, "train_speed(iter/s)": 0.163995 }, { "epoch": 1.8854727132974634, "grad_norm": 0.4954111874103546, "learning_rate": 4.668477159748858e-08, "loss": 0.5898621559143067, "memory(GiB)": 42.66, "step": 205, "token_acc": 0.8052279635258359, "train_speed(iter/s)": 0.162393 }, { "epoch": 1.9315910837817065, "grad_norm": 0.5124484300613403, "learning_rate": 1.771378167997745e-08, "loss": 0.5992970943450928, "memory(GiB)": 42.66, "step": 210, "token_acc": 0.8106964634410935, "train_speed(iter/s)": 0.162483 }, { "epoch": 1.9777094542659492, "grad_norm": 0.5024097561836243, "learning_rate": 2.4935331648298644e-09, "loss": 0.5985934734344482, "memory(GiB)": 42.66, "step": 215, "token_acc": 0.7960237258347979, "train_speed(iter/s)": 0.162605 } ], "logging_steps": 5, "max_steps": 218, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.33248995656781e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }