{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9832402234636871, "eval_steps": 50, "global_step": 44, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0223463687150838, "grad_norm": 2577.74755859375, "learning_rate": 2e-05, "loss": 50.7069, "step": 1 }, { "epoch": 0.0223463687150838, "eval_loss": 3.2835566997528076, "eval_runtime": 2.3872, "eval_samples_per_second": 63.255, "eval_steps_per_second": 15.918, "step": 1 }, { "epoch": 0.0446927374301676, "grad_norm": 6546.3173828125, "learning_rate": 4e-05, "loss": 52.4505, "step": 2 }, { "epoch": 0.0670391061452514, "grad_norm": 2879.14501953125, "learning_rate": 6e-05, "loss": 52.5919, "step": 3 }, { "epoch": 0.0893854748603352, "grad_norm": 2090.73974609375, "learning_rate": 8e-05, "loss": 52.7518, "step": 4 }, { "epoch": 0.11173184357541899, "grad_norm": 1940.23193359375, "learning_rate": 0.0001, "loss": 51.2834, "step": 5 }, { "epoch": 0.1340782122905028, "grad_norm": 2605.8798828125, "learning_rate": 0.00012, "loss": 52.4079, "step": 6 }, { "epoch": 0.1564245810055866, "grad_norm": 6682.8388671875, "learning_rate": 0.00014, "loss": 51.4155, "step": 7 }, { "epoch": 0.1787709497206704, "grad_norm": 2514.529296875, "learning_rate": 0.00016, "loss": 50.5981, "step": 8 }, { "epoch": 0.2011173184357542, "grad_norm": 2855.07421875, "learning_rate": 0.00018, "loss": 49.7885, "step": 9 }, { "epoch": 0.22346368715083798, "grad_norm": 2247.414794921875, "learning_rate": 0.0002, "loss": 51.4268, "step": 10 }, { "epoch": 0.24581005586592178, "grad_norm": 3424.6533203125, "learning_rate": 0.00019957341762950344, "loss": 51.0585, "step": 11 }, { "epoch": 0.2681564245810056, "grad_norm": 3296.65185546875, "learning_rate": 0.0001982973099683902, "loss": 50.0642, "step": 12 }, { "epoch": 0.2905027932960894, "grad_norm": 1766.2926025390625, "learning_rate": 0.00019618256431728194, "loss": 49.1072, "step": 13 }, { "epoch": 0.3128491620111732, "grad_norm": 2249.23876953125, "learning_rate": 0.00019324722294043558, "loss": 49.0022, "step": 14 }, { "epoch": 0.33519553072625696, "grad_norm": 2969.0947265625, "learning_rate": 0.00018951632913550626, "loss": 48.8406, "step": 15 }, { "epoch": 0.3575418994413408, "grad_norm": 2778.11328125, "learning_rate": 0.00018502171357296144, "loss": 47.0118, "step": 16 }, { "epoch": 0.37988826815642457, "grad_norm": 3201.85693359375, "learning_rate": 0.000179801722728024, "loss": 47.7868, "step": 17 }, { "epoch": 0.4022346368715084, "grad_norm": 4596.56591796875, "learning_rate": 0.00017390089172206592, "loss": 46.4997, "step": 18 }, { "epoch": 0.4245810055865922, "grad_norm": 3318.73828125, "learning_rate": 0.00016736956436465573, "loss": 47.2409, "step": 19 }, { "epoch": 0.44692737430167595, "grad_norm": 2099.513671875, "learning_rate": 0.00016026346363792567, "loss": 48.2933, "step": 20 }, { "epoch": 0.4692737430167598, "grad_norm": 4453.8125, "learning_rate": 0.0001526432162877356, "loss": 48.2358, "step": 21 }, { "epoch": 0.49162011173184356, "grad_norm": 3568.524658203125, "learning_rate": 0.00014457383557765386, "loss": 46.9575, "step": 22 }, { "epoch": 0.5139664804469274, "grad_norm": 4010.7314453125, "learning_rate": 0.00013612416661871533, "loss": 46.9159, "step": 23 }, { "epoch": 0.5363128491620112, "grad_norm": 2880.6123046875, "learning_rate": 0.0001273662990072083, "loss": 45.0099, "step": 24 }, { "epoch": 0.5586592178770949, "grad_norm": 3565.14404296875, "learning_rate": 0.00011837495178165706, "loss": 46.3621, "step": 25 }, { "epoch": 0.5810055865921788, "grad_norm": 3099.607177734375, "learning_rate": 0.00010922683594633021, "loss": 45.2878, "step": 26 }, { "epoch": 0.6033519553072626, "grad_norm": 2318.313720703125, "learning_rate": 0.0001, "loss": 46.6766, "step": 27 }, { "epoch": 0.6256983240223464, "grad_norm": 3364.091552734375, "learning_rate": 9.077316405366981e-05, "loss": 45.6603, "step": 28 }, { "epoch": 0.6480446927374302, "grad_norm": 3509.09619140625, "learning_rate": 8.162504821834295e-05, "loss": 46.9604, "step": 29 }, { "epoch": 0.6703910614525139, "grad_norm": 3948.4111328125, "learning_rate": 7.263370099279172e-05, "loss": 45.7947, "step": 30 }, { "epoch": 0.6927374301675978, "grad_norm": 3185.07861328125, "learning_rate": 6.387583338128471e-05, "loss": 47.6725, "step": 31 }, { "epoch": 0.7150837988826816, "grad_norm": 3412.48486328125, "learning_rate": 5.542616442234618e-05, "loss": 45.6106, "step": 32 }, { "epoch": 0.7374301675977654, "grad_norm": 2618.543701171875, "learning_rate": 4.735678371226441e-05, "loss": 45.2124, "step": 33 }, { "epoch": 0.7597765363128491, "grad_norm": 3468.01318359375, "learning_rate": 3.973653636207437e-05, "loss": 45.141, "step": 34 }, { "epoch": 0.7821229050279329, "grad_norm": 2938.23681640625, "learning_rate": 3.263043563534428e-05, "loss": 42.9445, "step": 35 }, { "epoch": 0.8044692737430168, "grad_norm": 4326.49169921875, "learning_rate": 2.6099108277934103e-05, "loss": 47.6381, "step": 36 }, { "epoch": 0.8268156424581006, "grad_norm": 4710.1123046875, "learning_rate": 2.0198277271976052e-05, "loss": 45.6724, "step": 37 }, { "epoch": 0.8491620111731844, "grad_norm": 2800.705322265625, "learning_rate": 1.4978286427038601e-05, "loss": 45.8585, "step": 38 }, { "epoch": 0.8715083798882681, "grad_norm": 4585.00244140625, "learning_rate": 1.0483670864493778e-05, "loss": 46.6726, "step": 39 }, { "epoch": 0.8938547486033519, "grad_norm": 4168.4462890625, "learning_rate": 6.75277705956443e-06, "loss": 44.954, "step": 40 }, { "epoch": 0.9162011173184358, "grad_norm": 3351.935791015625, "learning_rate": 3.817435682718096e-06, "loss": 46.8225, "step": 41 }, { "epoch": 0.9385474860335196, "grad_norm": 3822.043212890625, "learning_rate": 1.7026900316098215e-06, "loss": 46.403, "step": 42 }, { "epoch": 0.9608938547486033, "grad_norm": 5116.40283203125, "learning_rate": 4.2658237049655323e-07, "loss": 46.336, "step": 43 }, { "epoch": 0.9832402234636871, "grad_norm": 4652.48291015625, "learning_rate": 0.0, "loss": 48.1853, "step": 44 } ], "logging_steps": 1, "max_steps": 44, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 30, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2180833263747072.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }