benfielding's picture
End of training
90f82bb verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 20,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 188.90625,
"epoch": 0.5714285714285714,
"grad_norm": 11.504660606384277,
"kl": 0.0,
"learning_rate": 4.965903258506806e-07,
"loss": 0.0,
"reward": 3.6299600526690483,
"reward_std": 0.651573613169603,
"rewards/concensus_correctness_reward_func": 0.7819999903440475,
"rewards/consensus_reward_func": 1.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.77814756706357,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.171875,
"rewards/xmlcount_reward_func": 0.8198125020135194,
"step": 2
},
{
"completion_length": 139.29166666666666,
"epoch": 1.0,
"grad_norm": 19.680736541748047,
"kl": 0.018907852994743735,
"learning_rate": 4.698684378016222e-07,
"loss": 0.0,
"reward": 6.1616571346918745,
"reward_std": 0.4250478910592695,
"rewards/concensus_correctness_reward_func": 1.862833318610986,
"rewards/consensus_reward_func": 1.5833333333333333,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.16666666666666666,
"rewards/question_recreation_reward_func": 0.9973237961530685,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3541666666666667,
"rewards/xmlcount_reward_func": 1.1973333358764648,
"step": 4
},
{
"completion_length": 144.9375,
"epoch": 1.5714285714285714,
"grad_norm": 23.490110397338867,
"kl": 0.24136288941372186,
"learning_rate": 4.193203929064353e-07,
"loss": 0.0002,
"reward": 5.961412996053696,
"reward_std": 0.5539119137974922,
"rewards/concensus_correctness_reward_func": 1.8196874968707561,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.9034442752599716,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.390625,
"rewards/xmlcount_reward_func": 1.22265625,
"step": 6
},
{
"completion_length": 157.95833333333334,
"epoch": 2.0,
"grad_norm": 11.857107162475586,
"kl": 0.6173685067333281,
"learning_rate": 3.5042385616324236e-07,
"loss": 0.0005,
"reward": 5.671488285064697,
"reward_std": 0.6377726250017682,
"rewards/concensus_correctness_reward_func": 1.3618333016832669,
"rewards/consensus_reward_func": 1.8333333333333333,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.940946638584137,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.3125,
"rewards/xmlcount_reward_func": 1.2228749990463257,
"step": 8
},
{
"completion_length": 137.90625,
"epoch": 2.571428571428571,
"grad_norm": 19.347959518432617,
"kl": 7.246092613320798,
"learning_rate": 2.706448363680831e-07,
"loss": 0.0072,
"reward": 5.676198855042458,
"reward_std": 0.36480392375960946,
"rewards/concensus_correctness_reward_func": 1.6264374908059835,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.8661676824558526,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.328125,
"rewards/xmlcount_reward_func": 1.16796875,
"step": 10
},
{
"completion_length": 143.5,
"epoch": 3.0,
"grad_norm": 9.921439170837402,
"kl": 0.7624614595746001,
"learning_rate": 1.886286282148002e-07,
"loss": 0.0006,
"reward": 6.447582880655925,
"reward_std": 0.052944420681645475,
"rewards/concensus_correctness_reward_func": 1.7801666458447774,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9799163043498993,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.25,
"step": 12
},
{
"completion_length": 143.15625,
"epoch": 3.571428571428571,
"grad_norm": 22.13187026977539,
"kl": 1.8785493820905685,
"learning_rate": 1.1326296046939333e-07,
"loss": 0.0019,
"reward": 5.9165933430194855,
"reward_std": 0.3377845502400305,
"rewards/concensus_correctness_reward_func": 1.6038749888539314,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.9151558466255665,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.359375,
"rewards/xmlcount_reward_func": 1.2256875038146973,
"step": 14
},
{
"completion_length": 136.83333333333334,
"epoch": 4.0,
"grad_norm": 6.5841965675354,
"kl": 14.041547794515887,
"learning_rate": 5.271487265090163e-08,
"loss": 0.0105,
"reward": 5.688777546087901,
"reward_std": 0.5299860953819007,
"rewards/concensus_correctness_reward_func": 1.557333316653967,
"rewards/consensus_reward_func": 1.3333333333333333,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.16666666666666666,
"rewards/question_recreation_reward_func": 0.9697359601656595,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.2242083350817363,
"step": 16
},
{
"completion_length": 142.4375,
"epoch": 4.571428571428571,
"grad_norm": 77.4472885131836,
"kl": 2.3539611261803657,
"learning_rate": 1.3545689574841341e-08,
"loss": 0.0024,
"reward": 6.306179732084274,
"reward_std": 0.5243307306227507,
"rewards/concensus_correctness_reward_func": 1.7168749906122684,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.976023480296135,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.23828125,
"step": 18
},
{
"completion_length": 146.875,
"epoch": 5.0,
"grad_norm": 674.2899780273438,
"kl": 40.880931643148266,
"learning_rate": 0.0,
"loss": 0.0307,
"reward": 5.47615905602773,
"reward_std": 0.3092364342495178,
"rewards/concensus_correctness_reward_func": 1.635416644314925,
"rewards/consensus_reward_func": 1.4166666666666667,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9729507366816202,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.2708333333333333,
"rewards/xmlcount_reward_func": 1.1802916675806046,
"step": 20
},
{
"epoch": 5.0,
"step": 20,
"total_flos": 0.0,
"train_loss": 0.0053957260796778424,
"train_runtime": 95.7573,
"train_samples_per_second": 3.342,
"train_steps_per_second": 0.209
}
],
"logging_steps": 2,
"max_steps": 20,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}