| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.02, | |
| "eval_steps": 500, | |
| "global_step": 450, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1577.0, | |
| "completions/max_terminated_length": 1577.0, | |
| "completions/mean_length": 577.25, | |
| "completions/mean_terminated_length": 577.25, | |
| "completions/min_length": 236.5, | |
| "completions/min_terminated_length": 236.5, | |
| "entropy": 0.22603079956024885, | |
| "epoch": 8.888888888888889e-05, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0, | |
| "kl": 0.0, | |
| "learning_rate": 3.571428571428571e-08, | |
| "loss": 0.0, | |
| "num_tokens": 23312.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1521.0, | |
| "completions/max_terminated_length": 1521.0, | |
| "completions/mean_length": 474.9375, | |
| "completions/mean_terminated_length": 474.9375, | |
| "completions/min_length": 212.0, | |
| "completions/min_terminated_length": 212.0, | |
| "entropy": 0.24615928065031767, | |
| "epoch": 0.00017777777777777779, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0010758156719471042, | |
| "kl": 0.0003032498079846846, | |
| "learning_rate": 1.0714285714285713e-07, | |
| "loss": 0.0, | |
| "num_tokens": 43302.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2886.0, | |
| "completions/mean_length": 1106.625, | |
| "completions/mean_terminated_length": 903.5262451171875, | |
| "completions/min_length": 253.5, | |
| "completions/min_terminated_length": 253.5, | |
| "entropy": 0.27944554202258587, | |
| "epoch": 0.0002666666666666667, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.1026674172522337, | |
| "kl": 0.0005087878689664649, | |
| "learning_rate": 1.7857142857142858e-07, | |
| "loss": 0.0, | |
| "num_tokens": 83522.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.22201896458864212, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.34860680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2156.5, | |
| "completions/mean_length": 1165.78125, | |
| "completions/mean_terminated_length": 659.8333435058594, | |
| "completions/min_length": 292.5, | |
| "completions/min_terminated_length": 292.5, | |
| "entropy": 0.2345265755429864, | |
| "epoch": 0.00035555555555555557, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.5848473823308938, | |
| "kl": 0.0005193246015551267, | |
| "learning_rate": 2.5e-07, | |
| "loss": 0.0, | |
| "num_tokens": 125675.0, | |
| "reward": 0.75, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.3811737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2664.0, | |
| "completions/max_terminated_length": 2224.0, | |
| "completions/mean_length": 739.375, | |
| "completions/mean_terminated_length": 666.3250122070312, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "entropy": 0.24358350411057472, | |
| "epoch": 0.00044444444444444447, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0005102182872665691, | |
| "kl": 0.0005887709885428194, | |
| "learning_rate": 3.2142857142857145e-07, | |
| "loss": 0.0, | |
| "num_tokens": 154151.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1164.5, | |
| "completions/max_terminated_length": 1164.5, | |
| "completions/mean_length": 522.90625, | |
| "completions/mean_terminated_length": 522.90625, | |
| "completions/min_length": 252.0, | |
| "completions/min_terminated_length": 252.0, | |
| "entropy": 0.23033921141177416, | |
| "epoch": 0.0005333333333333334, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0003883087974099818, | |
| "kl": 0.0004703981048805872, | |
| "learning_rate": 3.928571428571428e-07, | |
| "loss": 0.0, | |
| "num_tokens": 175748.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1743.5, | |
| "completions/max_terminated_length": 1717.0, | |
| "completions/mean_length": 1200.6875, | |
| "completions/mean_terminated_length": 820.0625, | |
| "completions/min_length": 469.5, | |
| "completions/min_terminated_length": 469.5, | |
| "entropy": 0.22865951620042324, | |
| "epoch": 0.0006222222222222223, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.9029769583901943, | |
| "kl": 0.000541954672371503, | |
| "learning_rate": 4.6428571428571427e-07, | |
| "loss": 0.0, | |
| "num_tokens": 219058.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1865.5, | |
| "completions/max_terminated_length": 524.0, | |
| "completions/mean_length": 1023.0625, | |
| "completions/mean_terminated_length": 318.9375, | |
| "completions/min_length": 203.0, | |
| "completions/min_terminated_length": 203.0, | |
| "entropy": 0.2518942877650261, | |
| "epoch": 0.0007111111111111111, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.2651085166655804, | |
| "kl": 0.0006022736051818356, | |
| "learning_rate": 4.999935101463869e-07, | |
| "loss": 0.0, | |
| "num_tokens": 256604.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1280.5, | |
| "completions/max_terminated_length": 1280.5, | |
| "completions/mean_length": 584.65625, | |
| "completions/mean_terminated_length": 584.65625, | |
| "completions/min_length": 280.5, | |
| "completions/min_terminated_length": 280.5, | |
| "entropy": 0.23922867327928543, | |
| "epoch": 0.0008, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0003891599078429636, | |
| "kl": 0.0005218808764766436, | |
| "learning_rate": 4.999415933391384e-07, | |
| "loss": 0.0, | |
| "num_tokens": 280153.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2230.5, | |
| "completions/max_terminated_length": 1331.5, | |
| "completions/mean_length": 1207.5, | |
| "completions/mean_terminated_length": 657.40625, | |
| "completions/min_length": 431.5, | |
| "completions/min_terminated_length": 431.5, | |
| "entropy": 0.23088860977441072, | |
| "epoch": 0.0008888888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0002444108443410019, | |
| "kl": 0.0005470474952744553, | |
| "learning_rate": 4.998377705063407e-07, | |
| "loss": 0.0, | |
| "num_tokens": 323689.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1031.0, | |
| "completions/max_terminated_length": 1031.0, | |
| "completions/mean_length": 529.09375, | |
| "completions/mean_terminated_length": 529.09375, | |
| "completions/min_length": 231.0, | |
| "completions/min_terminated_length": 231.0, | |
| "entropy": 0.22655892837792635, | |
| "epoch": 0.0009777777777777777, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0007743500195651646, | |
| "kl": 0.0006699018595099915, | |
| "learning_rate": 4.996820632091536e-07, | |
| "loss": 0.0, | |
| "num_tokens": 345444.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 1943.5, | |
| "completions/max_terminated_length": 1827.0, | |
| "completions/mean_length": 1000.34375, | |
| "completions/mean_terminated_length": 648.9517211914062, | |
| "completions/min_length": 242.5, | |
| "completions/min_terminated_length": 242.5, | |
| "entropy": 0.2282683216035366, | |
| "epoch": 0.0010666666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.001017224455435619, | |
| "kl": 0.0007511216499551665, | |
| "learning_rate": 4.994745037837194e-07, | |
| "loss": 0.0, | |
| "num_tokens": 382311.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2529.0, | |
| "completions/mean_length": 2014.625, | |
| "completions/mean_terminated_length": 1184.2785949707031, | |
| "completions/min_length": 604.0, | |
| "completions/min_terminated_length": 604.0, | |
| "entropy": 0.27375217340886593, | |
| "epoch": 0.0011555555555555555, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0008308273702719758, | |
| "kl": 0.0006844787167210598, | |
| "learning_rate": 4.992151353344481e-07, | |
| "loss": 0.0, | |
| "num_tokens": 451651.0, | |
| "reward": 0.4375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.4375, | |
| "rewards/equation_reward_func/std": 0.5081988871097565, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 700.0, | |
| "completions/max_terminated_length": 700.0, | |
| "completions/mean_length": 453.9375, | |
| "completions/mean_terminated_length": 453.9375, | |
| "completions/min_length": 254.0, | |
| "completions/min_terminated_length": 254.0, | |
| "entropy": 0.23610753938555717, | |
| "epoch": 0.0012444444444444445, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0008685447557227168, | |
| "kl": 0.0007076808778947452, | |
| "learning_rate": 4.989040117250646e-07, | |
| "loss": 0.0, | |
| "num_tokens": 471057.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2293.0, | |
| "completions/mean_length": 1640.375, | |
| "completions/mean_terminated_length": 526.888916015625, | |
| "completions/min_length": 243.0, | |
| "completions/min_terminated_length": 243.0, | |
| "entropy": 0.23777490202337503, | |
| "epoch": 0.0013333333333333333, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6274088265459657, | |
| "kl": 0.0006857724620203953, | |
| "learning_rate": 4.985411975674243e-07, | |
| "loss": 0.0, | |
| "num_tokens": 528445.0, | |
| "reward": 0.53125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.53125, | |
| "rewards/equation_reward_func/std": 0.5143726766109467, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2285.5, | |
| "completions/max_terminated_length": 1055.5, | |
| "completions/mean_length": 1233.78125, | |
| "completions/mean_terminated_length": 601.4375, | |
| "completions/min_length": 342.0, | |
| "completions/min_terminated_length": 342.0, | |
| "entropy": 0.24325580801814795, | |
| "epoch": 0.0014222222222222223, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.000655337400503361, | |
| "kl": 0.0007609008825966157, | |
| "learning_rate": 4.981267682080939e-07, | |
| "loss": 0.0, | |
| "num_tokens": 572822.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2631.5, | |
| "completions/mean_length": 1312.71875, | |
| "completions/mean_terminated_length": 913.6030578613281, | |
| "completions/min_length": 283.0, | |
| "completions/min_terminated_length": 283.0, | |
| "entropy": 0.23677545227110386, | |
| "epoch": 0.001511111111111111, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 10.794765092710524, | |
| "kl": 0.000754591055738274, | |
| "learning_rate": 4.976608097127043e-07, | |
| "loss": 0.0, | |
| "num_tokens": 619725.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.36435678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2004.0, | |
| "completions/mean_length": 1334.03125, | |
| "completions/mean_terminated_length": 739.835693359375, | |
| "completions/min_length": 296.5, | |
| "completions/min_terminated_length": 296.5, | |
| "entropy": 0.26550517696887255, | |
| "epoch": 0.0016, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6416140180865778, | |
| "kl": 0.0007265451895364095, | |
| "learning_rate": 4.97143418848077e-07, | |
| "loss": 0.0, | |
| "num_tokens": 667294.0, | |
| "reward": 0.75, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.44091323018074036, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 1726.5, | |
| "completions/max_terminated_length": 1495.0, | |
| "completions/mean_length": 878.875, | |
| "completions/mean_terminated_length": 694.1370239257812, | |
| "completions/min_length": 368.5, | |
| "completions/min_terminated_length": 368.5, | |
| "entropy": 0.22233000118285418, | |
| "epoch": 0.0016888888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7522096940055117, | |
| "kl": 0.0009702143179310951, | |
| "learning_rate": 4.965747030621286e-07, | |
| "loss": 0.0, | |
| "num_tokens": 700266.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.90625, | |
| "rewards/equation_reward_func/std": 0.20155644416809082, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1813.5, | |
| "completions/max_terminated_length": 1028.0, | |
| "completions/mean_length": 949.8125, | |
| "completions/mean_terminated_length": 372.1666717529297, | |
| "completions/min_length": 232.5, | |
| "completions/min_terminated_length": 232.5, | |
| "entropy": 0.2340000979602337, | |
| "epoch": 0.0017777777777777779, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7589222336313214, | |
| "kl": 0.0008960766499512829, | |
| "learning_rate": 4.959547804615562e-07, | |
| "loss": 0.0, | |
| "num_tokens": 735476.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1754.5, | |
| "completions/max_terminated_length": 1719.5, | |
| "completions/mean_length": 1365.96875, | |
| "completions/mean_terminated_length": 838.2083740234375, | |
| "completions/min_length": 378.0, | |
| "completions/min_terminated_length": 378.0, | |
| "entropy": 0.26818372309207916, | |
| "epoch": 0.0018666666666666666, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0008688646141334615, | |
| "kl": 0.0008683820233272854, | |
| "learning_rate": 4.952837797873106e-07, | |
| "loss": 0.0, | |
| "num_tokens": 784019.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2572.0, | |
| "completions/max_terminated_length": 2570.5, | |
| "completions/mean_length": 1079.21875, | |
| "completions/mean_terminated_length": 873.141845703125, | |
| "completions/min_length": 297.5, | |
| "completions/min_terminated_length": 297.5, | |
| "entropy": 0.2423563925549388, | |
| "epoch": 0.0019555555555555554, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7020501913144348, | |
| "kl": 0.0009873026720015332, | |
| "learning_rate": 4.9456184038786e-07, | |
| "loss": 0.0, | |
| "num_tokens": 823410.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1686.5, | |
| "completions/mean_length": 1232.625, | |
| "completions/mean_terminated_length": 719.4333801269531, | |
| "completions/min_length": 300.5, | |
| "completions/min_terminated_length": 300.5, | |
| "entropy": 0.25999774504452944, | |
| "epoch": 0.0020444444444444447, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0005542284098518003, | |
| "kl": 0.001003881188808009, | |
| "learning_rate": 4.937891121902508e-07, | |
| "loss": 0.0, | |
| "num_tokens": 867702.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1775.5, | |
| "completions/mean_length": 1823.84375, | |
| "completions/mean_terminated_length": 1082.8750610351562, | |
| "completions/min_length": 487.0, | |
| "completions/min_terminated_length": 487.0, | |
| "entropy": 0.31278051622211933, | |
| "epoch": 0.0021333333333333334, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.9767853720165567, | |
| "kl": 0.0008782508339209016, | |
| "learning_rate": 4.929657556689726e-07, | |
| "loss": 0.0, | |
| "num_tokens": 930961.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.42898140847682953, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1420.0, | |
| "completions/max_terminated_length": 1420.0, | |
| "completions/mean_length": 628.59375, | |
| "completions/mean_terminated_length": 628.59375, | |
| "completions/min_length": 332.0, | |
| "completions/min_terminated_length": 332.0, | |
| "entropy": 0.23106891848146915, | |
| "epoch": 0.0022222222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0008790964938972672, | |
| "kl": 0.0011074417707277462, | |
| "learning_rate": 4.920919418126312e-07, | |
| "loss": 0.0, | |
| "num_tokens": 955956.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1836.0, | |
| "completions/max_terminated_length": 1599.0, | |
| "completions/mean_length": 1298.78125, | |
| "completions/mean_terminated_length": 968.2257080078125, | |
| "completions/min_length": 341.0, | |
| "completions/min_terminated_length": 341.0, | |
| "entropy": 0.23789576161652803, | |
| "epoch": 0.002311111111111111, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0015075554431588705, | |
| "kl": 0.0011714456777554005, | |
| "learning_rate": 4.911678520884398e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1002381.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.2630178928375244, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 1992.5, | |
| "completions/max_terminated_length": 1889.0, | |
| "completions/mean_length": 1640.78125, | |
| "completions/mean_terminated_length": 1111.7083740234375, | |
| "completions/min_length": 332.0, | |
| "completions/min_terminated_length": 332.0, | |
| "entropy": 0.26475983764976263, | |
| "epoch": 0.0024, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.5139488912304642, | |
| "kl": 0.0012290262056922074, | |
| "learning_rate": 4.901936784045324e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1059734.0, | |
| "reward": 0.5625, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/equation_reward_func/mean": 0.5625, | |
| "rewards/equation_reward_func/std": 0.3265564441680908, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2003.5, | |
| "completions/max_terminated_length": 739.5, | |
| "completions/mean_length": 1126.65625, | |
| "completions/mean_terminated_length": 461.5625, | |
| "completions/min_length": 268.0, | |
| "completions/min_terminated_length": 268.0, | |
| "entropy": 0.2474461616948247, | |
| "epoch": 0.002488888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0012532198813457072, | |
| "kl": 0.0015458752604899928, | |
| "learning_rate": 4.891696230701103e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1100603.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1490.5, | |
| "completions/max_terminated_length": 1490.5, | |
| "completions/mean_length": 529.59375, | |
| "completions/mean_terminated_length": 529.59375, | |
| "completions/min_length": 220.0, | |
| "completions/min_terminated_length": 220.0, | |
| "entropy": 0.24761256389319897, | |
| "epoch": 0.002577777777777778, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0013617152892329606, | |
| "kl": 0.0018745916750049219, | |
| "learning_rate": 4.880958987534282e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1122326.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 1818.0, | |
| "completions/max_terminated_length": 860.0, | |
| "completions/mean_length": 781.25, | |
| "completions/mean_terminated_length": 365.4545440673828, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "entropy": 0.26473226584494114, | |
| "epoch": 0.0026666666666666666, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0029793463244017996, | |
| "kl": 0.002165006830182392, | |
| "learning_rate": 4.869727284376277e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1152150.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2081.0, | |
| "completions/max_terminated_length": 1650.5, | |
| "completions/mean_length": 921.78125, | |
| "completions/mean_terminated_length": 636.7291870117188, | |
| "completions/min_length": 239.5, | |
| "completions/min_terminated_length": 239.5, | |
| "entropy": 0.25862254202365875, | |
| "epoch": 0.0027555555555555554, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7791104007321142, | |
| "kl": 0.0020706086870632134, | |
| "learning_rate": 4.858003453744314e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1186463.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1726.5, | |
| "completions/max_terminated_length": 1477.0, | |
| "completions/mean_length": 979.03125, | |
| "completions/mean_terminated_length": 442.2430725097656, | |
| "completions/min_length": 233.5, | |
| "completions/min_terminated_length": 233.5, | |
| "entropy": 0.2315123314037919, | |
| "epoch": 0.0028444444444444446, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.5664425570824678, | |
| "kl": 0.0022492043499369174, | |
| "learning_rate": 4.845789930357016e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1222616.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2144.5, | |
| "completions/max_terminated_length": 1759.0, | |
| "completions/mean_length": 916.84375, | |
| "completions/mean_terminated_length": 718.3966674804688, | |
| "completions/min_length": 228.0, | |
| "completions/min_terminated_length": 228.0, | |
| "entropy": 0.2546883439645171, | |
| "epoch": 0.0029333333333333334, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0018449164314454892, | |
| "kl": 0.0023287632793653756, | |
| "learning_rate": 4.833089250628786e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1256779.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2946.0, | |
| "completions/mean_length": 1932.3125, | |
| "completions/mean_terminated_length": 1132.9555969238281, | |
| "completions/min_length": 373.5, | |
| "completions/min_terminated_length": 373.5, | |
| "entropy": 0.2742412742227316, | |
| "epoch": 0.003022222222222222, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.0264964672382928, | |
| "kl": 0.0020347888057585806, | |
| "learning_rate": 4.819904052143058e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1323517.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.3377464786171913, | |
| "rewards/equation_reward_func/mean": 0.59375, | |
| "rewards/equation_reward_func/std": 0.5061737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1004.0, | |
| "completions/max_terminated_length": 1004.0, | |
| "completions/mean_length": 432.4375, | |
| "completions/mean_terminated_length": 432.4375, | |
| "completions/min_length": 185.0, | |
| "completions/min_terminated_length": 185.0, | |
| "entropy": 0.24551625549793243, | |
| "epoch": 0.003111111111111111, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0034440763335736547, | |
| "kl": 0.004841565096285194, | |
| "learning_rate": 4.806237073104548e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1342163.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1816.5, | |
| "completions/max_terminated_length": 1451.0, | |
| "completions/mean_length": 742.375, | |
| "completions/mean_terminated_length": 606.3526916503906, | |
| "completions/min_length": 237.0, | |
| "completions/min_terminated_length": 237.0, | |
| "entropy": 0.2519435351714492, | |
| "epoch": 0.0032, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.032328735168464, | |
| "kl": 0.0035425843234406784, | |
| "learning_rate": 4.792091151770602e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1370711.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1044.5, | |
| "completions/mean_length": 1702.125, | |
| "completions/mean_terminated_length": 484.8958435058594, | |
| "completions/min_length": 306.5, | |
| "completions/min_terminated_length": 306.5, | |
| "entropy": 0.24912292044609785, | |
| "epoch": 0.003288888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0017951204099864006, | |
| "kl": 0.0034496651496738195, | |
| "learning_rate": 4.777469225861765e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1430067.0, | |
| "reward": 0.5, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.5, | |
| "rewards/equation_reward_func/std": 0.5163977742195129, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 914.5, | |
| "completions/mean_length": 2169.8125, | |
| "completions/mean_terminated_length": 333.0833435058594, | |
| "completions/min_length": 1697.0, | |
| "completions/min_terminated_length": 161.0, | |
| "entropy": 0.25269814021885395, | |
| "epoch": 0.0033777777777777777, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7495021801377318, | |
| "kl": 0.0026823820953723043, | |
| "learning_rate": 4.762374331951703e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1504389.0, | |
| "reward": 0.375, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.375, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 1744.5, | |
| "completions/max_terminated_length": 672.0, | |
| "completions/mean_length": 1152.46875, | |
| "completions/mean_terminated_length": 474.77679443359375, | |
| "completions/min_length": 300.5, | |
| "completions/min_terminated_length": 300.5, | |
| "entropy": 0.2373593281954527, | |
| "epoch": 0.0034666666666666665, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0032166698633040733, | |
| "kl": 0.0036603061016649008, | |
| "learning_rate": 4.7468096048365814e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1546092.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2471.5, | |
| "completions/max_terminated_length": 2177.5, | |
| "completions/mean_length": 1190.25, | |
| "completions/mean_terminated_length": 821.2301330566406, | |
| "completions/min_length": 291.5, | |
| "completions/min_terminated_length": 291.5, | |
| "entropy": 0.26422596722841263, | |
| "epoch": 0.0035555555555555557, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.001992271528938085, | |
| "kl": 0.004376799814053811, | |
| "learning_rate": 4.730778276884061e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1588988.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2363.0, | |
| "completions/mean_length": 1536.46875, | |
| "completions/mean_terminated_length": 701.8055725097656, | |
| "completions/min_length": 234.0, | |
| "completions/min_terminated_length": 234.0, | |
| "entropy": 0.28139131516218185, | |
| "epoch": 0.0036444444444444445, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7357119490341348, | |
| "kl": 0.003145620590657927, | |
| "learning_rate": 4.7142836773620227e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1643003.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.59375, | |
| "rewards/equation_reward_func/std": 0.497555673122406, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.59375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2767.5, | |
| "completions/mean_length": 2251.09375, | |
| "completions/mean_terminated_length": 1607.1363830566406, | |
| "completions/min_length": 1232.5, | |
| "completions/min_terminated_length": 1232.5, | |
| "entropy": 0.27658269740641117, | |
| "epoch": 0.0037333333333333333, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0015005186459580411, | |
| "kl": 0.0028125419485149905, | |
| "learning_rate": 4.6973292317471635e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1719918.0, | |
| "reward": 0.3125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.3125, | |
| "rewards/equation_reward_func/std": 0.42898140847682953, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.65625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2763.5, | |
| "completions/mean_length": 2389.09375, | |
| "completions/mean_terminated_length": 1365.8055725097656, | |
| "completions/min_length": 718.0, | |
| "completions/min_terminated_length": 718.0, | |
| "entropy": 0.27335013449192047, | |
| "epoch": 0.003822222222222222, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.5794927896037598, | |
| "kl": 0.0027770966989919543, | |
| "learning_rate": 4.679918461013627e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1801257.0, | |
| "reward": 0.3125, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/equation_reward_func/mean": 0.3125, | |
| "rewards/equation_reward_func/std": 0.3811737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 823.0, | |
| "completions/max_terminated_length": 823.0, | |
| "completions/mean_length": 513.0625, | |
| "completions/mean_terminated_length": 513.0625, | |
| "completions/min_length": 294.5, | |
| "completions/min_terminated_length": 294.5, | |
| "entropy": 0.23741111066192389, | |
| "epoch": 0.003911111111111111, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0017861260868517065, | |
| "kl": 0.0039054618537193164, | |
| "learning_rate": 4.6620549809017885e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1822475.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2744.0, | |
| "completions/max_terminated_length": 1381.0, | |
| "completions/mean_length": 1148.5625, | |
| "completions/mean_terminated_length": 445.875, | |
| "completions/min_length": 207.5, | |
| "completions/min_terminated_length": 207.5, | |
| "entropy": 0.2619064189493656, | |
| "epoch": 0.004, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7985547598733307, | |
| "kl": 0.003670994978165254, | |
| "learning_rate": 4.643742501167366e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1864045.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1842.0, | |
| "completions/max_terminated_length": 1402.0, | |
| "completions/mean_length": 724.15625, | |
| "completions/mean_terminated_length": 657.5062866210938, | |
| "completions/min_length": 211.0, | |
| "completions/min_terminated_length": 211.0, | |
| "entropy": 0.2663749074563384, | |
| "epoch": 0.004088888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6611432476993873, | |
| "kl": 0.004101754348084796, | |
| "learning_rate": 4.624984824811006e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1892010.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2289.5, | |
| "completions/mean_length": 1697.5, | |
| "completions/mean_terminated_length": 1072.7273254394531, | |
| "completions/min_length": 314.0, | |
| "completions/min_terminated_length": 314.0, | |
| "entropy": 0.2442193143069744, | |
| "epoch": 0.004177777777777778, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.9689055012957616, | |
| "kl": 0.003427549614571035, | |
| "learning_rate": 4.605785847288502e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1951210.0, | |
| "reward": 0.65625, | |
| "reward_std": 0.3471629321575165, | |
| "rewards/equation_reward_func/mean": 0.65625, | |
| "rewards/equation_reward_func/std": 0.48935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2042.0, | |
| "completions/max_terminated_length": 1845.0, | |
| "completions/mean_length": 885.15625, | |
| "completions/mean_terminated_length": 675.1778869628906, | |
| "completions/min_length": 258.0, | |
| "completions/min_terminated_length": 258.0, | |
| "entropy": 0.25679061096161604, | |
| "epoch": 0.004266666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7668184777361081, | |
| "kl": 0.005221109124249779, | |
| "learning_rate": 4.5861495557018206e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1984351.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.90625, | |
| "rewards/equation_reward_func/std": 0.20155644416809082, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2058.0, | |
| "completions/max_terminated_length": 2058.0, | |
| "completions/mean_length": 713.0625, | |
| "completions/mean_terminated_length": 713.0625, | |
| "completions/min_length": 280.0, | |
| "completions/min_terminated_length": 280.0, | |
| "entropy": 0.25331663712859154, | |
| "epoch": 0.004355555555555555, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00194731319056398, | |
| "kl": 0.004550013778498396, | |
| "learning_rate": 4.566080027971082e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2012001.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1068.0, | |
| "completions/mean_length": 1476.90625, | |
| "completions/mean_terminated_length": 550.0494689941406, | |
| "completions/min_length": 289.5, | |
| "completions/min_terminated_length": 289.5, | |
| "entropy": 0.27285506669431925, | |
| "epoch": 0.0044444444444444444, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.8264287211470831, | |
| "kl": 0.003749549054191448, | |
| "learning_rate": 4.545581431987694e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2064150.0, | |
| "reward": 0.625, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/equation_reward_func/mean": 0.625, | |
| "rewards/equation_reward_func/std": 0.457730233669281, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 530.5, | |
| "completions/max_terminated_length": 530.5, | |
| "completions/mean_length": 319.5625, | |
| "completions/mean_terminated_length": 319.5625, | |
| "completions/min_length": 200.5, | |
| "completions/min_terminated_length": 200.5, | |
| "entropy": 0.24814098048955202, | |
| "epoch": 0.004533333333333334, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00292471693097075, | |
| "kl": 0.005984036062727682, | |
| "learning_rate": 4.5246580247487933e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2079152.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2575.0, | |
| "completions/max_terminated_length": 2125.0, | |
| "completions/mean_length": 1022.125, | |
| "completions/mean_terminated_length": 832.829345703125, | |
| "completions/min_length": 418.0, | |
| "completions/min_terminated_length": 418.0, | |
| "entropy": 0.2612212775275111, | |
| "epoch": 0.004622222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7744589581145311, | |
| "kl": 0.004202435855404474, | |
| "learning_rate": 4.5033141514731786e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2116748.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.90625, | |
| "rewards/equation_reward_func/std": 0.20155644416809082, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2367.5, | |
| "completions/mean_length": 1707.96875, | |
| "completions/mean_terminated_length": 1121.2916870117188, | |
| "completions/min_length": 421.5, | |
| "completions/min_terminated_length": 421.5, | |
| "entropy": 0.2849529664963484, | |
| "epoch": 0.004711111111111111, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.8055922294032352, | |
| "kl": 0.0038093408074928448, | |
| "learning_rate": 4.4815542446989373e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2176243.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.249358132481575, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.47360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 952.5, | |
| "completions/max_terminated_length": 952.5, | |
| "completions/mean_length": 322.4375, | |
| "completions/mean_terminated_length": 322.4375, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "entropy": 0.2402123035863042, | |
| "epoch": 0.0048, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0026144022076216205, | |
| "kl": 0.00510270893573761, | |
| "learning_rate": 4.4593828233629214e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2191353.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 905.5, | |
| "completions/mean_length": 1221.875, | |
| "completions/mean_terminated_length": 486.4125061035156, | |
| "completions/min_length": 256.5, | |
| "completions/min_terminated_length": 256.5, | |
| "entropy": 0.26087356358766556, | |
| "epoch": 0.004888888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8632217331595174, | |
| "kl": 0.00461755899596028, | |
| "learning_rate": 4.4368044918622893e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2235285.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2083.5, | |
| "completions/max_terminated_length": 1983.0, | |
| "completions/mean_length": 1154.28125, | |
| "completions/mean_terminated_length": 640.7534790039062, | |
| "completions/min_length": 305.5, | |
| "completions/min_terminated_length": 305.5, | |
| "entropy": 0.245620877481997, | |
| "epoch": 0.004977777777777778, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0014729909165334624, | |
| "kl": 0.0038956179923843592, | |
| "learning_rate": 4.4138239390983e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2277102.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 1987.5, | |
| "completions/max_terminated_length": 1762.5, | |
| "completions/mean_length": 829.34375, | |
| "completions/mean_terminated_length": 543.4895935058594, | |
| "completions/min_length": 215.0, | |
| "completions/min_terminated_length": 215.0, | |
| "entropy": 0.2480917638167739, | |
| "epoch": 0.005066666666666666, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7305596650806964, | |
| "kl": 0.005493182397913188, | |
| "learning_rate": 4.390445937502557e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2308457.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1659.5, | |
| "completions/max_terminated_length": 1659.5, | |
| "completions/mean_length": 595.0625, | |
| "completions/mean_terminated_length": 595.0625, | |
| "completions/min_length": 295.0, | |
| "completions/min_terminated_length": 295.0, | |
| "entropy": 0.2502336846664548, | |
| "epoch": 0.005155555555555556, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.001567908068070067, | |
| "kl": 0.0051197968859924, | |
| "learning_rate": 4.3666753420459023e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2332387.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2570.5, | |
| "completions/mean_length": 2503.6875, | |
| "completions/mean_terminated_length": 1840.6875, | |
| "completions/min_length": 1507.5, | |
| "completions/min_terminated_length": 1507.5, | |
| "entropy": 0.2708489568904042, | |
| "epoch": 0.005244444444444445, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 1.5058277210442557, | |
| "kl": 0.003387732562259771, | |
| "learning_rate": 4.3425170892301764e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2417425.0, | |
| "reward": 0.3125, | |
| "reward_std": 0.4355512708425522, | |
| "rewards/equation_reward_func/mean": 0.3125, | |
| "rewards/equation_reward_func/std": 0.42898140847682953, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2686.5, | |
| "completions/mean_length": 2226.1875, | |
| "completions/mean_terminated_length": 1788.4166870117188, | |
| "completions/min_length": 1331.5, | |
| "completions/min_terminated_length": 1331.5, | |
| "entropy": 0.2982914987951517, | |
| "epoch": 0.005333333333333333, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6845989637013021, | |
| "kl": 0.003966670599766076, | |
| "learning_rate": 4.3179761960630357e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2493511.0, | |
| "reward": 0.34375, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/equation_reward_func/mean": 0.34375, | |
| "rewards/equation_reward_func/std": 0.42695631086826324, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2067.5, | |
| "completions/max_terminated_length": 847.0, | |
| "completions/mean_length": 1148.78125, | |
| "completions/mean_terminated_length": 482.71875, | |
| "completions/min_length": 305.5, | |
| "completions/min_terminated_length": 305.5, | |
| "entropy": 0.26174278277903795, | |
| "epoch": 0.005422222222222222, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0016187374368629333, | |
| "kl": 0.00422936627001036, | |
| "learning_rate": 4.293057759016063e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2535120.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1857.5, | |
| "completions/max_terminated_length": 1041.0, | |
| "completions/mean_length": 583.75, | |
| "completions/mean_terminated_length": 507.26458740234375, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "entropy": 0.25596251245588064, | |
| "epoch": 0.005511111111111111, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6322511630069138, | |
| "kl": 0.005413578634033911, | |
| "learning_rate": 4.2677669529663686e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2558648.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 1878.5, | |
| "completions/max_terminated_length": 1508.0, | |
| "completions/mean_length": 908.8125, | |
| "completions/mean_terminated_length": 537.2784118652344, | |
| "completions/min_length": 247.0, | |
| "completions/min_terminated_length": 247.0, | |
| "entropy": 0.2507998961955309, | |
| "epoch": 0.0056, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7030376954792115, | |
| "kl": 0.00650369533104822, | |
| "learning_rate": 4.2421090301219077e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2592586.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1994.0, | |
| "completions/max_terminated_length": 817.0, | |
| "completions/mean_length": 1162.65625, | |
| "completions/mean_terminated_length": 494.8125, | |
| "completions/min_length": 329.0, | |
| "completions/min_terminated_length": 329.0, | |
| "entropy": 0.2358728414401412, | |
| "epoch": 0.005688888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0014426004757466125, | |
| "kl": 0.004484620047151111, | |
| "learning_rate": 4.216089318930741e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2634703.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1812.0, | |
| "completions/mean_length": 1705.46875, | |
| "completions/mean_terminated_length": 735.9261474609375, | |
| "completions/min_length": 333.0, | |
| "completions/min_terminated_length": 333.0, | |
| "entropy": 0.27035616524517536, | |
| "epoch": 0.0057777777777777775, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.605428435592805, | |
| "kl": 0.005202807413297705, | |
| "learning_rate": 4.189713222974466e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2694166.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.59375, | |
| "rewards/equation_reward_func/std": 0.497555673122406, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1858.5, | |
| "completions/max_terminated_length": 1547.0, | |
| "completions/mean_length": 928.84375, | |
| "completions/mean_terminated_length": 468.0812683105469, | |
| "completions/min_length": 224.5, | |
| "completions/min_terminated_length": 224.5, | |
| "entropy": 0.2494833106175065, | |
| "epoch": 0.005866666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0032201336301060854, | |
| "kl": 0.0067589654645416886, | |
| "learning_rate": 4.162986219846037e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2728713.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1941.0, | |
| "completions/max_terminated_length": 1046.0, | |
| "completions/mean_length": 1116.59375, | |
| "completions/mean_terminated_length": 614.3923645019531, | |
| "completions/min_length": 370.0, | |
| "completions/min_terminated_length": 370.0, | |
| "entropy": 0.2735691536217928, | |
| "epoch": 0.005955555555555556, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.7950120981284405, | |
| "kl": 0.005726380710257217, | |
| "learning_rate": 4.135913860012219e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2769260.0, | |
| "reward": 0.75, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.3811737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1792.5, | |
| "completions/max_terminated_length": 1133.0, | |
| "completions/mean_length": 1138.1875, | |
| "completions/mean_terminated_length": 565.8125, | |
| "completions/min_length": 289.0, | |
| "completions/min_terminated_length": 289.0, | |
| "entropy": 0.2514611016958952, | |
| "epoch": 0.006044444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0018004806044451867, | |
| "kl": 0.0059647281595971435, | |
| "learning_rate": 4.10850176566091e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2810538.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2790.5, | |
| "completions/max_terminated_length": 2470.5, | |
| "completions/mean_length": 1291.46875, | |
| "completions/mean_terminated_length": 1078.6041870117188, | |
| "completions/min_length": 469.5, | |
| "completions/min_terminated_length": 469.5, | |
| "entropy": 0.27105455938726664, | |
| "epoch": 0.0061333333333333335, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6507725934734265, | |
| "kl": 0.004865752707701176, | |
| "learning_rate": 4.080755629533566e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2856697.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1858.0, | |
| "completions/mean_length": 1279.09375, | |
| "completions/mean_terminated_length": 676.388916015625, | |
| "completions/min_length": 355.5, | |
| "completions/min_terminated_length": 355.5, | |
| "entropy": 0.2739897835999727, | |
| "epoch": 0.006222222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7611818739738083, | |
| "kl": 0.006271678488701582, | |
| "learning_rate": 4.052681213742971e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2902516.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1764.5, | |
| "completions/max_terminated_length": 1272.5, | |
| "completions/mean_length": 680.40625, | |
| "completions/mean_terminated_length": 613.7375183105469, | |
| "completions/min_length": 218.5, | |
| "completions/min_terminated_length": 218.5, | |
| "entropy": 0.2625574329867959, | |
| "epoch": 0.006311111111111111, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.641499811439406, | |
| "kl": 0.006318398402072489, | |
| "learning_rate": 4.024284348576611e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2929105.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 825.0, | |
| "completions/max_terminated_length": 825.0, | |
| "completions/mean_length": 357.28125, | |
| "completions/mean_terminated_length": 357.28125, | |
| "completions/min_length": 194.5, | |
| "completions/min_terminated_length": 194.5, | |
| "entropy": 0.25569348596036434, | |
| "epoch": 0.0064, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002224877364676125, | |
| "kl": 0.007314969494473189, | |
| "learning_rate": 3.9955709312858744e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2945338.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2148.5, | |
| "completions/max_terminated_length": 2137.5, | |
| "completions/mean_length": 813.75, | |
| "completions/mean_terminated_length": 749.6458740234375, | |
| "completions/min_length": 257.0, | |
| "completions/min_terminated_length": 257.0, | |
| "entropy": 0.26268062368035316, | |
| "epoch": 0.006488888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0020979525232143028, | |
| "kl": 0.006386323366314173, | |
| "learning_rate": 3.9665469248613616e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2976250.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1002.0, | |
| "completions/max_terminated_length": 1002.0, | |
| "completions/mean_length": 431.71875, | |
| "completions/mean_terminated_length": 431.71875, | |
| "completions/min_length": 238.5, | |
| "completions/min_terminated_length": 238.5, | |
| "entropy": 0.26534419134259224, | |
| "epoch": 0.006577777777777778, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0023952318007118166, | |
| "kl": 0.00811207268270664, | |
| "learning_rate": 3.9372183567945314e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2994913.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2330.0, | |
| "completions/max_terminated_length": 2274.0, | |
| "completions/mean_length": 933.59375, | |
| "completions/mean_terminated_length": 796.6964416503906, | |
| "completions/min_length": 304.5, | |
| "completions/min_terminated_length": 304.5, | |
| "entropy": 0.24762420449405909, | |
| "epoch": 0.006666666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0018580747675341872, | |
| "kl": 0.006500544230220839, | |
| "learning_rate": 3.907591317825956e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3029636.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1187.0, | |
| "completions/mean_length": 1713.03125, | |
| "completions/mean_terminated_length": 499.8055725097656, | |
| "completions/min_length": 253.0, | |
| "completions/min_terminated_length": 253.0, | |
| "entropy": 0.27569348085671663, | |
| "epoch": 0.0067555555555555554, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0021576179813316406, | |
| "kl": 0.007041995238978416, | |
| "learning_rate": 3.877671960680443e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3089309.0, | |
| "reward": 0.5, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.5, | |
| "rewards/equation_reward_func/std": 0.5163977742195129, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1730.5, | |
| "completions/max_terminated_length": 1377.0, | |
| "completions/mean_length": 913.4375, | |
| "completions/mean_terminated_length": 442.98126220703125, | |
| "completions/min_length": 237.5, | |
| "completions/min_terminated_length": 237.5, | |
| "entropy": 0.24052318930625916, | |
| "epoch": 0.006844444444444445, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0037383834531385493, | |
| "kl": 0.008585786272305995, | |
| "learning_rate": 3.847466498789282e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3123355.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1781.5, | |
| "completions/mean_length": 1636.40625, | |
| "completions/mean_terminated_length": 516.0249938964844, | |
| "completions/min_length": 218.0, | |
| "completions/min_terminated_length": 218.0, | |
| "entropy": 0.28433873131871223, | |
| "epoch": 0.006933333333333333, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0026989103207334733, | |
| "kl": 0.006938680307939649, | |
| "learning_rate": 3.816981204999882e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3180568.0, | |
| "reward": 0.5625, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.5625, | |
| "rewards/equation_reward_func/std": 0.5081988871097565, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1877.0, | |
| "completions/max_terminated_length": 1717.5, | |
| "completions/mean_length": 1203.28125, | |
| "completions/mean_terminated_length": 857.4750366210938, | |
| "completions/min_length": 471.5, | |
| "completions/min_terminated_length": 471.5, | |
| "entropy": 0.26476599369198084, | |
| "epoch": 0.007022222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6434589123374275, | |
| "kl": 0.00672344581107609, | |
| "learning_rate": 3.786222410273078e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3223945.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 3006.5, | |
| "completions/max_terminated_length": 2928.5, | |
| "completions/mean_length": 1742.5625, | |
| "completions/mean_terminated_length": 1458.9403686523438, | |
| "completions/min_length": 514.0, | |
| "completions/min_terminated_length": 514.0, | |
| "entropy": 0.2928483448922634, | |
| "epoch": 0.0071111111111111115, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0017971739857914673, | |
| "kl": 0.006342295295326039, | |
| "learning_rate": 3.755196502368361e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3284571.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.22201895713806152, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1899.0, | |
| "completions/max_terminated_length": 896.5, | |
| "completions/mean_length": 1095.1875, | |
| "completions/mean_terminated_length": 473.375, | |
| "completions/min_length": 336.0, | |
| "completions/min_terminated_length": 336.0, | |
| "entropy": 0.25843985099345446, | |
| "epoch": 0.0072, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0031502250054640376, | |
| "kl": 0.007663089840207249, | |
| "learning_rate": 3.723909924517314e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3324441.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2010.5, | |
| "completions/max_terminated_length": 1225.5, | |
| "completions/mean_length": 693.65625, | |
| "completions/mean_terminated_length": 541.5401916503906, | |
| "completions/min_length": 196.0, | |
| "completions/min_terminated_length": 196.0, | |
| "entropy": 0.2597637241706252, | |
| "epoch": 0.007288888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7702693435320899, | |
| "kl": 0.008306328963954002, | |
| "learning_rate": 3.692369174085534e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3351494.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 1958.0, | |
| "completions/max_terminated_length": 1272.0, | |
| "completions/mean_length": 1699.03125, | |
| "completions/mean_terminated_length": 1055.90625, | |
| "completions/min_length": 982.5, | |
| "completions/min_terminated_length": 982.5, | |
| "entropy": 0.2800039369612932, | |
| "epoch": 0.007377777777777777, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0027883239668124267, | |
| "kl": 0.00744300993392244, | |
| "learning_rate": 3.6605808012233004e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3410759.0, | |
| "reward": 0.53125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.53125, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2152.0, | |
| "completions/max_terminated_length": 2147.0, | |
| "completions/mean_length": 1513.59375, | |
| "completions/mean_terminated_length": 1326.09375, | |
| "completions/min_length": 522.5, | |
| "completions/min_terminated_length": 522.5, | |
| "entropy": 0.26186632737517357, | |
| "epoch": 0.007466666666666667, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.9198608744208149, | |
| "kl": 0.006979152763960883, | |
| "learning_rate": 3.628551407505292e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3464082.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 692.0, | |
| "completions/max_terminated_length": 692.0, | |
| "completions/mean_length": 430.03125, | |
| "completions/mean_terminated_length": 430.03125, | |
| "completions/min_length": 273.5, | |
| "completions/min_terminated_length": 273.5, | |
| "entropy": 0.23170531447976828, | |
| "epoch": 0.007555555555555556, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002541713449292013, | |
| "kl": 0.008697336044861004, | |
| "learning_rate": 3.5962876445596224e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3482691.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1732.0, | |
| "completions/mean_length": 1689.3125, | |
| "completions/mean_terminated_length": 716.3295593261719, | |
| "completions/min_length": 363.0, | |
| "completions/min_terminated_length": 363.0, | |
| "entropy": 0.2743770433589816, | |
| "epoch": 0.007644444444444444, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6462770111570193, | |
| "kl": 0.00819350325036794, | |
| "learning_rate": 3.563796212686475e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3541653.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.59375, | |
| "rewards/equation_reward_func/std": 0.497555673122406, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2642.5, | |
| "completions/max_terminated_length": 1885.5, | |
| "completions/mean_length": 1243.96875, | |
| "completions/mean_terminated_length": 603.65625, | |
| "completions/min_length": 296.0, | |
| "completions/min_terminated_length": 296.0, | |
| "entropy": 0.2883901707828045, | |
| "epoch": 0.007733333333333333, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0018479443351030222, | |
| "kl": 0.007616912451339886, | |
| "learning_rate": 3.531083859466635e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3586348.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2715.0, | |
| "completions/max_terminated_length": 2480.0, | |
| "completions/mean_length": 1095.03125, | |
| "completions/mean_terminated_length": 626.0375061035156, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "entropy": 0.2639634981751442, | |
| "epoch": 0.007822222222222222, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.006148680335338723, | |
| "kl": 0.008246932789916173, | |
| "learning_rate": 3.498157378360204e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3626229.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2908.5, | |
| "completions/max_terminated_length": 2566.5, | |
| "completions/mean_length": 986.8125, | |
| "completions/mean_terminated_length": 844.5803833007812, | |
| "completions/min_length": 244.5, | |
| "completions/min_terminated_length": 244.5, | |
| "entropy": 0.26056686975061893, | |
| "epoch": 0.007911111111111112, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6854510293518095, | |
| "kl": 0.009517568425508216, | |
| "learning_rate": 3.465023607295784e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3662655.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1047.5, | |
| "completions/max_terminated_length": 1047.5, | |
| "completions/mean_length": 464.34375, | |
| "completions/mean_terminated_length": 464.34375, | |
| "completions/min_length": 237.5, | |
| "completions/min_terminated_length": 237.5, | |
| "entropy": 0.27534451708197594, | |
| "epoch": 0.008, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.2111208036108543, | |
| "kl": 0.010279384237946942, | |
| "learning_rate": 3.4316894272504225e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3682330.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2305.5, | |
| "completions/mean_length": 1692.4375, | |
| "completions/mean_terminated_length": 1048.8803405761719, | |
| "completions/min_length": 294.5, | |
| "completions/min_terminated_length": 294.5, | |
| "entropy": 0.2586697665974498, | |
| "epoch": 0.008088888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6255903792292131, | |
| "kl": 0.008654644683701918, | |
| "learning_rate": 3.398161760820628e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3741408.0, | |
| "reward": 0.65625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.65625, | |
| "rewards/equation_reward_func/std": 0.4597553312778473, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2933.5, | |
| "completions/max_terminated_length": 2582.5, | |
| "completions/mean_length": 1454.15625, | |
| "completions/mean_terminated_length": 1066.7437438964844, | |
| "completions/min_length": 516.5, | |
| "completions/min_terminated_length": 516.5, | |
| "entropy": 0.3029740732163191, | |
| "epoch": 0.008177777777777779, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0022089392124720916, | |
| "kl": 0.008055398386204615, | |
| "learning_rate": 3.364447570784731e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3792829.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2534.5, | |
| "completions/max_terminated_length": 1977.0, | |
| "completions/mean_length": 1022.34375, | |
| "completions/mean_terminated_length": 726.3333435058594, | |
| "completions/min_length": 272.5, | |
| "completions/min_terminated_length": 272.5, | |
| "entropy": 0.25912539288401604, | |
| "epoch": 0.008266666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8681055932592283, | |
| "kl": 0.00938627275172621, | |
| "learning_rate": 3.3305538586569116e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3830440.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2095.5, | |
| "completions/max_terminated_length": 1841.5, | |
| "completions/mean_length": 944.5, | |
| "completions/mean_terminated_length": 740.860595703125, | |
| "completions/min_length": 283.0, | |
| "completions/min_terminated_length": 283.0, | |
| "entropy": 0.251262541860342, | |
| "epoch": 0.008355555555555555, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6350731005119447, | |
| "kl": 0.010139893798623234, | |
| "learning_rate": 3.296487663233168e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3865512.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.90625, | |
| "rewards/equation_reward_func/std": 0.20155644416809082, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 462.5, | |
| "completions/max_terminated_length": 462.5, | |
| "completions/mean_length": 306.125, | |
| "completions/mean_terminated_length": 306.125, | |
| "completions/min_length": 189.0, | |
| "completions/min_terminated_length": 189.0, | |
| "entropy": 0.22235615644603968, | |
| "epoch": 0.008444444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0031605861188310326, | |
| "kl": 0.01150068684364669, | |
| "learning_rate": 3.2622560591295606e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3880108.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1820.0, | |
| "completions/max_terminated_length": 797.5, | |
| "completions/mean_length": 1109.0625, | |
| "completions/mean_terminated_length": 513.5, | |
| "completions/min_length": 306.0, | |
| "completions/min_terminated_length": 306.0, | |
| "entropy": 0.2766123227775097, | |
| "epoch": 0.008533333333333334, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002693976312764259, | |
| "kl": 0.009999056375818327, | |
| "learning_rate": 3.227866155313002e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3920438.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2416.5, | |
| "completions/mean_length": 1054.34375, | |
| "completions/mean_terminated_length": 680.9667053222656, | |
| "completions/min_length": 280.5, | |
| "completions/min_terminated_length": 280.5, | |
| "entropy": 0.29231622349470854, | |
| "epoch": 0.008622222222222222, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.8717361985224948, | |
| "kl": 0.009583401377312839, | |
| "learning_rate": 3.1933250936249213e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3959009.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.22201896458864212, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.34860680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1548.5, | |
| "completions/max_terminated_length": 1548.5, | |
| "completions/mean_length": 621.3125, | |
| "completions/mean_terminated_length": 621.3125, | |
| "completions/min_length": 273.0, | |
| "completions/min_terminated_length": 273.0, | |
| "entropy": 0.26802380103617907, | |
| "epoch": 0.00871111111111111, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002640439329048923, | |
| "kl": 0.010297555243596435, | |
| "learning_rate": 3.158640047298098e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3983691.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2787.0, | |
| "completions/max_terminated_length": 2773.5, | |
| "completions/mean_length": 894.3125, | |
| "completions/mean_terminated_length": 825.9396057128906, | |
| "completions/min_length": 207.0, | |
| "completions/min_terminated_length": 207.0, | |
| "entropy": 0.260735678486526, | |
| "epoch": 0.0088, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6876160757154334, | |
| "kl": 0.009146726137259975, | |
| "learning_rate": 3.123818219466981e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4017157.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1940.5, | |
| "completions/max_terminated_length": 1940.5, | |
| "completions/mean_length": 574.5, | |
| "completions/mean_terminated_length": 574.5, | |
| "completions/min_length": 222.0, | |
| "completions/min_terminated_length": 222.0, | |
| "entropy": 0.24010000098496675, | |
| "epoch": 0.008888888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.005430605739914178, | |
| "kl": 0.00986800153623335, | |
| "learning_rate": 3.088866841671789e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4040381.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2159.5, | |
| "completions/mean_length": 1421.6875, | |
| "completions/mean_terminated_length": 817.5857696533203, | |
| "completions/min_length": 277.5, | |
| "completions/min_terminated_length": 277.5, | |
| "entropy": 0.2729024589061737, | |
| "epoch": 0.008977777777777777, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.8441320423470235, | |
| "kl": 0.008688551635714248, | |
| "learning_rate": 3.0537931723567253e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4090771.0, | |
| "reward": 0.75, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.42078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2166.0, | |
| "completions/max_terminated_length": 1986.5, | |
| "completions/mean_length": 1150.1875, | |
| "completions/mean_terminated_length": 619.1840209960938, | |
| "completions/min_length": 263.5, | |
| "completions/min_terminated_length": 263.5, | |
| "entropy": 0.24514910019934177, | |
| "epoch": 0.009066666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00302042576058633, | |
| "kl": 0.010161265439819545, | |
| "learning_rate": 3.01860449536259e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4132473.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1779.0, | |
| "completions/max_terminated_length": 1766.0, | |
| "completions/mean_length": 1532.96875, | |
| "completions/mean_terminated_length": 1267.1875, | |
| "completions/min_length": 781.5, | |
| "completions/min_terminated_length": 781.5, | |
| "entropy": 0.2567154373973608, | |
| "epoch": 0.009155555555555556, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.003281200405250708, | |
| "kl": 0.008494329609675333, | |
| "learning_rate": 2.983308118414131e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4186376.0, | |
| "reward": 0.65625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.65625, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 928.0, | |
| "completions/max_terminated_length": 928.0, | |
| "completions/mean_length": 469.5, | |
| "completions/mean_terminated_length": 469.5, | |
| "completions/min_length": 262.5, | |
| "completions/min_terminated_length": 262.5, | |
| "entropy": 0.2405125731602311, | |
| "epoch": 0.009244444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0032161521894528843, | |
| "kl": 0.008923913293983787, | |
| "learning_rate": 2.9479113716024275e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4206216.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2232.5, | |
| "completions/max_terminated_length": 1459.0, | |
| "completions/mean_length": 877.40625, | |
| "completions/mean_terminated_length": 638.1778869628906, | |
| "completions/min_length": 293.0, | |
| "completions/min_terminated_length": 293.0, | |
| "entropy": 0.24397206585854292, | |
| "epoch": 0.009333333333333334, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.9097697460274851, | |
| "kl": 0.010490661457879469, | |
| "learning_rate": 2.912421605862632e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4239101.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.90625, | |
| "rewards/equation_reward_func/std": 0.20155644416809082, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2090.0, | |
| "completions/mean_length": 1876.65625, | |
| "completions/mean_terminated_length": 927.8375244140625, | |
| "completions/min_length": 418.5, | |
| "completions/min_terminated_length": 418.5, | |
| "entropy": 0.25669932272285223, | |
| "epoch": 0.009422222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00271911961262999, | |
| "kl": 0.008709978050319478, | |
| "learning_rate": 2.8768461914473794e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4304026.0, | |
| "reward": 0.5625, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.5625, | |
| "rewards/equation_reward_func/std": 0.5081988871097565, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 908.0, | |
| "completions/max_terminated_length": 908.0, | |
| "completions/mean_length": 414.3125, | |
| "completions/mean_terminated_length": 414.3125, | |
| "completions/min_length": 207.0, | |
| "completions/min_terminated_length": 207.0, | |
| "entropy": 0.25329437758773565, | |
| "epoch": 0.00951111111111111, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003267497300157984, | |
| "kl": 0.01166740502230823, | |
| "learning_rate": 2.8411925163961926e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4322108.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 1865.5, | |
| "completions/max_terminated_length": 1799.5, | |
| "completions/mean_length": 883.4375, | |
| "completions/mean_terminated_length": 608.4166870117188, | |
| "completions/min_length": 250.5, | |
| "completions/min_terminated_length": 250.5, | |
| "entropy": 0.27052732463926077, | |
| "epoch": 0.0096, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.006071708906705738, | |
| "kl": 0.010494905174709857, | |
| "learning_rate": 2.8054679850011825e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4355202.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.22201895713806152, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1109.5, | |
| "completions/max_terminated_length": 1109.5, | |
| "completions/mean_length": 530.875, | |
| "completions/mean_terminated_length": 530.875, | |
| "completions/min_length": 265.5, | |
| "completions/min_terminated_length": 265.5, | |
| "entropy": 0.21879185363650322, | |
| "epoch": 0.00968888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0024693973812315844, | |
| "kl": 0.010472121328348294, | |
| "learning_rate": 2.769680016269385e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4377110.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1888.5, | |
| "completions/max_terminated_length": 1553.0, | |
| "completions/mean_length": 795.8125, | |
| "completions/mean_terminated_length": 662.4866333007812, | |
| "completions/min_length": 247.0, | |
| "completions/min_terminated_length": 247.0, | |
| "entropy": 0.2686642771586776, | |
| "epoch": 0.009777777777777778, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6128253340962238, | |
| "kl": 0.011848014197312295, | |
| "learning_rate": 2.7338360423820327e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4407376.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 964.5, | |
| "completions/max_terminated_length": 964.5, | |
| "completions/mean_length": 345.875, | |
| "completions/mean_terminated_length": 345.875, | |
| "completions/min_length": 203.0, | |
| "completions/min_terminated_length": 203.0, | |
| "entropy": 0.252207750454545, | |
| "epoch": 0.009866666666666666, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002946676892555618, | |
| "kl": 0.012307984463404864, | |
| "learning_rate": 2.6979435071510956e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4423260.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2080.5, | |
| "completions/max_terminated_length": 2070.0, | |
| "completions/mean_length": 966.1875, | |
| "completions/mean_terminated_length": 777.90869140625, | |
| "completions/min_length": 295.5, | |
| "completions/min_terminated_length": 295.5, | |
| "entropy": 0.2508456828072667, | |
| "epoch": 0.009955555555555556, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.6106400201424906, | |
| "kl": 0.012187447311589494, | |
| "learning_rate": 2.662009864473406e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4459066.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.90625, | |
| "rewards/equation_reward_func/std": 0.20155644416809082, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1453.5, | |
| "completions/max_terminated_length": 1453.5, | |
| "completions/mean_length": 601.1875, | |
| "completions/mean_terminated_length": 601.1875, | |
| "completions/min_length": 232.0, | |
| "completions/min_terminated_length": 232.0, | |
| "entropy": 0.26608254946768284, | |
| "epoch": 0.010044444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0024192669923231076, | |
| "kl": 0.012214825808769092, | |
| "learning_rate": 2.626042576782687e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4483160.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2300.5, | |
| "completions/max_terminated_length": 1112.0, | |
| "completions/mean_length": 1102.59375, | |
| "completions/mean_terminated_length": 413.6875, | |
| "completions/min_length": 206.0, | |
| "completions/min_terminated_length": 206.0, | |
| "entropy": 0.2653536135330796, | |
| "epoch": 0.010133333333333333, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003141380751052536, | |
| "kl": 0.011402795265894383, | |
| "learning_rate": 2.590049113499809e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4523275.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1841.5, | |
| "completions/max_terminated_length": 1778.0, | |
| "completions/mean_length": 1348.21875, | |
| "completions/mean_terminated_length": 969.0, | |
| "completions/min_length": 446.5, | |
| "completions/min_terminated_length": 446.5, | |
| "entropy": 0.2285262243822217, | |
| "epoch": 0.010222222222222223, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.002841794206765057, | |
| "kl": 0.011010473070200533, | |
| "learning_rate": 2.5540369494815966e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4571314.0, | |
| "reward": 0.75, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2224.5, | |
| "completions/mean_length": 1477.28125, | |
| "completions/mean_terminated_length": 817.2619323730469, | |
| "completions/min_length": 238.5, | |
| "completions/min_terminated_length": 238.5, | |
| "entropy": 0.2778350468724966, | |
| "epoch": 0.010311111111111111, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6689780153086349, | |
| "kl": 0.010737988399341702, | |
| "learning_rate": 2.5180135634685064e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4623483.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.42898140847682953, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2414.5, | |
| "completions/mean_length": 1444.4375, | |
| "completions/mean_terminated_length": 795.5530700683594, | |
| "completions/min_length": 239.5, | |
| "completions/min_terminated_length": 239.5, | |
| "entropy": 0.2818821109831333, | |
| "epoch": 0.0104, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.9879929334941168, | |
| "kl": 0.011097497888840735, | |
| "learning_rate": 2.4819864365314934e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4674561.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.4787135720252991, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 1913.0, | |
| "completions/max_terminated_length": 1698.0, | |
| "completions/mean_length": 1022.125, | |
| "completions/mean_terminated_length": 678.53125, | |
| "completions/min_length": 270.5, | |
| "completions/min_terminated_length": 270.5, | |
| "entropy": 0.2688545901328325, | |
| "epoch": 0.01048888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6315402563299277, | |
| "kl": 0.01124519360018894, | |
| "learning_rate": 2.445963050518403e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4712149.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1819.5, | |
| "completions/max_terminated_length": 505.5, | |
| "completions/mean_length": 995.8125, | |
| "completions/mean_terminated_length": 311.28125, | |
| "completions/min_length": 228.0, | |
| "completions/min_terminated_length": 228.0, | |
| "entropy": 0.27648669946938753, | |
| "epoch": 0.010577777777777778, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0023729104405685922, | |
| "kl": 0.012640737142646685, | |
| "learning_rate": 2.4099508865001914e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4748831.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2162.5, | |
| "completions/max_terminated_length": 2162.5, | |
| "completions/mean_length": 693.84375, | |
| "completions/mean_terminated_length": 693.84375, | |
| "completions/min_length": 272.5, | |
| "completions/min_terminated_length": 272.5, | |
| "entropy": 0.2688616942614317, | |
| "epoch": 0.010666666666666666, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.004133074831770589, | |
| "kl": 0.01435062033124268, | |
| "learning_rate": 2.3739574232173134e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4775898.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 651.0, | |
| "completions/max_terminated_length": 651.0, | |
| "completions/mean_length": 408.5, | |
| "completions/mean_terminated_length": 408.5, | |
| "completions/min_length": 281.5, | |
| "completions/min_terminated_length": 281.5, | |
| "entropy": 0.23651507124304771, | |
| "epoch": 0.010755555555555556, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0029914986702280033, | |
| "kl": 0.014157850993797183, | |
| "learning_rate": 2.3379901355265936e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4793826.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2794.5, | |
| "completions/max_terminated_length": 2252.5, | |
| "completions/mean_length": 1407.5, | |
| "completions/mean_terminated_length": 984.7250061035156, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "entropy": 0.2798434291034937, | |
| "epoch": 0.010844444444444445, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8787443536732897, | |
| "kl": 0.012309474579524249, | |
| "learning_rate": 2.3020564928489041e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4843714.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1956.0, | |
| "completions/mean_length": 1603.625, | |
| "completions/mean_terminated_length": 596.8500061035156, | |
| "completions/min_length": 298.5, | |
| "completions/min_terminated_length": 298.5, | |
| "entropy": 0.2638617567718029, | |
| "epoch": 0.010933333333333333, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7666509207211404, | |
| "kl": 0.011061000754125416, | |
| "learning_rate": 2.2661639576179676e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4899894.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.2041158601641655, | |
| "rewards/equation_reward_func/mean": 0.59375, | |
| "rewards/equation_reward_func/std": 0.5061737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 1990.0, | |
| "completions/max_terminated_length": 1649.0, | |
| "completions/mean_length": 1623.96875, | |
| "completions/mean_terminated_length": 1129.4375, | |
| "completions/min_length": 704.5, | |
| "completions/min_terminated_length": 704.5, | |
| "entropy": 0.2599087553098798, | |
| "epoch": 0.011022222222222221, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0028186538614616394, | |
| "kl": 0.011108465492725372, | |
| "learning_rate": 2.2303199837306153e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4956749.0, | |
| "reward": 0.625, | |
| "reward_std": 0.2177756428718567, | |
| "rewards/equation_reward_func/mean": 0.625, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2025.0, | |
| "completions/max_terminated_length": 1847.5, | |
| "completions/mean_length": 1289.53125, | |
| "completions/mean_terminated_length": 1127.0729370117188, | |
| "completions/min_length": 547.5, | |
| "completions/min_terminated_length": 547.5, | |
| "entropy": 0.2873380035161972, | |
| "epoch": 0.011111111111111112, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.1290088171475576, | |
| "kl": 0.012026124983094633, | |
| "learning_rate": 2.194532014998817e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5002854.0, | |
| "reward": 0.875, | |
| "reward_std": 0.2177756428718567, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1747.0, | |
| "completions/max_terminated_length": 417.5, | |
| "completions/mean_length": 1000.25, | |
| "completions/mean_terminated_length": 308.5, | |
| "completions/min_length": 246.5, | |
| "completions/min_terminated_length": 246.5, | |
| "entropy": 0.2919534966349602, | |
| "epoch": 0.0112, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0025806344488486584, | |
| "kl": 0.013488087366567925, | |
| "learning_rate": 2.1588074836038071e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5039686.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1922.5, | |
| "completions/max_terminated_length": 1082.5, | |
| "completions/mean_length": 1145.96875, | |
| "completions/mean_terminated_length": 525.375, | |
| "completions/min_length": 287.5, | |
| "completions/min_terminated_length": 287.5, | |
| "entropy": 0.2488407287746668, | |
| "epoch": 0.011288888888888888, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.498668519279294, | |
| "kl": 0.014777076430618763, | |
| "learning_rate": 2.1231538085526204e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5081189.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1815.0, | |
| "completions/max_terminated_length": 1722.0, | |
| "completions/mean_length": 656.6875, | |
| "completions/mean_terminated_length": 507.6294860839844, | |
| "completions/min_length": 222.0, | |
| "completions/min_terminated_length": 222.0, | |
| "entropy": 0.25390581879764795, | |
| "epoch": 0.011377777777777778, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8200840056513704, | |
| "kl": 0.013640801305882633, | |
| "learning_rate": 2.0875783941373686e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5107027.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 583.0, | |
| "completions/max_terminated_length": 583.0, | |
| "completions/mean_length": 335.8125, | |
| "completions/mean_terminated_length": 335.8125, | |
| "completions/min_length": 205.5, | |
| "completions/min_terminated_length": 205.5, | |
| "entropy": 0.2633366733789444, | |
| "epoch": 0.011466666666666667, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0032217440107119217, | |
| "kl": 0.013921725505497307, | |
| "learning_rate": 2.052088628397572e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5122565.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1858.0, | |
| "completions/max_terminated_length": 1650.0, | |
| "completions/mean_length": 846.8125, | |
| "completions/mean_terminated_length": 789.0354614257812, | |
| "completions/min_length": 302.0, | |
| "completions/min_terminated_length": 302.0, | |
| "entropy": 0.25422646198421717, | |
| "epoch": 0.011555555555555555, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6192911531673088, | |
| "kl": 0.0131575966370292, | |
| "learning_rate": 2.0166918815858688e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5154495.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1515.5, | |
| "completions/max_terminated_length": 1515.5, | |
| "completions/mean_length": 593.78125, | |
| "completions/mean_terminated_length": 593.78125, | |
| "completions/min_length": 263.5, | |
| "completions/min_terminated_length": 263.5, | |
| "entropy": 0.2436890648677945, | |
| "epoch": 0.011644444444444445, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00379025728998696, | |
| "kl": 0.013373794557992369, | |
| "learning_rate": 1.9813955046374102e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5178360.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1448.0, | |
| "completions/max_terminated_length": 1448.0, | |
| "completions/mean_length": 659.90625, | |
| "completions/mean_terminated_length": 659.90625, | |
| "completions/min_length": 318.0, | |
| "completions/min_terminated_length": 318.0, | |
| "entropy": 0.27821108512580395, | |
| "epoch": 0.011733333333333333, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002902300626200292, | |
| "kl": 0.011531222582561895, | |
| "learning_rate": 1.946206827643275e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5204293.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 1972.5, | |
| "completions/max_terminated_length": 1638.5, | |
| "completions/mean_length": 1453.71875, | |
| "completions/mean_terminated_length": 982.8333740234375, | |
| "completions/min_length": 579.5, | |
| "completions/min_terminated_length": 579.5, | |
| "entropy": 0.28553890995681286, | |
| "epoch": 0.011822222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7618530543419798, | |
| "kl": 0.012410368886776268, | |
| "learning_rate": 1.9111331583282103e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5255644.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 1782.5, | |
| "completions/max_terminated_length": 1336.0, | |
| "completions/mean_length": 811.6875, | |
| "completions/mean_terminated_length": 521.4375, | |
| "completions/min_length": 265.0, | |
| "completions/min_terminated_length": 265.0, | |
| "entropy": 0.27573296427726746, | |
| "epoch": 0.011911111111111112, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.6613583754850176, | |
| "kl": 0.01382189046125859, | |
| "learning_rate": 1.8761817805330195e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5286434.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.249358132481575, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 833.0, | |
| "completions/max_terminated_length": 833.0, | |
| "completions/mean_length": 389.71875, | |
| "completions/mean_terminated_length": 389.71875, | |
| "completions/min_length": 225.5, | |
| "completions/min_terminated_length": 225.5, | |
| "entropy": 0.2621759483590722, | |
| "epoch": 0.012, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.007007814627563382, | |
| "kl": 0.014533480803947896, | |
| "learning_rate": 1.8413599527019018e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5303721.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2546.5, | |
| "completions/mean_length": 1690.15625, | |
| "completions/mean_terminated_length": 1133.9886474609375, | |
| "completions/min_length": 433.5, | |
| "completions/min_terminated_length": 433.5, | |
| "entropy": 0.26791701279580593, | |
| "epoch": 0.012088888888888889, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.0865010274613978, | |
| "kl": 0.011928621912375093, | |
| "learning_rate": 1.806674906375079e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5362734.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.3471629321575165, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.46296359598636627, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1968.0, | |
| "completions/max_terminated_length": 1741.0, | |
| "completions/mean_length": 1352.90625, | |
| "completions/mean_terminated_length": 1023.9305419921875, | |
| "completions/min_length": 596.0, | |
| "completions/min_terminated_length": 596.0, | |
| "entropy": 0.2802997101098299, | |
| "epoch": 0.012177777777777777, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0028160156857841676, | |
| "kl": 0.011874206480570138, | |
| "learning_rate": 1.7721338446869976e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5410915.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 940.0, | |
| "completions/max_terminated_length": 940.0, | |
| "completions/mean_length": 401.34375, | |
| "completions/mean_terminated_length": 401.34375, | |
| "completions/min_length": 269.5, | |
| "completions/min_terminated_length": 269.5, | |
| "entropy": 0.25402139965444803, | |
| "epoch": 0.012266666666666667, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003056421139936211, | |
| "kl": 0.012792080786311999, | |
| "learning_rate": 1.7377439408704392e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5428606.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2440.5, | |
| "completions/max_terminated_length": 2271.5, | |
| "completions/mean_length": 1083.34375, | |
| "completions/mean_terminated_length": 794.0104370117188, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "entropy": 0.28792093601077795, | |
| "epoch": 0.012355555555555555, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8645122139887629, | |
| "kl": 0.011338822660036385, | |
| "learning_rate": 1.7035123367668323e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5468121.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1582.5, | |
| "completions/max_terminated_length": 1582.5, | |
| "completions/mean_length": 617.75, | |
| "completions/mean_terminated_length": 617.75, | |
| "completions/min_length": 314.0, | |
| "completions/min_terminated_length": 314.0, | |
| "entropy": 0.26330281142145395, | |
| "epoch": 0.012444444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003203365010013785, | |
| "kl": 0.01245744532207027, | |
| "learning_rate": 1.6694461413430893e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5492769.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2827.0, | |
| "completions/mean_length": 1535.59375, | |
| "completions/mean_terminated_length": 1023.4583435058594, | |
| "completions/min_length": 277.5, | |
| "completions/min_terminated_length": 277.5, | |
| "entropy": 0.2922232113778591, | |
| "epoch": 0.012533333333333334, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6984483514858193, | |
| "kl": 0.012172643968369812, | |
| "learning_rate": 1.6355524292152684e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5546756.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.2630179077386856, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.46296359598636627, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1831.5, | |
| "completions/max_terminated_length": 1012.5, | |
| "completions/mean_length": 1000.125, | |
| "completions/mean_terminated_length": 435.96875, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "entropy": 0.2578113954514265, | |
| "epoch": 0.012622222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7839498455938455, | |
| "kl": 0.011324702645651996, | |
| "learning_rate": 1.6018382391793722e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5583616.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 960.5, | |
| "completions/max_terminated_length": 960.5, | |
| "completions/mean_length": 461.9375, | |
| "completions/mean_terminated_length": 461.9375, | |
| "completions/min_length": 290.5, | |
| "completions/min_terminated_length": 290.5, | |
| "entropy": 0.2691855514422059, | |
| "epoch": 0.01271111111111111, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003558811604246946, | |
| "kl": 0.012289426987990737, | |
| "learning_rate": 1.5683105727495778e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5603238.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 1837.0, | |
| "completions/max_terminated_length": 1507.0, | |
| "completions/mean_length": 641.28125, | |
| "completions/mean_terminated_length": 487.58038330078125, | |
| "completions/min_length": 268.5, | |
| "completions/min_terminated_length": 268.5, | |
| "entropy": 0.2638643169775605, | |
| "epoch": 0.0128, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00331423594586153, | |
| "kl": 0.013494796236045659, | |
| "learning_rate": 1.5349763927042168e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5628583.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 1766.5, | |
| "completions/max_terminated_length": 1478.5, | |
| "completions/mean_length": 854.34375, | |
| "completions/mean_terminated_length": 577.59375, | |
| "completions/min_length": 258.5, | |
| "completions/min_terminated_length": 258.5, | |
| "entropy": 0.25728738587349653, | |
| "epoch": 0.012888888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0032389845802218683, | |
| "kl": 0.013152709056157619, | |
| "learning_rate": 1.501842621639796e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5660738.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1958.0, | |
| "completions/max_terminated_length": 1507.5, | |
| "completions/mean_length": 1080.875, | |
| "completions/mean_terminated_length": 558.1840209960938, | |
| "completions/min_length": 266.0, | |
| "completions/min_terminated_length": 266.0, | |
| "entropy": 0.2784293610602617, | |
| "epoch": 0.012977777777777777, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6177267044875747, | |
| "kl": 0.012857195571996272, | |
| "learning_rate": 1.4689161405333652e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5700190.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.59375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1906.5, | |
| "completions/mean_length": 2204.4375, | |
| "completions/mean_terminated_length": 1260.9166717529297, | |
| "completions/min_length": 714.5, | |
| "completions/min_terminated_length": 714.5, | |
| "entropy": 0.27723210770636797, | |
| "epoch": 0.013066666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7415118165249563, | |
| "kl": 0.011066248407587409, | |
| "learning_rate": 1.4362037873135255e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5775620.0, | |
| "reward": 0.28125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.28125, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2425.0, | |
| "completions/max_terminated_length": 2049.0, | |
| "completions/mean_length": 1567.46875, | |
| "completions/mean_terminated_length": 1257.46875, | |
| "completions/min_length": 493.0, | |
| "completions/min_terminated_length": 493.0, | |
| "entropy": 0.27383615262806416, | |
| "epoch": 0.013155555555555556, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.5944976991687992, | |
| "kl": 0.011643942736554891, | |
| "learning_rate": 1.403712355440378e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5830659.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1815.5, | |
| "completions/max_terminated_length": 1656.5, | |
| "completions/mean_length": 897.5625, | |
| "completions/mean_terminated_length": 427.38751220703125, | |
| "completions/min_length": 231.5, | |
| "completions/min_terminated_length": 231.5, | |
| "entropy": 0.2819562489166856, | |
| "epoch": 0.013244444444444444, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.004012844388284618, | |
| "kl": 0.01389238511910662, | |
| "learning_rate": 1.371448592494707e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5864173.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1141.0, | |
| "completions/max_terminated_length": 1141.0, | |
| "completions/mean_length": 413.8125, | |
| "completions/mean_terminated_length": 413.8125, | |
| "completions/min_length": 193.0, | |
| "completions/min_terminated_length": 193.0, | |
| "entropy": 0.255883046425879, | |
| "epoch": 0.013333333333333334, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003103920043368309, | |
| "kl": 0.011319157638354227, | |
| "learning_rate": 1.3394191987766996e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5882255.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1775.0, | |
| "completions/max_terminated_length": 1775.0, | |
| "completions/mean_length": 624.5625, | |
| "completions/mean_terminated_length": 624.5625, | |
| "completions/min_length": 262.5, | |
| "completions/min_terminated_length": 262.5, | |
| "entropy": 0.2605795245617628, | |
| "epoch": 0.013422222222222223, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00298458314438669, | |
| "kl": 0.013170374266337603, | |
| "learning_rate": 1.3076308259144652e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5907073.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 828.5, | |
| "completions/max_terminated_length": 828.5, | |
| "completions/mean_length": 405.78125, | |
| "completions/mean_terminated_length": 405.78125, | |
| "completions/min_length": 200.5, | |
| "completions/min_terminated_length": 200.5, | |
| "entropy": 0.2499817917123437, | |
| "epoch": 0.013511111111111111, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0038414119612760764, | |
| "kl": 0.013066469924524426, | |
| "learning_rate": 1.2760900754826858e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5924850.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1068.5, | |
| "completions/mean_length": 1562.1875, | |
| "completions/mean_terminated_length": 495.4886474609375, | |
| "completions/min_length": 200.0, | |
| "completions/min_terminated_length": 200.0, | |
| "entropy": 0.27882583532482386, | |
| "epoch": 0.0136, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6649376114013511, | |
| "kl": 0.013456764572765678, | |
| "learning_rate": 1.2448034976316394e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5979680.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.59375, | |
| "rewards/equation_reward_func/std": 0.497555673122406, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2961.0, | |
| "completions/mean_length": 1758.9375, | |
| "completions/mean_terminated_length": 1347.0, | |
| "completions/min_length": 305.0, | |
| "completions/min_terminated_length": 305.0, | |
| "entropy": 0.28104583360254765, | |
| "epoch": 0.01368888888888889, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6774759505887689, | |
| "kl": 0.011262226558756083, | |
| "learning_rate": 1.213777589726922e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6040846.0, | |
| "reward": 0.625, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/equation_reward_func/mean": 0.625, | |
| "rewards/equation_reward_func/std": 0.4955305755138397, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1761.0, | |
| "completions/max_terminated_length": 1554.5, | |
| "completions/mean_length": 648.5, | |
| "completions/mean_terminated_length": 579.08544921875, | |
| "completions/min_length": 213.5, | |
| "completions/min_terminated_length": 213.5, | |
| "entropy": 0.26855653896927834, | |
| "epoch": 0.013777777777777778, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0036323414110195915, | |
| "kl": 0.013169703772291541, | |
| "learning_rate": 1.183018795000118e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6066398.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2624.5, | |
| "completions/max_terminated_length": 2549.5, | |
| "completions/mean_length": 1283.78125, | |
| "completions/mean_terminated_length": 907.1875, | |
| "completions/min_length": 379.0, | |
| "completions/min_terminated_length": 379.0, | |
| "entropy": 0.27201704028993845, | |
| "epoch": 0.013866666666666666, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002257188322713532, | |
| "kl": 0.011060904245823622, | |
| "learning_rate": 1.1525335012107188e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6112335.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2512.0, | |
| "completions/max_terminated_length": 1334.0, | |
| "completions/mean_length": 1107.65625, | |
| "completions/mean_terminated_length": 527.8958282470703, | |
| "completions/min_length": 273.5, | |
| "completions/min_terminated_length": 273.5, | |
| "entropy": 0.2640869989991188, | |
| "epoch": 0.013955555555555556, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0025896482570957236, | |
| "kl": 0.012067800795193762, | |
| "learning_rate": 1.1223280393195566e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6152588.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 588.5, | |
| "completions/max_terminated_length": 588.5, | |
| "completions/mean_length": 321.96875, | |
| "completions/mean_terminated_length": 321.96875, | |
| "completions/min_length": 226.5, | |
| "completions/min_terminated_length": 226.5, | |
| "entropy": 0.25738913659006357, | |
| "epoch": 0.014044444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0028261410702532233, | |
| "kl": 0.010619041451718658, | |
| "learning_rate": 1.0924086821740436e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6167683.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1067.0, | |
| "completions/mean_length": 1269.25, | |
| "completions/mean_terminated_length": 529.7416687011719, | |
| "completions/min_length": 304.5, | |
| "completions/min_terminated_length": 304.5, | |
| "entropy": 0.2710387809202075, | |
| "epoch": 0.014133333333333333, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7163321341525302, | |
| "kl": 0.011698946240358055, | |
| "learning_rate": 1.0627816432054689e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6213211.0, | |
| "reward": 0.71875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.71875, | |
| "rewards/equation_reward_func/std": 0.38319888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2626.0, | |
| "completions/max_terminated_length": 2082.0, | |
| "completions/mean_length": 1737.0625, | |
| "completions/mean_terminated_length": 1189.125, | |
| "completions/min_length": 716.0, | |
| "completions/min_terminated_length": 716.0, | |
| "entropy": 0.30040886998176575, | |
| "epoch": 0.014222222222222223, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002674510389367636, | |
| "kl": 0.010718741046730429, | |
| "learning_rate": 1.0334530751386386e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6273685.0, | |
| "reward": 0.625, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.625, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 1868.0, | |
| "completions/max_terminated_length": 1647.5, | |
| "completions/mean_length": 1601.4375, | |
| "completions/mean_terminated_length": 1256.625, | |
| "completions/min_length": 943.5, | |
| "completions/min_terminated_length": 943.5, | |
| "entropy": 0.2695089690387249, | |
| "epoch": 0.014311111111111111, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.8756831429213762, | |
| "kl": 0.012713596923276782, | |
| "learning_rate": 1.0044290687141255e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6329771.0, | |
| "reward": 0.625, | |
| "reward_std": 0.2177756428718567, | |
| "rewards/equation_reward_func/mean": 0.625, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2595.0, | |
| "completions/max_terminated_length": 1631.5, | |
| "completions/mean_length": 926.0625, | |
| "completions/mean_terminated_length": 625.90625, | |
| "completions/min_length": 293.5, | |
| "completions/min_terminated_length": 293.5, | |
| "entropy": 0.26283825282007456, | |
| "epoch": 0.0144, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.140546805041934, | |
| "kl": 0.012503173289587721, | |
| "learning_rate": 9.757156514233892e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6364277.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2831.5, | |
| "completions/max_terminated_length": 1721.5, | |
| "completions/mean_length": 1427.34375, | |
| "completions/mean_terminated_length": 810.53125, | |
| "completions/min_length": 366.5, | |
| "completions/min_terminated_length": 366.5, | |
| "entropy": 0.28494337759912014, | |
| "epoch": 0.01448888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0033609260382285277, | |
| "kl": 0.012984584202058613, | |
| "learning_rate": 9.473187862570289e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6414800.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2215.5, | |
| "completions/max_terminated_length": 2132.5, | |
| "completions/mean_length": 1407.46875, | |
| "completions/mean_terminated_length": 1155.7875366210938, | |
| "completions/min_length": 375.0, | |
| "completions/min_terminated_length": 375.0, | |
| "entropy": 0.2730356818065047, | |
| "epoch": 0.014577777777777778, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.0014872222355606, | |
| "kl": 0.010669022798538208, | |
| "learning_rate": 9.192443704664344e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6464727.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2444.0, | |
| "completions/max_terminated_length": 2240.5, | |
| "completions/mean_length": 1488.28125, | |
| "completions/mean_terminated_length": 1031.9375, | |
| "completions/min_length": 493.0, | |
| "completions/min_terminated_length": 493.0, | |
| "entropy": 0.28267885465174913, | |
| "epoch": 0.014666666666666666, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0028641374969204676, | |
| "kl": 0.013437119021546096, | |
| "learning_rate": 8.914982343390895e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6517240.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1373.0, | |
| "completions/max_terminated_length": 1373.0, | |
| "completions/mean_length": 464.25, | |
| "completions/mean_terminated_length": 464.25, | |
| "completions/min_length": 226.0, | |
| "completions/min_terminated_length": 226.0, | |
| "entropy": 0.27360291965305805, | |
| "epoch": 0.014755555555555555, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0031912292292720448, | |
| "kl": 0.011018336488632485, | |
| "learning_rate": 8.640861399877805e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6536952.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2387.0, | |
| "completions/max_terminated_length": 1884.5, | |
| "completions/mean_length": 622.5625, | |
| "completions/mean_terminated_length": 544.7062683105469, | |
| "completions/min_length": 297.0, | |
| "completions/min_terminated_length": 297.0, | |
| "entropy": 0.270102976821363, | |
| "epoch": 0.014844444444444445, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7085645926713029, | |
| "kl": 0.013982411939650774, | |
| "learning_rate": 8.370137801539634e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6561706.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 670.5, | |
| "completions/max_terminated_length": 670.5, | |
| "completions/mean_length": 347.03125, | |
| "completions/mean_terminated_length": 347.03125, | |
| "completions/min_length": 238.5, | |
| "completions/min_terminated_length": 238.5, | |
| "entropy": 0.2622284069657326, | |
| "epoch": 0.014933333333333333, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002958625037089681, | |
| "kl": 0.012627416843315586, | |
| "learning_rate": 8.102867770255337e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6577603.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2189.0, | |
| "completions/mean_length": 2281.78125, | |
| "completions/mean_terminated_length": 1114.7222595214844, | |
| "completions/min_length": 488.0, | |
| "completions/min_terminated_length": 488.0, | |
| "entropy": 0.2811094503849745, | |
| "epoch": 0.015022222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.632578356189579, | |
| "kl": 0.012064321548677981, | |
| "learning_rate": 7.839106810692589e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6655484.0, | |
| "reward": 0.3125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.3125, | |
| "rewards/equation_reward_func/std": 0.42898140847682953, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2268.0, | |
| "completions/max_terminated_length": 912.5, | |
| "completions/mean_length": 1051.40625, | |
| "completions/mean_terminated_length": 356.5, | |
| "completions/min_length": 236.0, | |
| "completions/min_terminated_length": 236.0, | |
| "entropy": 0.26706848200410604, | |
| "epoch": 0.015111111111111112, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0028140986587067572, | |
| "kl": 0.013069912674836814, | |
| "learning_rate": 7.57890969878093e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6693953.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 839.0, | |
| "completions/max_terminated_length": 839.0, | |
| "completions/mean_length": 403.90625, | |
| "completions/mean_terminated_length": 403.90625, | |
| "completions/min_length": 236.0, | |
| "completions/min_terminated_length": 236.0, | |
| "entropy": 0.24486979842185974, | |
| "epoch": 0.0152, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.004798301159813211, | |
| "kl": 0.013417759502772242, | |
| "learning_rate": 7.322330470336313e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6711686.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2934.0, | |
| "completions/max_terminated_length": 2934.0, | |
| "completions/mean_length": 634.4375, | |
| "completions/mean_terminated_length": 634.4375, | |
| "completions/min_length": 230.5, | |
| "completions/min_terminated_length": 230.5, | |
| "entropy": 0.25119878351688385, | |
| "epoch": 0.015288888888888888, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0033714467626650446, | |
| "kl": 0.013649999862536788, | |
| "learning_rate": 7.069422409839363e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6736836.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2214.5, | |
| "completions/max_terminated_length": 1771.5, | |
| "completions/mean_length": 1115.0, | |
| "completions/mean_terminated_length": 599.6007080078125, | |
| "completions/min_length": 261.5, | |
| "completions/min_terminated_length": 261.5, | |
| "entropy": 0.25783967413008213, | |
| "epoch": 0.015377777777777778, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0036622729451236693, | |
| "kl": 0.01107509626308456, | |
| "learning_rate": 6.820238039369647e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6777332.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 3035.0, | |
| "completions/mean_length": 1394.3125, | |
| "completions/mean_terminated_length": 829.5874633789062, | |
| "completions/min_length": 227.5, | |
| "completions/min_terminated_length": 227.5, | |
| "entropy": 0.274350737221539, | |
| "epoch": 0.015466666666666667, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6737440602776203, | |
| "kl": 0.01305691129527986, | |
| "learning_rate": 6.574829107698238e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6826806.0, | |
| "reward": 0.75, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.44091323018074036, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1940.5, | |
| "completions/max_terminated_length": 1940.5, | |
| "completions/mean_length": 704.875, | |
| "completions/mean_terminated_length": 704.875, | |
| "completions/min_length": 318.5, | |
| "completions/min_terminated_length": 318.5, | |
| "entropy": 0.26101984456181526, | |
| "epoch": 0.015555555555555555, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0023647808678910177, | |
| "kl": 0.012259377690497786, | |
| "learning_rate": 6.333246579540971e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6854226.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 1845.0, | |
| "completions/max_terminated_length": 1726.5, | |
| "completions/mean_length": 845.90625, | |
| "completions/mean_terminated_length": 646.5505065917969, | |
| "completions/min_length": 236.5, | |
| "completions/min_terminated_length": 236.5, | |
| "entropy": 0.24565229751169682, | |
| "epoch": 0.015644444444444443, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7264834384366701, | |
| "kl": 0.012497128162067384, | |
| "learning_rate": 6.095540624974435e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6886135.0, | |
| "reward": 0.90625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.90625, | |
| "rewards/equation_reward_func/std": 0.20155644416809082, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1843.5, | |
| "completions/max_terminated_length": 749.0, | |
| "completions/mean_length": 499.5, | |
| "completions/mean_terminated_length": 417.7875061035156, | |
| "completions/min_length": 233.0, | |
| "completions/min_terminated_length": 233.0, | |
| "entropy": 0.2693540593609214, | |
| "epoch": 0.015733333333333332, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0028934413742537947, | |
| "kl": 0.011744226852897555, | |
| "learning_rate": 5.861760609017002e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6906975.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1108.5, | |
| "completions/mean_length": 1465.5, | |
| "completions/mean_terminated_length": 465.375, | |
| "completions/min_length": 206.5, | |
| "completions/min_terminated_length": 206.5, | |
| "entropy": 0.2571452846750617, | |
| "epoch": 0.015822222222222224, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002685926338613922, | |
| "kl": 0.012025278876535594, | |
| "learning_rate": 5.63195508137711e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6958719.0, | |
| "reward": 0.625, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.625, | |
| "rewards/equation_reward_func/std": 0.4818056970834732, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 390.5, | |
| "completions/max_terminated_length": 390.5, | |
| "completions/mean_length": 270.09375, | |
| "completions/mean_terminated_length": 270.09375, | |
| "completions/min_length": 181.0, | |
| "completions/min_terminated_length": 181.0, | |
| "entropy": 0.2511430708691478, | |
| "epoch": 0.015911111111111112, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0031781465152464555, | |
| "kl": 0.012126730172894895, | |
| "learning_rate": 5.4061717663707843e-08, | |
| "loss": 0.0, | |
| "num_tokens": 6972154.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2784.0, | |
| "completions/max_terminated_length": 2210.5, | |
| "completions/mean_length": 1096.84375, | |
| "completions/mean_terminated_length": 723.633544921875, | |
| "completions/min_length": 265.0, | |
| "completions/min_terminated_length": 265.0, | |
| "entropy": 0.2540301540866494, | |
| "epoch": 0.016, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002946265639527776, | |
| "kl": 0.01303106703562662, | |
| "learning_rate": 5.1844575530106265e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7012133.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1216.0, | |
| "completions/max_terminated_length": 1216.0, | |
| "completions/mean_length": 568.03125, | |
| "completions/mean_terminated_length": 568.03125, | |
| "completions/min_length": 254.0, | |
| "completions/min_terminated_length": 254.0, | |
| "entropy": 0.275842048227787, | |
| "epoch": 0.01608888888888889, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003142009163240758, | |
| "kl": 0.012856011278927326, | |
| "learning_rate": 4.9668584852682134e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7035126.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 1867.5, | |
| "completions/max_terminated_length": 1599.0, | |
| "completions/mean_length": 858.125, | |
| "completions/mean_terminated_length": 567.03125, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "entropy": 0.26426226925104856, | |
| "epoch": 0.016177777777777777, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.8508128064563231, | |
| "kl": 0.012756991724018008, | |
| "learning_rate": 4.753419752512072e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7067402.0, | |
| "reward": 0.84375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.84375, | |
| "rewards/equation_reward_func/std": 0.23935678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 892.0, | |
| "completions/max_terminated_length": 892.0, | |
| "completions/mean_length": 380.28125, | |
| "completions/mean_terminated_length": 380.28125, | |
| "completions/min_length": 186.0, | |
| "completions/min_terminated_length": 186.0, | |
| "entropy": 0.2523947898298502, | |
| "epoch": 0.016266666666666665, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0031902645325912102, | |
| "kl": 0.011268698493950069, | |
| "learning_rate": 4.5441856801230525e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7084363.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1857.5, | |
| "completions/mean_length": 1984.6875, | |
| "completions/mean_terminated_length": 1133.2500610351562, | |
| "completions/min_length": 629.0, | |
| "completions/min_terminated_length": 629.0, | |
| "entropy": 0.26920368149876595, | |
| "epoch": 0.016355555555555557, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.8209724135025811, | |
| "kl": 0.011945405742153525, | |
| "learning_rate": 4.3391997202891825e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7152753.0, | |
| "reward": 0.53125, | |
| "reward_std": 0.3608423173427582, | |
| "rewards/equation_reward_func/mean": 0.53125, | |
| "rewards/equation_reward_func/std": 0.5061737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2582.5, | |
| "completions/mean_length": 1663.5, | |
| "completions/mean_terminated_length": 924.6394348144531, | |
| "completions/min_length": 361.5, | |
| "completions/min_terminated_length": 361.5, | |
| "entropy": 0.2724431995302439, | |
| "epoch": 0.016444444444444446, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002375931629879322, | |
| "kl": 0.011462729802588001, | |
| "learning_rate": 4.1385044429817966e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7210897.0, | |
| "reward": 0.59375, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.59375, | |
| "rewards/equation_reward_func/std": 0.497555673122406, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2543.5, | |
| "completions/max_terminated_length": 2048.5, | |
| "completions/mean_length": 1225.5, | |
| "completions/mean_terminated_length": 771.75, | |
| "completions/min_length": 356.5, | |
| "completions/min_terminated_length": 356.5, | |
| "entropy": 0.2872797902673483, | |
| "epoch": 0.016533333333333334, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0036013988233853434, | |
| "kl": 0.013051826565060765, | |
| "learning_rate": 3.942141527114978e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7254921.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1786.5, | |
| "completions/max_terminated_length": 1704.5, | |
| "completions/mean_length": 676.3125, | |
| "completions/mean_terminated_length": 607.3291931152344, | |
| "completions/min_length": 258.0, | |
| "completions/min_terminated_length": 258.0, | |
| "entropy": 0.2516833422705531, | |
| "epoch": 0.016622222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0032409740755153012, | |
| "kl": 0.013011956005357206, | |
| "learning_rate": 3.7501517518899486e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7281419.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2043.5, | |
| "completions/mean_length": 1477.40625, | |
| "completions/mean_terminated_length": 752.5909118652344, | |
| "completions/min_length": 290.5, | |
| "completions/min_terminated_length": 290.5, | |
| "entropy": 0.26651648059487343, | |
| "epoch": 0.01671111111111111, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7197186929191686, | |
| "kl": 0.013189784425776452, | |
| "learning_rate": 3.562574988326342e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7333592.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.2587745785713196, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.4787135720252991, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1807.5, | |
| "completions/max_terminated_length": 543.5, | |
| "completions/mean_length": 1050.15625, | |
| "completions/mean_terminated_length": 384.28125, | |
| "completions/min_length": 293.0, | |
| "completions/min_terminated_length": 293.0, | |
| "entropy": 0.28364898823201656, | |
| "epoch": 0.0168, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002229579900740854, | |
| "kl": 0.010572923871222883, | |
| "learning_rate": 3.379450190982114e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7372061.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1590.0, | |
| "completions/max_terminated_length": 1590.0, | |
| "completions/mean_length": 460.46875, | |
| "completions/mean_terminated_length": 460.46875, | |
| "completions/min_length": 288.0, | |
| "completions/min_terminated_length": 288.0, | |
| "entropy": 0.27252288814634085, | |
| "epoch": 0.016888888888888887, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0024168024524627665, | |
| "kl": 0.01104135494097136, | |
| "learning_rate": 3.2008153898637255e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7391644.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 587.5, | |
| "completions/max_terminated_length": 587.5, | |
| "completions/mean_length": 321.25, | |
| "completions/mean_terminated_length": 321.25, | |
| "completions/min_length": 195.5, | |
| "completions/min_terminated_length": 195.5, | |
| "entropy": 0.26600412372499704, | |
| "epoch": 0.01697777777777778, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.5467716903361135, | |
| "kl": 0.013095528644043952, | |
| "learning_rate": 3.026707682528365e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7406716.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1082.5, | |
| "completions/mean_length": 1442.59375, | |
| "completions/mean_terminated_length": 554.8702087402344, | |
| "completions/min_length": 234.5, | |
| "completions/min_terminated_length": 234.5, | |
| "entropy": 0.2811947613954544, | |
| "epoch": 0.017066666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0023612765147716713, | |
| "kl": 0.012181783735286444, | |
| "learning_rate": 2.8571632263797745e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7457711.0, | |
| "reward": 0.65625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.65625, | |
| "rewards/equation_reward_func/std": 0.4597553312778473, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2230.5, | |
| "completions/max_terminated_length": 2110.5, | |
| "completions/mean_length": 863.5625, | |
| "completions/mean_terminated_length": 798.5021057128906, | |
| "completions/min_length": 211.5, | |
| "completions/min_terminated_length": 211.5, | |
| "entropy": 0.25374304968863726, | |
| "epoch": 0.017155555555555556, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 2.300851451670029, | |
| "kl": 0.013115475769154727, | |
| "learning_rate": 2.6922172311593884e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7490137.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 536.0, | |
| "completions/max_terminated_length": 536.0, | |
| "completions/mean_length": 329.40625, | |
| "completions/mean_terminated_length": 329.40625, | |
| "completions/min_length": 239.0, | |
| "completions/min_terminated_length": 239.0, | |
| "entropy": 0.2687660912051797, | |
| "epoch": 0.017244444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003236188807490885, | |
| "kl": 0.013288187910802662, | |
| "learning_rate": 2.5319039516341844e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7505510.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2899.5, | |
| "completions/max_terminated_length": 2529.0, | |
| "completions/mean_length": 2049.125, | |
| "completions/mean_terminated_length": 1644.6687622070312, | |
| "completions/min_length": 784.0, | |
| "completions/min_terminated_length": 784.0, | |
| "entropy": 0.2617297563701868, | |
| "epoch": 0.017333333333333333, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.7057223749958403, | |
| "kl": 0.01200129883363843, | |
| "learning_rate": 2.3762566804829742e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7575994.0, | |
| "reward": 0.625, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/equation_reward_func/mean": 0.625, | |
| "rewards/equation_reward_func/std": 0.36435678601264954, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2027.0, | |
| "completions/max_terminated_length": 1576.0, | |
| "completions/mean_length": 882.0625, | |
| "completions/mean_terminated_length": 762.65625, | |
| "completions/min_length": 260.0, | |
| "completions/min_terminated_length": 260.0, | |
| "entropy": 0.2711814185604453, | |
| "epoch": 0.01742222222222222, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.002678885528367909, | |
| "kl": 0.01261142164003104, | |
| "learning_rate": 2.2253077413823458e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7609036.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1152.0, | |
| "completions/max_terminated_length": 1152.0, | |
| "completions/mean_length": 482.75, | |
| "completions/mean_terminated_length": 482.75, | |
| "completions/min_length": 216.0, | |
| "completions/min_terminated_length": 216.0, | |
| "entropy": 0.27470015175640583, | |
| "epoch": 0.017511111111111113, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0034936681657812877, | |
| "kl": 0.011282922176178545, | |
| "learning_rate": 2.0790884822939836e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7629316.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 583.5, | |
| "completions/max_terminated_length": 583.5, | |
| "completions/mean_length": 353.03125, | |
| "completions/mean_terminated_length": 353.03125, | |
| "completions/min_length": 218.5, | |
| "completions/min_terminated_length": 218.5, | |
| "entropy": 0.2574036065489054, | |
| "epoch": 0.0176, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0034114881771838503, | |
| "kl": 0.01028229494113475, | |
| "learning_rate": 1.9376292689545158e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7645429.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2915.5, | |
| "completions/mean_length": 2265.375, | |
| "completions/mean_terminated_length": 1267.5556030273438, | |
| "completions/min_length": 499.0, | |
| "completions/min_terminated_length": 499.0, | |
| "entropy": 0.28383847046643496, | |
| "epoch": 0.01768888888888889, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 1.1894129573830192, | |
| "kl": 0.01126208424102515, | |
| "learning_rate": 1.800959478569422e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7722809.0, | |
| "reward": 0.34375, | |
| "reward_std": 0.2041158676147461, | |
| "rewards/equation_reward_func/mean": 0.34375, | |
| "rewards/equation_reward_func/std": 0.4597553312778473, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1687.5, | |
| "completions/mean_length": 1554.78125, | |
| "completions/mean_terminated_length": 674.7259521484375, | |
| "completions/min_length": 325.5, | |
| "completions/min_terminated_length": 325.5, | |
| "entropy": 0.29043113626539707, | |
| "epoch": 0.017777777777777778, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.9429187670404626, | |
| "kl": 0.012492028530687094, | |
| "learning_rate": 1.6691074937121407e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7777394.0, | |
| "reward": 0.65625, | |
| "reward_std": 0.1293872892856598, | |
| "rewards/equation_reward_func/mean": 0.65625, | |
| "rewards/equation_reward_func/std": 0.4597553312778473, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1728.0, | |
| "completions/max_terminated_length": 383.5, | |
| "completions/mean_length": 983.375, | |
| "completions/mean_terminated_length": 290.9375, | |
| "completions/min_length": 235.0, | |
| "completions/min_terminated_length": 235.0, | |
| "entropy": 0.26652571372687817, | |
| "epoch": 0.017866666666666666, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00423530355082865, | |
| "kl": 0.013721562922000885, | |
| "learning_rate": 1.5421006964298377e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7813678.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2300.0, | |
| "completions/mean_length": 1527.1875, | |
| "completions/mean_terminated_length": 1120.86669921875, | |
| "completions/min_length": 399.0, | |
| "completions/min_terminated_length": 399.0, | |
| "entropy": 0.27129553351551294, | |
| "epoch": 0.017955555555555554, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 2.832976146674529, | |
| "kl": 0.010713240539189428, | |
| "learning_rate": 1.4199654625568575e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7867468.0, | |
| "reward": 0.75, | |
| "reward_std": 0.3514062389731407, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.3811737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1719.0, | |
| "completions/max_terminated_length": 1388.5, | |
| "completions/mean_length": 1190.53125, | |
| "completions/mean_terminated_length": 902.3624877929688, | |
| "completions/min_length": 531.5, | |
| "completions/min_terminated_length": 531.5, | |
| "entropy": 0.26110014878213406, | |
| "epoch": 0.018044444444444443, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.9105665973454747, | |
| "kl": 0.012254673813004047, | |
| "learning_rate": 1.302727156237224e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7910397.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.249358132481575, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1916.0, | |
| "completions/mean_length": 1700.84375, | |
| "completions/mean_terminated_length": 626.5249938964844, | |
| "completions/min_length": 287.0, | |
| "completions/min_terminated_length": 287.0, | |
| "entropy": 0.26668123714625835, | |
| "epoch": 0.018133333333333335, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0029355418790128285, | |
| "kl": 0.013148852216545492, | |
| "learning_rate": 1.1904101246571874e-08, | |
| "loss": 0.0, | |
| "num_tokens": 7969656.0, | |
| "reward": 0.5625, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.5625, | |
| "rewards/equation_reward_func/std": 0.5081988871097565, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2229.0, | |
| "completions/max_terminated_length": 2178.5, | |
| "completions/mean_length": 1085.9375, | |
| "completions/mean_terminated_length": 977.763427734375, | |
| "completions/min_length": 255.0, | |
| "completions/min_terminated_length": 255.0, | |
| "entropy": 0.2600689213722944, | |
| "epoch": 0.018222222222222223, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.5613127220073695, | |
| "kl": 0.01210535824066028, | |
| "learning_rate": 1.0830376929889612e-08, | |
| "loss": 0.0, | |
| "num_tokens": 8009222.0, | |
| "reward": 0.9375, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.9375, | |
| "rewards/equation_reward_func/std": 0.17078252136707306, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1955.0, | |
| "completions/max_terminated_length": 1225.0, | |
| "completions/mean_length": 1088.0625, | |
| "completions/mean_terminated_length": 514.1111145019531, | |
| "completions/min_length": 269.0, | |
| "completions/min_terminated_length": 269.0, | |
| "entropy": 0.2509449180215597, | |
| "epoch": 0.01831111111111111, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002559214424379042, | |
| "kl": 0.012904529517982155, | |
| "learning_rate": 9.806321595467598e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8048896.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.59375, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2563.0, | |
| "completions/mean_length": 2233.84375, | |
| "completions/mean_terminated_length": 1334.3055572509766, | |
| "completions/min_length": 761.0, | |
| "completions/min_terminated_length": 761.0, | |
| "entropy": 0.2827332355082035, | |
| "epoch": 0.0184, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.7520508296235054, | |
| "kl": 0.011386299796868116, | |
| "learning_rate": 8.832147911560173e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8125267.0, | |
| "reward": 0.375, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/equation_reward_func/mean": 0.375, | |
| "rewards/equation_reward_func/std": 0.457730233669281, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2780.0, | |
| "completions/max_terminated_length": 2169.0, | |
| "completions/mean_length": 1257.53125, | |
| "completions/mean_terminated_length": 1037.7291870117188, | |
| "completions/min_length": 378.0, | |
| "completions/min_terminated_length": 378.0, | |
| "entropy": 0.2801125952973962, | |
| "epoch": 0.018488888888888888, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.7066233355374787, | |
| "kl": 0.012001977243926376, | |
| "learning_rate": 7.908058187368726e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8170364.0, | |
| "reward": 0.875, | |
| "reward_std": 0.13363061845302582, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 1795.5, | |
| "completions/max_terminated_length": 1576.5, | |
| "completions/mean_length": 998.3125, | |
| "completions/mean_terminated_length": 767.65625, | |
| "completions/min_length": 270.0, | |
| "completions/min_terminated_length": 270.0, | |
| "entropy": 0.2634145403280854, | |
| "epoch": 0.018577777777777776, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0032855917738229988, | |
| "kl": 0.01313964050495997, | |
| "learning_rate": 7.0342443310273665e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8207150.0, | |
| "reward": 0.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.22360680997371674, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2799.0, | |
| "completions/max_terminated_length": 2799.0, | |
| "completions/mean_length": 983.03125, | |
| "completions/mean_terminated_length": 983.03125, | |
| "completions/min_length": 244.5, | |
| "completions/min_terminated_length": 244.5, | |
| "entropy": 0.27339703403413296, | |
| "epoch": 0.018666666666666668, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.003186100415722168, | |
| "kl": 0.010260674695018679, | |
| "learning_rate": 6.210887809749099e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8243487.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1876.5, | |
| "completions/max_terminated_length": 1254.5, | |
| "completions/mean_length": 997.0, | |
| "completions/mean_terminated_length": 453.1388854980469, | |
| "completions/min_length": 257.5, | |
| "completions/min_terminated_length": 257.5, | |
| "entropy": 0.25085126888006926, | |
| "epoch": 0.018755555555555557, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.003471742651114961, | |
| "kl": 0.011486861592857167, | |
| "learning_rate": 5.4381596121399476e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8280215.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1553.0, | |
| "completions/mean_length": 1449.65625, | |
| "completions/mean_terminated_length": 612.8303833007812, | |
| "completions/min_length": 223.0, | |
| "completions/min_terminated_length": 223.0, | |
| "entropy": 0.2599188946187496, | |
| "epoch": 0.018844444444444445, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.0021310158987967853, | |
| "kl": 0.010764601000119, | |
| "learning_rate": 4.716220212689332e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8331460.0, | |
| "reward": 0.6875, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.6875, | |
| "rewards/equation_reward_func/std": 0.42898140847682953, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1283.0, | |
| "completions/max_terminated_length": 1283.0, | |
| "completions/mean_length": 482.65625, | |
| "completions/mean_terminated_length": 482.65625, | |
| "completions/min_length": 246.0, | |
| "completions/min_terminated_length": 246.0, | |
| "entropy": 0.2782833958044648, | |
| "epoch": 0.018933333333333333, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0030275165486116904, | |
| "kl": 0.013003175088670105, | |
| "learning_rate": 4.045219538443778e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8351729.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2238.5, | |
| "completions/max_terminated_length": 927.5, | |
| "completions/mean_length": 1134.78125, | |
| "completions/mean_terminated_length": 433.71875, | |
| "completions/min_length": 205.0, | |
| "completions/min_terminated_length": 205.0, | |
| "entropy": 0.2606539400294423, | |
| "epoch": 0.01902222222222222, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0025799788396094247, | |
| "kl": 0.012180730293039232, | |
| "learning_rate": 3.4252969378714134e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8392898.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2623.5, | |
| "completions/max_terminated_length": 1766.0, | |
| "completions/mean_length": 1437.03125, | |
| "completions/mean_terminated_length": 993.0659790039062, | |
| "completions/min_length": 503.5, | |
| "completions/min_terminated_length": 503.5, | |
| "entropy": 0.2920260410755873, | |
| "epoch": 0.01911111111111111, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 1.1090578401338118, | |
| "kl": 0.011862305458635092, | |
| "learning_rate": 2.856581151922943e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8443763.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1883.0, | |
| "completions/mean_length": 1990.8125, | |
| "completions/mean_terminated_length": 1052.0625, | |
| "completions/min_length": 680.5, | |
| "completions/min_terminated_length": 680.5, | |
| "entropy": 0.2943479251116514, | |
| "epoch": 0.0192, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 1.1633775097626482, | |
| "kl": 0.010395180608611554, | |
| "learning_rate": 2.339190287295678e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8512357.0, | |
| "reward": 0.53125, | |
| "reward_std": 0.35564958304166794, | |
| "rewards/equation_reward_func/mean": 0.53125, | |
| "rewards/equation_reward_func/std": 0.5143726766109467, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 1847.5, | |
| "completions/max_terminated_length": 1103.5, | |
| "completions/mean_length": 923.84375, | |
| "completions/mean_terminated_length": 471.8937683105469, | |
| "completions/min_length": 240.0, | |
| "completions/min_terminated_length": 240.0, | |
| "entropy": 0.25271155778318644, | |
| "epoch": 0.01928888888888889, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.003008452213961997, | |
| "kl": 0.012103472923627123, | |
| "learning_rate": 1.8732317919060715e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8546760.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 2304.5, | |
| "completions/mean_length": 1056.8125, | |
| "completions/mean_terminated_length": 770.851318359375, | |
| "completions/min_length": 248.0, | |
| "completions/min_terminated_length": 248.0, | |
| "entropy": 0.2532212445512414, | |
| "epoch": 0.01937777777777778, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6794073285446092, | |
| "kl": 0.01284879056038335, | |
| "learning_rate": 1.4588024325756788e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8585450.0, | |
| "reward": 0.875, | |
| "reward_std": 0.2177756354212761, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.3265564441680908, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2099.5, | |
| "completions/max_terminated_length": 1801.0, | |
| "completions/mean_length": 1114.53125, | |
| "completions/mean_terminated_length": 680.0187683105469, | |
| "completions/min_length": 322.0, | |
| "completions/min_terminated_length": 322.0, | |
| "entropy": 0.26436334289610386, | |
| "epoch": 0.019466666666666667, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6266343864555947, | |
| "kl": 0.011212630255613476, | |
| "learning_rate": 1.0959882749354277e-09, | |
| "loss": 0.0, | |
| "num_tokens": 8625971.0, | |
| "reward": 0.8125, | |
| "reward_std": 0.1157275140285492, | |
| "rewards/equation_reward_func/mean": 0.8125, | |
| "rewards/equation_reward_func/std": 0.25, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1318.0, | |
| "completions/max_terminated_length": 1318.0, | |
| "completions/mean_length": 444.9375, | |
| "completions/mean_terminated_length": 444.9375, | |
| "completions/min_length": 207.0, | |
| "completions/min_terminated_length": 207.0, | |
| "entropy": 0.2647299263626337, | |
| "epoch": 0.019555555555555555, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0035374009167824442, | |
| "kl": 0.012473908253014088, | |
| "learning_rate": 7.848646655519986e-10, | |
| "loss": 0.0, | |
| "num_tokens": 8644993.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 1.0, | |
| "rewards/equation_reward_func/std": 0.0, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2224.5, | |
| "completions/max_terminated_length": 846.0, | |
| "completions/mean_length": 1158.34375, | |
| "completions/mean_terminated_length": 457.5625, | |
| "completions/min_length": 250.5, | |
| "completions/min_terminated_length": 250.5, | |
| "entropy": 0.28921834006905556, | |
| "epoch": 0.019644444444444444, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0027064951159668604, | |
| "kl": 0.012806050304789096, | |
| "learning_rate": 5.254962162804799e-10, | |
| "loss": 0.0, | |
| "num_tokens": 8686924.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2069.5, | |
| "completions/max_terminated_length": 682.0, | |
| "completions/mean_length": 1083.59375, | |
| "completions/mean_terminated_length": 380.3125, | |
| "completions/min_length": 213.0, | |
| "completions/min_terminated_length": 213.0, | |
| "entropy": 0.2702910928055644, | |
| "epoch": 0.019733333333333332, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0027219806874225795, | |
| "kl": 0.011863501626066864, | |
| "learning_rate": 3.1793679084632375e-10, | |
| "loss": 0.0, | |
| "num_tokens": 8726423.0, | |
| "reward": 0.75, | |
| "reward_std": 0.0, | |
| "rewards/equation_reward_func/mean": 0.75, | |
| "rewards/equation_reward_func/std": 0.25819888710975647, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 1817.5, | |
| "completions/max_terminated_length": 1292.0, | |
| "completions/mean_length": 990.65625, | |
| "completions/mean_terminated_length": 457.5138854980469, | |
| "completions/min_length": 284.5, | |
| "completions/min_terminated_length": 284.5, | |
| "entropy": 0.25941435527056456, | |
| "epoch": 0.019822222222222224, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.00358339852977573, | |
| "kl": 0.01235124742379412, | |
| "learning_rate": 1.6222949365926608e-10, | |
| "loss": 0.0, | |
| "num_tokens": 8762980.0, | |
| "reward": 0.78125, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.78125, | |
| "rewards/equation_reward_func/std": 0.2561737895011902, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 1766.5, | |
| "completions/max_terminated_length": 1612.0, | |
| "completions/mean_length": 730.78125, | |
| "completions/mean_terminated_length": 667.4208374023438, | |
| "completions/min_length": 225.0, | |
| "completions/min_terminated_length": 225.0, | |
| "entropy": 0.27090085577219725, | |
| "epoch": 0.019911111111111112, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.6441581681704772, | |
| "kl": 0.013042959035374224, | |
| "learning_rate": 5.84066608615985e-11, | |
| "loss": 0.0, | |
| "num_tokens": 8791165.0, | |
| "reward": 0.96875, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/equation_reward_func/mean": 0.96875, | |
| "rewards/equation_reward_func/std": 0.125, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 3072.0, | |
| "completions/max_terminated_length": 1963.0, | |
| "completions/mean_length": 1123.75, | |
| "completions/mean_terminated_length": 845.4285888671875, | |
| "completions/min_length": 334.0, | |
| "completions/min_terminated_length": 334.0, | |
| "entropy": 0.24659618083387613, | |
| "epoch": 0.02, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.6981668381022528, | |
| "kl": 0.014014697342645377, | |
| "learning_rate": 6.489853613067531e-12, | |
| "loss": 0.0, | |
| "num_tokens": 8831965.0, | |
| "reward": 0.875, | |
| "reward_std": 0.2314550280570984, | |
| "rewards/equation_reward_func/mean": 0.875, | |
| "rewards/equation_reward_func/std": 0.3415650427341461, | |
| "rewards/format_reward_func/mean": 0.0, | |
| "rewards/format_reward_func/std": 0.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "step": 450, | |
| "total_flos": 0.0, | |
| "train_loss": 8.708548251913978e-06, | |
| "train_runtime": 15573.3321, | |
| "train_samples_per_second": 0.462, | |
| "train_steps_per_second": 0.029 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 450, | |
| "num_input_tokens_seen": 8831965, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |