qwen-3-4b-instruct-r1-countdown / trainer_state.json
LlameUser's picture
Model save
70ca4d9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.02,
"eval_steps": 500,
"global_step": 450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1577.0,
"completions/max_terminated_length": 1577.0,
"completions/mean_length": 577.25,
"completions/mean_terminated_length": 577.25,
"completions/min_length": 236.5,
"completions/min_terminated_length": 236.5,
"entropy": 0.22603079956024885,
"epoch": 8.888888888888889e-05,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 3.571428571428571e-08,
"loss": 0.0,
"num_tokens": 23312.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1521.0,
"completions/max_terminated_length": 1521.0,
"completions/mean_length": 474.9375,
"completions/mean_terminated_length": 474.9375,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"entropy": 0.24615928065031767,
"epoch": 0.00017777777777777779,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0010758156719471042,
"kl": 0.0003032498079846846,
"learning_rate": 1.0714285714285713e-07,
"loss": 0.0,
"num_tokens": 43302.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2886.0,
"completions/mean_length": 1106.625,
"completions/mean_terminated_length": 903.5262451171875,
"completions/min_length": 253.5,
"completions/min_terminated_length": 253.5,
"entropy": 0.27944554202258587,
"epoch": 0.0002666666666666667,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.1026674172522337,
"kl": 0.0005087878689664649,
"learning_rate": 1.7857142857142858e-07,
"loss": 0.0,
"num_tokens": 83522.0,
"reward": 0.84375,
"reward_std": 0.22201896458864212,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.34860680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2156.5,
"completions/mean_length": 1165.78125,
"completions/mean_terminated_length": 659.8333435058594,
"completions/min_length": 292.5,
"completions/min_terminated_length": 292.5,
"entropy": 0.2345265755429864,
"epoch": 0.00035555555555555557,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.5848473823308938,
"kl": 0.0005193246015551267,
"learning_rate": 2.5e-07,
"loss": 0.0,
"num_tokens": 125675.0,
"reward": 0.75,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.3811737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2664.0,
"completions/max_terminated_length": 2224.0,
"completions/mean_length": 739.375,
"completions/mean_terminated_length": 666.3250122070312,
"completions/min_length": 248.0,
"completions/min_terminated_length": 248.0,
"entropy": 0.24358350411057472,
"epoch": 0.00044444444444444447,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0005102182872665691,
"kl": 0.0005887709885428194,
"learning_rate": 3.2142857142857145e-07,
"loss": 0.0,
"num_tokens": 154151.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1164.5,
"completions/max_terminated_length": 1164.5,
"completions/mean_length": 522.90625,
"completions/mean_terminated_length": 522.90625,
"completions/min_length": 252.0,
"completions/min_terminated_length": 252.0,
"entropy": 0.23033921141177416,
"epoch": 0.0005333333333333334,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0003883087974099818,
"kl": 0.0004703981048805872,
"learning_rate": 3.928571428571428e-07,
"loss": 0.0,
"num_tokens": 175748.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1743.5,
"completions/max_terminated_length": 1717.0,
"completions/mean_length": 1200.6875,
"completions/mean_terminated_length": 820.0625,
"completions/min_length": 469.5,
"completions/min_terminated_length": 469.5,
"entropy": 0.22865951620042324,
"epoch": 0.0006222222222222223,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.9029769583901943,
"kl": 0.000541954672371503,
"learning_rate": 4.6428571428571427e-07,
"loss": 0.0,
"num_tokens": 219058.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1865.5,
"completions/max_terminated_length": 524.0,
"completions/mean_length": 1023.0625,
"completions/mean_terminated_length": 318.9375,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"entropy": 0.2518942877650261,
"epoch": 0.0007111111111111111,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.2651085166655804,
"kl": 0.0006022736051818356,
"learning_rate": 4.999935101463869e-07,
"loss": 0.0,
"num_tokens": 256604.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1280.5,
"completions/max_terminated_length": 1280.5,
"completions/mean_length": 584.65625,
"completions/mean_terminated_length": 584.65625,
"completions/min_length": 280.5,
"completions/min_terminated_length": 280.5,
"entropy": 0.23922867327928543,
"epoch": 0.0008,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0003891599078429636,
"kl": 0.0005218808764766436,
"learning_rate": 4.999415933391384e-07,
"loss": 0.0,
"num_tokens": 280153.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2230.5,
"completions/max_terminated_length": 1331.5,
"completions/mean_length": 1207.5,
"completions/mean_terminated_length": 657.40625,
"completions/min_length": 431.5,
"completions/min_terminated_length": 431.5,
"entropy": 0.23088860977441072,
"epoch": 0.0008888888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0002444108443410019,
"kl": 0.0005470474952744553,
"learning_rate": 4.998377705063407e-07,
"loss": 0.0,
"num_tokens": 323689.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1031.0,
"completions/max_terminated_length": 1031.0,
"completions/mean_length": 529.09375,
"completions/mean_terminated_length": 529.09375,
"completions/min_length": 231.0,
"completions/min_terminated_length": 231.0,
"entropy": 0.22655892837792635,
"epoch": 0.0009777777777777777,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0007743500195651646,
"kl": 0.0006699018595099915,
"learning_rate": 4.996820632091536e-07,
"loss": 0.0,
"num_tokens": 345444.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 1943.5,
"completions/max_terminated_length": 1827.0,
"completions/mean_length": 1000.34375,
"completions/mean_terminated_length": 648.9517211914062,
"completions/min_length": 242.5,
"completions/min_terminated_length": 242.5,
"entropy": 0.2282683216035366,
"epoch": 0.0010666666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.001017224455435619,
"kl": 0.0007511216499551665,
"learning_rate": 4.994745037837194e-07,
"loss": 0.0,
"num_tokens": 382311.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2529.0,
"completions/mean_length": 2014.625,
"completions/mean_terminated_length": 1184.2785949707031,
"completions/min_length": 604.0,
"completions/min_terminated_length": 604.0,
"entropy": 0.27375217340886593,
"epoch": 0.0011555555555555555,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0008308273702719758,
"kl": 0.0006844787167210598,
"learning_rate": 4.992151353344481e-07,
"loss": 0.0,
"num_tokens": 451651.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.4375,
"rewards/equation_reward_func/std": 0.5081988871097565,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 700.0,
"completions/max_terminated_length": 700.0,
"completions/mean_length": 453.9375,
"completions/mean_terminated_length": 453.9375,
"completions/min_length": 254.0,
"completions/min_terminated_length": 254.0,
"entropy": 0.23610753938555717,
"epoch": 0.0012444444444444445,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0008685447557227168,
"kl": 0.0007076808778947452,
"learning_rate": 4.989040117250646e-07,
"loss": 0.0,
"num_tokens": 471057.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2293.0,
"completions/mean_length": 1640.375,
"completions/mean_terminated_length": 526.888916015625,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"entropy": 0.23777490202337503,
"epoch": 0.0013333333333333333,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6274088265459657,
"kl": 0.0006857724620203953,
"learning_rate": 4.985411975674243e-07,
"loss": 0.0,
"num_tokens": 528445.0,
"reward": 0.53125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.53125,
"rewards/equation_reward_func/std": 0.5143726766109467,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2285.5,
"completions/max_terminated_length": 1055.5,
"completions/mean_length": 1233.78125,
"completions/mean_terminated_length": 601.4375,
"completions/min_length": 342.0,
"completions/min_terminated_length": 342.0,
"entropy": 0.24325580801814795,
"epoch": 0.0014222222222222223,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.000655337400503361,
"kl": 0.0007609008825966157,
"learning_rate": 4.981267682080939e-07,
"loss": 0.0,
"num_tokens": 572822.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2631.5,
"completions/mean_length": 1312.71875,
"completions/mean_terminated_length": 913.6030578613281,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"entropy": 0.23677545227110386,
"epoch": 0.001511111111111111,
"frac_reward_zero_std": 0.5,
"grad_norm": 10.794765092710524,
"kl": 0.000754591055738274,
"learning_rate": 4.976608097127043e-07,
"loss": 0.0,
"num_tokens": 619725.0,
"reward": 0.8125,
"reward_std": 0.2177756354212761,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.36435678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2004.0,
"completions/mean_length": 1334.03125,
"completions/mean_terminated_length": 739.835693359375,
"completions/min_length": 296.5,
"completions/min_terminated_length": 296.5,
"entropy": 0.26550517696887255,
"epoch": 0.0016,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6416140180865778,
"kl": 0.0007265451895364095,
"learning_rate": 4.97143418848077e-07,
"loss": 0.0,
"num_tokens": 667294.0,
"reward": 0.75,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.44091323018074036,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 1726.5,
"completions/max_terminated_length": 1495.0,
"completions/mean_length": 878.875,
"completions/mean_terminated_length": 694.1370239257812,
"completions/min_length": 368.5,
"completions/min_terminated_length": 368.5,
"entropy": 0.22233000118285418,
"epoch": 0.0016888888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7522096940055117,
"kl": 0.0009702143179310951,
"learning_rate": 4.965747030621286e-07,
"loss": 0.0,
"num_tokens": 700266.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.90625,
"rewards/equation_reward_func/std": 0.20155644416809082,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1813.5,
"completions/max_terminated_length": 1028.0,
"completions/mean_length": 949.8125,
"completions/mean_terminated_length": 372.1666717529297,
"completions/min_length": 232.5,
"completions/min_terminated_length": 232.5,
"entropy": 0.2340000979602337,
"epoch": 0.0017777777777777779,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7589222336313214,
"kl": 0.0008960766499512829,
"learning_rate": 4.959547804615562e-07,
"loss": 0.0,
"num_tokens": 735476.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1754.5,
"completions/max_terminated_length": 1719.5,
"completions/mean_length": 1365.96875,
"completions/mean_terminated_length": 838.2083740234375,
"completions/min_length": 378.0,
"completions/min_terminated_length": 378.0,
"entropy": 0.26818372309207916,
"epoch": 0.0018666666666666666,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0008688646141334615,
"kl": 0.0008683820233272854,
"learning_rate": 4.952837797873106e-07,
"loss": 0.0,
"num_tokens": 784019.0,
"reward": 0.6875,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2572.0,
"completions/max_terminated_length": 2570.5,
"completions/mean_length": 1079.21875,
"completions/mean_terminated_length": 873.141845703125,
"completions/min_length": 297.5,
"completions/min_terminated_length": 297.5,
"entropy": 0.2423563925549388,
"epoch": 0.0019555555555555554,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7020501913144348,
"kl": 0.0009873026720015332,
"learning_rate": 4.9456184038786e-07,
"loss": 0.0,
"num_tokens": 823410.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1686.5,
"completions/mean_length": 1232.625,
"completions/mean_terminated_length": 719.4333801269531,
"completions/min_length": 300.5,
"completions/min_terminated_length": 300.5,
"entropy": 0.25999774504452944,
"epoch": 0.0020444444444444447,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0005542284098518003,
"kl": 0.001003881188808009,
"learning_rate": 4.937891121902508e-07,
"loss": 0.0,
"num_tokens": 867702.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1775.5,
"completions/mean_length": 1823.84375,
"completions/mean_terminated_length": 1082.8750610351562,
"completions/min_length": 487.0,
"completions/min_terminated_length": 487.0,
"entropy": 0.31278051622211933,
"epoch": 0.0021333333333333334,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.9767853720165567,
"kl": 0.0008782508339209016,
"learning_rate": 4.929657556689726e-07,
"loss": 0.0,
"num_tokens": 930961.0,
"reward": 0.6875,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.42898140847682953,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1420.0,
"completions/max_terminated_length": 1420.0,
"completions/mean_length": 628.59375,
"completions/mean_terminated_length": 628.59375,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"entropy": 0.23106891848146915,
"epoch": 0.0022222222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0008790964938972672,
"kl": 0.0011074417707277462,
"learning_rate": 4.920919418126312e-07,
"loss": 0.0,
"num_tokens": 955956.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1836.0,
"completions/max_terminated_length": 1599.0,
"completions/mean_length": 1298.78125,
"completions/mean_terminated_length": 968.2257080078125,
"completions/min_length": 341.0,
"completions/min_terminated_length": 341.0,
"entropy": 0.23789576161652803,
"epoch": 0.002311111111111111,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0015075554431588705,
"kl": 0.0011714456777554005,
"learning_rate": 4.911678520884398e-07,
"loss": 0.0,
"num_tokens": 1002381.0,
"reward": 0.78125,
"reward_std": 0.2630178928375244,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 1992.5,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 1640.78125,
"completions/mean_terminated_length": 1111.7083740234375,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"entropy": 0.26475983764976263,
"epoch": 0.0024,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.5139488912304642,
"kl": 0.0012290262056922074,
"learning_rate": 4.901936784045324e-07,
"loss": 0.0,
"num_tokens": 1059734.0,
"reward": 0.5625,
"reward_std": 0.2177756354212761,
"rewards/equation_reward_func/mean": 0.5625,
"rewards/equation_reward_func/std": 0.3265564441680908,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2003.5,
"completions/max_terminated_length": 739.5,
"completions/mean_length": 1126.65625,
"completions/mean_terminated_length": 461.5625,
"completions/min_length": 268.0,
"completions/min_terminated_length": 268.0,
"entropy": 0.2474461616948247,
"epoch": 0.002488888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0012532198813457072,
"kl": 0.0015458752604899928,
"learning_rate": 4.891696230701103e-07,
"loss": 0.0,
"num_tokens": 1100603.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1490.5,
"completions/max_terminated_length": 1490.5,
"completions/mean_length": 529.59375,
"completions/mean_terminated_length": 529.59375,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"entropy": 0.24761256389319897,
"epoch": 0.002577777777777778,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0013617152892329606,
"kl": 0.0018745916750049219,
"learning_rate": 4.880958987534282e-07,
"loss": 0.0,
"num_tokens": 1122326.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 1818.0,
"completions/max_terminated_length": 860.0,
"completions/mean_length": 781.25,
"completions/mean_terminated_length": 365.4545440673828,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"entropy": 0.26473226584494114,
"epoch": 0.0026666666666666666,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0029793463244017996,
"kl": 0.002165006830182392,
"learning_rate": 4.869727284376277e-07,
"loss": 0.0,
"num_tokens": 1152150.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2081.0,
"completions/max_terminated_length": 1650.5,
"completions/mean_length": 921.78125,
"completions/mean_terminated_length": 636.7291870117188,
"completions/min_length": 239.5,
"completions/min_terminated_length": 239.5,
"entropy": 0.25862254202365875,
"epoch": 0.0027555555555555554,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7791104007321142,
"kl": 0.0020706086870632134,
"learning_rate": 4.858003453744314e-07,
"loss": 0.0,
"num_tokens": 1186463.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1726.5,
"completions/max_terminated_length": 1477.0,
"completions/mean_length": 979.03125,
"completions/mean_terminated_length": 442.2430725097656,
"completions/min_length": 233.5,
"completions/min_terminated_length": 233.5,
"entropy": 0.2315123314037919,
"epoch": 0.0028444444444444446,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5664425570824678,
"kl": 0.0022492043499369174,
"learning_rate": 4.845789930357016e-07,
"loss": 0.0,
"num_tokens": 1222616.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2144.5,
"completions/max_terminated_length": 1759.0,
"completions/mean_length": 916.84375,
"completions/mean_terminated_length": 718.3966674804688,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"entropy": 0.2546883439645171,
"epoch": 0.0029333333333333334,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0018449164314454892,
"kl": 0.0023287632793653756,
"learning_rate": 4.833089250628786e-07,
"loss": 0.0,
"num_tokens": 1256779.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2946.0,
"completions/mean_length": 1932.3125,
"completions/mean_terminated_length": 1132.9555969238281,
"completions/min_length": 373.5,
"completions/min_terminated_length": 373.5,
"entropy": 0.2742412742227316,
"epoch": 0.003022222222222222,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.0264964672382928,
"kl": 0.0020347888057585806,
"learning_rate": 4.819904052143058e-07,
"loss": 0.0,
"num_tokens": 1323517.0,
"reward": 0.59375,
"reward_std": 0.3377464786171913,
"rewards/equation_reward_func/mean": 0.59375,
"rewards/equation_reward_func/std": 0.5061737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1004.0,
"completions/max_terminated_length": 1004.0,
"completions/mean_length": 432.4375,
"completions/mean_terminated_length": 432.4375,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"entropy": 0.24551625549793243,
"epoch": 0.003111111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0034440763335736547,
"kl": 0.004841565096285194,
"learning_rate": 4.806237073104548e-07,
"loss": 0.0,
"num_tokens": 1342163.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1816.5,
"completions/max_terminated_length": 1451.0,
"completions/mean_length": 742.375,
"completions/mean_terminated_length": 606.3526916503906,
"completions/min_length": 237.0,
"completions/min_terminated_length": 237.0,
"entropy": 0.2519435351714492,
"epoch": 0.0032,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.032328735168464,
"kl": 0.0035425843234406784,
"learning_rate": 4.792091151770602e-07,
"loss": 0.0,
"num_tokens": 1370711.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1044.5,
"completions/mean_length": 1702.125,
"completions/mean_terminated_length": 484.8958435058594,
"completions/min_length": 306.5,
"completions/min_terminated_length": 306.5,
"entropy": 0.24912292044609785,
"epoch": 0.003288888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0017951204099864006,
"kl": 0.0034496651496738195,
"learning_rate": 4.777469225861765e-07,
"loss": 0.0,
"num_tokens": 1430067.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.5,
"rewards/equation_reward_func/std": 0.5163977742195129,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 914.5,
"completions/mean_length": 2169.8125,
"completions/mean_terminated_length": 333.0833435058594,
"completions/min_length": 1697.0,
"completions/min_terminated_length": 161.0,
"entropy": 0.25269814021885395,
"epoch": 0.0033777777777777777,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7495021801377318,
"kl": 0.0026823820953723043,
"learning_rate": 4.762374331951703e-07,
"loss": 0.0,
"num_tokens": 1504389.0,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.375,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 1744.5,
"completions/max_terminated_length": 672.0,
"completions/mean_length": 1152.46875,
"completions/mean_terminated_length": 474.77679443359375,
"completions/min_length": 300.5,
"completions/min_terminated_length": 300.5,
"entropy": 0.2373593281954527,
"epoch": 0.0034666666666666665,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0032166698633040733,
"kl": 0.0036603061016649008,
"learning_rate": 4.7468096048365814e-07,
"loss": 0.0,
"num_tokens": 1546092.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2471.5,
"completions/max_terminated_length": 2177.5,
"completions/mean_length": 1190.25,
"completions/mean_terminated_length": 821.2301330566406,
"completions/min_length": 291.5,
"completions/min_terminated_length": 291.5,
"entropy": 0.26422596722841263,
"epoch": 0.0035555555555555557,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.001992271528938085,
"kl": 0.004376799814053811,
"learning_rate": 4.730778276884061e-07,
"loss": 0.0,
"num_tokens": 1588988.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2363.0,
"completions/mean_length": 1536.46875,
"completions/mean_terminated_length": 701.8055725097656,
"completions/min_length": 234.0,
"completions/min_terminated_length": 234.0,
"entropy": 0.28139131516218185,
"epoch": 0.0036444444444444445,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7357119490341348,
"kl": 0.003145620590657927,
"learning_rate": 4.7142836773620227e-07,
"loss": 0.0,
"num_tokens": 1643003.0,
"reward": 0.59375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.59375,
"rewards/equation_reward_func/std": 0.497555673122406,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2767.5,
"completions/mean_length": 2251.09375,
"completions/mean_terminated_length": 1607.1363830566406,
"completions/min_length": 1232.5,
"completions/min_terminated_length": 1232.5,
"entropy": 0.27658269740641117,
"epoch": 0.0037333333333333333,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0015005186459580411,
"kl": 0.0028125419485149905,
"learning_rate": 4.6973292317471635e-07,
"loss": 0.0,
"num_tokens": 1719918.0,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.3125,
"rewards/equation_reward_func/std": 0.42898140847682953,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2763.5,
"completions/mean_length": 2389.09375,
"completions/mean_terminated_length": 1365.8055725097656,
"completions/min_length": 718.0,
"completions/min_terminated_length": 718.0,
"entropy": 0.27335013449192047,
"epoch": 0.003822222222222222,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.5794927896037598,
"kl": 0.0027770966989919543,
"learning_rate": 4.679918461013627e-07,
"loss": 0.0,
"num_tokens": 1801257.0,
"reward": 0.3125,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.3125,
"rewards/equation_reward_func/std": 0.3811737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 513.0625,
"completions/mean_terminated_length": 513.0625,
"completions/min_length": 294.5,
"completions/min_terminated_length": 294.5,
"entropy": 0.23741111066192389,
"epoch": 0.003911111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0017861260868517065,
"kl": 0.0039054618537193164,
"learning_rate": 4.6620549809017885e-07,
"loss": 0.0,
"num_tokens": 1822475.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2744.0,
"completions/max_terminated_length": 1381.0,
"completions/mean_length": 1148.5625,
"completions/mean_terminated_length": 445.875,
"completions/min_length": 207.5,
"completions/min_terminated_length": 207.5,
"entropy": 0.2619064189493656,
"epoch": 0.004,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7985547598733307,
"kl": 0.003670994978165254,
"learning_rate": 4.643742501167366e-07,
"loss": 0.0,
"num_tokens": 1864045.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1842.0,
"completions/max_terminated_length": 1402.0,
"completions/mean_length": 724.15625,
"completions/mean_terminated_length": 657.5062866210938,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"entropy": 0.2663749074563384,
"epoch": 0.004088888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6611432476993873,
"kl": 0.004101754348084796,
"learning_rate": 4.624984824811006e-07,
"loss": 0.0,
"num_tokens": 1892010.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2289.5,
"completions/mean_length": 1697.5,
"completions/mean_terminated_length": 1072.7273254394531,
"completions/min_length": 314.0,
"completions/min_terminated_length": 314.0,
"entropy": 0.2442193143069744,
"epoch": 0.004177777777777778,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.9689055012957616,
"kl": 0.003427549614571035,
"learning_rate": 4.605785847288502e-07,
"loss": 0.0,
"num_tokens": 1951210.0,
"reward": 0.65625,
"reward_std": 0.3471629321575165,
"rewards/equation_reward_func/mean": 0.65625,
"rewards/equation_reward_func/std": 0.48935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2042.0,
"completions/max_terminated_length": 1845.0,
"completions/mean_length": 885.15625,
"completions/mean_terminated_length": 675.1778869628906,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"entropy": 0.25679061096161604,
"epoch": 0.004266666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7668184777361081,
"kl": 0.005221109124249779,
"learning_rate": 4.5861495557018206e-07,
"loss": 0.0,
"num_tokens": 1984351.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.90625,
"rewards/equation_reward_func/std": 0.20155644416809082,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2058.0,
"completions/max_terminated_length": 2058.0,
"completions/mean_length": 713.0625,
"completions/mean_terminated_length": 713.0625,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"entropy": 0.25331663712859154,
"epoch": 0.004355555555555555,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00194731319056398,
"kl": 0.004550013778498396,
"learning_rate": 4.566080027971082e-07,
"loss": 0.0,
"num_tokens": 2012001.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1068.0,
"completions/mean_length": 1476.90625,
"completions/mean_terminated_length": 550.0494689941406,
"completions/min_length": 289.5,
"completions/min_terminated_length": 289.5,
"entropy": 0.27285506669431925,
"epoch": 0.0044444444444444444,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.8264287211470831,
"kl": 0.003749549054191448,
"learning_rate": 4.545581431987694e-07,
"loss": 0.0,
"num_tokens": 2064150.0,
"reward": 0.625,
"reward_std": 0.2177756354212761,
"rewards/equation_reward_func/mean": 0.625,
"rewards/equation_reward_func/std": 0.457730233669281,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 530.5,
"completions/max_terminated_length": 530.5,
"completions/mean_length": 319.5625,
"completions/mean_terminated_length": 319.5625,
"completions/min_length": 200.5,
"completions/min_terminated_length": 200.5,
"entropy": 0.24814098048955202,
"epoch": 0.004533333333333334,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00292471693097075,
"kl": 0.005984036062727682,
"learning_rate": 4.5246580247487933e-07,
"loss": 0.0,
"num_tokens": 2079152.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2575.0,
"completions/max_terminated_length": 2125.0,
"completions/mean_length": 1022.125,
"completions/mean_terminated_length": 832.829345703125,
"completions/min_length": 418.0,
"completions/min_terminated_length": 418.0,
"entropy": 0.2612212775275111,
"epoch": 0.004622222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7744589581145311,
"kl": 0.004202435855404474,
"learning_rate": 4.5033141514731786e-07,
"loss": 0.0,
"num_tokens": 2116748.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.90625,
"rewards/equation_reward_func/std": 0.20155644416809082,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2367.5,
"completions/mean_length": 1707.96875,
"completions/mean_terminated_length": 1121.2916870117188,
"completions/min_length": 421.5,
"completions/min_terminated_length": 421.5,
"entropy": 0.2849529664963484,
"epoch": 0.004711111111111111,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.8055922294032352,
"kl": 0.0038093408074928448,
"learning_rate": 4.4815542446989373e-07,
"loss": 0.0,
"num_tokens": 2176243.0,
"reward": 0.6875,
"reward_std": 0.249358132481575,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.47360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 952.5,
"completions/max_terminated_length": 952.5,
"completions/mean_length": 322.4375,
"completions/mean_terminated_length": 322.4375,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"entropy": 0.2402123035863042,
"epoch": 0.0048,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0026144022076216205,
"kl": 0.00510270893573761,
"learning_rate": 4.4593828233629214e-07,
"loss": 0.0,
"num_tokens": 2191353.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 905.5,
"completions/mean_length": 1221.875,
"completions/mean_terminated_length": 486.4125061035156,
"completions/min_length": 256.5,
"completions/min_terminated_length": 256.5,
"entropy": 0.26087356358766556,
"epoch": 0.004888888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8632217331595174,
"kl": 0.00461755899596028,
"learning_rate": 4.4368044918622893e-07,
"loss": 0.0,
"num_tokens": 2235285.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2083.5,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 1154.28125,
"completions/mean_terminated_length": 640.7534790039062,
"completions/min_length": 305.5,
"completions/min_terminated_length": 305.5,
"entropy": 0.245620877481997,
"epoch": 0.004977777777777778,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0014729909165334624,
"kl": 0.0038956179923843592,
"learning_rate": 4.4138239390983e-07,
"loss": 0.0,
"num_tokens": 2277102.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1987.5,
"completions/max_terminated_length": 1762.5,
"completions/mean_length": 829.34375,
"completions/mean_terminated_length": 543.4895935058594,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"entropy": 0.2480917638167739,
"epoch": 0.005066666666666666,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7305596650806964,
"kl": 0.005493182397913188,
"learning_rate": 4.390445937502557e-07,
"loss": 0.0,
"num_tokens": 2308457.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1659.5,
"completions/max_terminated_length": 1659.5,
"completions/mean_length": 595.0625,
"completions/mean_terminated_length": 595.0625,
"completions/min_length": 295.0,
"completions/min_terminated_length": 295.0,
"entropy": 0.2502336846664548,
"epoch": 0.005155555555555556,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.001567908068070067,
"kl": 0.0051197968859924,
"learning_rate": 4.3666753420459023e-07,
"loss": 0.0,
"num_tokens": 2332387.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2570.5,
"completions/mean_length": 2503.6875,
"completions/mean_terminated_length": 1840.6875,
"completions/min_length": 1507.5,
"completions/min_terminated_length": 1507.5,
"entropy": 0.2708489568904042,
"epoch": 0.005244444444444445,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5058277210442557,
"kl": 0.003387732562259771,
"learning_rate": 4.3425170892301764e-07,
"loss": 0.0,
"num_tokens": 2417425.0,
"reward": 0.3125,
"reward_std": 0.4355512708425522,
"rewards/equation_reward_func/mean": 0.3125,
"rewards/equation_reward_func/std": 0.42898140847682953,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2686.5,
"completions/mean_length": 2226.1875,
"completions/mean_terminated_length": 1788.4166870117188,
"completions/min_length": 1331.5,
"completions/min_terminated_length": 1331.5,
"entropy": 0.2982914987951517,
"epoch": 0.005333333333333333,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6845989637013021,
"kl": 0.003966670599766076,
"learning_rate": 4.3179761960630357e-07,
"loss": 0.0,
"num_tokens": 2493511.0,
"reward": 0.34375,
"reward_std": 0.2041158601641655,
"rewards/equation_reward_func/mean": 0.34375,
"rewards/equation_reward_func/std": 0.42695631086826324,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2067.5,
"completions/max_terminated_length": 847.0,
"completions/mean_length": 1148.78125,
"completions/mean_terminated_length": 482.71875,
"completions/min_length": 305.5,
"completions/min_terminated_length": 305.5,
"entropy": 0.26174278277903795,
"epoch": 0.005422222222222222,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0016187374368629333,
"kl": 0.00422936627001036,
"learning_rate": 4.293057759016063e-07,
"loss": 0.0,
"num_tokens": 2535120.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1857.5,
"completions/max_terminated_length": 1041.0,
"completions/mean_length": 583.75,
"completions/mean_terminated_length": 507.26458740234375,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"entropy": 0.25596251245588064,
"epoch": 0.005511111111111111,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6322511630069138,
"kl": 0.005413578634033911,
"learning_rate": 4.2677669529663686e-07,
"loss": 0.0,
"num_tokens": 2558648.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 1878.5,
"completions/max_terminated_length": 1508.0,
"completions/mean_length": 908.8125,
"completions/mean_terminated_length": 537.2784118652344,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"entropy": 0.2507998961955309,
"epoch": 0.0056,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7030376954792115,
"kl": 0.00650369533104822,
"learning_rate": 4.2421090301219077e-07,
"loss": 0.0,
"num_tokens": 2592586.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1994.0,
"completions/max_terminated_length": 817.0,
"completions/mean_length": 1162.65625,
"completions/mean_terminated_length": 494.8125,
"completions/min_length": 329.0,
"completions/min_terminated_length": 329.0,
"entropy": 0.2358728414401412,
"epoch": 0.005688888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0014426004757466125,
"kl": 0.004484620047151111,
"learning_rate": 4.216089318930741e-07,
"loss": 0.0,
"num_tokens": 2634703.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1812.0,
"completions/mean_length": 1705.46875,
"completions/mean_terminated_length": 735.9261474609375,
"completions/min_length": 333.0,
"completions/min_terminated_length": 333.0,
"entropy": 0.27035616524517536,
"epoch": 0.0057777777777777775,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.605428435592805,
"kl": 0.005202807413297705,
"learning_rate": 4.189713222974466e-07,
"loss": 0.0,
"num_tokens": 2694166.0,
"reward": 0.59375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.59375,
"rewards/equation_reward_func/std": 0.497555673122406,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1858.5,
"completions/max_terminated_length": 1547.0,
"completions/mean_length": 928.84375,
"completions/mean_terminated_length": 468.0812683105469,
"completions/min_length": 224.5,
"completions/min_terminated_length": 224.5,
"entropy": 0.2494833106175065,
"epoch": 0.005866666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0032201336301060854,
"kl": 0.0067589654645416886,
"learning_rate": 4.162986219846037e-07,
"loss": 0.0,
"num_tokens": 2728713.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1941.0,
"completions/max_terminated_length": 1046.0,
"completions/mean_length": 1116.59375,
"completions/mean_terminated_length": 614.3923645019531,
"completions/min_length": 370.0,
"completions/min_terminated_length": 370.0,
"entropy": 0.2735691536217928,
"epoch": 0.005955555555555556,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.7950120981284405,
"kl": 0.005726380710257217,
"learning_rate": 4.135913860012219e-07,
"loss": 0.0,
"num_tokens": 2769260.0,
"reward": 0.75,
"reward_std": 0.1767766922712326,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.3811737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1792.5,
"completions/max_terminated_length": 1133.0,
"completions/mean_length": 1138.1875,
"completions/mean_terminated_length": 565.8125,
"completions/min_length": 289.0,
"completions/min_terminated_length": 289.0,
"entropy": 0.2514611016958952,
"epoch": 0.006044444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0018004806044451867,
"kl": 0.0059647281595971435,
"learning_rate": 4.10850176566091e-07,
"loss": 0.0,
"num_tokens": 2810538.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2790.5,
"completions/max_terminated_length": 2470.5,
"completions/mean_length": 1291.46875,
"completions/mean_terminated_length": 1078.6041870117188,
"completions/min_length": 469.5,
"completions/min_terminated_length": 469.5,
"entropy": 0.27105455938726664,
"epoch": 0.0061333333333333335,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6507725934734265,
"kl": 0.004865752707701176,
"learning_rate": 4.080755629533566e-07,
"loss": 0.0,
"num_tokens": 2856697.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1858.0,
"completions/mean_length": 1279.09375,
"completions/mean_terminated_length": 676.388916015625,
"completions/min_length": 355.5,
"completions/min_terminated_length": 355.5,
"entropy": 0.2739897835999727,
"epoch": 0.006222222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7611818739738083,
"kl": 0.006271678488701582,
"learning_rate": 4.052681213742971e-07,
"loss": 0.0,
"num_tokens": 2902516.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1764.5,
"completions/max_terminated_length": 1272.5,
"completions/mean_length": 680.40625,
"completions/mean_terminated_length": 613.7375183105469,
"completions/min_length": 218.5,
"completions/min_terminated_length": 218.5,
"entropy": 0.2625574329867959,
"epoch": 0.006311111111111111,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.641499811439406,
"kl": 0.006318398402072489,
"learning_rate": 4.024284348576611e-07,
"loss": 0.0,
"num_tokens": 2929105.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 825.0,
"completions/max_terminated_length": 825.0,
"completions/mean_length": 357.28125,
"completions/mean_terminated_length": 357.28125,
"completions/min_length": 194.5,
"completions/min_terminated_length": 194.5,
"entropy": 0.25569348596036434,
"epoch": 0.0064,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002224877364676125,
"kl": 0.007314969494473189,
"learning_rate": 3.9955709312858744e-07,
"loss": 0.0,
"num_tokens": 2945338.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2148.5,
"completions/max_terminated_length": 2137.5,
"completions/mean_length": 813.75,
"completions/mean_terminated_length": 749.6458740234375,
"completions/min_length": 257.0,
"completions/min_terminated_length": 257.0,
"entropy": 0.26268062368035316,
"epoch": 0.006488888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0020979525232143028,
"kl": 0.006386323366314173,
"learning_rate": 3.9665469248613616e-07,
"loss": 0.0,
"num_tokens": 2976250.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1002.0,
"completions/max_terminated_length": 1002.0,
"completions/mean_length": 431.71875,
"completions/mean_terminated_length": 431.71875,
"completions/min_length": 238.5,
"completions/min_terminated_length": 238.5,
"entropy": 0.26534419134259224,
"epoch": 0.006577777777777778,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0023952318007118166,
"kl": 0.00811207268270664,
"learning_rate": 3.9372183567945314e-07,
"loss": 0.0,
"num_tokens": 2994913.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2330.0,
"completions/max_terminated_length": 2274.0,
"completions/mean_length": 933.59375,
"completions/mean_terminated_length": 796.6964416503906,
"completions/min_length": 304.5,
"completions/min_terminated_length": 304.5,
"entropy": 0.24762420449405909,
"epoch": 0.006666666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0018580747675341872,
"kl": 0.006500544230220839,
"learning_rate": 3.907591317825956e-07,
"loss": 0.0,
"num_tokens": 3029636.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1187.0,
"completions/mean_length": 1713.03125,
"completions/mean_terminated_length": 499.8055725097656,
"completions/min_length": 253.0,
"completions/min_terminated_length": 253.0,
"entropy": 0.27569348085671663,
"epoch": 0.0067555555555555554,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0021576179813316406,
"kl": 0.007041995238978416,
"learning_rate": 3.877671960680443e-07,
"loss": 0.0,
"num_tokens": 3089309.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.5,
"rewards/equation_reward_func/std": 0.5163977742195129,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1730.5,
"completions/max_terminated_length": 1377.0,
"completions/mean_length": 913.4375,
"completions/mean_terminated_length": 442.98126220703125,
"completions/min_length": 237.5,
"completions/min_terminated_length": 237.5,
"entropy": 0.24052318930625916,
"epoch": 0.006844444444444445,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0037383834531385493,
"kl": 0.008585786272305995,
"learning_rate": 3.847466498789282e-07,
"loss": 0.0,
"num_tokens": 3123355.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1781.5,
"completions/mean_length": 1636.40625,
"completions/mean_terminated_length": 516.0249938964844,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"entropy": 0.28433873131871223,
"epoch": 0.006933333333333333,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0026989103207334733,
"kl": 0.006938680307939649,
"learning_rate": 3.816981204999882e-07,
"loss": 0.0,
"num_tokens": 3180568.0,
"reward": 0.5625,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.5625,
"rewards/equation_reward_func/std": 0.5081988871097565,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1877.0,
"completions/max_terminated_length": 1717.5,
"completions/mean_length": 1203.28125,
"completions/mean_terminated_length": 857.4750366210938,
"completions/min_length": 471.5,
"completions/min_terminated_length": 471.5,
"entropy": 0.26476599369198084,
"epoch": 0.007022222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6434589123374275,
"kl": 0.00672344581107609,
"learning_rate": 3.786222410273078e-07,
"loss": 0.0,
"num_tokens": 3223945.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 3006.5,
"completions/max_terminated_length": 2928.5,
"completions/mean_length": 1742.5625,
"completions/mean_terminated_length": 1458.9403686523438,
"completions/min_length": 514.0,
"completions/min_terminated_length": 514.0,
"entropy": 0.2928483448922634,
"epoch": 0.0071111111111111115,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0017971739857914673,
"kl": 0.006342295295326039,
"learning_rate": 3.755196502368361e-07,
"loss": 0.0,
"num_tokens": 3284571.0,
"reward": 0.84375,
"reward_std": 0.22201895713806152,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1899.0,
"completions/max_terminated_length": 896.5,
"completions/mean_length": 1095.1875,
"completions/mean_terminated_length": 473.375,
"completions/min_length": 336.0,
"completions/min_terminated_length": 336.0,
"entropy": 0.25843985099345446,
"epoch": 0.0072,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0031502250054640376,
"kl": 0.007663089840207249,
"learning_rate": 3.723909924517314e-07,
"loss": 0.0,
"num_tokens": 3324441.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2010.5,
"completions/max_terminated_length": 1225.5,
"completions/mean_length": 693.65625,
"completions/mean_terminated_length": 541.5401916503906,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"entropy": 0.2597637241706252,
"epoch": 0.007288888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7702693435320899,
"kl": 0.008306328963954002,
"learning_rate": 3.692369174085534e-07,
"loss": 0.0,
"num_tokens": 3351494.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 1958.0,
"completions/max_terminated_length": 1272.0,
"completions/mean_length": 1699.03125,
"completions/mean_terminated_length": 1055.90625,
"completions/min_length": 982.5,
"completions/min_terminated_length": 982.5,
"entropy": 0.2800039369612932,
"epoch": 0.007377777777777777,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0027883239668124267,
"kl": 0.00744300993392244,
"learning_rate": 3.6605808012233004e-07,
"loss": 0.0,
"num_tokens": 3410759.0,
"reward": 0.53125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.53125,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2152.0,
"completions/max_terminated_length": 2147.0,
"completions/mean_length": 1513.59375,
"completions/mean_terminated_length": 1326.09375,
"completions/min_length": 522.5,
"completions/min_terminated_length": 522.5,
"entropy": 0.26186632737517357,
"epoch": 0.007466666666666667,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.9198608744208149,
"kl": 0.006979152763960883,
"learning_rate": 3.628551407505292e-07,
"loss": 0.0,
"num_tokens": 3464082.0,
"reward": 0.8125,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 692.0,
"completions/max_terminated_length": 692.0,
"completions/mean_length": 430.03125,
"completions/mean_terminated_length": 430.03125,
"completions/min_length": 273.5,
"completions/min_terminated_length": 273.5,
"entropy": 0.23170531447976828,
"epoch": 0.007555555555555556,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002541713449292013,
"kl": 0.008697336044861004,
"learning_rate": 3.5962876445596224e-07,
"loss": 0.0,
"num_tokens": 3482691.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1732.0,
"completions/mean_length": 1689.3125,
"completions/mean_terminated_length": 716.3295593261719,
"completions/min_length": 363.0,
"completions/min_terminated_length": 363.0,
"entropy": 0.2743770433589816,
"epoch": 0.007644444444444444,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6462770111570193,
"kl": 0.00819350325036794,
"learning_rate": 3.563796212686475e-07,
"loss": 0.0,
"num_tokens": 3541653.0,
"reward": 0.59375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.59375,
"rewards/equation_reward_func/std": 0.497555673122406,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2642.5,
"completions/max_terminated_length": 1885.5,
"completions/mean_length": 1243.96875,
"completions/mean_terminated_length": 603.65625,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"entropy": 0.2883901707828045,
"epoch": 0.007733333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0018479443351030222,
"kl": 0.007616912451339886,
"learning_rate": 3.531083859466635e-07,
"loss": 0.0,
"num_tokens": 3586348.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2715.0,
"completions/max_terminated_length": 2480.0,
"completions/mean_length": 1095.03125,
"completions/mean_terminated_length": 626.0375061035156,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"entropy": 0.2639634981751442,
"epoch": 0.007822222222222222,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006148680335338723,
"kl": 0.008246932789916173,
"learning_rate": 3.498157378360204e-07,
"loss": 0.0,
"num_tokens": 3626229.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2908.5,
"completions/max_terminated_length": 2566.5,
"completions/mean_length": 986.8125,
"completions/mean_terminated_length": 844.5803833007812,
"completions/min_length": 244.5,
"completions/min_terminated_length": 244.5,
"entropy": 0.26056686975061893,
"epoch": 0.007911111111111112,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6854510293518095,
"kl": 0.009517568425508216,
"learning_rate": 3.465023607295784e-07,
"loss": 0.0,
"num_tokens": 3662655.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1047.5,
"completions/max_terminated_length": 1047.5,
"completions/mean_length": 464.34375,
"completions/mean_terminated_length": 464.34375,
"completions/min_length": 237.5,
"completions/min_terminated_length": 237.5,
"entropy": 0.27534451708197594,
"epoch": 0.008,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.2111208036108543,
"kl": 0.010279384237946942,
"learning_rate": 3.4316894272504225e-07,
"loss": 0.0,
"num_tokens": 3682330.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2305.5,
"completions/mean_length": 1692.4375,
"completions/mean_terminated_length": 1048.8803405761719,
"completions/min_length": 294.5,
"completions/min_terminated_length": 294.5,
"entropy": 0.2586697665974498,
"epoch": 0.008088888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6255903792292131,
"kl": 0.008654644683701918,
"learning_rate": 3.398161760820628e-07,
"loss": 0.0,
"num_tokens": 3741408.0,
"reward": 0.65625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.65625,
"rewards/equation_reward_func/std": 0.4597553312778473,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2933.5,
"completions/max_terminated_length": 2582.5,
"completions/mean_length": 1454.15625,
"completions/mean_terminated_length": 1066.7437438964844,
"completions/min_length": 516.5,
"completions/min_terminated_length": 516.5,
"entropy": 0.3029740732163191,
"epoch": 0.008177777777777779,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0022089392124720916,
"kl": 0.008055398386204615,
"learning_rate": 3.364447570784731e-07,
"loss": 0.0,
"num_tokens": 3792829.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2534.5,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 1022.34375,
"completions/mean_terminated_length": 726.3333435058594,
"completions/min_length": 272.5,
"completions/min_terminated_length": 272.5,
"entropy": 0.25912539288401604,
"epoch": 0.008266666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8681055932592283,
"kl": 0.00938627275172621,
"learning_rate": 3.3305538586569116e-07,
"loss": 0.0,
"num_tokens": 3830440.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2095.5,
"completions/max_terminated_length": 1841.5,
"completions/mean_length": 944.5,
"completions/mean_terminated_length": 740.860595703125,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"entropy": 0.251262541860342,
"epoch": 0.008355555555555555,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6350731005119447,
"kl": 0.010139893798623234,
"learning_rate": 3.296487663233168e-07,
"loss": 0.0,
"num_tokens": 3865512.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.90625,
"rewards/equation_reward_func/std": 0.20155644416809082,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 462.5,
"completions/max_terminated_length": 462.5,
"completions/mean_length": 306.125,
"completions/mean_terminated_length": 306.125,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"entropy": 0.22235615644603968,
"epoch": 0.008444444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0031605861188310326,
"kl": 0.01150068684364669,
"learning_rate": 3.2622560591295606e-07,
"loss": 0.0,
"num_tokens": 3880108.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1820.0,
"completions/max_terminated_length": 797.5,
"completions/mean_length": 1109.0625,
"completions/mean_terminated_length": 513.5,
"completions/min_length": 306.0,
"completions/min_terminated_length": 306.0,
"entropy": 0.2766123227775097,
"epoch": 0.008533333333333334,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002693976312764259,
"kl": 0.009999056375818327,
"learning_rate": 3.227866155313002e-07,
"loss": 0.0,
"num_tokens": 3920438.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2416.5,
"completions/mean_length": 1054.34375,
"completions/mean_terminated_length": 680.9667053222656,
"completions/min_length": 280.5,
"completions/min_terminated_length": 280.5,
"entropy": 0.29231622349470854,
"epoch": 0.008622222222222222,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.8717361985224948,
"kl": 0.009583401377312839,
"learning_rate": 3.1933250936249213e-07,
"loss": 0.0,
"num_tokens": 3959009.0,
"reward": 0.84375,
"reward_std": 0.22201896458864212,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.34860680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1548.5,
"completions/max_terminated_length": 1548.5,
"completions/mean_length": 621.3125,
"completions/mean_terminated_length": 621.3125,
"completions/min_length": 273.0,
"completions/min_terminated_length": 273.0,
"entropy": 0.26802380103617907,
"epoch": 0.00871111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002640439329048923,
"kl": 0.010297555243596435,
"learning_rate": 3.158640047298098e-07,
"loss": 0.0,
"num_tokens": 3983691.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2787.0,
"completions/max_terminated_length": 2773.5,
"completions/mean_length": 894.3125,
"completions/mean_terminated_length": 825.9396057128906,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"entropy": 0.260735678486526,
"epoch": 0.0088,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6876160757154334,
"kl": 0.009146726137259975,
"learning_rate": 3.123818219466981e-07,
"loss": 0.0,
"num_tokens": 4017157.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1940.5,
"completions/max_terminated_length": 1940.5,
"completions/mean_length": 574.5,
"completions/mean_terminated_length": 574.5,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"entropy": 0.24010000098496675,
"epoch": 0.008888888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.005430605739914178,
"kl": 0.00986800153623335,
"learning_rate": 3.088866841671789e-07,
"loss": 0.0,
"num_tokens": 4040381.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2159.5,
"completions/mean_length": 1421.6875,
"completions/mean_terminated_length": 817.5857696533203,
"completions/min_length": 277.5,
"completions/min_terminated_length": 277.5,
"entropy": 0.2729024589061737,
"epoch": 0.008977777777777777,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.8441320423470235,
"kl": 0.008688551635714248,
"learning_rate": 3.0537931723567253e-07,
"loss": 0.0,
"num_tokens": 4090771.0,
"reward": 0.75,
"reward_std": 0.2314550280570984,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.42078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2166.0,
"completions/max_terminated_length": 1986.5,
"completions/mean_length": 1150.1875,
"completions/mean_terminated_length": 619.1840209960938,
"completions/min_length": 263.5,
"completions/min_terminated_length": 263.5,
"entropy": 0.24514910019934177,
"epoch": 0.009066666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.00302042576058633,
"kl": 0.010161265439819545,
"learning_rate": 3.01860449536259e-07,
"loss": 0.0,
"num_tokens": 4132473.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1779.0,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 1532.96875,
"completions/mean_terminated_length": 1267.1875,
"completions/min_length": 781.5,
"completions/min_terminated_length": 781.5,
"entropy": 0.2567154373973608,
"epoch": 0.009155555555555556,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.003281200405250708,
"kl": 0.008494329609675333,
"learning_rate": 2.983308118414131e-07,
"loss": 0.0,
"num_tokens": 4186376.0,
"reward": 0.65625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.65625,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 928.0,
"completions/max_terminated_length": 928.0,
"completions/mean_length": 469.5,
"completions/mean_terminated_length": 469.5,
"completions/min_length": 262.5,
"completions/min_terminated_length": 262.5,
"entropy": 0.2405125731602311,
"epoch": 0.009244444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0032161521894528843,
"kl": 0.008923913293983787,
"learning_rate": 2.9479113716024275e-07,
"loss": 0.0,
"num_tokens": 4206216.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2232.5,
"completions/max_terminated_length": 1459.0,
"completions/mean_length": 877.40625,
"completions/mean_terminated_length": 638.1778869628906,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.24397206585854292,
"epoch": 0.009333333333333334,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.9097697460274851,
"kl": 0.010490661457879469,
"learning_rate": 2.912421605862632e-07,
"loss": 0.0,
"num_tokens": 4239101.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.90625,
"rewards/equation_reward_func/std": 0.20155644416809082,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2090.0,
"completions/mean_length": 1876.65625,
"completions/mean_terminated_length": 927.8375244140625,
"completions/min_length": 418.5,
"completions/min_terminated_length": 418.5,
"entropy": 0.25669932272285223,
"epoch": 0.009422222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.00271911961262999,
"kl": 0.008709978050319478,
"learning_rate": 2.8768461914473794e-07,
"loss": 0.0,
"num_tokens": 4304026.0,
"reward": 0.5625,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.5625,
"rewards/equation_reward_func/std": 0.5081988871097565,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 908.0,
"completions/max_terminated_length": 908.0,
"completions/mean_length": 414.3125,
"completions/mean_terminated_length": 414.3125,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"entropy": 0.25329437758773565,
"epoch": 0.00951111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003267497300157984,
"kl": 0.01166740502230823,
"learning_rate": 2.8411925163961926e-07,
"loss": 0.0,
"num_tokens": 4322108.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1865.5,
"completions/max_terminated_length": 1799.5,
"completions/mean_length": 883.4375,
"completions/mean_terminated_length": 608.4166870117188,
"completions/min_length": 250.5,
"completions/min_terminated_length": 250.5,
"entropy": 0.27052732463926077,
"epoch": 0.0096,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.006071708906705738,
"kl": 0.010494905174709857,
"learning_rate": 2.8054679850011825e-07,
"loss": 0.0,
"num_tokens": 4355202.0,
"reward": 0.84375,
"reward_std": 0.22201895713806152,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1109.5,
"completions/max_terminated_length": 1109.5,
"completions/mean_length": 530.875,
"completions/mean_terminated_length": 530.875,
"completions/min_length": 265.5,
"completions/min_terminated_length": 265.5,
"entropy": 0.21879185363650322,
"epoch": 0.00968888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0024693973812315844,
"kl": 0.010472121328348294,
"learning_rate": 2.769680016269385e-07,
"loss": 0.0,
"num_tokens": 4377110.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1888.5,
"completions/max_terminated_length": 1553.0,
"completions/mean_length": 795.8125,
"completions/mean_terminated_length": 662.4866333007812,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"entropy": 0.2686642771586776,
"epoch": 0.009777777777777778,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6128253340962238,
"kl": 0.011848014197312295,
"learning_rate": 2.7338360423820327e-07,
"loss": 0.0,
"num_tokens": 4407376.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 964.5,
"completions/max_terminated_length": 964.5,
"completions/mean_length": 345.875,
"completions/mean_terminated_length": 345.875,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"entropy": 0.252207750454545,
"epoch": 0.009866666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002946676892555618,
"kl": 0.012307984463404864,
"learning_rate": 2.6979435071510956e-07,
"loss": 0.0,
"num_tokens": 4423260.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2080.5,
"completions/max_terminated_length": 2070.0,
"completions/mean_length": 966.1875,
"completions/mean_terminated_length": 777.90869140625,
"completions/min_length": 295.5,
"completions/min_terminated_length": 295.5,
"entropy": 0.2508456828072667,
"epoch": 0.009955555555555556,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.6106400201424906,
"kl": 0.012187447311589494,
"learning_rate": 2.662009864473406e-07,
"loss": 0.0,
"num_tokens": 4459066.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.90625,
"rewards/equation_reward_func/std": 0.20155644416809082,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1453.5,
"completions/max_terminated_length": 1453.5,
"completions/mean_length": 601.1875,
"completions/mean_terminated_length": 601.1875,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"entropy": 0.26608254946768284,
"epoch": 0.010044444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0024192669923231076,
"kl": 0.012214825808769092,
"learning_rate": 2.626042576782687e-07,
"loss": 0.0,
"num_tokens": 4483160.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2300.5,
"completions/max_terminated_length": 1112.0,
"completions/mean_length": 1102.59375,
"completions/mean_terminated_length": 413.6875,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"entropy": 0.2653536135330796,
"epoch": 0.010133333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003141380751052536,
"kl": 0.011402795265894383,
"learning_rate": 2.590049113499809e-07,
"loss": 0.0,
"num_tokens": 4523275.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1841.5,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 1348.21875,
"completions/mean_terminated_length": 969.0,
"completions/min_length": 446.5,
"completions/min_terminated_length": 446.5,
"entropy": 0.2285262243822217,
"epoch": 0.010222222222222223,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.002841794206765057,
"kl": 0.011010473070200533,
"learning_rate": 2.5540369494815966e-07,
"loss": 0.0,
"num_tokens": 4571314.0,
"reward": 0.75,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2224.5,
"completions/mean_length": 1477.28125,
"completions/mean_terminated_length": 817.2619323730469,
"completions/min_length": 238.5,
"completions/min_terminated_length": 238.5,
"entropy": 0.2778350468724966,
"epoch": 0.010311111111111111,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6689780153086349,
"kl": 0.010737988399341702,
"learning_rate": 2.5180135634685064e-07,
"loss": 0.0,
"num_tokens": 4623483.0,
"reward": 0.6875,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.42898140847682953,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2414.5,
"completions/mean_length": 1444.4375,
"completions/mean_terminated_length": 795.5530700683594,
"completions/min_length": 239.5,
"completions/min_terminated_length": 239.5,
"entropy": 0.2818821109831333,
"epoch": 0.0104,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.9879929334941168,
"kl": 0.011097497888840735,
"learning_rate": 2.4819864365314934e-07,
"loss": 0.0,
"num_tokens": 4674561.0,
"reward": 0.6875,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.4787135720252991,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 234
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 1913.0,
"completions/max_terminated_length": 1698.0,
"completions/mean_length": 1022.125,
"completions/mean_terminated_length": 678.53125,
"completions/min_length": 270.5,
"completions/min_terminated_length": 270.5,
"entropy": 0.2688545901328325,
"epoch": 0.01048888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6315402563299277,
"kl": 0.01124519360018894,
"learning_rate": 2.445963050518403e-07,
"loss": 0.0,
"num_tokens": 4712149.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1819.5,
"completions/max_terminated_length": 505.5,
"completions/mean_length": 995.8125,
"completions/mean_terminated_length": 311.28125,
"completions/min_length": 228.0,
"completions/min_terminated_length": 228.0,
"entropy": 0.27648669946938753,
"epoch": 0.010577777777777778,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0023729104405685922,
"kl": 0.012640737142646685,
"learning_rate": 2.4099508865001914e-07,
"loss": 0.0,
"num_tokens": 4748831.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 238
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2162.5,
"completions/max_terminated_length": 2162.5,
"completions/mean_length": 693.84375,
"completions/mean_terminated_length": 693.84375,
"completions/min_length": 272.5,
"completions/min_terminated_length": 272.5,
"entropy": 0.2688616942614317,
"epoch": 0.010666666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004133074831770589,
"kl": 0.01435062033124268,
"learning_rate": 2.3739574232173134e-07,
"loss": 0.0,
"num_tokens": 4775898.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 651.0,
"completions/max_terminated_length": 651.0,
"completions/mean_length": 408.5,
"completions/mean_terminated_length": 408.5,
"completions/min_length": 281.5,
"completions/min_terminated_length": 281.5,
"entropy": 0.23651507124304771,
"epoch": 0.010755555555555556,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0029914986702280033,
"kl": 0.014157850993797183,
"learning_rate": 2.3379901355265936e-07,
"loss": 0.0,
"num_tokens": 4793826.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 242
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2794.5,
"completions/max_terminated_length": 2252.5,
"completions/mean_length": 1407.5,
"completions/mean_terminated_length": 984.7250061035156,
"completions/min_length": 348.0,
"completions/min_terminated_length": 348.0,
"entropy": 0.2798434291034937,
"epoch": 0.010844444444444445,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8787443536732897,
"kl": 0.012309474579524249,
"learning_rate": 2.3020564928489041e-07,
"loss": 0.0,
"num_tokens": 4843714.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 1603.625,
"completions/mean_terminated_length": 596.8500061035156,
"completions/min_length": 298.5,
"completions/min_terminated_length": 298.5,
"entropy": 0.2638617567718029,
"epoch": 0.010933333333333333,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7666509207211404,
"kl": 0.011061000754125416,
"learning_rate": 2.2661639576179676e-07,
"loss": 0.0,
"num_tokens": 4899894.0,
"reward": 0.59375,
"reward_std": 0.2041158601641655,
"rewards/equation_reward_func/mean": 0.59375,
"rewards/equation_reward_func/std": 0.5061737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 246
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 1990.0,
"completions/max_terminated_length": 1649.0,
"completions/mean_length": 1623.96875,
"completions/mean_terminated_length": 1129.4375,
"completions/min_length": 704.5,
"completions/min_terminated_length": 704.5,
"entropy": 0.2599087553098798,
"epoch": 0.011022222222222221,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0028186538614616394,
"kl": 0.011108465492725372,
"learning_rate": 2.2303199837306153e-07,
"loss": 0.0,
"num_tokens": 4956749.0,
"reward": 0.625,
"reward_std": 0.2177756428718567,
"rewards/equation_reward_func/mean": 0.625,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2025.0,
"completions/max_terminated_length": 1847.5,
"completions/mean_length": 1289.53125,
"completions/mean_terminated_length": 1127.0729370117188,
"completions/min_length": 547.5,
"completions/min_terminated_length": 547.5,
"entropy": 0.2873380035161972,
"epoch": 0.011111111111111112,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.1290088171475576,
"kl": 0.012026124983094633,
"learning_rate": 2.194532014998817e-07,
"loss": 0.0,
"num_tokens": 5002854.0,
"reward": 0.875,
"reward_std": 0.2177756428718567,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1747.0,
"completions/max_terminated_length": 417.5,
"completions/mean_length": 1000.25,
"completions/mean_terminated_length": 308.5,
"completions/min_length": 246.5,
"completions/min_terminated_length": 246.5,
"entropy": 0.2919534966349602,
"epoch": 0.0112,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0025806344488486584,
"kl": 0.013488087366567925,
"learning_rate": 2.1588074836038071e-07,
"loss": 0.0,
"num_tokens": 5039686.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1922.5,
"completions/max_terminated_length": 1082.5,
"completions/mean_length": 1145.96875,
"completions/mean_terminated_length": 525.375,
"completions/min_length": 287.5,
"completions/min_terminated_length": 287.5,
"entropy": 0.2488407287746668,
"epoch": 0.011288888888888888,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.498668519279294,
"kl": 0.014777076430618763,
"learning_rate": 2.1231538085526204e-07,
"loss": 0.0,
"num_tokens": 5081189.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 254
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1722.0,
"completions/mean_length": 656.6875,
"completions/mean_terminated_length": 507.6294860839844,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"entropy": 0.25390581879764795,
"epoch": 0.011377777777777778,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8200840056513704,
"kl": 0.013640801305882633,
"learning_rate": 2.0875783941373686e-07,
"loss": 0.0,
"num_tokens": 5107027.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 583.0,
"completions/max_terminated_length": 583.0,
"completions/mean_length": 335.8125,
"completions/mean_terminated_length": 335.8125,
"completions/min_length": 205.5,
"completions/min_terminated_length": 205.5,
"entropy": 0.2633366733789444,
"epoch": 0.011466666666666667,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0032217440107119217,
"kl": 0.013921725505497307,
"learning_rate": 2.052088628397572e-07,
"loss": 0.0,
"num_tokens": 5122565.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 258
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1858.0,
"completions/max_terminated_length": 1650.0,
"completions/mean_length": 846.8125,
"completions/mean_terminated_length": 789.0354614257812,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"entropy": 0.25422646198421717,
"epoch": 0.011555555555555555,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6192911531673088,
"kl": 0.0131575966370292,
"learning_rate": 2.0166918815858688e-07,
"loss": 0.0,
"num_tokens": 5154495.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1515.5,
"completions/max_terminated_length": 1515.5,
"completions/mean_length": 593.78125,
"completions/mean_terminated_length": 593.78125,
"completions/min_length": 263.5,
"completions/min_terminated_length": 263.5,
"entropy": 0.2436890648677945,
"epoch": 0.011644444444444445,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00379025728998696,
"kl": 0.013373794557992369,
"learning_rate": 1.9813955046374102e-07,
"loss": 0.0,
"num_tokens": 5178360.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 262
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1448.0,
"completions/max_terminated_length": 1448.0,
"completions/mean_length": 659.90625,
"completions/mean_terminated_length": 659.90625,
"completions/min_length": 318.0,
"completions/min_terminated_length": 318.0,
"entropy": 0.27821108512580395,
"epoch": 0.011733333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002902300626200292,
"kl": 0.011531222582561895,
"learning_rate": 1.946206827643275e-07,
"loss": 0.0,
"num_tokens": 5204293.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 1972.5,
"completions/max_terminated_length": 1638.5,
"completions/mean_length": 1453.71875,
"completions/mean_terminated_length": 982.8333740234375,
"completions/min_length": 579.5,
"completions/min_terminated_length": 579.5,
"entropy": 0.28553890995681286,
"epoch": 0.011822222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7618530543419798,
"kl": 0.012410368886776268,
"learning_rate": 1.9111331583282103e-07,
"loss": 0.0,
"num_tokens": 5255644.0,
"reward": 0.6875,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 266
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1782.5,
"completions/max_terminated_length": 1336.0,
"completions/mean_length": 811.6875,
"completions/mean_terminated_length": 521.4375,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.27573296427726746,
"epoch": 0.011911111111111112,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.6613583754850176,
"kl": 0.01382189046125859,
"learning_rate": 1.8761817805330195e-07,
"loss": 0.0,
"num_tokens": 5286434.0,
"reward": 0.8125,
"reward_std": 0.249358132481575,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 833.0,
"completions/max_terminated_length": 833.0,
"completions/mean_length": 389.71875,
"completions/mean_terminated_length": 389.71875,
"completions/min_length": 225.5,
"completions/min_terminated_length": 225.5,
"entropy": 0.2621759483590722,
"epoch": 0.012,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.007007814627563382,
"kl": 0.014533480803947896,
"learning_rate": 1.8413599527019018e-07,
"loss": 0.0,
"num_tokens": 5303721.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2546.5,
"completions/mean_length": 1690.15625,
"completions/mean_terminated_length": 1133.9886474609375,
"completions/min_length": 433.5,
"completions/min_terminated_length": 433.5,
"entropy": 0.26791701279580593,
"epoch": 0.012088888888888889,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.0865010274613978,
"kl": 0.011928621912375093,
"learning_rate": 1.806674906375079e-07,
"loss": 0.0,
"num_tokens": 5362734.0,
"reward": 0.71875,
"reward_std": 0.3471629321575165,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.46296359598636627,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1968.0,
"completions/max_terminated_length": 1741.0,
"completions/mean_length": 1352.90625,
"completions/mean_terminated_length": 1023.9305419921875,
"completions/min_length": 596.0,
"completions/min_terminated_length": 596.0,
"entropy": 0.2802997101098299,
"epoch": 0.012177777777777777,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0028160156857841676,
"kl": 0.011874206480570138,
"learning_rate": 1.7721338446869976e-07,
"loss": 0.0,
"num_tokens": 5410915.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 274
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 940.0,
"completions/max_terminated_length": 940.0,
"completions/mean_length": 401.34375,
"completions/mean_terminated_length": 401.34375,
"completions/min_length": 269.5,
"completions/min_terminated_length": 269.5,
"entropy": 0.25402139965444803,
"epoch": 0.012266666666666667,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003056421139936211,
"kl": 0.012792080786311999,
"learning_rate": 1.7377439408704392e-07,
"loss": 0.0,
"num_tokens": 5428606.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 276
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2440.5,
"completions/max_terminated_length": 2271.5,
"completions/mean_length": 1083.34375,
"completions/mean_terminated_length": 794.0104370117188,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"entropy": 0.28792093601077795,
"epoch": 0.012355555555555555,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8645122139887629,
"kl": 0.011338822660036385,
"learning_rate": 1.7035123367668323e-07,
"loss": 0.0,
"num_tokens": 5468121.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 278
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1582.5,
"completions/max_terminated_length": 1582.5,
"completions/mean_length": 617.75,
"completions/mean_terminated_length": 617.75,
"completions/min_length": 314.0,
"completions/min_terminated_length": 314.0,
"entropy": 0.26330281142145395,
"epoch": 0.012444444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003203365010013785,
"kl": 0.01245744532207027,
"learning_rate": 1.6694461413430893e-07,
"loss": 0.0,
"num_tokens": 5492769.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2827.0,
"completions/mean_length": 1535.59375,
"completions/mean_terminated_length": 1023.4583435058594,
"completions/min_length": 277.5,
"completions/min_terminated_length": 277.5,
"entropy": 0.2922232113778591,
"epoch": 0.012533333333333334,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6984483514858193,
"kl": 0.012172643968369812,
"learning_rate": 1.6355524292152684e-07,
"loss": 0.0,
"num_tokens": 5546756.0,
"reward": 0.71875,
"reward_std": 0.2630179077386856,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.46296359598636627,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 282
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1831.5,
"completions/max_terminated_length": 1012.5,
"completions/mean_length": 1000.125,
"completions/mean_terminated_length": 435.96875,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"entropy": 0.2578113954514265,
"epoch": 0.012622222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7839498455938455,
"kl": 0.011324702645651996,
"learning_rate": 1.6018382391793722e-07,
"loss": 0.0,
"num_tokens": 5583616.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 284
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 960.5,
"completions/max_terminated_length": 960.5,
"completions/mean_length": 461.9375,
"completions/mean_terminated_length": 461.9375,
"completions/min_length": 290.5,
"completions/min_terminated_length": 290.5,
"entropy": 0.2691855514422059,
"epoch": 0.01271111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003558811604246946,
"kl": 0.012289426987990737,
"learning_rate": 1.5683105727495778e-07,
"loss": 0.0,
"num_tokens": 5603238.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 286
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 1837.0,
"completions/max_terminated_length": 1507.0,
"completions/mean_length": 641.28125,
"completions/mean_terminated_length": 487.58038330078125,
"completions/min_length": 268.5,
"completions/min_terminated_length": 268.5,
"entropy": 0.2638643169775605,
"epoch": 0.0128,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.00331423594586153,
"kl": 0.013494796236045659,
"learning_rate": 1.5349763927042168e-07,
"loss": 0.0,
"num_tokens": 5628583.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 288
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1766.5,
"completions/max_terminated_length": 1478.5,
"completions/mean_length": 854.34375,
"completions/mean_terminated_length": 577.59375,
"completions/min_length": 258.5,
"completions/min_terminated_length": 258.5,
"entropy": 0.25728738587349653,
"epoch": 0.012888888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0032389845802218683,
"kl": 0.013152709056157619,
"learning_rate": 1.501842621639796e-07,
"loss": 0.0,
"num_tokens": 5660738.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1958.0,
"completions/max_terminated_length": 1507.5,
"completions/mean_length": 1080.875,
"completions/mean_terminated_length": 558.1840209960938,
"completions/min_length": 266.0,
"completions/min_terminated_length": 266.0,
"entropy": 0.2784293610602617,
"epoch": 0.012977777777777777,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6177267044875747,
"kl": 0.012857195571996272,
"learning_rate": 1.4689161405333652e-07,
"loss": 0.0,
"num_tokens": 5700190.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 292
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1906.5,
"completions/mean_length": 2204.4375,
"completions/mean_terminated_length": 1260.9166717529297,
"completions/min_length": 714.5,
"completions/min_terminated_length": 714.5,
"entropy": 0.27723210770636797,
"epoch": 0.013066666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7415118165249563,
"kl": 0.011066248407587409,
"learning_rate": 1.4362037873135255e-07,
"loss": 0.0,
"num_tokens": 5775620.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.28125,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 294
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2425.0,
"completions/max_terminated_length": 2049.0,
"completions/mean_length": 1567.46875,
"completions/mean_terminated_length": 1257.46875,
"completions/min_length": 493.0,
"completions/min_terminated_length": 493.0,
"entropy": 0.27383615262806416,
"epoch": 0.013155555555555556,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5944976991687992,
"kl": 0.011643942736554891,
"learning_rate": 1.403712355440378e-07,
"loss": 0.0,
"num_tokens": 5830659.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 296
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1815.5,
"completions/max_terminated_length": 1656.5,
"completions/mean_length": 897.5625,
"completions/mean_terminated_length": 427.38751220703125,
"completions/min_length": 231.5,
"completions/min_terminated_length": 231.5,
"entropy": 0.2819562489166856,
"epoch": 0.013244444444444444,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.004012844388284618,
"kl": 0.01389238511910662,
"learning_rate": 1.371448592494707e-07,
"loss": 0.0,
"num_tokens": 5864173.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 298
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1141.0,
"completions/max_terminated_length": 1141.0,
"completions/mean_length": 413.8125,
"completions/mean_terminated_length": 413.8125,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.255883046425879,
"epoch": 0.013333333333333334,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003103920043368309,
"kl": 0.011319157638354227,
"learning_rate": 1.3394191987766996e-07,
"loss": 0.0,
"num_tokens": 5882255.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1775.0,
"completions/max_terminated_length": 1775.0,
"completions/mean_length": 624.5625,
"completions/mean_terminated_length": 624.5625,
"completions/min_length": 262.5,
"completions/min_terminated_length": 262.5,
"entropy": 0.2605795245617628,
"epoch": 0.013422222222222223,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.00298458314438669,
"kl": 0.013170374266337603,
"learning_rate": 1.3076308259144652e-07,
"loss": 0.0,
"num_tokens": 5907073.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 302
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 828.5,
"completions/max_terminated_length": 828.5,
"completions/mean_length": 405.78125,
"completions/mean_terminated_length": 405.78125,
"completions/min_length": 200.5,
"completions/min_terminated_length": 200.5,
"entropy": 0.2499817917123437,
"epoch": 0.013511111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0038414119612760764,
"kl": 0.013066469924524426,
"learning_rate": 1.2760900754826858e-07,
"loss": 0.0,
"num_tokens": 5924850.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 304
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1068.5,
"completions/mean_length": 1562.1875,
"completions/mean_terminated_length": 495.4886474609375,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"entropy": 0.27882583532482386,
"epoch": 0.0136,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6649376114013511,
"kl": 0.013456764572765678,
"learning_rate": 1.2448034976316394e-07,
"loss": 0.0,
"num_tokens": 5979680.0,
"reward": 0.59375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.59375,
"rewards/equation_reward_func/std": 0.497555673122406,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 306
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 1758.9375,
"completions/mean_terminated_length": 1347.0,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.28104583360254765,
"epoch": 0.01368888888888889,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6774759505887689,
"kl": 0.011262226558756083,
"learning_rate": 1.213777589726922e-07,
"loss": 0.0,
"num_tokens": 6040846.0,
"reward": 0.625,
"reward_std": 0.2177756354212761,
"rewards/equation_reward_func/mean": 0.625,
"rewards/equation_reward_func/std": 0.4955305755138397,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 308
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1761.0,
"completions/max_terminated_length": 1554.5,
"completions/mean_length": 648.5,
"completions/mean_terminated_length": 579.08544921875,
"completions/min_length": 213.5,
"completions/min_terminated_length": 213.5,
"entropy": 0.26855653896927834,
"epoch": 0.013777777777777778,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0036323414110195915,
"kl": 0.013169703772291541,
"learning_rate": 1.183018795000118e-07,
"loss": 0.0,
"num_tokens": 6066398.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2624.5,
"completions/max_terminated_length": 2549.5,
"completions/mean_length": 1283.78125,
"completions/mean_terminated_length": 907.1875,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"entropy": 0.27201704028993845,
"epoch": 0.013866666666666666,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.002257188322713532,
"kl": 0.011060904245823622,
"learning_rate": 1.1525335012107188e-07,
"loss": 0.0,
"num_tokens": 6112335.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 312
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2512.0,
"completions/max_terminated_length": 1334.0,
"completions/mean_length": 1107.65625,
"completions/mean_terminated_length": 527.8958282470703,
"completions/min_length": 273.5,
"completions/min_terminated_length": 273.5,
"entropy": 0.2640869989991188,
"epoch": 0.013955555555555556,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0025896482570957236,
"kl": 0.012067800795193762,
"learning_rate": 1.1223280393195566e-07,
"loss": 0.0,
"num_tokens": 6152588.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 314
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 588.5,
"completions/max_terminated_length": 588.5,
"completions/mean_length": 321.96875,
"completions/mean_terminated_length": 321.96875,
"completions/min_length": 226.5,
"completions/min_terminated_length": 226.5,
"entropy": 0.25738913659006357,
"epoch": 0.014044444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0028261410702532233,
"kl": 0.010619041451718658,
"learning_rate": 1.0924086821740436e-07,
"loss": 0.0,
"num_tokens": 6167683.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 316
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1067.0,
"completions/mean_length": 1269.25,
"completions/mean_terminated_length": 529.7416687011719,
"completions/min_length": 304.5,
"completions/min_terminated_length": 304.5,
"entropy": 0.2710387809202075,
"epoch": 0.014133333333333333,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7163321341525302,
"kl": 0.011698946240358055,
"learning_rate": 1.0627816432054689e-07,
"loss": 0.0,
"num_tokens": 6213211.0,
"reward": 0.71875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.71875,
"rewards/equation_reward_func/std": 0.38319888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 318
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2626.0,
"completions/max_terminated_length": 2082.0,
"completions/mean_length": 1737.0625,
"completions/mean_terminated_length": 1189.125,
"completions/min_length": 716.0,
"completions/min_terminated_length": 716.0,
"entropy": 0.30040886998176575,
"epoch": 0.014222222222222223,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.002674510389367636,
"kl": 0.010718741046730429,
"learning_rate": 1.0334530751386386e-07,
"loss": 0.0,
"num_tokens": 6273685.0,
"reward": 0.625,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.625,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 1868.0,
"completions/max_terminated_length": 1647.5,
"completions/mean_length": 1601.4375,
"completions/mean_terminated_length": 1256.625,
"completions/min_length": 943.5,
"completions/min_terminated_length": 943.5,
"entropy": 0.2695089690387249,
"epoch": 0.014311111111111111,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.8756831429213762,
"kl": 0.012713596923276782,
"learning_rate": 1.0044290687141255e-07,
"loss": 0.0,
"num_tokens": 6329771.0,
"reward": 0.625,
"reward_std": 0.2177756428718567,
"rewards/equation_reward_func/mean": 0.625,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 322
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2595.0,
"completions/max_terminated_length": 1631.5,
"completions/mean_length": 926.0625,
"completions/mean_terminated_length": 625.90625,
"completions/min_length": 293.5,
"completions/min_terminated_length": 293.5,
"entropy": 0.26283825282007456,
"epoch": 0.0144,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.140546805041934,
"kl": 0.012503173289587721,
"learning_rate": 9.757156514233892e-08,
"loss": 0.0,
"num_tokens": 6364277.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 324
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2831.5,
"completions/max_terminated_length": 1721.5,
"completions/mean_length": 1427.34375,
"completions/mean_terminated_length": 810.53125,
"completions/min_length": 366.5,
"completions/min_terminated_length": 366.5,
"entropy": 0.28494337759912014,
"epoch": 0.01448888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0033609260382285277,
"kl": 0.012984584202058613,
"learning_rate": 9.473187862570289e-08,
"loss": 0.0,
"num_tokens": 6414800.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 326
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2215.5,
"completions/max_terminated_length": 2132.5,
"completions/mean_length": 1407.46875,
"completions/mean_terminated_length": 1155.7875366210938,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"entropy": 0.2730356818065047,
"epoch": 0.014577777777777778,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.0014872222355606,
"kl": 0.010669022798538208,
"learning_rate": 9.192443704664344e-08,
"loss": 0.0,
"num_tokens": 6464727.0,
"reward": 0.8125,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 328
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2444.0,
"completions/max_terminated_length": 2240.5,
"completions/mean_length": 1488.28125,
"completions/mean_terminated_length": 1031.9375,
"completions/min_length": 493.0,
"completions/min_terminated_length": 493.0,
"entropy": 0.28267885465174913,
"epoch": 0.014666666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0028641374969204676,
"kl": 0.013437119021546096,
"learning_rate": 8.914982343390895e-08,
"loss": 0.0,
"num_tokens": 6517240.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1373.0,
"completions/max_terminated_length": 1373.0,
"completions/mean_length": 464.25,
"completions/mean_terminated_length": 464.25,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"entropy": 0.27360291965305805,
"epoch": 0.014755555555555555,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0031912292292720448,
"kl": 0.011018336488632485,
"learning_rate": 8.640861399877805e-08,
"loss": 0.0,
"num_tokens": 6536952.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 332
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2387.0,
"completions/max_terminated_length": 1884.5,
"completions/mean_length": 622.5625,
"completions/mean_terminated_length": 544.7062683105469,
"completions/min_length": 297.0,
"completions/min_terminated_length": 297.0,
"entropy": 0.270102976821363,
"epoch": 0.014844444444444445,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7085645926713029,
"kl": 0.013982411939650774,
"learning_rate": 8.370137801539634e-08,
"loss": 0.0,
"num_tokens": 6561706.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 670.5,
"completions/max_terminated_length": 670.5,
"completions/mean_length": 347.03125,
"completions/mean_terminated_length": 347.03125,
"completions/min_length": 238.5,
"completions/min_terminated_length": 238.5,
"entropy": 0.2622284069657326,
"epoch": 0.014933333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002958625037089681,
"kl": 0.012627416843315586,
"learning_rate": 8.102867770255337e-08,
"loss": 0.0,
"num_tokens": 6577603.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2189.0,
"completions/mean_length": 2281.78125,
"completions/mean_terminated_length": 1114.7222595214844,
"completions/min_length": 488.0,
"completions/min_terminated_length": 488.0,
"entropy": 0.2811094503849745,
"epoch": 0.015022222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.632578356189579,
"kl": 0.012064321548677981,
"learning_rate": 7.839106810692589e-08,
"loss": 0.0,
"num_tokens": 6655484.0,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.3125,
"rewards/equation_reward_func/std": 0.42898140847682953,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 338
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2268.0,
"completions/max_terminated_length": 912.5,
"completions/mean_length": 1051.40625,
"completions/mean_terminated_length": 356.5,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 0.26706848200410604,
"epoch": 0.015111111111111112,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0028140986587067572,
"kl": 0.013069912674836814,
"learning_rate": 7.57890969878093e-08,
"loss": 0.0,
"num_tokens": 6693953.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 839.0,
"completions/max_terminated_length": 839.0,
"completions/mean_length": 403.90625,
"completions/mean_terminated_length": 403.90625,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 0.24486979842185974,
"epoch": 0.0152,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004798301159813211,
"kl": 0.013417759502772242,
"learning_rate": 7.322330470336313e-08,
"loss": 0.0,
"num_tokens": 6711686.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 342
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2934.0,
"completions/max_terminated_length": 2934.0,
"completions/mean_length": 634.4375,
"completions/mean_terminated_length": 634.4375,
"completions/min_length": 230.5,
"completions/min_terminated_length": 230.5,
"entropy": 0.25119878351688385,
"epoch": 0.015288888888888888,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0033714467626650446,
"kl": 0.013649999862536788,
"learning_rate": 7.069422409839363e-08,
"loss": 0.0,
"num_tokens": 6736836.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 344
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2214.5,
"completions/max_terminated_length": 1771.5,
"completions/mean_length": 1115.0,
"completions/mean_terminated_length": 599.6007080078125,
"completions/min_length": 261.5,
"completions/min_terminated_length": 261.5,
"entropy": 0.25783967413008213,
"epoch": 0.015377777777777778,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0036622729451236693,
"kl": 0.01107509626308456,
"learning_rate": 6.820238039369647e-08,
"loss": 0.0,
"num_tokens": 6777332.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 346
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 1394.3125,
"completions/mean_terminated_length": 829.5874633789062,
"completions/min_length": 227.5,
"completions/min_terminated_length": 227.5,
"entropy": 0.274350737221539,
"epoch": 0.015466666666666667,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6737440602776203,
"kl": 0.01305691129527986,
"learning_rate": 6.574829107698238e-08,
"loss": 0.0,
"num_tokens": 6826806.0,
"reward": 0.75,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.44091323018074036,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 348
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1940.5,
"completions/max_terminated_length": 1940.5,
"completions/mean_length": 704.875,
"completions/mean_terminated_length": 704.875,
"completions/min_length": 318.5,
"completions/min_terminated_length": 318.5,
"entropy": 0.26101984456181526,
"epoch": 0.015555555555555555,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0023647808678910177,
"kl": 0.012259377690497786,
"learning_rate": 6.333246579540971e-08,
"loss": 0.0,
"num_tokens": 6854226.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 1845.0,
"completions/max_terminated_length": 1726.5,
"completions/mean_length": 845.90625,
"completions/mean_terminated_length": 646.5505065917969,
"completions/min_length": 236.5,
"completions/min_terminated_length": 236.5,
"entropy": 0.24565229751169682,
"epoch": 0.015644444444444443,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7264834384366701,
"kl": 0.012497128162067384,
"learning_rate": 6.095540624974435e-08,
"loss": 0.0,
"num_tokens": 6886135.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.90625,
"rewards/equation_reward_func/std": 0.20155644416809082,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 352
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1843.5,
"completions/max_terminated_length": 749.0,
"completions/mean_length": 499.5,
"completions/mean_terminated_length": 417.7875061035156,
"completions/min_length": 233.0,
"completions/min_terminated_length": 233.0,
"entropy": 0.2693540593609214,
"epoch": 0.015733333333333332,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0028934413742537947,
"kl": 0.011744226852897555,
"learning_rate": 5.861760609017002e-08,
"loss": 0.0,
"num_tokens": 6906975.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 354
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1108.5,
"completions/mean_length": 1465.5,
"completions/mean_terminated_length": 465.375,
"completions/min_length": 206.5,
"completions/min_terminated_length": 206.5,
"entropy": 0.2571452846750617,
"epoch": 0.015822222222222224,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.002685926338613922,
"kl": 0.012025278876535594,
"learning_rate": 5.63195508137711e-08,
"loss": 0.0,
"num_tokens": 6958719.0,
"reward": 0.625,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.625,
"rewards/equation_reward_func/std": 0.4818056970834732,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 356
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 390.5,
"completions/max_terminated_length": 390.5,
"completions/mean_length": 270.09375,
"completions/mean_terminated_length": 270.09375,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"entropy": 0.2511430708691478,
"epoch": 0.015911111111111112,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0031781465152464555,
"kl": 0.012126730172894895,
"learning_rate": 5.4061717663707843e-08,
"loss": 0.0,
"num_tokens": 6972154.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 358
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2784.0,
"completions/max_terminated_length": 2210.5,
"completions/mean_length": 1096.84375,
"completions/mean_terminated_length": 723.633544921875,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.2540301540866494,
"epoch": 0.016,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.002946265639527776,
"kl": 0.01303106703562662,
"learning_rate": 5.1844575530106265e-08,
"loss": 0.0,
"num_tokens": 7012133.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1216.0,
"completions/max_terminated_length": 1216.0,
"completions/mean_length": 568.03125,
"completions/mean_terminated_length": 568.03125,
"completions/min_length": 254.0,
"completions/min_terminated_length": 254.0,
"entropy": 0.275842048227787,
"epoch": 0.01608888888888889,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003142009163240758,
"kl": 0.012856011278927326,
"learning_rate": 4.9668584852682134e-08,
"loss": 0.0,
"num_tokens": 7035126.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 362
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1867.5,
"completions/max_terminated_length": 1599.0,
"completions/mean_length": 858.125,
"completions/mean_terminated_length": 567.03125,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"entropy": 0.26426226925104856,
"epoch": 0.016177777777777777,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.8508128064563231,
"kl": 0.012756991724018008,
"learning_rate": 4.753419752512072e-08,
"loss": 0.0,
"num_tokens": 7067402.0,
"reward": 0.84375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.84375,
"rewards/equation_reward_func/std": 0.23935678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 364
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 892.0,
"completions/max_terminated_length": 892.0,
"completions/mean_length": 380.28125,
"completions/mean_terminated_length": 380.28125,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"entropy": 0.2523947898298502,
"epoch": 0.016266666666666665,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0031902645325912102,
"kl": 0.011268698493950069,
"learning_rate": 4.5441856801230525e-08,
"loss": 0.0,
"num_tokens": 7084363.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 366
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1857.5,
"completions/mean_length": 1984.6875,
"completions/mean_terminated_length": 1133.2500610351562,
"completions/min_length": 629.0,
"completions/min_terminated_length": 629.0,
"entropy": 0.26920368149876595,
"epoch": 0.016355555555555557,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8209724135025811,
"kl": 0.011945405742153525,
"learning_rate": 4.3391997202891825e-08,
"loss": 0.0,
"num_tokens": 7152753.0,
"reward": 0.53125,
"reward_std": 0.3608423173427582,
"rewards/equation_reward_func/mean": 0.53125,
"rewards/equation_reward_func/std": 0.5061737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 368
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2582.5,
"completions/mean_length": 1663.5,
"completions/mean_terminated_length": 924.6394348144531,
"completions/min_length": 361.5,
"completions/min_terminated_length": 361.5,
"entropy": 0.2724431995302439,
"epoch": 0.016444444444444446,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.002375931629879322,
"kl": 0.011462729802588001,
"learning_rate": 4.1385044429817966e-08,
"loss": 0.0,
"num_tokens": 7210897.0,
"reward": 0.59375,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.59375,
"rewards/equation_reward_func/std": 0.497555673122406,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 370
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2543.5,
"completions/max_terminated_length": 2048.5,
"completions/mean_length": 1225.5,
"completions/mean_terminated_length": 771.75,
"completions/min_length": 356.5,
"completions/min_terminated_length": 356.5,
"entropy": 0.2872797902673483,
"epoch": 0.016533333333333334,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0036013988233853434,
"kl": 0.013051826565060765,
"learning_rate": 3.942141527114978e-08,
"loss": 0.0,
"num_tokens": 7254921.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 372
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1786.5,
"completions/max_terminated_length": 1704.5,
"completions/mean_length": 676.3125,
"completions/mean_terminated_length": 607.3291931152344,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"entropy": 0.2516833422705531,
"epoch": 0.016622222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0032409740755153012,
"kl": 0.013011956005357206,
"learning_rate": 3.7501517518899486e-08,
"loss": 0.0,
"num_tokens": 7281419.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 374
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2043.5,
"completions/mean_length": 1477.40625,
"completions/mean_terminated_length": 752.5909118652344,
"completions/min_length": 290.5,
"completions/min_terminated_length": 290.5,
"entropy": 0.26651648059487343,
"epoch": 0.01671111111111111,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7197186929191686,
"kl": 0.013189784425776452,
"learning_rate": 3.562574988326342e-08,
"loss": 0.0,
"num_tokens": 7333592.0,
"reward": 0.6875,
"reward_std": 0.2587745785713196,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.4787135720252991,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 376
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1807.5,
"completions/max_terminated_length": 543.5,
"completions/mean_length": 1050.15625,
"completions/mean_terminated_length": 384.28125,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.28364898823201656,
"epoch": 0.0168,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002229579900740854,
"kl": 0.010572923871222883,
"learning_rate": 3.379450190982114e-08,
"loss": 0.0,
"num_tokens": 7372061.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 378
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1590.0,
"completions/max_terminated_length": 1590.0,
"completions/mean_length": 460.46875,
"completions/mean_terminated_length": 460.46875,
"completions/min_length": 288.0,
"completions/min_terminated_length": 288.0,
"entropy": 0.27252288814634085,
"epoch": 0.016888888888888887,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0024168024524627665,
"kl": 0.01104135494097136,
"learning_rate": 3.2008153898637255e-08,
"loss": 0.0,
"num_tokens": 7391644.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 587.5,
"completions/max_terminated_length": 587.5,
"completions/mean_length": 321.25,
"completions/mean_terminated_length": 321.25,
"completions/min_length": 195.5,
"completions/min_terminated_length": 195.5,
"entropy": 0.26600412372499704,
"epoch": 0.01697777777777778,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.5467716903361135,
"kl": 0.013095528644043952,
"learning_rate": 3.026707682528365e-08,
"loss": 0.0,
"num_tokens": 7406716.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 382
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1082.5,
"completions/mean_length": 1442.59375,
"completions/mean_terminated_length": 554.8702087402344,
"completions/min_length": 234.5,
"completions/min_terminated_length": 234.5,
"entropy": 0.2811947613954544,
"epoch": 0.017066666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0023612765147716713,
"kl": 0.012181783735286444,
"learning_rate": 2.8571632263797745e-08,
"loss": 0.0,
"num_tokens": 7457711.0,
"reward": 0.65625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.65625,
"rewards/equation_reward_func/std": 0.4597553312778473,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 384
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2230.5,
"completions/max_terminated_length": 2110.5,
"completions/mean_length": 863.5625,
"completions/mean_terminated_length": 798.5021057128906,
"completions/min_length": 211.5,
"completions/min_terminated_length": 211.5,
"entropy": 0.25374304968863726,
"epoch": 0.017155555555555556,
"frac_reward_zero_std": 0.75,
"grad_norm": 2.300851451670029,
"kl": 0.013115475769154727,
"learning_rate": 2.6922172311593884e-08,
"loss": 0.0,
"num_tokens": 7490137.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 386
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 536.0,
"completions/max_terminated_length": 536.0,
"completions/mean_length": 329.40625,
"completions/mean_terminated_length": 329.40625,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"entropy": 0.2687660912051797,
"epoch": 0.017244444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003236188807490885,
"kl": 0.013288187910802662,
"learning_rate": 2.5319039516341844e-08,
"loss": 0.0,
"num_tokens": 7505510.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 388
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2899.5,
"completions/max_terminated_length": 2529.0,
"completions/mean_length": 2049.125,
"completions/mean_terminated_length": 1644.6687622070312,
"completions/min_length": 784.0,
"completions/min_terminated_length": 784.0,
"entropy": 0.2617297563701868,
"epoch": 0.017333333333333333,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.7057223749958403,
"kl": 0.01200129883363843,
"learning_rate": 2.3762566804829742e-08,
"loss": 0.0,
"num_tokens": 7575994.0,
"reward": 0.625,
"reward_std": 0.2177756354212761,
"rewards/equation_reward_func/mean": 0.625,
"rewards/equation_reward_func/std": 0.36435678601264954,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 390
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2027.0,
"completions/max_terminated_length": 1576.0,
"completions/mean_length": 882.0625,
"completions/mean_terminated_length": 762.65625,
"completions/min_length": 260.0,
"completions/min_terminated_length": 260.0,
"entropy": 0.2711814185604453,
"epoch": 0.01742222222222222,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.002678885528367909,
"kl": 0.01261142164003104,
"learning_rate": 2.2253077413823458e-08,
"loss": 0.0,
"num_tokens": 7609036.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 392
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1152.0,
"completions/max_terminated_length": 1152.0,
"completions/mean_length": 482.75,
"completions/mean_terminated_length": 482.75,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"entropy": 0.27470015175640583,
"epoch": 0.017511111111111113,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0034936681657812877,
"kl": 0.011282922176178545,
"learning_rate": 2.0790884822939836e-08,
"loss": 0.0,
"num_tokens": 7629316.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 394
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 583.5,
"completions/max_terminated_length": 583.5,
"completions/mean_length": 353.03125,
"completions/mean_terminated_length": 353.03125,
"completions/min_length": 218.5,
"completions/min_terminated_length": 218.5,
"entropy": 0.2574036065489054,
"epoch": 0.0176,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0034114881771838503,
"kl": 0.01028229494113475,
"learning_rate": 1.9376292689545158e-08,
"loss": 0.0,
"num_tokens": 7645429.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 396
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2915.5,
"completions/mean_length": 2265.375,
"completions/mean_terminated_length": 1267.5556030273438,
"completions/min_length": 499.0,
"completions/min_terminated_length": 499.0,
"entropy": 0.28383847046643496,
"epoch": 0.01768888888888889,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.1894129573830192,
"kl": 0.01126208424102515,
"learning_rate": 1.800959478569422e-08,
"loss": 0.0,
"num_tokens": 7722809.0,
"reward": 0.34375,
"reward_std": 0.2041158676147461,
"rewards/equation_reward_func/mean": 0.34375,
"rewards/equation_reward_func/std": 0.4597553312778473,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 398
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1687.5,
"completions/mean_length": 1554.78125,
"completions/mean_terminated_length": 674.7259521484375,
"completions/min_length": 325.5,
"completions/min_terminated_length": 325.5,
"entropy": 0.29043113626539707,
"epoch": 0.017777777777777778,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.9429187670404626,
"kl": 0.012492028530687094,
"learning_rate": 1.6691074937121407e-08,
"loss": 0.0,
"num_tokens": 7777394.0,
"reward": 0.65625,
"reward_std": 0.1293872892856598,
"rewards/equation_reward_func/mean": 0.65625,
"rewards/equation_reward_func/std": 0.4597553312778473,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1728.0,
"completions/max_terminated_length": 383.5,
"completions/mean_length": 983.375,
"completions/mean_terminated_length": 290.9375,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"entropy": 0.26652571372687817,
"epoch": 0.017866666666666666,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00423530355082865,
"kl": 0.013721562922000885,
"learning_rate": 1.5421006964298377e-08,
"loss": 0.0,
"num_tokens": 7813678.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 402
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2300.0,
"completions/mean_length": 1527.1875,
"completions/mean_terminated_length": 1120.86669921875,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"entropy": 0.27129553351551294,
"epoch": 0.017955555555555554,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.832976146674529,
"kl": 0.010713240539189428,
"learning_rate": 1.4199654625568575e-08,
"loss": 0.0,
"num_tokens": 7867468.0,
"reward": 0.75,
"reward_std": 0.3514062389731407,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.3811737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 404
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1719.0,
"completions/max_terminated_length": 1388.5,
"completions/mean_length": 1190.53125,
"completions/mean_terminated_length": 902.3624877929688,
"completions/min_length": 531.5,
"completions/min_terminated_length": 531.5,
"entropy": 0.26110014878213406,
"epoch": 0.018044444444444443,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.9105665973454747,
"kl": 0.012254673813004047,
"learning_rate": 1.302727156237224e-08,
"loss": 0.0,
"num_tokens": 7910397.0,
"reward": 0.8125,
"reward_std": 0.249358132481575,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 406
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 1700.84375,
"completions/mean_terminated_length": 626.5249938964844,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"entropy": 0.26668123714625835,
"epoch": 0.018133333333333335,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0029355418790128285,
"kl": 0.013148852216545492,
"learning_rate": 1.1904101246571874e-08,
"loss": 0.0,
"num_tokens": 7969656.0,
"reward": 0.5625,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.5625,
"rewards/equation_reward_func/std": 0.5081988871097565,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 408
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2229.0,
"completions/max_terminated_length": 2178.5,
"completions/mean_length": 1085.9375,
"completions/mean_terminated_length": 977.763427734375,
"completions/min_length": 255.0,
"completions/min_terminated_length": 255.0,
"entropy": 0.2600689213722944,
"epoch": 0.018222222222222223,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5613127220073695,
"kl": 0.01210535824066028,
"learning_rate": 1.0830376929889612e-08,
"loss": 0.0,
"num_tokens": 8009222.0,
"reward": 0.9375,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.9375,
"rewards/equation_reward_func/std": 0.17078252136707306,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 410
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1955.0,
"completions/max_terminated_length": 1225.0,
"completions/mean_length": 1088.0625,
"completions/mean_terminated_length": 514.1111145019531,
"completions/min_length": 269.0,
"completions/min_terminated_length": 269.0,
"entropy": 0.2509449180215597,
"epoch": 0.01831111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002559214424379042,
"kl": 0.012904529517982155,
"learning_rate": 9.806321595467598e-09,
"loss": 0.0,
"num_tokens": 8048896.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 412
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2563.0,
"completions/mean_length": 2233.84375,
"completions/mean_terminated_length": 1334.3055572509766,
"completions/min_length": 761.0,
"completions/min_terminated_length": 761.0,
"entropy": 0.2827332355082035,
"epoch": 0.0184,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7520508296235054,
"kl": 0.011386299796868116,
"learning_rate": 8.832147911560173e-09,
"loss": 0.0,
"num_tokens": 8125267.0,
"reward": 0.375,
"reward_std": 0.2177756354212761,
"rewards/equation_reward_func/mean": 0.375,
"rewards/equation_reward_func/std": 0.457730233669281,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 414
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2780.0,
"completions/max_terminated_length": 2169.0,
"completions/mean_length": 1257.53125,
"completions/mean_terminated_length": 1037.7291870117188,
"completions/min_length": 378.0,
"completions/min_terminated_length": 378.0,
"entropy": 0.2801125952973962,
"epoch": 0.018488888888888888,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7066233355374787,
"kl": 0.012001977243926376,
"learning_rate": 7.908058187368726e-09,
"loss": 0.0,
"num_tokens": 8170364.0,
"reward": 0.875,
"reward_std": 0.13363061845302582,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 416
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1795.5,
"completions/max_terminated_length": 1576.5,
"completions/mean_length": 998.3125,
"completions/mean_terminated_length": 767.65625,
"completions/min_length": 270.0,
"completions/min_terminated_length": 270.0,
"entropy": 0.2634145403280854,
"epoch": 0.018577777777777776,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0032855917738229988,
"kl": 0.01313964050495997,
"learning_rate": 7.0342443310273665e-09,
"loss": 0.0,
"num_tokens": 8207150.0,
"reward": 0.875,
"reward_std": 0.2314550280570984,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.22360680997371674,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 418
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2799.0,
"completions/max_terminated_length": 2799.0,
"completions/mean_length": 983.03125,
"completions/mean_terminated_length": 983.03125,
"completions/min_length": 244.5,
"completions/min_terminated_length": 244.5,
"entropy": 0.27339703403413296,
"epoch": 0.018666666666666668,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003186100415722168,
"kl": 0.010260674695018679,
"learning_rate": 6.210887809749099e-09,
"loss": 0.0,
"num_tokens": 8243487.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1876.5,
"completions/max_terminated_length": 1254.5,
"completions/mean_length": 997.0,
"completions/mean_terminated_length": 453.1388854980469,
"completions/min_length": 257.5,
"completions/min_terminated_length": 257.5,
"entropy": 0.25085126888006926,
"epoch": 0.018755555555555557,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.003471742651114961,
"kl": 0.011486861592857167,
"learning_rate": 5.4381596121399476e-09,
"loss": 0.0,
"num_tokens": 8280215.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 422
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1553.0,
"completions/mean_length": 1449.65625,
"completions/mean_terminated_length": 612.8303833007812,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"entropy": 0.2599188946187496,
"epoch": 0.018844444444444445,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0021310158987967853,
"kl": 0.010764601000119,
"learning_rate": 4.716220212689332e-09,
"loss": 0.0,
"num_tokens": 8331460.0,
"reward": 0.6875,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.6875,
"rewards/equation_reward_func/std": 0.42898140847682953,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 424
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1283.0,
"completions/max_terminated_length": 1283.0,
"completions/mean_length": 482.65625,
"completions/mean_terminated_length": 482.65625,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"entropy": 0.2782833958044648,
"epoch": 0.018933333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0030275165486116904,
"kl": 0.013003175088670105,
"learning_rate": 4.045219538443778e-09,
"loss": 0.0,
"num_tokens": 8351729.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 426
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2238.5,
"completions/max_terminated_length": 927.5,
"completions/mean_length": 1134.78125,
"completions/mean_terminated_length": 433.71875,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"entropy": 0.2606539400294423,
"epoch": 0.01902222222222222,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0025799788396094247,
"kl": 0.012180730293039232,
"learning_rate": 3.4252969378714134e-09,
"loss": 0.0,
"num_tokens": 8392898.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 428
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2623.5,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 1437.03125,
"completions/mean_terminated_length": 993.0659790039062,
"completions/min_length": 503.5,
"completions/min_terminated_length": 503.5,
"entropy": 0.2920260410755873,
"epoch": 0.01911111111111111,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.1090578401338118,
"kl": 0.011862305458635092,
"learning_rate": 2.856581151922943e-09,
"loss": 0.0,
"num_tokens": 8443763.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 430
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 1990.8125,
"completions/mean_terminated_length": 1052.0625,
"completions/min_length": 680.5,
"completions/min_terminated_length": 680.5,
"entropy": 0.2943479251116514,
"epoch": 0.0192,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.1633775097626482,
"kl": 0.010395180608611554,
"learning_rate": 2.339190287295678e-09,
"loss": 0.0,
"num_tokens": 8512357.0,
"reward": 0.53125,
"reward_std": 0.35564958304166794,
"rewards/equation_reward_func/mean": 0.53125,
"rewards/equation_reward_func/std": 0.5143726766109467,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1847.5,
"completions/max_terminated_length": 1103.5,
"completions/mean_length": 923.84375,
"completions/mean_terminated_length": 471.8937683105469,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.25271155778318644,
"epoch": 0.01928888888888889,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.003008452213961997,
"kl": 0.012103472923627123,
"learning_rate": 1.8732317919060715e-09,
"loss": 0.0,
"num_tokens": 8546760.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 434
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2304.5,
"completions/mean_length": 1056.8125,
"completions/mean_terminated_length": 770.851318359375,
"completions/min_length": 248.0,
"completions/min_terminated_length": 248.0,
"entropy": 0.2532212445512414,
"epoch": 0.01937777777777778,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6794073285446092,
"kl": 0.01284879056038335,
"learning_rate": 1.4588024325756788e-09,
"loss": 0.0,
"num_tokens": 8585450.0,
"reward": 0.875,
"reward_std": 0.2177756354212761,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.3265564441680908,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 436
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2099.5,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 1114.53125,
"completions/mean_terminated_length": 680.0187683105469,
"completions/min_length": 322.0,
"completions/min_terminated_length": 322.0,
"entropy": 0.26436334289610386,
"epoch": 0.019466666666666667,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6266343864555947,
"kl": 0.011212630255613476,
"learning_rate": 1.0959882749354277e-09,
"loss": 0.0,
"num_tokens": 8625971.0,
"reward": 0.8125,
"reward_std": 0.1157275140285492,
"rewards/equation_reward_func/mean": 0.8125,
"rewards/equation_reward_func/std": 0.25,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 438
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1318.0,
"completions/max_terminated_length": 1318.0,
"completions/mean_length": 444.9375,
"completions/mean_terminated_length": 444.9375,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"entropy": 0.2647299263626337,
"epoch": 0.019555555555555555,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0035374009167824442,
"kl": 0.012473908253014088,
"learning_rate": 7.848646655519986e-10,
"loss": 0.0,
"num_tokens": 8644993.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 1.0,
"rewards/equation_reward_func/std": 0.0,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2224.5,
"completions/max_terminated_length": 846.0,
"completions/mean_length": 1158.34375,
"completions/mean_terminated_length": 457.5625,
"completions/min_length": 250.5,
"completions/min_terminated_length": 250.5,
"entropy": 0.28921834006905556,
"epoch": 0.019644444444444444,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0027064951159668604,
"kl": 0.012806050304789096,
"learning_rate": 5.254962162804799e-10,
"loss": 0.0,
"num_tokens": 8686924.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 442
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2069.5,
"completions/max_terminated_length": 682.0,
"completions/mean_length": 1083.59375,
"completions/mean_terminated_length": 380.3125,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"entropy": 0.2702910928055644,
"epoch": 0.019733333333333332,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0027219806874225795,
"kl": 0.011863501626066864,
"learning_rate": 3.1793679084632375e-10,
"loss": 0.0,
"num_tokens": 8726423.0,
"reward": 0.75,
"reward_std": 0.0,
"rewards/equation_reward_func/mean": 0.75,
"rewards/equation_reward_func/std": 0.25819888710975647,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 444
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 1817.5,
"completions/max_terminated_length": 1292.0,
"completions/mean_length": 990.65625,
"completions/mean_terminated_length": 457.5138854980469,
"completions/min_length": 284.5,
"completions/min_terminated_length": 284.5,
"entropy": 0.25941435527056456,
"epoch": 0.019822222222222224,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.00358339852977573,
"kl": 0.01235124742379412,
"learning_rate": 1.6222949365926608e-10,
"loss": 0.0,
"num_tokens": 8762980.0,
"reward": 0.78125,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.78125,
"rewards/equation_reward_func/std": 0.2561737895011902,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 446
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1766.5,
"completions/max_terminated_length": 1612.0,
"completions/mean_length": 730.78125,
"completions/mean_terminated_length": 667.4208374023438,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"entropy": 0.27090085577219725,
"epoch": 0.019911111111111112,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6441581681704772,
"kl": 0.013042959035374224,
"learning_rate": 5.84066608615985e-11,
"loss": 0.0,
"num_tokens": 8791165.0,
"reward": 0.96875,
"reward_std": 0.0883883461356163,
"rewards/equation_reward_func/mean": 0.96875,
"rewards/equation_reward_func/std": 0.125,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 448
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1963.0,
"completions/mean_length": 1123.75,
"completions/mean_terminated_length": 845.4285888671875,
"completions/min_length": 334.0,
"completions/min_terminated_length": 334.0,
"entropy": 0.24659618083387613,
"epoch": 0.02,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6981668381022528,
"kl": 0.014014697342645377,
"learning_rate": 6.489853613067531e-12,
"loss": 0.0,
"num_tokens": 8831965.0,
"reward": 0.875,
"reward_std": 0.2314550280570984,
"rewards/equation_reward_func/mean": 0.875,
"rewards/equation_reward_func/std": 0.3415650427341461,
"rewards/format_reward_func/mean": 0.0,
"rewards/format_reward_func/std": 0.0,
"step": 450
},
{
"epoch": 0.02,
"step": 450,
"total_flos": 0.0,
"train_loss": 8.708548251913978e-06,
"train_runtime": 15573.3321,
"train_samples_per_second": 0.462,
"train_steps_per_second": 0.029
}
],
"logging_steps": 2,
"max_steps": 450,
"num_input_tokens_seen": 8831965,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}