| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.17142857142857143, | |
| "eval_steps": 500, | |
| "global_step": 150, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.671875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1734.0, | |
| "completions/mean_length": 1702.03125, | |
| "completions/mean_terminated_length": 993.6190795898438, | |
| "completions/min_length": 483.0, | |
| "completions/min_terminated_length": 483.0, | |
| "epoch": 0.001142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25444135069847107, | |
| "learning_rate": 0.0, | |
| "loss": -0.0, | |
| "num_tokens": 118418.0, | |
| "reward": 0.17899775505065918, | |
| "reward_std": 0.7650213241577148, | |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, | |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, | |
| "rewards/format_reward/mean": 0.375, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1894.0, | |
| "completions/mean_length": 1738.90625, | |
| "completions/mean_terminated_length": 949.0, | |
| "completions/min_length": 435.0, | |
| "completions/min_terminated_length": 435.0, | |
| "epoch": 0.002285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24364067614078522, | |
| "learning_rate": 5e-08, | |
| "loss": -0.0, | |
| "num_tokens": 239748.0, | |
| "reward": 0.3848632574081421, | |
| "reward_std": 0.9111153483390808, | |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, | |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.890625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1405.0, | |
| "completions/mean_length": 1930.609375, | |
| "completions/mean_terminated_length": 974.71435546875, | |
| "completions/min_length": 477.0, | |
| "completions/min_terminated_length": 477.0, | |
| "epoch": 0.0034285714285714284, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27834266424179077, | |
| "learning_rate": 1e-07, | |
| "loss": -0.0, | |
| "num_tokens": 373779.0, | |
| "reward": -0.3227587938308716, | |
| "reward_std": 0.45940712094306946, | |
| "rewards/cosine_scaled_reward/mean": -0.2160668969154358, | |
| "rewards/cosine_scaled_reward/std": 0.21890601515769958, | |
| "rewards/format_reward/mean": 0.109375, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.515625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2039.0, | |
| "completions/mean_length": 1596.75, | |
| "completions/mean_terminated_length": 1116.3870849609375, | |
| "completions/min_length": 474.0, | |
| "completions/min_terminated_length": 474.0, | |
| "epoch": 0.004571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2799243628978729, | |
| "learning_rate": 1.5e-07, | |
| "loss": -0.0, | |
| "num_tokens": 485779.0, | |
| "reward": 0.27003082633018494, | |
| "reward_std": 0.7608597874641418, | |
| "rewards/cosine_scaled_reward/mean": -0.11498458683490753, | |
| "rewards/cosine_scaled_reward/std": 0.36645373702049255, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.96875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1755.0, | |
| "completions/mean_length": 2035.46875, | |
| "completions/mean_terminated_length": 1647.0, | |
| "completions/min_length": 1539.0, | |
| "completions/min_terminated_length": 1539.0, | |
| "epoch": 0.005714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24311119318008423, | |
| "learning_rate": 2e-07, | |
| "loss": -0.0, | |
| "num_tokens": 626865.0, | |
| "reward": -0.4839385151863098, | |
| "reward_std": 0.34498828649520874, | |
| "rewards/cosine_scaled_reward/mean": -0.2732192277908325, | |
| "rewards/cosine_scaled_reward/std": 0.18402352929115295, | |
| "rewards/format_reward/mean": 0.0625, | |
| "rewards/format_reward/std": 0.24397502839565277, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1862.0, | |
| "completions/mean_length": 1884.109375, | |
| "completions/mean_terminated_length": 882.5555419921875, | |
| "completions/min_length": 524.0, | |
| "completions/min_terminated_length": 524.0, | |
| "epoch": 0.006857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2741600275039673, | |
| "learning_rate": 2.5e-07, | |
| "loss": -0.0, | |
| "num_tokens": 759096.0, | |
| "reward": -0.2049689143896103, | |
| "reward_std": 0.639178991317749, | |
| "rewards/cosine_scaled_reward/mean": -0.18060946464538574, | |
| "rewards/cosine_scaled_reward/std": 0.2599981129169464, | |
| "rewards/format_reward/mean": 0.15625, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1795.0, | |
| "completions/mean_length": 1959.84375, | |
| "completions/mean_terminated_length": 1342.75, | |
| "completions/min_length": 974.0, | |
| "completions/min_terminated_length": 974.0, | |
| "epoch": 0.008, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21986258029937744, | |
| "learning_rate": 3e-07, | |
| "loss": -0.0, | |
| "num_tokens": 894934.0, | |
| "reward": -0.11210991442203522, | |
| "reward_std": 0.6349427103996277, | |
| "rewards/cosine_scaled_reward/mean": -0.14199243485927582, | |
| "rewards/cosine_scaled_reward/std": 0.3749195337295532, | |
| "rewards/format_reward/mean": 0.171875, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1895.0, | |
| "completions/mean_length": 1717.78125, | |
| "completions/mean_terminated_length": 873.888916015625, | |
| "completions/min_length": 342.0, | |
| "completions/min_terminated_length": 342.0, | |
| "epoch": 0.009142857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.23102505505084991, | |
| "learning_rate": 3.5e-07, | |
| "loss": -0.0, | |
| "num_tokens": 1015288.0, | |
| "reward": 0.12653985619544983, | |
| "reward_std": 0.4742490351200104, | |
| "rewards/cosine_scaled_reward/mean": -0.09298005700111389, | |
| "rewards/cosine_scaled_reward/std": 0.39157670736312866, | |
| "rewards/format_reward/mean": 0.3125, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.828125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1815.0, | |
| "completions/mean_length": 1928.53125, | |
| "completions/mean_terminated_length": 1352.9091796875, | |
| "completions/min_length": 999.0, | |
| "completions/min_terminated_length": 999.0, | |
| "epoch": 0.010285714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2600339353084564, | |
| "learning_rate": 4e-07, | |
| "loss": -0.0, | |
| "num_tokens": 1150170.0, | |
| "reward": -0.14216071367263794, | |
| "reward_std": 0.702994704246521, | |
| "rewards/cosine_scaled_reward/mean": -0.17264285683631897, | |
| "rewards/cosine_scaled_reward/std": 0.33145979046821594, | |
| "rewards/format_reward/mean": 0.203125, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1432.0, | |
| "completions/mean_length": 1699.84375, | |
| "completions/mean_terminated_length": 810.1111450195312, | |
| "completions/min_length": 337.0, | |
| "completions/min_terminated_length": 337.0, | |
| "epoch": 0.011428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2749471366405487, | |
| "learning_rate": 4.5e-07, | |
| "loss": -0.0, | |
| "num_tokens": 1269792.0, | |
| "reward": -0.13922849297523499, | |
| "reward_std": 0.4937349855899811, | |
| "rewards/cosine_scaled_reward/mean": -0.2102392464876175, | |
| "rewards/cosine_scaled_reward/std": 0.30274781584739685, | |
| "rewards/format_reward/mean": 0.28125, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.9375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1614.0, | |
| "completions/mean_length": 1994.453125, | |
| "completions/mean_terminated_length": 1191.25, | |
| "completions/min_length": 916.0, | |
| "completions/min_terminated_length": 916.0, | |
| "epoch": 0.012571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2305486500263214, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1409109.0, | |
| "reward": -0.39525067806243896, | |
| "reward_std": 0.3650783896446228, | |
| "rewards/cosine_scaled_reward/mean": -0.2288753092288971, | |
| "rewards/cosine_scaled_reward/std": 0.22182811796665192, | |
| "rewards/format_reward/mean": 0.0625, | |
| "rewards/format_reward/std": 0.24397502839565277, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.609375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2024.0, | |
| "completions/mean_length": 1706.6875, | |
| "completions/mean_terminated_length": 1174.239990234375, | |
| "completions/min_length": 319.0, | |
| "completions/min_terminated_length": 319.0, | |
| "epoch": 0.013714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2918066382408142, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1529281.0, | |
| "reward": 0.08787664026021957, | |
| "reward_std": 0.7579531073570251, | |
| "rewards/cosine_scaled_reward/mean": -0.18262416124343872, | |
| "rewards/cosine_scaled_reward/std": 0.37901216745376587, | |
| "rewards/format_reward/mean": 0.453125, | |
| "rewards/format_reward/std": 0.501733124256134, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.765625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1943.0, | |
| "completions/mean_length": 1820.828125, | |
| "completions/mean_terminated_length": 1078.7333984375, | |
| "completions/min_length": 527.0, | |
| "completions/min_terminated_length": 527.0, | |
| "epoch": 0.014857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27849823236465454, | |
| "learning_rate": 6e-07, | |
| "loss": -0.0, | |
| "num_tokens": 1656854.0, | |
| "reward": 0.03077489137649536, | |
| "reward_std": 0.6479229927062988, | |
| "rewards/cosine_scaled_reward/mean": -0.12523755431175232, | |
| "rewards/cosine_scaled_reward/std": 0.34234777092933655, | |
| "rewards/format_reward/mean": 0.28125, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1972.0, | |
| "completions/mean_length": 1772.296875, | |
| "completions/mean_terminated_length": 1165.75, | |
| "completions/min_length": 605.0, | |
| "completions/min_terminated_length": 605.0, | |
| "epoch": 0.016, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25037428736686707, | |
| "learning_rate": 6.5e-07, | |
| "loss": 0.0, | |
| "num_tokens": 1780889.0, | |
| "reward": 0.3261271119117737, | |
| "reward_std": 0.6276673078536987, | |
| "rewards/cosine_scaled_reward/mean": -0.008811453357338905, | |
| "rewards/cosine_scaled_reward/std": 0.46767035126686096, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.703125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1831.0, | |
| "completions/mean_length": 1715.5625, | |
| "completions/mean_terminated_length": 928.2105102539062, | |
| "completions/min_length": 413.0, | |
| "completions/min_terminated_length": 413.0, | |
| "epoch": 0.017142857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26902371644973755, | |
| "learning_rate": 7e-07, | |
| "loss": -0.0, | |
| "num_tokens": 1901605.0, | |
| "reward": 0.3007117211818695, | |
| "reward_std": 0.3918319642543793, | |
| "rewards/cosine_scaled_reward/mean": -0.005894124507904053, | |
| "rewards/cosine_scaled_reward/std": 0.4677385091781616, | |
| "rewards/format_reward/mean": 0.3125, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.984375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 757.0, | |
| "completions/mean_length": 2027.828125, | |
| "completions/mean_terminated_length": 757.0, | |
| "completions/min_length": 757.0, | |
| "completions/min_terminated_length": 757.0, | |
| "epoch": 0.018285714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25064370036125183, | |
| "learning_rate": 7.5e-07, | |
| "loss": -0.0, | |
| "num_tokens": 2041826.0, | |
| "reward": -0.499896764755249, | |
| "reward_std": 0.34189552068710327, | |
| "rewards/cosine_scaled_reward/mean": -0.2577608823776245, | |
| "rewards/cosine_scaled_reward/std": 0.18115636706352234, | |
| "rewards/format_reward/mean": 0.015625, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.546875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1816.0, | |
| "completions/mean_length": 1530.796875, | |
| "completions/mean_terminated_length": 906.586181640625, | |
| "completions/min_length": 378.0, | |
| "completions/min_terminated_length": 378.0, | |
| "epoch": 0.019428571428571427, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3018316924571991, | |
| "learning_rate": 8e-07, | |
| "loss": -0.0, | |
| "num_tokens": 2150317.0, | |
| "reward": 0.23110359907150269, | |
| "reward_std": 0.6260336637496948, | |
| "rewards/cosine_scaled_reward/mean": -0.12663568556308746, | |
| "rewards/cosine_scaled_reward/std": 0.39377179741859436, | |
| "rewards/format_reward/mean": 0.484375, | |
| "rewards/format_reward/std": 0.5037065148353577, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.796875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1689.0, | |
| "completions/mean_length": 1813.671875, | |
| "completions/mean_terminated_length": 894.3846435546875, | |
| "completions/min_length": 505.0, | |
| "completions/min_terminated_length": 505.0, | |
| "epoch": 0.02057142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.23236438632011414, | |
| "learning_rate": 8.499999999999999e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2276768.0, | |
| "reward": -0.10029121488332748, | |
| "reward_std": 0.7172800302505493, | |
| "rewards/cosine_scaled_reward/mean": -0.18295811116695404, | |
| "rewards/cosine_scaled_reward/std": 0.3038564622402191, | |
| "rewards/format_reward/mean": 0.265625, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.78125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1697.0, | |
| "completions/mean_length": 1843.15625, | |
| "completions/mean_terminated_length": 1111.571533203125, | |
| "completions/min_length": 484.0, | |
| "completions/min_terminated_length": 484.0, | |
| "epoch": 0.021714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2313074916601181, | |
| "learning_rate": 9e-07, | |
| "loss": -0.0, | |
| "num_tokens": 2405986.0, | |
| "reward": 0.09310440719127655, | |
| "reward_std": 0.7020131349563599, | |
| "rewards/cosine_scaled_reward/mean": -0.08626029640436172, | |
| "rewards/cosine_scaled_reward/std": 0.44063708186149597, | |
| "rewards/format_reward/mean": 0.265625, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.578125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1676.0, | |
| "completions/mean_length": 1523.03125, | |
| "completions/mean_terminated_length": 803.629638671875, | |
| "completions/min_length": 395.0, | |
| "completions/min_terminated_length": 395.0, | |
| "epoch": 0.022857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.295642226934433, | |
| "learning_rate": 9.499999999999999e-07, | |
| "loss": -0.0, | |
| "num_tokens": 2514812.0, | |
| "reward": 0.3644811511039734, | |
| "reward_std": 0.7943294048309326, | |
| "rewards/cosine_scaled_reward/mean": -0.03650941699743271, | |
| "rewards/cosine_scaled_reward/std": 0.44610291719436646, | |
| "rewards/format_reward/mean": 0.4375, | |
| "rewards/format_reward/std": 0.5, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.703125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2002.0, | |
| "completions/mean_length": 1793.328125, | |
| "completions/mean_terminated_length": 1190.157958984375, | |
| "completions/min_length": 455.0, | |
| "completions/min_terminated_length": 455.0, | |
| "epoch": 0.024, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2961376905441284, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0, | |
| "num_tokens": 2640393.0, | |
| "reward": 0.06134350597858429, | |
| "reward_std": 0.6498202085494995, | |
| "rewards/cosine_scaled_reward/mean": -0.14120325446128845, | |
| "rewards/cosine_scaled_reward/std": 0.3548509478569031, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1925.0, | |
| "completions/mean_length": 1386.75, | |
| "completions/mean_terminated_length": 872.4444580078125, | |
| "completions/min_length": 271.0, | |
| "completions/min_terminated_length": 271.0, | |
| "epoch": 0.025142857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.34918317198753357, | |
| "learning_rate": 9.99931462820376e-07, | |
| "loss": 0.0, | |
| "num_tokens": 2738161.0, | |
| "reward": 0.5064569711685181, | |
| "reward_std": 0.7104054689407349, | |
| "rewards/cosine_scaled_reward/mean": -0.035834040492773056, | |
| "rewards/cosine_scaled_reward/std": 0.4265843331813812, | |
| "rewards/format_reward/mean": 0.578125, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.59375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1986.0, | |
| "completions/mean_length": 1656.59375, | |
| "completions/mean_terminated_length": 1084.5384521484375, | |
| "completions/min_length": 364.0, | |
| "completions/min_terminated_length": 364.0, | |
| "epoch": 0.026285714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26697081327438354, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": -0.0, | |
| "num_tokens": 2854975.0, | |
| "reward": 0.2737857699394226, | |
| "reward_std": 0.6956006288528442, | |
| "rewards/cosine_scaled_reward/mean": -0.0896696150302887, | |
| "rewards/cosine_scaled_reward/std": 0.3913433253765106, | |
| "rewards/format_reward/mean": 0.453125, | |
| "rewards/format_reward/std": 0.501733124256134, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2004.0, | |
| "completions/mean_length": 1790.421875, | |
| "completions/mean_terminated_length": 1223.75, | |
| "completions/min_length": 421.0, | |
| "completions/min_terminated_length": 421.0, | |
| "epoch": 0.027428571428571427, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24950818717479706, | |
| "learning_rate": 9.993832906395582e-07, | |
| "loss": -0.0, | |
| "num_tokens": 2980490.0, | |
| "reward": -0.08990197628736496, | |
| "reward_std": 0.7724581956863403, | |
| "rewards/cosine_scaled_reward/mean": -0.21682599186897278, | |
| "rewards/cosine_scaled_reward/std": 0.35711658000946045, | |
| "rewards/format_reward/mean": 0.34375, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.703125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1985.0, | |
| "completions/mean_length": 1703.953125, | |
| "completions/mean_terminated_length": 889.1052856445312, | |
| "completions/min_length": 427.0, | |
| "completions/min_terminated_length": 427.0, | |
| "epoch": 0.02857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28078693151474, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3099839.0, | |
| "reward": -0.12643180787563324, | |
| "reward_std": 0.6687923669815063, | |
| "rewards/cosine_scaled_reward/mean": -0.21946589648723602, | |
| "rewards/cosine_scaled_reward/std": 0.30431970953941345, | |
| "rewards/format_reward/mean": 0.3125, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.75, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1997.0, | |
| "completions/mean_length": 1938.078125, | |
| "completions/mean_terminated_length": 1608.3125, | |
| "completions/min_length": 1087.0, | |
| "completions/min_terminated_length": 1087.0, | |
| "epoch": 0.029714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21486051380634308, | |
| "learning_rate": 9.982876141412855e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3234508.0, | |
| "reward": 0.05503671616315842, | |
| "reward_std": 0.6532000303268433, | |
| "rewards/cosine_scaled_reward/mean": -0.1287316530942917, | |
| "rewards/cosine_scaled_reward/std": 0.36068078875541687, | |
| "rewards/format_reward/mean": 0.3125, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.765625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2037.0, | |
| "completions/mean_length": 1896.375, | |
| "completions/mean_terminated_length": 1401.0667724609375, | |
| "completions/min_length": 568.0, | |
| "completions/min_terminated_length": 568.0, | |
| "epoch": 0.030857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2675936222076416, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": -0.0, | |
| "num_tokens": 3366164.0, | |
| "reward": -0.02987336739897728, | |
| "reward_std": 0.5919089913368225, | |
| "rewards/cosine_scaled_reward/mean": -0.1633741855621338, | |
| "rewards/cosine_scaled_reward/std": 0.3508918881416321, | |
| "rewards/format_reward/mean": 0.296875, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1951.0, | |
| "completions/mean_length": 1832.96875, | |
| "completions/mean_terminated_length": 901.1666870117188, | |
| "completions/min_length": 450.0, | |
| "completions/min_terminated_length": 450.0, | |
| "epoch": 0.032, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2518174946308136, | |
| "learning_rate": 9.96645768238595e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3493810.0, | |
| "reward": 0.08577289432287216, | |
| "reward_std": 0.6993601322174072, | |
| "rewards/cosine_scaled_reward/mean": -0.08211354911327362, | |
| "rewards/cosine_scaled_reward/std": 0.45168522000312805, | |
| "rewards/format_reward/mean": 0.25, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1779.0, | |
| "completions/mean_length": 1921.1875, | |
| "completions/mean_terminated_length": 1146.2222900390625, | |
| "completions/min_length": 710.0, | |
| "completions/min_terminated_length": 710.0, | |
| "epoch": 0.03314285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25027790665626526, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": -0.0, | |
| "num_tokens": 3627238.0, | |
| "reward": -0.3098237216472626, | |
| "reward_std": 0.4339829087257385, | |
| "rewards/cosine_scaled_reward/mean": -0.2330368608236313, | |
| "rewards/cosine_scaled_reward/std": 0.17332859337329865, | |
| "rewards/format_reward/mean": 0.15625, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2048.0, | |
| "completions/mean_length": 1891.109375, | |
| "completions/mean_terminated_length": 1490.1666259765625, | |
| "completions/min_length": 401.0, | |
| "completions/min_terminated_length": 401.0, | |
| "epoch": 0.03428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24189673364162445, | |
| "learning_rate": 9.944597532678119e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3758805.0, | |
| "reward": -0.08874380588531494, | |
| "reward_std": 0.5923835635185242, | |
| "rewards/cosine_scaled_reward/mean": -0.18499691784381866, | |
| "rewards/cosine_scaled_reward/std": 0.27955111861228943, | |
| "rewards/format_reward/mean": 0.28125, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1467.0, | |
| "completions/mean_length": 1818.8125, | |
| "completions/mean_terminated_length": 825.6666870117188, | |
| "completions/min_length": 444.0, | |
| "completions/min_terminated_length": 444.0, | |
| "epoch": 0.03542857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24893531203269958, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0, | |
| "num_tokens": 3885705.0, | |
| "reward": -0.18628405034542084, | |
| "reward_std": 0.5522075891494751, | |
| "rewards/cosine_scaled_reward/mean": -0.20251703262329102, | |
| "rewards/cosine_scaled_reward/std": 0.37591472268104553, | |
| "rewards/format_reward/mean": 0.21875, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.796875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1867.0, | |
| "completions/mean_length": 1878.140625, | |
| "completions/mean_terminated_length": 1211.769287109375, | |
| "completions/min_length": 654.0, | |
| "completions/min_terminated_length": 654.0, | |
| "epoch": 0.036571428571428574, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25341352820396423, | |
| "learning_rate": 9.917322325514487e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4016258.0, | |
| "reward": -0.14861394464969635, | |
| "reward_std": 0.5451517105102539, | |
| "rewards/cosine_scaled_reward/mean": -0.19149449467658997, | |
| "rewards/cosine_scaled_reward/std": 0.3489256203174591, | |
| "rewards/format_reward/mean": 0.234375, | |
| "rewards/format_reward/std": 0.42695629596710205, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.890625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1823.0, | |
| "completions/mean_length": 1969.03125, | |
| "completions/mean_terminated_length": 1326.0, | |
| "completions/min_length": 998.0, | |
| "completions/min_terminated_length": 998.0, | |
| "epoch": 0.037714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.23557375371456146, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": -0.0, | |
| "num_tokens": 4153492.0, | |
| "reward": -0.3634287118911743, | |
| "reward_std": 0.5462046265602112, | |
| "rewards/cosine_scaled_reward/mean": -0.25983935594558716, | |
| "rewards/cosine_scaled_reward/std": 0.3271723687648773, | |
| "rewards/format_reward/mean": 0.15625, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.53125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1998.0, | |
| "completions/mean_length": 1568.296875, | |
| "completions/mean_terminated_length": 1024.6334228515625, | |
| "completions/min_length": 506.0, | |
| "completions/min_terminated_length": 506.0, | |
| "epoch": 0.038857142857142854, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2892495095729828, | |
| "learning_rate": 9.88466529153356e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4263415.0, | |
| "reward": 0.575156569480896, | |
| "reward_std": 0.8866004347801208, | |
| "rewards/cosine_scaled_reward/mean": 0.045390784740448, | |
| "rewards/cosine_scaled_reward/std": 0.5505619645118713, | |
| "rewards/format_reward/mean": 0.484375, | |
| "rewards/format_reward/std": 0.5037065148353577, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.84375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1834.0, | |
| "completions/mean_length": 1830.65625, | |
| "completions/mean_terminated_length": 657.0, | |
| "completions/min_length": 371.0, | |
| "completions/min_terminated_length": 371.0, | |
| "epoch": 0.04, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28274399042129517, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": -0.0, | |
| "num_tokens": 4392073.0, | |
| "reward": -0.1704331934452057, | |
| "reward_std": 0.7666259407997131, | |
| "rewards/cosine_scaled_reward/mean": -0.18677911162376404, | |
| "rewards/cosine_scaled_reward/std": 0.36125659942626953, | |
| "rewards/format_reward/mean": 0.203125, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.890625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1962.0, | |
| "completions/mean_length": 1950.671875, | |
| "completions/mean_terminated_length": 1158.1429443359375, | |
| "completions/min_length": 669.0, | |
| "completions/min_terminated_length": 669.0, | |
| "epoch": 0.04114285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2504905164241791, | |
| "learning_rate": 9.846666218300807e-07, | |
| "loss": -0.0, | |
| "num_tokens": 4528028.0, | |
| "reward": -0.49544650316238403, | |
| "reward_std": 0.3493530750274658, | |
| "rewards/cosine_scaled_reward/mean": -0.302410751581192, | |
| "rewards/cosine_scaled_reward/std": 0.17342224717140198, | |
| "rewards/format_reward/mean": 0.109375, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.859375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1879.0, | |
| "completions/mean_length": 1956.546875, | |
| "completions/mean_terminated_length": 1397.6666259765625, | |
| "completions/min_length": 789.0, | |
| "completions/min_terminated_length": 789.0, | |
| "epoch": 0.04228571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24223695695400238, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4664271.0, | |
| "reward": -0.2983526587486267, | |
| "reward_std": 0.45510220527648926, | |
| "rewards/cosine_scaled_reward/mean": -0.22730132937431335, | |
| "rewards/cosine_scaled_reward/std": 0.21532759070396423, | |
| "rewards/format_reward/mean": 0.15625, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.8125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1844.0, | |
| "completions/mean_length": 1872.0625, | |
| "completions/mean_terminated_length": 1109.666748046875, | |
| "completions/min_length": 799.0, | |
| "completions/min_terminated_length": 799.0, | |
| "epoch": 0.04342857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22518785297870636, | |
| "learning_rate": 9.80337140183366e-07, | |
| "loss": 0.0, | |
| "num_tokens": 4795795.0, | |
| "reward": -0.0591111034154892, | |
| "reward_std": 0.38858330249786377, | |
| "rewards/cosine_scaled_reward/mean": -0.1311180591583252, | |
| "rewards/cosine_scaled_reward/std": 0.32316854596138, | |
| "rewards/format_reward/mean": 0.203125, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.71875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1982.0, | |
| "completions/mean_length": 1674.8125, | |
| "completions/mean_terminated_length": 721.1111450195312, | |
| "completions/min_length": 358.0, | |
| "completions/min_terminated_length": 358.0, | |
| "epoch": 0.044571428571428574, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26911357045173645, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": -0.0, | |
| "num_tokens": 4913767.0, | |
| "reward": 0.14183415472507477, | |
| "reward_std": 0.6081592440605164, | |
| "rewards/cosine_scaled_reward/mean": -0.09314543008804321, | |
| "rewards/cosine_scaled_reward/std": 0.3410241901874542, | |
| "rewards/format_reward/mean": 0.328125, | |
| "rewards/format_reward/std": 0.4732423722743988, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2018.0, | |
| "completions/mean_length": 1549.328125, | |
| "completions/mean_terminated_length": 908.1785888671875, | |
| "completions/min_length": 204.0, | |
| "completions/min_terminated_length": 204.0, | |
| "epoch": 0.045714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2770562767982483, | |
| "learning_rate": 9.754833590196926e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5022996.0, | |
| "reward": 0.3034515678882599, | |
| "reward_std": 0.5147567987442017, | |
| "rewards/cosine_scaled_reward/mean": -0.09827423095703125, | |
| "rewards/cosine_scaled_reward/std": 0.39993754029273987, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1819.0, | |
| "completions/mean_length": 1768.609375, | |
| "completions/mean_terminated_length": 1302.9583740234375, | |
| "completions/min_length": 584.0, | |
| "completions/min_terminated_length": 584.0, | |
| "epoch": 0.046857142857142854, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.23544111847877502, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5147339.0, | |
| "reward": 0.05204566568136215, | |
| "reward_std": 0.7308298349380493, | |
| "rewards/cosine_scaled_reward/mean": -0.18491466343402863, | |
| "rewards/cosine_scaled_reward/std": 0.3467314541339874, | |
| "rewards/format_reward/mean": 0.421875, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.703125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1894.0, | |
| "completions/mean_length": 1662.234375, | |
| "completions/mean_terminated_length": 748.5789794921875, | |
| "completions/min_length": 164.0, | |
| "completions/min_terminated_length": 164.0, | |
| "epoch": 0.048, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3946230709552765, | |
| "learning_rate": 9.701111919237408e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5264082.0, | |
| "reward": -0.1084136962890625, | |
| "reward_std": 0.35625624656677246, | |
| "rewards/cosine_scaled_reward/mean": -0.21045684814453125, | |
| "rewards/cosine_scaled_reward/std": 0.17068159580230713, | |
| "rewards/format_reward/mean": 0.3125, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.609375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2044.0, | |
| "completions/mean_length": 1628.796875, | |
| "completions/mean_terminated_length": 974.8399658203125, | |
| "completions/min_length": 387.0, | |
| "completions/min_terminated_length": 387.0, | |
| "epoch": 0.04914285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2622542679309845, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": -0.0, | |
| "num_tokens": 5379941.0, | |
| "reward": 0.24864289164543152, | |
| "reward_std": 0.622364342212677, | |
| "rewards/cosine_scaled_reward/mean": -0.08661604672670364, | |
| "rewards/cosine_scaled_reward/std": 0.3968709111213684, | |
| "rewards/format_reward/mean": 0.421875, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1813.0, | |
| "completions/mean_length": 1598.265625, | |
| "completions/mean_terminated_length": 848.7083740234375, | |
| "completions/min_length": 233.0, | |
| "completions/min_terminated_length": 233.0, | |
| "epoch": 0.05028571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3861124813556671, | |
| "learning_rate": 9.64227184053598e-07, | |
| "loss": -0.0, | |
| "num_tokens": 5492926.0, | |
| "reward": 0.17736095190048218, | |
| "reward_std": 0.5736653804779053, | |
| "rewards/cosine_scaled_reward/mean": -0.09881951659917831, | |
| "rewards/cosine_scaled_reward/std": 0.4637540578842163, | |
| "rewards/format_reward/mean": 0.375, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1934.0, | |
| "completions/mean_length": 1945.546875, | |
| "completions/mean_terminated_length": 1228.375, | |
| "completions/min_length": 909.0, | |
| "completions/min_terminated_length": 909.0, | |
| "epoch": 0.05142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2586025893688202, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": -0.0, | |
| "num_tokens": 5629097.0, | |
| "reward": -0.2874904274940491, | |
| "reward_std": 0.4528215825557709, | |
| "rewards/cosine_scaled_reward/mean": -0.21405771374702454, | |
| "rewards/cosine_scaled_reward/std": 0.3033171594142914, | |
| "rewards/format_reward/mean": 0.140625, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.765625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1514.0, | |
| "completions/mean_length": 1772.890625, | |
| "completions/mean_terminated_length": 874.2000732421875, | |
| "completions/min_length": 597.0, | |
| "completions/min_terminated_length": 597.0, | |
| "epoch": 0.052571428571428575, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27347299456596375, | |
| "learning_rate": 9.578385041664925e-07, | |
| "loss": -0.0, | |
| "num_tokens": 5753730.0, | |
| "reward": -0.0957992672920227, | |
| "reward_std": 0.4836219251155853, | |
| "rewards/cosine_scaled_reward/mean": -0.17289963364601135, | |
| "rewards/cosine_scaled_reward/std": 0.3050842881202698, | |
| "rewards/format_reward/mean": 0.25, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.609375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2020.0, | |
| "completions/mean_length": 1595.734375, | |
| "completions/mean_terminated_length": 890.199951171875, | |
| "completions/min_length": 379.0, | |
| "completions/min_terminated_length": 379.0, | |
| "epoch": 0.053714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3536407947540283, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0, | |
| "num_tokens": 5866257.0, | |
| "reward": -0.01777055859565735, | |
| "reward_std": 0.6523094177246094, | |
| "rewards/cosine_scaled_reward/mean": -0.22763527929782867, | |
| "rewards/cosine_scaled_reward/std": 0.3455982208251953, | |
| "rewards/format_reward/mean": 0.4375, | |
| "rewards/format_reward/std": 0.5, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.546875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1792.0, | |
| "completions/mean_length": 1581.84375, | |
| "completions/mean_terminated_length": 1019.2413940429688, | |
| "completions/min_length": 397.0, | |
| "completions/min_terminated_length": 397.0, | |
| "epoch": 0.054857142857142854, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26218104362487793, | |
| "learning_rate": 9.509529358847654e-07, | |
| "loss": -0.0, | |
| "num_tokens": 5978039.0, | |
| "reward": 0.36145922541618347, | |
| "reward_std": 0.8229352235794067, | |
| "rewards/cosine_scaled_reward/mean": -0.06145789101719856, | |
| "rewards/cosine_scaled_reward/std": 0.4491077661514282, | |
| "rewards/format_reward/mean": 0.484375, | |
| "rewards/format_reward/std": 0.5037065148353577, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.484375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1525.0, | |
| "completions/mean_length": 1404.46875, | |
| "completions/mean_terminated_length": 799.9393920898438, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "epoch": 0.056, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3138538897037506, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6077989.0, | |
| "reward": 0.23753327131271362, | |
| "reward_std": 0.6856037378311157, | |
| "rewards/cosine_scaled_reward/mean": -0.1468583643436432, | |
| "rewards/cosine_scaled_reward/std": 0.36308491230010986, | |
| "rewards/format_reward/mean": 0.53125, | |
| "rewards/format_reward/std": 0.5029674172401428, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.65625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1834.0, | |
| "completions/mean_length": 1668.46875, | |
| "completions/mean_terminated_length": 943.9091186523438, | |
| "completions/min_length": 327.0, | |
| "completions/min_terminated_length": 327.0, | |
| "epoch": 0.05714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2541959285736084, | |
| "learning_rate": 9.43578868212728e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6195587.0, | |
| "reward": 0.2079824060201645, | |
| "reward_std": 0.6563009023666382, | |
| "rewards/cosine_scaled_reward/mean": -0.09132131934165955, | |
| "rewards/cosine_scaled_reward/std": 0.39781448245048523, | |
| "rewards/format_reward/mean": 0.390625, | |
| "rewards/format_reward/std": 0.4917473793029785, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.421875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1850.0, | |
| "completions/mean_length": 1368.90625, | |
| "completions/mean_terminated_length": 873.3513793945312, | |
| "completions/min_length": 432.0, | |
| "completions/min_terminated_length": 432.0, | |
| "epoch": 0.05828571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28964340686798096, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": -0.0, | |
| "num_tokens": 6293405.0, | |
| "reward": 0.3547493815422058, | |
| "reward_std": 0.702359139919281, | |
| "rewards/cosine_scaled_reward/mean": -0.1116877943277359, | |
| "rewards/cosine_scaled_reward/std": 0.37401553988456726, | |
| "rewards/format_reward/mean": 0.578125, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.65625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1647.0, | |
| "completions/mean_length": 1649.34375, | |
| "completions/mean_terminated_length": 888.2727661132812, | |
| "completions/min_length": 327.0, | |
| "completions/min_terminated_length": 327.0, | |
| "epoch": 0.05942857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.33357536792755127, | |
| "learning_rate": 9.357252853159505e-07, | |
| "loss": -0.0, | |
| "num_tokens": 6410315.0, | |
| "reward": 0.257318913936615, | |
| "reward_std": 1.0037888288497925, | |
| "rewards/cosine_scaled_reward/mean": -0.08227802813053131, | |
| "rewards/cosine_scaled_reward/std": 0.48496147990226746, | |
| "rewards/format_reward/mean": 0.421875, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.546875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1957.0, | |
| "completions/mean_length": 1505.484375, | |
| "completions/mean_terminated_length": 850.72412109375, | |
| "completions/min_length": 448.0, | |
| "completions/min_terminated_length": 448.0, | |
| "epoch": 0.060571428571428575, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28791311383247375, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6517498.0, | |
| "reward": 0.10961895436048508, | |
| "reward_std": 0.6692662239074707, | |
| "rewards/cosine_scaled_reward/mean": -0.19519051909446716, | |
| "rewards/cosine_scaled_reward/std": 0.31183505058288574, | |
| "rewards/format_reward/mean": 0.5, | |
| "rewards/format_reward/std": 0.5039526224136353, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.390625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2047.0, | |
| "completions/mean_length": 1480.296875, | |
| "completions/mean_terminated_length": 1116.3846435546875, | |
| "completions/min_length": 421.0, | |
| "completions/min_terminated_length": 421.0, | |
| "epoch": 0.061714285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.25811222195625305, | |
| "learning_rate": 9.274017555754407e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6623381.0, | |
| "reward": 0.7679780721664429, | |
| "reward_std": 0.8107975125312805, | |
| "rewards/cosine_scaled_reward/mean": 0.040239036083221436, | |
| "rewards/cosine_scaled_reward/std": 0.535083532333374, | |
| "rewards/format_reward/mean": 0.6875, | |
| "rewards/format_reward/std": 0.467176616191864, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.453125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1734.0, | |
| "completions/mean_length": 1411.125, | |
| "completions/mean_terminated_length": 883.4285888671875, | |
| "completions/min_length": 383.0, | |
| "completions/min_terminated_length": 383.0, | |
| "epoch": 0.06285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2835226356983185, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": -0.0, | |
| "num_tokens": 6723981.0, | |
| "reward": 0.2917740046977997, | |
| "reward_std": 0.7892479300498962, | |
| "rewards/cosine_scaled_reward/mean": -0.13536299765110016, | |
| "rewards/cosine_scaled_reward/std": 0.3841571509838104, | |
| "rewards/format_reward/mean": 0.5625, | |
| "rewards/format_reward/std": 0.5, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2032.0, | |
| "completions/mean_length": 1539.265625, | |
| "completions/mean_terminated_length": 1030.53125, | |
| "completions/min_length": 432.0, | |
| "completions/min_terminated_length": 432.0, | |
| "epoch": 0.064, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27514079213142395, | |
| "learning_rate": 9.186184199300463e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6833454.0, | |
| "reward": 0.518336832523346, | |
| "reward_std": 0.5821805000305176, | |
| "rewards/cosine_scaled_reward/mean": -0.045519083738327026, | |
| "rewards/cosine_scaled_reward/std": 0.47304341197013855, | |
| "rewards/format_reward/mean": 0.609375, | |
| "rewards/format_reward/std": 0.4917473793029785, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.453125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2043.0, | |
| "completions/mean_length": 1580.421875, | |
| "completions/mean_terminated_length": 1193.0, | |
| "completions/min_length": 709.0, | |
| "completions/min_terminated_length": 709.0, | |
| "epoch": 0.06514285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24258998036384583, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0, | |
| "num_tokens": 6946169.0, | |
| "reward": 0.06029004976153374, | |
| "reward_std": 0.5583463311195374, | |
| "rewards/cosine_scaled_reward/mean": -0.2667299807071686, | |
| "rewards/cosine_scaled_reward/std": 0.29048436880111694, | |
| "rewards/format_reward/mean": 0.59375, | |
| "rewards/format_reward/std": 0.49501484632492065, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.390625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1777.0, | |
| "completions/mean_length": 1361.828125, | |
| "completions/mean_terminated_length": 921.974365234375, | |
| "completions/min_length": 480.0, | |
| "completions/min_terminated_length": 480.0, | |
| "epoch": 0.06628571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2641505300998688, | |
| "learning_rate": 9.093859795212817e-07, | |
| "loss": 0.0, | |
| "num_tokens": 7043422.0, | |
| "reward": 0.5511020421981812, | |
| "reward_std": 0.7235630750656128, | |
| "rewards/cosine_scaled_reward/mean": -0.036948978900909424, | |
| "rewards/cosine_scaled_reward/std": 0.4425795376300812, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2008.0, | |
| "completions/mean_length": 1535.578125, | |
| "completions/mean_terminated_length": 1184.9736328125, | |
| "completions/min_length": 549.0, | |
| "completions/min_terminated_length": 549.0, | |
| "epoch": 0.06742857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2389107048511505, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": -0.0, | |
| "num_tokens": 7152163.0, | |
| "reward": 0.40797895193099976, | |
| "reward_std": 0.6782904863357544, | |
| "rewards/cosine_scaled_reward/mean": -0.11632302403450012, | |
| "rewards/cosine_scaled_reward/std": 0.4052051305770874, | |
| "rewards/format_reward/mean": 0.640625, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2038.0, | |
| "completions/mean_length": 1534.46875, | |
| "completions/mean_terminated_length": 1020.9375, | |
| "completions/min_length": 277.0, | |
| "completions/min_terminated_length": 277.0, | |
| "epoch": 0.06857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2644842863082886, | |
| "learning_rate": 8.997156826556369e-07, | |
| "loss": 0.0, | |
| "num_tokens": 7261257.0, | |
| "reward": 0.09485618025064468, | |
| "reward_std": 0.690287709236145, | |
| "rewards/cosine_scaled_reward/mean": -0.21038439869880676, | |
| "rewards/cosine_scaled_reward/std": 0.3277226686477661, | |
| "rewards/format_reward/mean": 0.515625, | |
| "rewards/format_reward/std": 0.5037065148353577, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.578125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2007.0, | |
| "completions/mean_length": 1678.84375, | |
| "completions/mean_terminated_length": 1172.9630126953125, | |
| "completions/min_length": 434.0, | |
| "completions/min_terminated_length": 434.0, | |
| "epoch": 0.06971428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2128395438194275, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0, | |
| "num_tokens": 7380223.0, | |
| "reward": 0.3876636028289795, | |
| "reward_std": 0.8163598775863647, | |
| "rewards/cosine_scaled_reward/mean": -0.06398070603609085, | |
| "rewards/cosine_scaled_reward/std": 0.37083569169044495, | |
| "rewards/format_reward/mean": 0.515625, | |
| "rewards/format_reward/std": 0.5037065148353577, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1620.0, | |
| "completions/mean_length": 1104.90625, | |
| "completions/mean_terminated_length": 909.1697998046875, | |
| "completions/min_length": 387.0, | |
| "completions/min_terminated_length": 387.0, | |
| "epoch": 0.07085714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2858783006668091, | |
| "learning_rate": 8.896193111002475e-07, | |
| "loss": -0.0, | |
| "num_tokens": 7461169.0, | |
| "reward": 1.0647192001342773, | |
| "reward_std": 0.7218182682991028, | |
| "rewards/cosine_scaled_reward/mean": 0.11048462986946106, | |
| "rewards/cosine_scaled_reward/std": 0.5039199590682983, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1842.0, | |
| "completions/mean_length": 1272.703125, | |
| "completions/mean_terminated_length": 866.5952758789062, | |
| "completions/min_length": 311.0, | |
| "completions/min_terminated_length": 311.0, | |
| "epoch": 0.072, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.29019492864608765, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": -0.0, | |
| "num_tokens": 7552878.0, | |
| "reward": 0.7738356590270996, | |
| "reward_std": 0.6520147323608398, | |
| "rewards/cosine_scaled_reward/mean": 0.050980325788259506, | |
| "rewards/cosine_scaled_reward/std": 0.4395767152309418, | |
| "rewards/format_reward/mean": 0.671875, | |
| "rewards/format_reward/std": 0.4732423722743988, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1929.0, | |
| "completions/mean_length": 1234.015625, | |
| "completions/mean_terminated_length": 1026.5294189453125, | |
| "completions/min_length": 425.0, | |
| "completions/min_terminated_length": 425.0, | |
| "epoch": 0.07314285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2868574261665344, | |
| "learning_rate": 8.791091657286267e-07, | |
| "loss": -0.0, | |
| "num_tokens": 7642807.0, | |
| "reward": 0.7110692858695984, | |
| "reward_std": 0.5455821752548218, | |
| "rewards/cosine_scaled_reward/mean": -0.07415284961462021, | |
| "rewards/cosine_scaled_reward/std": 0.3914482891559601, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2030.0, | |
| "completions/mean_length": 1297.984375, | |
| "completions/mean_terminated_length": 957.0682373046875, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "epoch": 0.07428571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27404776215553284, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0, | |
| "num_tokens": 7736582.0, | |
| "reward": 0.3309648334980011, | |
| "reward_std": 0.5653569102287292, | |
| "rewards/cosine_scaled_reward/mean": -0.18608009815216064, | |
| "rewards/cosine_scaled_reward/std": 0.3975105583667755, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1778.0, | |
| "completions/mean_length": 940.03125, | |
| "completions/mean_terminated_length": 781.7500610351562, | |
| "completions/min_length": 265.0, | |
| "completions/min_terminated_length": 265.0, | |
| "epoch": 0.07542857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3140449821949005, | |
| "learning_rate": 8.681980515339463e-07, | |
| "loss": 0.0, | |
| "num_tokens": 7806976.0, | |
| "reward": 0.9573196172714233, | |
| "reward_std": 0.7096561193466187, | |
| "rewards/cosine_scaled_reward/mean": 0.04115980118513107, | |
| "rewards/cosine_scaled_reward/std": 0.4971291422843933, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.640625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1989.0, | |
| "completions/mean_length": 1740.34375, | |
| "completions/mean_terminated_length": 1191.9130859375, | |
| "completions/min_length": 769.0, | |
| "completions/min_terminated_length": 769.0, | |
| "epoch": 0.07657142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22468577325344086, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": -0.0, | |
| "num_tokens": 7929094.0, | |
| "reward": -0.05777654051780701, | |
| "reward_std": 0.4052577018737793, | |
| "rewards/cosine_scaled_reward/mean": -0.2242007553577423, | |
| "rewards/cosine_scaled_reward/std": 0.19833898544311523, | |
| "rewards/format_reward/mean": 0.390625, | |
| "rewards/format_reward/std": 0.4917473793029785, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1562.0, | |
| "completions/mean_length": 818.421875, | |
| "completions/mean_terminated_length": 691.22412109375, | |
| "completions/min_length": 296.0, | |
| "completions/min_terminated_length": 296.0, | |
| "epoch": 0.07771428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.32118088006973267, | |
| "learning_rate": 8.568992620281243e-07, | |
| "loss": -0.0, | |
| "num_tokens": 7990729.0, | |
| "reward": 0.9250792264938354, | |
| "reward_std": 0.7536466717720032, | |
| "rewards/cosine_scaled_reward/mean": 0.0016020983457565308, | |
| "rewards/cosine_scaled_reward/std": 0.4650251567363739, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1998.0, | |
| "completions/mean_length": 1106.140625, | |
| "completions/mean_terminated_length": 888.7885131835938, | |
| "completions/min_length": 369.0, | |
| "completions/min_terminated_length": 369.0, | |
| "epoch": 0.07885714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.34258174896240234, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0, | |
| "num_tokens": 8071866.0, | |
| "reward": 0.43910637497901917, | |
| "reward_std": 0.5676280856132507, | |
| "rewards/cosine_scaled_reward/mean": -0.19450931251049042, | |
| "rewards/cosine_scaled_reward/std": 0.277770459651947, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1964.0, | |
| "completions/mean_length": 1519.0625, | |
| "completions/mean_terminated_length": 1157.157958984375, | |
| "completions/min_length": 479.0, | |
| "completions/min_terminated_length": 479.0, | |
| "epoch": 0.08, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2579463720321655, | |
| "learning_rate": 8.452265630457282e-07, | |
| "loss": 0.0, | |
| "num_tokens": 8180542.0, | |
| "reward": 0.38829973340034485, | |
| "reward_std": 0.7944818735122681, | |
| "rewards/cosine_scaled_reward/mean": -0.12616263329982758, | |
| "rewards/cosine_scaled_reward/std": 0.42241016030311584, | |
| "rewards/format_reward/mean": 0.640625, | |
| "rewards/format_reward/std": 0.4836103618144989, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1999.0, | |
| "completions/mean_length": 1468.515625, | |
| "completions/mean_terminated_length": 1017.8055419921875, | |
| "completions/min_length": 316.0, | |
| "completions/min_terminated_length": 316.0, | |
| "epoch": 0.08114285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.29125189781188965, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0, | |
| "num_tokens": 8285247.0, | |
| "reward": 0.5433856248855591, | |
| "reward_std": 0.7695709466934204, | |
| "rewards/cosine_scaled_reward/mean": -0.017369696870446205, | |
| "rewards/cosine_scaled_reward/std": 0.49587228894233704, | |
| "rewards/format_reward/mean": 0.578125, | |
| "rewards/format_reward/std": 0.49776285886764526, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2039.0, | |
| "completions/mean_length": 1293.078125, | |
| "completions/mean_terminated_length": 949.9318237304688, | |
| "completions/min_length": 264.0, | |
| "completions/min_terminated_length": 264.0, | |
| "epoch": 0.08228571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3353990316390991, | |
| "learning_rate": 8.331941759724268e-07, | |
| "loss": -0.0, | |
| "num_tokens": 8378356.0, | |
| "reward": 0.42883288860321045, | |
| "reward_std": 0.6259180307388306, | |
| "rewards/cosine_scaled_reward/mean": -0.14495855569839478, | |
| "rewards/cosine_scaled_reward/std": 0.29958412051200867, | |
| "rewards/format_reward/mean": 0.71875, | |
| "rewards/format_reward/std": 0.4531635046005249, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.40625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2041.0, | |
| "completions/mean_length": 1505.328125, | |
| "completions/mean_terminated_length": 1134.0263671875, | |
| "completions/min_length": 590.0, | |
| "completions/min_terminated_length": 590.0, | |
| "epoch": 0.08342857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24657614529132843, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0, | |
| "num_tokens": 8485425.0, | |
| "reward": 0.2927630543708801, | |
| "reward_std": 0.5202052593231201, | |
| "rewards/cosine_scaled_reward/mean": -0.15830597281455994, | |
| "rewards/cosine_scaled_reward/std": 0.3964028060436249, | |
| "rewards/format_reward/mean": 0.609375, | |
| "rewards/format_reward/std": 0.4917473793029785, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1677.0, | |
| "completions/mean_length": 1350.203125, | |
| "completions/mean_terminated_length": 931.5250244140625, | |
| "completions/min_length": 416.0, | |
| "completions/min_terminated_length": 416.0, | |
| "epoch": 0.08457142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2827599048614502, | |
| "learning_rate": 8.208167604184217e-07, | |
| "loss": -0.0, | |
| "num_tokens": 8581766.0, | |
| "reward": 0.599705159664154, | |
| "reward_std": 0.6438653469085693, | |
| "rewards/cosine_scaled_reward/mean": -0.012647412717342377, | |
| "rewards/cosine_scaled_reward/std": 0.49363547563552856, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.296875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2029.0, | |
| "completions/mean_length": 1410.484375, | |
| "completions/mean_terminated_length": 1141.3111572265625, | |
| "completions/min_length": 334.0, | |
| "completions/min_terminated_length": 334.0, | |
| "epoch": 0.08571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2516263425350189, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": -0.0, | |
| "num_tokens": 8682997.0, | |
| "reward": 0.7366052269935608, | |
| "reward_std": 0.5748400092124939, | |
| "rewards/cosine_scaled_reward/mean": -0.030134890228509903, | |
| "rewards/cosine_scaled_reward/std": 0.4929082691669464, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.296875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1604.0, | |
| "completions/mean_length": 1206.453125, | |
| "completions/mean_terminated_length": 851.1333618164062, | |
| "completions/min_length": 342.0, | |
| "completions/min_terminated_length": 342.0, | |
| "epoch": 0.08685714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.33788394927978516, | |
| "learning_rate": 8.081093963579707e-07, | |
| "loss": 0.0, | |
| "num_tokens": 8770610.0, | |
| "reward": 0.2925173044204712, | |
| "reward_std": 0.543351948261261, | |
| "rewards/cosine_scaled_reward/mean": -0.2053038477897644, | |
| "rewards/cosine_scaled_reward/std": 0.35098204016685486, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2001.0, | |
| "completions/mean_length": 1203.796875, | |
| "completions/mean_terminated_length": 988.6078491210938, | |
| "completions/min_length": 459.0, | |
| "completions/min_terminated_length": 459.0, | |
| "epoch": 0.088, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2513405680656433, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": -0.0, | |
| "num_tokens": 8859229.0, | |
| "reward": 0.5237706899642944, | |
| "reward_std": 0.6414985060691833, | |
| "rewards/cosine_scaled_reward/mean": -0.15217715501785278, | |
| "rewards/cosine_scaled_reward/std": 0.3733552396297455, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.46875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1883.0, | |
| "completions/mean_length": 1434.796875, | |
| "completions/mean_terminated_length": 893.7352905273438, | |
| "completions/min_length": 381.0, | |
| "completions/min_terminated_length": 381.0, | |
| "epoch": 0.08914285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28274238109588623, | |
| "learning_rate": 7.950875657567621e-07, | |
| "loss": -0.0, | |
| "num_tokens": 8961776.0, | |
| "reward": 0.3842669129371643, | |
| "reward_std": 0.7946954369544983, | |
| "rewards/cosine_scaled_reward/mean": -0.08911655098199844, | |
| "rewards/cosine_scaled_reward/std": 0.4567674696445465, | |
| "rewards/format_reward/mean": 0.5625, | |
| "rewards/format_reward/std": 0.5, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1759.0, | |
| "completions/mean_length": 958.921875, | |
| "completions/mean_terminated_length": 757.24072265625, | |
| "completions/min_length": 234.0, | |
| "completions/min_terminated_length": 234.0, | |
| "epoch": 0.09028571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3380838930606842, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0, | |
| "num_tokens": 9033243.0, | |
| "reward": 0.5171822309494019, | |
| "reward_std": 0.4722011089324951, | |
| "rewards/cosine_scaled_reward/mean": -0.16328388452529907, | |
| "rewards/cosine_scaled_reward/std": 0.371114581823349, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.296875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2001.0, | |
| "completions/mean_length": 1309.25, | |
| "completions/mean_terminated_length": 997.3333740234375, | |
| "completions/min_length": 234.0, | |
| "completions/min_terminated_length": 234.0, | |
| "epoch": 0.09142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28732550144195557, | |
| "learning_rate": 7.817671337095244e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9127427.0, | |
| "reward": 0.42246782779693604, | |
| "reward_std": 0.5261572003364563, | |
| "rewards/cosine_scaled_reward/mean": -0.1559535712003708, | |
| "rewards/cosine_scaled_reward/std": 0.37669748067855835, | |
| "rewards/format_reward/mean": 0.734375, | |
| "rewards/format_reward/std": 0.44515693187713623, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1912.0, | |
| "completions/mean_length": 1156.71875, | |
| "completions/mean_terminated_length": 991.6666870117188, | |
| "completions/min_length": 251.0, | |
| "completions/min_terminated_length": 251.0, | |
| "epoch": 0.09257142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.36872920393943787, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0, | |
| "num_tokens": 9212729.0, | |
| "reward": 0.6148654222488403, | |
| "reward_std": 0.6330965161323547, | |
| "rewards/cosine_scaled_reward/mean": -0.13006731867790222, | |
| "rewards/cosine_scaled_reward/std": 0.36434388160705566, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1952.0, | |
| "completions/mean_length": 1259.765625, | |
| "completions/mean_terminated_length": 997.0208740234375, | |
| "completions/min_length": 441.0, | |
| "completions/min_terminated_length": 441.0, | |
| "epoch": 0.09371428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2519986927509308, | |
| "learning_rate": 7.681643291108517e-07, | |
| "loss": 0.0, | |
| "num_tokens": 9303682.0, | |
| "reward": 0.8954258561134338, | |
| "reward_std": 0.4955286383628845, | |
| "rewards/cosine_scaled_reward/mean": 0.05708790570497513, | |
| "rewards/cosine_scaled_reward/std": 0.548876941204071, | |
| "rewards/format_reward/mean": 0.78125, | |
| "rewards/format_reward/std": 0.4166666865348816, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1823.0, | |
| "completions/mean_length": 1342.71875, | |
| "completions/mean_terminated_length": 973.2857055664062, | |
| "completions/min_length": 66.0, | |
| "completions/min_terminated_length": 66.0, | |
| "epoch": 0.09485714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5289212465286255, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9400704.0, | |
| "reward": 0.5462230443954468, | |
| "reward_std": 0.6437035799026489, | |
| "rewards/cosine_scaled_reward/mean": -0.05501346290111542, | |
| "rewards/cosine_scaled_reward/std": 0.3811412453651428, | |
| "rewards/format_reward/mean": 0.65625, | |
| "rewards/format_reward/std": 0.4787135720252991, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1603.0, | |
| "completions/mean_length": 1094.421875, | |
| "completions/mean_terminated_length": 874.3654174804688, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "epoch": 0.096, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3851650059223175, | |
| "learning_rate": 7.54295724882796e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9481443.0, | |
| "reward": 0.42046594619750977, | |
| "reward_std": 0.5146702527999878, | |
| "rewards/cosine_scaled_reward/mean": -0.18820451200008392, | |
| "rewards/cosine_scaled_reward/std": 0.3148095905780792, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1912.0, | |
| "completions/mean_length": 1293.734375, | |
| "completions/mean_terminated_length": 1101.4705810546875, | |
| "completions/min_length": 634.0, | |
| "completions/min_terminated_length": 634.0, | |
| "epoch": 0.09714285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.21673259139060974, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9574562.0, | |
| "reward": 0.5379458665847778, | |
| "reward_std": 0.5725549459457397, | |
| "rewards/cosine_scaled_reward/mean": -0.1529020518064499, | |
| "rewards/cosine_scaled_reward/std": 0.34331607818603516, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1936.0, | |
| "completions/mean_length": 1286.953125, | |
| "completions/mean_terminated_length": 1111.326904296875, | |
| "completions/min_length": 457.0, | |
| "completions/min_terminated_length": 457.0, | |
| "epoch": 0.09828571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.29995283484458923, | |
| "learning_rate": 7.401782177833147e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9667783.0, | |
| "reward": 0.7049737572669983, | |
| "reward_std": 0.5882902145385742, | |
| "rewards/cosine_scaled_reward/mean": -0.06938813626766205, | |
| "rewards/cosine_scaled_reward/std": 0.35692107677459717, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1650.0, | |
| "completions/mean_length": 953.484375, | |
| "completions/mean_terminated_length": 797.1250610351562, | |
| "completions/min_length": 39.0, | |
| "completions/min_terminated_length": 39.0, | |
| "epoch": 0.09942857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.8409918546676636, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9738926.0, | |
| "reward": 0.5864202976226807, | |
| "reward_std": 0.5318285822868347, | |
| "rewards/cosine_scaled_reward/mean": -0.13647735118865967, | |
| "rewards/cosine_scaled_reward/std": 0.3167019188404083, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1729.0, | |
| "completions/max_terminated_length": 1729.0, | |
| "completions/mean_length": 886.546875, | |
| "completions/mean_terminated_length": 886.546875, | |
| "completions/min_length": 126.0, | |
| "completions/min_terminated_length": 126.0, | |
| "epoch": 0.10057142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.35503509640693665, | |
| "learning_rate": 7.258290078201731e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9806177.0, | |
| "reward": 1.2680045366287231, | |
| "reward_std": 0.6174743175506592, | |
| "rewards/cosine_scaled_reward/mean": 0.14181479811668396, | |
| "rewards/cosine_scaled_reward/std": 0.5076755881309509, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2004.0, | |
| "completions/mean_length": 1268.875, | |
| "completions/mean_terminated_length": 1124.5926513671875, | |
| "completions/min_length": 496.0, | |
| "completions/min_terminated_length": 496.0, | |
| "epoch": 0.10171428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2577503025531769, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0, | |
| "num_tokens": 9898385.0, | |
| "reward": 0.7599090337753296, | |
| "reward_std": 0.8181240558624268, | |
| "rewards/cosine_scaled_reward/mean": -0.073170505464077, | |
| "rewards/cosine_scaled_reward/std": 0.43178030848503113, | |
| "rewards/format_reward/mean": 0.90625, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1870.0, | |
| "completions/mean_length": 1033.5, | |
| "completions/mean_terminated_length": 888.5714721679688, | |
| "completions/min_length": 160.0, | |
| "completions/min_terminated_length": 160.0, | |
| "epoch": 0.10285714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.39826640486717224, | |
| "learning_rate": 7.11265577295385e-07, | |
| "loss": -0.0, | |
| "num_tokens": 9974529.0, | |
| "reward": 0.4363415837287903, | |
| "reward_std": 0.48137861490249634, | |
| "rewards/cosine_scaled_reward/mean": -0.21932920813560486, | |
| "rewards/cosine_scaled_reward/std": 0.31020957231521606, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.28125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1911.0, | |
| "completions/mean_length": 1406.75, | |
| "completions/mean_terminated_length": 1155.8260498046875, | |
| "completions/min_length": 288.0, | |
| "completions/min_terminated_length": 288.0, | |
| "epoch": 0.104, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2828798294067383, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0, | |
| "num_tokens": 10075129.0, | |
| "reward": 0.6231826543807983, | |
| "reward_std": 0.8636409044265747, | |
| "rewards/cosine_scaled_reward/mean": -0.06340868771076202, | |
| "rewards/cosine_scaled_reward/std": 0.49163660407066345, | |
| "rewards/format_reward/mean": 0.75, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1904.0, | |
| "completions/mean_length": 1240.375, | |
| "completions/mean_terminated_length": 1054.0, | |
| "completions/min_length": 294.0, | |
| "completions/min_terminated_length": 294.0, | |
| "epoch": 0.10514285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28899839520454407, | |
| "learning_rate": 6.965056695057204e-07, | |
| "loss": -0.0, | |
| "num_tokens": 10164505.0, | |
| "reward": 0.5402791500091553, | |
| "reward_std": 0.6010072827339172, | |
| "rewards/cosine_scaled_reward/mean": -0.18298542499542236, | |
| "rewards/cosine_scaled_reward/std": 0.3275497853755951, | |
| "rewards/format_reward/mean": 0.90625, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.328125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2002.0, | |
| "completions/mean_length": 1452.328125, | |
| "completions/mean_terminated_length": 1161.4185791015625, | |
| "completions/min_length": 488.0, | |
| "completions/min_terminated_length": 488.0, | |
| "epoch": 0.10628571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.29217013716697693, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0, | |
| "num_tokens": 10268798.0, | |
| "reward": 0.3901035785675049, | |
| "reward_std": 0.5545454025268555, | |
| "rewards/cosine_scaled_reward/mean": -0.20338571071624756, | |
| "rewards/cosine_scaled_reward/std": 0.30444955825805664, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.296875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1862.0, | |
| "completions/mean_length": 1260.515625, | |
| "completions/mean_terminated_length": 928.022216796875, | |
| "completions/min_length": 432.0, | |
| "completions/min_terminated_length": 432.0, | |
| "epoch": 0.10742857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3268873393535614, | |
| "learning_rate": 6.815672671252315e-07, | |
| "loss": 0.0, | |
| "num_tokens": 10359239.0, | |
| "reward": 0.7386243343353271, | |
| "reward_std": 0.7403403520584106, | |
| "rewards/cosine_scaled_reward/mean": 0.017749670892953873, | |
| "rewards/cosine_scaled_reward/std": 0.48611870408058167, | |
| "rewards/format_reward/mean": 0.703125, | |
| "rewards/format_reward/std": 0.4604927599430084, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1586.0, | |
| "completions/mean_length": 1262.921875, | |
| "completions/mean_terminated_length": 1099.981201171875, | |
| "completions/min_length": 494.0, | |
| "completions/min_terminated_length": 494.0, | |
| "epoch": 0.10857142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2648283839225769, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": -0.0, | |
| "num_tokens": 10450594.0, | |
| "reward": 0.2988126277923584, | |
| "reward_std": 0.47151660919189453, | |
| "rewards/cosine_scaled_reward/mean": -0.2646561861038208, | |
| "rewards/cosine_scaled_reward/std": 0.2507747411727905, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1733.0, | |
| "completions/mean_length": 1106.09375, | |
| "completions/mean_terminated_length": 931.6666870117188, | |
| "completions/min_length": 352.0, | |
| "completions/min_terminated_length": 352.0, | |
| "epoch": 0.10971428571428571, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.29469895362854004, | |
| "learning_rate": 6.664685702961344e-07, | |
| "loss": -0.0, | |
| "num_tokens": 10532520.0, | |
| "reward": 0.810766339302063, | |
| "reward_std": 0.48911383748054504, | |
| "rewards/cosine_scaled_reward/mean": -0.02430431731045246, | |
| "rewards/cosine_scaled_reward/std": 0.47827479243278503, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2032.0, | |
| "completions/mean_length": 1183.0625, | |
| "completions/mean_terminated_length": 1041.5272216796875, | |
| "completions/min_length": 414.0, | |
| "completions/min_terminated_length": 414.0, | |
| "epoch": 0.11085714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2775411605834961, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": -0.0, | |
| "num_tokens": 10618500.0, | |
| "reward": 0.711927056312561, | |
| "reward_std": 0.7540339231491089, | |
| "rewards/cosine_scaled_reward/mean": -0.08934895694255829, | |
| "rewards/cosine_scaled_reward/std": 0.39428383111953735, | |
| "rewards/format_reward/mean": 0.890625, | |
| "rewards/format_reward/std": 0.3145764470100403, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1922.0, | |
| "completions/mean_length": 1130.078125, | |
| "completions/mean_terminated_length": 998.9464721679688, | |
| "completions/min_length": 335.0, | |
| "completions/min_terminated_length": 335.0, | |
| "epoch": 0.112, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2643944323062897, | |
| "learning_rate": 6.512279744547392e-07, | |
| "loss": -0.0, | |
| "num_tokens": 10701889.0, | |
| "reward": 0.9012003540992737, | |
| "reward_std": 0.6251660585403442, | |
| "rewards/cosine_scaled_reward/mean": -0.0025248080492019653, | |
| "rewards/cosine_scaled_reward/std": 0.48845264315605164, | |
| "rewards/format_reward/mean": 0.90625, | |
| "rewards/format_reward/std": 0.29378482699394226, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2047.0, | |
| "completions/mean_length": 1079.828125, | |
| "completions/mean_terminated_length": 941.5178833007812, | |
| "completions/min_length": 324.0, | |
| "completions/min_terminated_length": 324.0, | |
| "epoch": 0.11314285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.31110212206840515, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": -0.0, | |
| "num_tokens": 10782166.0, | |
| "reward": 0.9453647136688232, | |
| "reward_std": 0.5895795822143555, | |
| "rewards/cosine_scaled_reward/mean": -0.011692702770233154, | |
| "rewards/cosine_scaled_reward/std": 0.49524030089378357, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1817.0, | |
| "completions/mean_length": 1099.609375, | |
| "completions/mean_terminated_length": 964.1250610351562, | |
| "completions/min_length": 394.0, | |
| "completions/min_terminated_length": 394.0, | |
| "epoch": 0.11428571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28003421425819397, | |
| "learning_rate": 6.358640479194451e-07, | |
| "loss": 0.0, | |
| "num_tokens": 10862253.0, | |
| "reward": 1.0164594650268555, | |
| "reward_std": 0.8200770616531372, | |
| "rewards/cosine_scaled_reward/mean": 0.047292180359363556, | |
| "rewards/cosine_scaled_reward/std": 0.5198192000389099, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1940.0, | |
| "completions/mean_length": 1095.8125, | |
| "completions/mean_terminated_length": 1048.9835205078125, | |
| "completions/min_length": 518.0, | |
| "completions/min_terminated_length": 518.0, | |
| "epoch": 0.11542857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.31470584869384766, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": -0.0, | |
| "num_tokens": 10943817.0, | |
| "reward": 0.7535556554794312, | |
| "reward_std": 0.5378469228744507, | |
| "rewards/cosine_scaled_reward/mean": -0.11540969461202621, | |
| "rewards/cosine_scaled_reward/std": 0.37915751338005066, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1952.0, | |
| "completions/mean_length": 1071.109375, | |
| "completions/mean_terminated_length": 931.5535888671875, | |
| "completions/min_length": 415.0, | |
| "completions/min_terminated_length": 415.0, | |
| "epoch": 0.11657142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3006651699542999, | |
| "learning_rate": 6.203955092681039e-07, | |
| "loss": 0.0, | |
| "num_tokens": 11022520.0, | |
| "reward": 0.6289626359939575, | |
| "reward_std": 0.5818617939949036, | |
| "rewards/cosine_scaled_reward/mean": -0.12301868200302124, | |
| "rewards/cosine_scaled_reward/std": 0.3668438792228699, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2021.0, | |
| "completions/mean_length": 1084.203125, | |
| "completions/mean_terminated_length": 965.8421020507812, | |
| "completions/min_length": 222.0, | |
| "completions/min_terminated_length": 222.0, | |
| "epoch": 0.11771428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.32252946496009827, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": -0.0, | |
| "num_tokens": 11102261.0, | |
| "reward": 0.6010515689849854, | |
| "reward_std": 0.7124715447425842, | |
| "rewards/cosine_scaled_reward/mean": -0.1682242453098297, | |
| "rewards/cosine_scaled_reward/std": 0.34618714451789856, | |
| "rewards/format_reward/mean": 0.9375, | |
| "rewards/format_reward/std": 0.24397502839565277, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1817.0, | |
| "completions/mean_length": 1172.25, | |
| "completions/mean_terminated_length": 990.4906005859375, | |
| "completions/min_length": 337.0, | |
| "completions/min_terminated_length": 337.0, | |
| "epoch": 0.11885714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.30141475796699524, | |
| "learning_rate": 6.048412045323164e-07, | |
| "loss": 0.0, | |
| "num_tokens": 11187749.0, | |
| "reward": 0.5190969705581665, | |
| "reward_std": 0.4445875287055969, | |
| "rewards/cosine_scaled_reward/mean": -0.16232651472091675, | |
| "rewards/cosine_scaled_reward/std": 0.2870725095272064, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2006.0, | |
| "completions/mean_length": 1155.9375, | |
| "completions/mean_terminated_length": 882.8571166992188, | |
| "completions/min_length": 492.0, | |
| "completions/min_terminated_length": 492.0, | |
| "epoch": 0.12, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3128022253513336, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0, | |
| "num_tokens": 11273017.0, | |
| "reward": 0.844305157661438, | |
| "reward_std": 0.8654354810714722, | |
| "rewards/cosine_scaled_reward/mean": 0.0080900639295578, | |
| "rewards/cosine_scaled_reward/std": 0.5000066757202148, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1920.0, | |
| "completions/mean_length": 982.828125, | |
| "completions/mean_terminated_length": 911.8167114257812, | |
| "completions/min_length": 271.0, | |
| "completions/min_terminated_length": 271.0, | |
| "epoch": 0.12114285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.287850946187973, | |
| "learning_rate": 5.892200842364462e-07, | |
| "loss": -0.0, | |
| "num_tokens": 11346750.0, | |
| "reward": 0.9764542579650879, | |
| "reward_std": 0.523072361946106, | |
| "rewards/cosine_scaled_reward/mean": 0.019477128982543945, | |
| "rewards/cosine_scaled_reward/std": 0.44723302125930786, | |
| "rewards/format_reward/mean": 0.9375, | |
| "rewards/format_reward/std": 0.24397502839565277, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1898.0, | |
| "completions/mean_length": 1099.75, | |
| "completions/mean_terminated_length": 944.581787109375, | |
| "completions/min_length": 356.0, | |
| "completions/min_terminated_length": 356.0, | |
| "epoch": 0.12228571428571429, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2976638078689575, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": -0.0, | |
| "num_tokens": 11428286.0, | |
| "reward": 0.727447509765625, | |
| "reward_std": 0.630887508392334, | |
| "rewards/cosine_scaled_reward/mean": -0.0737762451171875, | |
| "rewards/cosine_scaled_reward/std": 0.4439302980899811, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1688.0, | |
| "completions/mean_length": 1213.25, | |
| "completions/mean_terminated_length": 979.5199584960938, | |
| "completions/min_length": 472.0, | |
| "completions/min_terminated_length": 472.0, | |
| "epoch": 0.12342857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2800474762916565, | |
| "learning_rate": 5.735511803093248e-07, | |
| "loss": -0.0, | |
| "num_tokens": 11516294.0, | |
| "reward": 0.5832531452178955, | |
| "reward_std": 0.73647540807724, | |
| "rewards/cosine_scaled_reward/mean": -0.11462344229221344, | |
| "rewards/cosine_scaled_reward/std": 0.37341246008872986, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1828.0, | |
| "completions/mean_length": 1253.625, | |
| "completions/mean_terminated_length": 1010.448974609375, | |
| "completions/min_length": 316.0, | |
| "completions/min_terminated_length": 316.0, | |
| "epoch": 0.12457142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2575153708457947, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0, | |
| "num_tokens": 11607774.0, | |
| "reward": 0.5941106677055359, | |
| "reward_std": 0.6006972789764404, | |
| "rewards/cosine_scaled_reward/mean": -0.10138219594955444, | |
| "rewards/cosine_scaled_reward/std": 0.4213758409023285, | |
| "rewards/format_reward/mean": 0.796875, | |
| "rewards/format_reward/std": 0.40550529956817627, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2005.0, | |
| "completions/mean_length": 1212.171875, | |
| "completions/mean_terminated_length": 978.1399536132812, | |
| "completions/min_length": 128.0, | |
| "completions/min_terminated_length": 128.0, | |
| "epoch": 0.12571428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3692750930786133, | |
| "learning_rate": 5.578535828967777e-07, | |
| "loss": 0.0, | |
| "num_tokens": 11696129.0, | |
| "reward": 0.5244784355163574, | |
| "reward_std": 0.5243270397186279, | |
| "rewards/cosine_scaled_reward/mean": -0.15963581204414368, | |
| "rewards/cosine_scaled_reward/std": 0.31791090965270996, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1957.0, | |
| "completions/mean_length": 925.640625, | |
| "completions/mean_terminated_length": 907.825439453125, | |
| "completions/min_length": 422.0, | |
| "completions/min_terminated_length": 422.0, | |
| "epoch": 0.12685714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2943163514137268, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0, | |
| "num_tokens": 11765490.0, | |
| "reward": 1.0681891441345215, | |
| "reward_std": 0.666343092918396, | |
| "rewards/cosine_scaled_reward/mean": 0.041907064616680145, | |
| "rewards/cosine_scaled_reward/std": 0.4317578375339508, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.234375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2033.0, | |
| "completions/mean_length": 1297.265625, | |
| "completions/mean_terminated_length": 1067.448974609375, | |
| "completions/min_length": 536.0, | |
| "completions/min_terminated_length": 536.0, | |
| "epoch": 0.128, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2623840868473053, | |
| "learning_rate": 5.421464171032224e-07, | |
| "loss": -0.0, | |
| "num_tokens": 11859611.0, | |
| "reward": 0.5628055930137634, | |
| "reward_std": 0.664225697517395, | |
| "rewards/cosine_scaled_reward/mean": -0.09359719604253769, | |
| "rewards/cosine_scaled_reward/std": 0.4235653281211853, | |
| "rewards/format_reward/mean": 0.75, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1643.0, | |
| "completions/mean_length": 905.21875, | |
| "completions/mean_terminated_length": 887.0794067382812, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "epoch": 0.12914285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3688279390335083, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0, | |
| "num_tokens": 11927841.0, | |
| "reward": 1.0120556354522705, | |
| "reward_std": 0.7051924467086792, | |
| "rewards/cosine_scaled_reward/mean": 0.006027787923812866, | |
| "rewards/cosine_scaled_reward/std": 0.4939332902431488, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2006.0, | |
| "completions/mean_length": 984.96875, | |
| "completions/mean_terminated_length": 914.1000366210938, | |
| "completions/min_length": 267.0, | |
| "completions/min_terminated_length": 267.0, | |
| "epoch": 0.13028571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27232953906059265, | |
| "learning_rate": 5.264488196906752e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12000727.0, | |
| "reward": 0.5494575500488281, | |
| "reward_std": 0.5694750547409058, | |
| "rewards/cosine_scaled_reward/mean": -0.21745873987674713, | |
| "rewards/cosine_scaled_reward/std": 0.3295048475265503, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2002.0, | |
| "completions/mean_length": 1129.015625, | |
| "completions/mean_terminated_length": 958.8333129882812, | |
| "completions/min_length": 311.0, | |
| "completions/min_terminated_length": 311.0, | |
| "epoch": 0.13142857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.31655776500701904, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12084200.0, | |
| "reward": 0.6148514747619629, | |
| "reward_std": 0.592422604560852, | |
| "rewards/cosine_scaled_reward/mean": -0.12226178497076035, | |
| "rewards/cosine_scaled_reward/std": 0.44683361053466797, | |
| "rewards/format_reward/mean": 0.859375, | |
| "rewards/format_reward/std": 0.3503824472427368, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1829.0, | |
| "completions/mean_length": 1261.96875, | |
| "completions/mean_terminated_length": 1098.8302001953125, | |
| "completions/min_length": 344.0, | |
| "completions/min_terminated_length": 344.0, | |
| "epoch": 0.13257142857142856, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2751889228820801, | |
| "learning_rate": 5.107799157635538e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12176350.0, | |
| "reward": 0.7403342723846436, | |
| "reward_std": 0.6721117496490479, | |
| "rewards/cosine_scaled_reward/mean": -0.05170784145593643, | |
| "rewards/cosine_scaled_reward/std": 0.43005797266960144, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1906.0, | |
| "completions/mean_length": 1156.421875, | |
| "completions/mean_terminated_length": 1080.8643798828125, | |
| "completions/min_length": 510.0, | |
| "completions/min_terminated_length": 510.0, | |
| "epoch": 0.1337142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2688439190387726, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12261385.0, | |
| "reward": 0.8138295412063599, | |
| "reward_std": 0.5513401031494141, | |
| "rewards/cosine_scaled_reward/mean": -0.09308521449565887, | |
| "rewards/cosine_scaled_reward/std": 0.3840063810348511, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.21875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1838.0, | |
| "completions/mean_length": 1200.328125, | |
| "completions/mean_terminated_length": 962.97998046875, | |
| "completions/min_length": 460.0, | |
| "completions/min_terminated_length": 460.0, | |
| "epoch": 0.13485714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27440333366394043, | |
| "learning_rate": 4.951587954676837e-07, | |
| "loss": -0.0, | |
| "num_tokens": 12348742.0, | |
| "reward": 0.7029136419296265, | |
| "reward_std": 0.819955587387085, | |
| "rewards/cosine_scaled_reward/mean": -0.05479319393634796, | |
| "rewards/cosine_scaled_reward/std": 0.4745423495769501, | |
| "rewards/format_reward/mean": 0.8125, | |
| "rewards/format_reward/std": 0.39339789748191833, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1933.0, | |
| "completions/mean_length": 1073.125, | |
| "completions/mean_terminated_length": 933.857177734375, | |
| "completions/min_length": 276.0, | |
| "completions/min_terminated_length": 276.0, | |
| "epoch": 0.136, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.4175036549568176, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": -0.0, | |
| "num_tokens": 12427830.0, | |
| "reward": 0.9872255325317383, | |
| "reward_std": 0.5814859867095947, | |
| "rewards/cosine_scaled_reward/mean": 0.056112758815288544, | |
| "rewards/cosine_scaled_reward/std": 0.5062689781188965, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1674.0, | |
| "completions/mean_length": 865.609375, | |
| "completions/mean_terminated_length": 827.4677124023438, | |
| "completions/min_length": 280.0, | |
| "completions/min_terminated_length": 280.0, | |
| "epoch": 0.13714285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3479575216770172, | |
| "learning_rate": 4.79604490731896e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12493685.0, | |
| "reward": 0.8312849998474121, | |
| "reward_std": 0.6629652976989746, | |
| "rewards/cosine_scaled_reward/mean": -0.07654500752687454, | |
| "rewards/cosine_scaled_reward/std": 0.4900154173374176, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1877.0, | |
| "completions/mean_length": 754.1875, | |
| "completions/mean_terminated_length": 733.6508178710938, | |
| "completions/min_length": 315.0, | |
| "completions/min_terminated_length": 315.0, | |
| "epoch": 0.1382857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3520006239414215, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12551649.0, | |
| "reward": 1.1358025074005127, | |
| "reward_std": 0.6475541591644287, | |
| "rewards/cosine_scaled_reward/mean": 0.06790120899677277, | |
| "rewards/cosine_scaled_reward/std": 0.5241734385490417, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1933.0, | |
| "completions/mean_length": 1191.53125, | |
| "completions/mean_terminated_length": 1069.1785888671875, | |
| "completions/min_length": 339.0, | |
| "completions/min_terminated_length": 339.0, | |
| "epoch": 0.13942857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.29668495059013367, | |
| "learning_rate": 4.641359520805548e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12639467.0, | |
| "reward": 0.9448899030685425, | |
| "reward_std": 0.731184184551239, | |
| "rewards/cosine_scaled_reward/mean": -0.01193002238869667, | |
| "rewards/cosine_scaled_reward/std": 0.4401930868625641, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2002.0, | |
| "completions/mean_length": 1111.390625, | |
| "completions/mean_terminated_length": 996.368408203125, | |
| "completions/min_length": 356.0, | |
| "completions/min_terminated_length": 356.0, | |
| "epoch": 0.14057142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.33428606390953064, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12720828.0, | |
| "reward": 0.605126678943634, | |
| "reward_std": 0.5388374328613281, | |
| "rewards/cosine_scaled_reward/mean": -0.16618669033050537, | |
| "rewards/cosine_scaled_reward/std": 0.36216598749160767, | |
| "rewards/format_reward/mean": 0.9375, | |
| "rewards/format_reward/std": 0.24397502839565277, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1889.0, | |
| "completions/mean_length": 1088.375, | |
| "completions/mean_terminated_length": 1007.0508422851562, | |
| "completions/min_length": 395.0, | |
| "completions/min_terminated_length": 395.0, | |
| "epoch": 0.1417142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27362972497940063, | |
| "learning_rate": 4.4877202554526084e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12801436.0, | |
| "reward": 0.8875737190246582, | |
| "reward_std": 0.7504779100418091, | |
| "rewards/cosine_scaled_reward/mean": -0.04058811068534851, | |
| "rewards/cosine_scaled_reward/std": 0.41639918088912964, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1941.0, | |
| "completions/mean_length": 1045.59375, | |
| "completions/mean_terminated_length": 941.8965454101562, | |
| "completions/min_length": 445.0, | |
| "completions/min_terminated_length": 445.0, | |
| "epoch": 0.14285714285714285, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26206985116004944, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12879282.0, | |
| "reward": 0.9140812754631042, | |
| "reward_std": 0.5521372556686401, | |
| "rewards/cosine_scaled_reward/mean": -0.011709354817867279, | |
| "rewards/cosine_scaled_reward/std": 0.478300005197525, | |
| "rewards/format_reward/mean": 0.9375, | |
| "rewards/format_reward/std": 0.24397502839565277, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1726.0, | |
| "completions/mean_length": 1053.3125, | |
| "completions/mean_terminated_length": 950.413818359375, | |
| "completions/min_length": 555.0, | |
| "completions/min_terminated_length": 555.0, | |
| "epoch": 0.144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2631860077381134, | |
| "learning_rate": 4.3353142970386557e-07, | |
| "loss": 0.0, | |
| "num_tokens": 12957990.0, | |
| "reward": 0.9084943532943726, | |
| "reward_std": 0.7618498802185059, | |
| "rewards/cosine_scaled_reward/mean": -0.006690334528684616, | |
| "rewards/cosine_scaled_reward/std": 0.4652135372161865, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.09375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2040.0, | |
| "completions/mean_length": 1078.5625, | |
| "completions/mean_terminated_length": 978.27587890625, | |
| "completions/min_length": 286.0, | |
| "completions/min_terminated_length": 286.0, | |
| "epoch": 0.14514285714285713, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.32610929012298584, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13038114.0, | |
| "reward": 0.6247783303260803, | |
| "reward_std": 0.6168485879898071, | |
| "rewards/cosine_scaled_reward/mean": -0.16417336463928223, | |
| "rewards/cosine_scaled_reward/std": 0.3348706066608429, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1959.0, | |
| "completions/mean_length": 1018.53125, | |
| "completions/mean_terminated_length": 949.9000244140625, | |
| "completions/min_length": 392.0, | |
| "completions/min_terminated_length": 392.0, | |
| "epoch": 0.1462857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2964915633201599, | |
| "learning_rate": 4.1843273287476854e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13113652.0, | |
| "reward": 1.0467863082885742, | |
| "reward_std": 0.696172833442688, | |
| "rewards/cosine_scaled_reward/mean": 0.031205661594867706, | |
| "rewards/cosine_scaled_reward/std": 0.5108028650283813, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.203125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1711.0, | |
| "completions/mean_length": 1295.765625, | |
| "completions/mean_terminated_length": 1104.0196533203125, | |
| "completions/min_length": 493.0, | |
| "completions/min_terminated_length": 493.0, | |
| "epoch": 0.14742857142857144, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24947936832904816, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": -0.0, | |
| "num_tokens": 13207285.0, | |
| "reward": 0.6936242580413818, | |
| "reward_std": 0.7887886762619019, | |
| "rewards/cosine_scaled_reward/mean": -0.07506285607814789, | |
| "rewards/cosine_scaled_reward/std": 0.42623400688171387, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1796.0, | |
| "completions/mean_length": 1126.21875, | |
| "completions/mean_terminated_length": 913.5000610351562, | |
| "completions/min_length": 359.0, | |
| "completions/min_terminated_length": 359.0, | |
| "epoch": 0.14857142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2999359369277954, | |
| "learning_rate": 4.034943304942796e-07, | |
| "loss": -0.0, | |
| "num_tokens": 13289867.0, | |
| "reward": 0.43217456340789795, | |
| "reward_std": 0.5134756565093994, | |
| "rewards/cosine_scaled_reward/mean": -0.19797520339488983, | |
| "rewards/cosine_scaled_reward/std": 0.26216205954551697, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1773.0, | |
| "completions/mean_length": 1085.21875, | |
| "completions/mean_terminated_length": 885.396240234375, | |
| "completions/min_length": 315.0, | |
| "completions/min_terminated_length": 315.0, | |
| "epoch": 0.14971428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.336990088224411, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13369057.0, | |
| "reward": 0.9999659061431885, | |
| "reward_std": 0.5346121788024902, | |
| "rewards/cosine_scaled_reward/mean": 0.06248297542333603, | |
| "rewards/cosine_scaled_reward/std": 0.48766252398490906, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.3333333432674408, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.390625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1847.0, | |
| "completions/mean_length": 1286.34375, | |
| "completions/mean_terminated_length": 798.1026000976562, | |
| "completions/min_length": 356.0, | |
| "completions/min_terminated_length": 356.0, | |
| "epoch": 0.15085714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.29184556007385254, | |
| "learning_rate": 3.8873442270461485e-07, | |
| "loss": -0.0, | |
| "num_tokens": 13462447.0, | |
| "reward": 0.6418495178222656, | |
| "reward_std": 0.674288809299469, | |
| "rewards/cosine_scaled_reward/mean": 0.008424755185842514, | |
| "rewards/cosine_scaled_reward/std": 0.4481044411659241, | |
| "rewards/format_reward/mean": 0.625, | |
| "rewards/format_reward/std": 0.48795005679130554, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1703.0, | |
| "completions/mean_length": 1147.875, | |
| "completions/mean_terminated_length": 940.1538696289062, | |
| "completions/min_length": 462.0, | |
| "completions/min_terminated_length": 462.0, | |
| "epoch": 0.152, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2999313175678253, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13547135.0, | |
| "reward": 0.5941276550292969, | |
| "reward_std": 0.6214425563812256, | |
| "rewards/cosine_scaled_reward/mean": -0.11699868738651276, | |
| "rewards/cosine_scaled_reward/std": 0.32567545771598816, | |
| "rewards/format_reward/mean": 0.828125, | |
| "rewards/format_reward/std": 0.38025420904159546, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1888.0, | |
| "completions/mean_length": 952.796875, | |
| "completions/mean_terminated_length": 935.4127807617188, | |
| "completions/min_length": 314.0, | |
| "completions/min_terminated_length": 314.0, | |
| "epoch": 0.15314285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3382585942745209, | |
| "learning_rate": 3.7417099217982686e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13618682.0, | |
| "reward": 1.1113653182983398, | |
| "reward_std": 0.6422195434570312, | |
| "rewards/cosine_scaled_reward/mean": 0.07130765169858932, | |
| "rewards/cosine_scaled_reward/std": 0.5419203042984009, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1522.0, | |
| "completions/mean_length": 749.625, | |
| "completions/mean_terminated_length": 685.7704467773438, | |
| "completions/min_length": 113.0, | |
| "completions/min_terminated_length": 113.0, | |
| "epoch": 0.15428571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.45106765627861023, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13677130.0, | |
| "reward": 1.3096649646759033, | |
| "reward_std": 0.8325010538101196, | |
| "rewards/cosine_scaled_reward/mean": 0.17826992273330688, | |
| "rewards/cosine_scaled_reward/std": 0.49473828077316284, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1694.0, | |
| "completions/mean_length": 1074.609375, | |
| "completions/mean_terminated_length": 992.11865234375, | |
| "completions/min_length": 432.0, | |
| "completions/min_terminated_length": 432.0, | |
| "epoch": 0.15542857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3076488673686981, | |
| "learning_rate": 3.5982178221668533e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13756577.0, | |
| "reward": 1.0143537521362305, | |
| "reward_std": 0.645289957523346, | |
| "rewards/cosine_scaled_reward/mean": 0.01498936116695404, | |
| "rewards/cosine_scaled_reward/std": 0.4602736830711365, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1831.0, | |
| "completions/mean_length": 1153.1875, | |
| "completions/mean_terminated_length": 1077.35595703125, | |
| "completions/min_length": 464.0, | |
| "completions/min_terminated_length": 464.0, | |
| "epoch": 0.15657142857142858, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26375824213027954, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13840989.0, | |
| "reward": 0.7302319407463074, | |
| "reward_std": 0.6595839262008667, | |
| "rewards/cosine_scaled_reward/mean": -0.11144650727510452, | |
| "rewards/cosine_scaled_reward/std": 0.36661940813064575, | |
| "rewards/format_reward/mean": 0.953125, | |
| "rewards/format_reward/std": 0.21304203569889069, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1999.0, | |
| "completions/mean_length": 1057.84375, | |
| "completions/mean_terminated_length": 1025.9031982421875, | |
| "completions/min_length": 318.0, | |
| "completions/min_terminated_length": 318.0, | |
| "epoch": 0.15771428571428572, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.28375834226608276, | |
| "learning_rate": 3.45704275117204e-07, | |
| "loss": 0.0, | |
| "num_tokens": 13920003.0, | |
| "reward": 0.9056442975997925, | |
| "reward_std": 0.5362595319747925, | |
| "rewards/cosine_scaled_reward/mean": -0.047177836298942566, | |
| "rewards/cosine_scaled_reward/std": 0.46028000116348267, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1936.0, | |
| "completions/mean_length": 1131.96875, | |
| "completions/mean_terminated_length": 1001.107177734375, | |
| "completions/min_length": 278.0, | |
| "completions/min_terminated_length": 278.0, | |
| "epoch": 0.15885714285714286, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3277582824230194, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14003089.0, | |
| "reward": 0.6589163541793823, | |
| "reward_std": 0.4049028754234314, | |
| "rewards/cosine_scaled_reward/mean": -0.17054180800914764, | |
| "rewards/cosine_scaled_reward/std": 0.36281341314315796, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2043.0, | |
| "completions/mean_length": 1074.421875, | |
| "completions/mean_terminated_length": 1009.5167236328125, | |
| "completions/min_length": 414.0, | |
| "completions/min_terminated_length": 414.0, | |
| "epoch": 0.16, | |
| "frac_reward_zero_std": 0.125, | |
| "grad_norm": 0.2817178964614868, | |
| "learning_rate": 3.3183567088914833e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14082076.0, | |
| "reward": 0.9182517528533936, | |
| "reward_std": 0.4315429925918579, | |
| "rewards/cosine_scaled_reward/mean": -0.033061616122722626, | |
| "rewards/cosine_scaled_reward/std": 0.483820378780365, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1960.0, | |
| "completions/mean_length": 1011.375, | |
| "completions/mean_terminated_length": 942.2667236328125, | |
| "completions/min_length": 348.0, | |
| "completions/min_terminated_length": 348.0, | |
| "epoch": 0.16114285714285714, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2890068590641022, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14157364.0, | |
| "reward": 0.9973877668380737, | |
| "reward_std": 0.4701315760612488, | |
| "rewards/cosine_scaled_reward/mean": -0.0013061091303825378, | |
| "rewards/cosine_scaled_reward/std": 0.47546684741973877, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.15625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1878.0, | |
| "completions/mean_length": 1245.0625, | |
| "completions/mean_terminated_length": 1096.370361328125, | |
| "completions/min_length": 226.0, | |
| "completions/min_terminated_length": 226.0, | |
| "epoch": 0.16228571428571428, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3200085163116455, | |
| "learning_rate": 3.182328662904756e-07, | |
| "loss": 0.0, | |
| "num_tokens": 14247552.0, | |
| "reward": 0.5833260416984558, | |
| "reward_std": 0.6680535674095154, | |
| "rewards/cosine_scaled_reward/mean": -0.1692744940519333, | |
| "rewards/cosine_scaled_reward/std": 0.3119296729564667, | |
| "rewards/format_reward/mean": 0.921875, | |
| "rewards/format_reward/std": 0.27048972249031067, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1817.0, | |
| "completions/mean_length": 953.15625, | |
| "completions/mean_terminated_length": 935.77783203125, | |
| "completions/min_length": 412.0, | |
| "completions/min_terminated_length": 412.0, | |
| "epoch": 0.16342857142857142, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3159373104572296, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14319370.0, | |
| "reward": 0.7112681269645691, | |
| "reward_std": 0.5509551167488098, | |
| "rewards/cosine_scaled_reward/mean": -0.14436593651771545, | |
| "rewards/cosine_scaled_reward/std": 0.3024492859840393, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1857.0, | |
| "completions/mean_length": 1130.140625, | |
| "completions/mean_terminated_length": 939.6415405273438, | |
| "completions/min_length": 185.0, | |
| "completions/min_terminated_length": 185.0, | |
| "epoch": 0.16457142857142856, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.366451621055603, | |
| "learning_rate": 3.0491243424323783e-07, | |
| "loss": 0.0, | |
| "num_tokens": 14403187.0, | |
| "reward": 0.8458279967308044, | |
| "reward_std": 0.48854437470436096, | |
| "rewards/cosine_scaled_reward/mean": 0.0010389834642410278, | |
| "rewards/cosine_scaled_reward/std": 0.49104803800582886, | |
| "rewards/format_reward/mean": 0.84375, | |
| "rewards/format_reward/std": 0.36596253514289856, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1815.0, | |
| "completions/mean_length": 901.28125, | |
| "completions/mean_terminated_length": 883.0794067382812, | |
| "completions/min_length": 216.0, | |
| "completions/min_terminated_length": 216.0, | |
| "epoch": 0.1657142857142857, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3450138568878174, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14470813.0, | |
| "reward": 1.018430471420288, | |
| "reward_std": 0.7617365717887878, | |
| "rewards/cosine_scaled_reward/mean": 0.009215235710144043, | |
| "rewards/cosine_scaled_reward/std": 0.5040040016174316, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1837.0, | |
| "completions/mean_length": 1048.78125, | |
| "completions/mean_terminated_length": 964.1016845703125, | |
| "completions/min_length": 368.0, | |
| "completions/min_terminated_length": 368.0, | |
| "epoch": 0.16685714285714287, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2851085364818573, | |
| "learning_rate": 2.918906036420294e-07, | |
| "loss": 0.0, | |
| "num_tokens": 14548551.0, | |
| "reward": 0.7197650671005249, | |
| "reward_std": 0.6007095575332642, | |
| "rewards/cosine_scaled_reward/mean": -0.12449245154857635, | |
| "rewards/cosine_scaled_reward/std": 0.3838319182395935, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1901.0, | |
| "completions/mean_length": 1127.25, | |
| "completions/mean_terminated_length": 1065.86669921875, | |
| "completions/min_length": 522.0, | |
| "completions/min_terminated_length": 522.0, | |
| "epoch": 0.168, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2944687306880951, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14631479.0, | |
| "reward": 0.8103795051574707, | |
| "reward_std": 0.4803225100040436, | |
| "rewards/cosine_scaled_reward/mean": -0.08699773252010345, | |
| "rewards/cosine_scaled_reward/std": 0.4309064447879791, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.109375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1860.0, | |
| "completions/mean_length": 985.265625, | |
| "completions/mean_terminated_length": 854.75439453125, | |
| "completions/min_length": 383.0, | |
| "completions/min_terminated_length": 383.0, | |
| "epoch": 0.16914285714285715, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.316753625869751, | |
| "learning_rate": 2.791832395815782e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14705720.0, | |
| "reward": 0.8408422470092773, | |
| "reward_std": 0.47007906436920166, | |
| "rewards/cosine_scaled_reward/mean": -0.06395385414361954, | |
| "rewards/cosine_scaled_reward/std": 0.39589446783065796, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.17536810040473938, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.34375, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 2036.0, | |
| "completions/mean_length": 1468.953125, | |
| "completions/mean_terminated_length": 1165.642822265625, | |
| "completions/min_length": 446.0, | |
| "completions/min_terminated_length": 446.0, | |
| "epoch": 0.1702857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2663600444793701, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0, | |
| "num_tokens": 14811629.0, | |
| "reward": 0.49039748311042786, | |
| "reward_std": 0.7405215501785278, | |
| "rewards/cosine_scaled_reward/mean": -0.12980125844478607, | |
| "rewards/cosine_scaled_reward/std": 0.38436219096183777, | |
| "rewards/format_reward/mean": 0.75, | |
| "rewards/format_reward/std": 0.4364357888698578, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.046875, | |
| "completions/max_length": 2048.0, | |
| "completions/max_terminated_length": 1840.0, | |
| "completions/mean_length": 1005.96875, | |
| "completions/mean_terminated_length": 954.7212524414062, | |
| "completions/min_length": 368.0, | |
| "completions/min_terminated_length": 368.0, | |
| "epoch": 0.17142857142857143, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27496519684791565, | |
| "learning_rate": 2.6680582402757324e-07, | |
| "loss": -0.0, | |
| "num_tokens": 14886515.0, | |
| "reward": 0.8553354740142822, | |
| "reward_std": 0.7562883496284485, | |
| "rewards/cosine_scaled_reward/mean": -0.06451976299285889, | |
| "rewards/cosine_scaled_reward/std": 0.43835195899009705, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.125, | |
| "step": 150 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 200, | |
| "num_input_tokens_seen": 14886515, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |