{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.22857142857142856,
  "eval_steps": 500,
  "global_step": 200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1734.0,
      "completions/mean_length": 1702.03125,
      "completions/mean_terminated_length": 993.6190795898438,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 0.001142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25444135069847107,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 118418.0,
      "reward": 0.17899775505065918,
      "reward_std": 0.7650213241577148,
      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1738.90625,
      "completions/mean_terminated_length": 949.0,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.002285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24364067614078522,
      "learning_rate": 5e-08,
      "loss": -0.0,
      "num_tokens": 239748.0,
      "reward": 0.3848632574081421,
      "reward_std": 0.9111153483390808,
      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1405.0,
      "completions/mean_length": 1930.609375,
      "completions/mean_terminated_length": 974.71435546875,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 0.0034285714285714284,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27834266424179077,
      "learning_rate": 1e-07,
      "loss": -0.0,
      "num_tokens": 373779.0,
      "reward": -0.3227587938308716,
      "reward_std": 0.45940712094306946,
      "rewards/cosine_scaled_reward/mean": -0.2160668969154358,
      "rewards/cosine_scaled_reward/std": 0.21890601515769958,
      "rewards/format_reward/mean": 0.109375,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1596.75,
      "completions/mean_terminated_length": 1116.3870849609375,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 0.004571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2799243628978729,
      "learning_rate": 1.5e-07,
      "loss": -0.0,
      "num_tokens": 485779.0,
      "reward": 0.27003082633018494,
      "reward_std": 0.7608597874641418,
      "rewards/cosine_scaled_reward/mean": -0.11498458683490753,
      "rewards/cosine_scaled_reward/std": 0.36645373702049255,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5039526224136353,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1755.0,
      "completions/mean_length": 2035.46875,
      "completions/mean_terminated_length": 1647.0,
      "completions/min_length": 1539.0,
      "completions/min_terminated_length": 1539.0,
      "epoch": 0.005714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24311119318008423,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 626865.0,
      "reward": -0.4839385151863098,
      "reward_std": 0.34498828649520874,
      "rewards/cosine_scaled_reward/mean": -0.2732192277908325,
      "rewards/cosine_scaled_reward/std": 0.18402352929115295,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1862.0,
      "completions/mean_length": 1884.109375,
      "completions/mean_terminated_length": 882.5555419921875,
      "completions/min_length": 524.0,
      "completions/min_terminated_length": 524.0,
      "epoch": 0.006857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2741600275039673,
      "learning_rate": 2.5e-07,
      "loss": -0.0,
      "num_tokens": 759096.0,
      "reward": -0.2049689143896103,
      "reward_std": 0.639178991317749,
      "rewards/cosine_scaled_reward/mean": -0.18060946464538574,
      "rewards/cosine_scaled_reward/std": 0.2599981129169464,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1795.0,
      "completions/mean_length": 1959.84375,
      "completions/mean_terminated_length": 1342.75,
      "completions/min_length": 974.0,
      "completions/min_terminated_length": 974.0,
      "epoch": 0.008,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21986258029937744,
      "learning_rate": 3e-07,
      "loss": -0.0,
      "num_tokens": 894934.0,
      "reward": -0.11210991442203522,
      "reward_std": 0.6349427103996277,
      "rewards/cosine_scaled_reward/mean": -0.14199243485927582,
      "rewards/cosine_scaled_reward/std": 0.3749195337295532,
      "rewards/format_reward/mean": 0.171875,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1717.78125,
      "completions/mean_terminated_length": 873.888916015625,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.009142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23102505505084991,
      "learning_rate": 3.5e-07,
      "loss": -0.0,
      "num_tokens": 1015288.0,
      "reward": 0.12653985619544983,
      "reward_std": 0.4742490351200104,
      "rewards/cosine_scaled_reward/mean": -0.09298005700111389,
      "rewards/cosine_scaled_reward/std": 0.39157670736312866,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1815.0,
      "completions/mean_length": 1928.53125,
      "completions/mean_terminated_length": 1352.9091796875,
      "completions/min_length": 999.0,
      "completions/min_terminated_length": 999.0,
      "epoch": 0.010285714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2600339353084564,
      "learning_rate": 4e-07,
      "loss": -0.0,
      "num_tokens": 1150170.0,
      "reward": -0.14216071367263794,
      "reward_std": 0.702994704246521,
      "rewards/cosine_scaled_reward/mean": -0.17264285683631897,
      "rewards/cosine_scaled_reward/std": 0.33145979046821594,
      "rewards/format_reward/mean": 0.203125,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1432.0,
      "completions/mean_length": 1699.84375,
      "completions/mean_terminated_length": 810.1111450195312,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.011428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2749471366405487,
      "learning_rate": 4.5e-07,
      "loss": -0.0,
      "num_tokens": 1269792.0,
      "reward": -0.13922849297523499,
      "reward_std": 0.4937349855899811,
      "rewards/cosine_scaled_reward/mean": -0.2102392464876175,
      "rewards/cosine_scaled_reward/std": 0.30274781584739685,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1614.0,
      "completions/mean_length": 1994.453125,
      "completions/mean_terminated_length": 1191.25,
      "completions/min_length": 916.0,
      "completions/min_terminated_length": 916.0,
      "epoch": 0.012571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2305486500263214,
      "learning_rate": 5e-07,
      "loss": 0.0,
      "num_tokens": 1409109.0,
      "reward": -0.39525067806243896,
      "reward_std": 0.3650783896446228,
      "rewards/cosine_scaled_reward/mean": -0.2288753092288971,
      "rewards/cosine_scaled_reward/std": 0.22182811796665192,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1706.6875,
      "completions/mean_terminated_length": 1174.239990234375,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.013714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2918066382408142,
      "learning_rate": 5.5e-07,
      "loss": 0.0,
      "num_tokens": 1529281.0,
      "reward": 0.08787664026021957,
      "reward_std": 0.7579531073570251,
      "rewards/cosine_scaled_reward/mean": -0.18262416124343872,
      "rewards/cosine_scaled_reward/std": 0.37901216745376587,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1820.828125,
      "completions/mean_terminated_length": 1078.7333984375,
      "completions/min_length": 527.0,
      "completions/min_terminated_length": 527.0,
      "epoch": 0.014857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27849823236465454,
      "learning_rate": 6e-07,
      "loss": -0.0,
      "num_tokens": 1656854.0,
      "reward": 0.03077489137649536,
      "reward_std": 0.6479229927062988,
      "rewards/cosine_scaled_reward/mean": -0.12523755431175232,
      "rewards/cosine_scaled_reward/std": 0.34234777092933655,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1772.296875,
      "completions/mean_terminated_length": 1165.75,
      "completions/min_length": 605.0,
      "completions/min_terminated_length": 605.0,
      "epoch": 0.016,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25037428736686707,
      "learning_rate": 6.5e-07,
      "loss": 0.0,
      "num_tokens": 1780889.0,
      "reward": 0.3261271119117737,
      "reward_std": 0.6276673078536987,
      "rewards/cosine_scaled_reward/mean": -0.008811453357338905,
      "rewards/cosine_scaled_reward/std": 0.46767035126686096,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1715.5625,
      "completions/mean_terminated_length": 928.2105102539062,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 0.017142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26902371644973755,
      "learning_rate": 7e-07,
      "loss": -0.0,
      "num_tokens": 1901605.0,
      "reward": 0.3007117211818695,
      "reward_std": 0.3918319642543793,
      "rewards/cosine_scaled_reward/mean": -0.005894124507904053,
      "rewards/cosine_scaled_reward/std": 0.4677385091781616,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 757.0,
      "completions/mean_length": 2027.828125,
      "completions/mean_terminated_length": 757.0,
      "completions/min_length": 757.0,
      "completions/min_terminated_length": 757.0,
      "epoch": 0.018285714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25064370036125183,
      "learning_rate": 7.5e-07,
      "loss": -0.0,
      "num_tokens": 2041826.0,
      "reward": -0.499896764755249,
      "reward_std": 0.34189552068710327,
      "rewards/cosine_scaled_reward/mean": -0.2577608823776245,
      "rewards/cosine_scaled_reward/std": 0.18115636706352234,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.125,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1530.796875,
      "completions/mean_terminated_length": 906.586181640625,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 0.019428571428571427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3018316924571991,
      "learning_rate": 8e-07,
      "loss": -0.0,
      "num_tokens": 2150317.0,
      "reward": 0.23110359907150269,
      "reward_std": 0.6260336637496948,
      "rewards/cosine_scaled_reward/mean": -0.12663568556308746,
      "rewards/cosine_scaled_reward/std": 0.39377179741859436,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1689.0,
      "completions/mean_length": 1813.671875,
      "completions/mean_terminated_length": 894.3846435546875,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 0.02057142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23236438632011414,
      "learning_rate": 8.499999999999999e-07,
      "loss": 0.0,
      "num_tokens": 2276768.0,
      "reward": -0.10029121488332748,
      "reward_std": 0.7172800302505493,
      "rewards/cosine_scaled_reward/mean": -0.18295811116695404,
      "rewards/cosine_scaled_reward/std": 0.3038564622402191,
      "rewards/format_reward/mean": 0.265625,
      "rewards/format_reward/std": 0.44515693187713623,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1697.0,
      "completions/mean_length": 1843.15625,
      "completions/mean_terminated_length": 1111.571533203125,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 0.021714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2313074916601181,
      "learning_rate": 9e-07,
      "loss": -0.0,
      "num_tokens": 2405986.0,
      "reward": 0.09310440719127655,
      "reward_std": 0.7020131349563599,
      "rewards/cosine_scaled_reward/mean": -0.08626029640436172,
      "rewards/cosine_scaled_reward/std": 0.44063708186149597,
      "rewards/format_reward/mean": 0.265625,
      "rewards/format_reward/std": 0.44515693187713623,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1676.0,
      "completions/mean_length": 1523.03125,
      "completions/mean_terminated_length": 803.629638671875,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.022857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.295642226934433,
      "learning_rate": 9.499999999999999e-07,
      "loss": -0.0,
      "num_tokens": 2514812.0,
      "reward": 0.3644811511039734,
      "reward_std": 0.7943294048309326,
      "rewards/cosine_scaled_reward/mean": -0.03650941699743271,
      "rewards/cosine_scaled_reward/std": 0.44610291719436646,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.5,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1793.328125,
      "completions/mean_terminated_length": 1190.157958984375,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 0.024,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2961376905441284,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 2640393.0,
      "reward": 0.06134350597858429,
      "reward_std": 0.6498202085494995,
      "rewards/cosine_scaled_reward/mean": -0.14120325446128845,
      "rewards/cosine_scaled_reward/std": 0.3548509478569031,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1386.75,
      "completions/mean_terminated_length": 872.4444580078125,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 0.025142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.34918317198753357,
      "learning_rate": 9.99931462820376e-07,
      "loss": 0.0,
      "num_tokens": 2738161.0,
      "reward": 0.5064569711685181,
      "reward_std": 0.7104054689407349,
      "rewards/cosine_scaled_reward/mean": -0.035834040492773056,
      "rewards/cosine_scaled_reward/std": 0.4265843331813812,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1656.59375,
      "completions/mean_terminated_length": 1084.5384521484375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.026285714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26697081327438354,
      "learning_rate": 9.997258721585931e-07,
      "loss": -0.0,
      "num_tokens": 2854975.0,
      "reward": 0.2737857699394226,
      "reward_std": 0.6956006288528442,
      "rewards/cosine_scaled_reward/mean": -0.0896696150302887,
      "rewards/cosine_scaled_reward/std": 0.3913433253765106,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1790.421875,
      "completions/mean_terminated_length": 1223.75,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.027428571428571427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24950818717479706,
      "learning_rate": 9.993832906395582e-07,
      "loss": -0.0,
      "num_tokens": 2980490.0,
      "reward": -0.08990197628736496,
      "reward_std": 0.7724581956863403,
      "rewards/cosine_scaled_reward/mean": -0.21682599186897278,
      "rewards/cosine_scaled_reward/std": 0.35711658000946045,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1703.953125,
      "completions/mean_terminated_length": 889.1052856445312,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 0.02857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28078693151474,
      "learning_rate": 9.989038226169207e-07,
      "loss": 0.0,
      "num_tokens": 3099839.0,
      "reward": -0.12643180787563324,
      "reward_std": 0.6687923669815063,
      "rewards/cosine_scaled_reward/mean": -0.21946589648723602,
      "rewards/cosine_scaled_reward/std": 0.30431970953941345,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1938.078125,
      "completions/mean_terminated_length": 1608.3125,
      "completions/min_length": 1087.0,
      "completions/min_terminated_length": 1087.0,
      "epoch": 0.029714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21486051380634308,
      "learning_rate": 9.982876141412855e-07,
      "loss": 0.0,
      "num_tokens": 3234508.0,
      "reward": 0.05503671616315842,
      "reward_std": 0.6532000303268433,
      "rewards/cosine_scaled_reward/mean": -0.1287316530942917,
      "rewards/cosine_scaled_reward/std": 0.36068078875541687,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1896.375,
      "completions/mean_terminated_length": 1401.0667724609375,
      "completions/min_length": 568.0,
      "completions/min_terminated_length": 568.0,
      "epoch": 0.030857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2675936222076416,
      "learning_rate": 9.975348529157229e-07,
      "loss": -0.0,
      "num_tokens": 3366164.0,
      "reward": -0.02987336739897728,
      "reward_std": 0.5919089913368225,
      "rewards/cosine_scaled_reward/mean": -0.1633741855621338,
      "rewards/cosine_scaled_reward/std": 0.3508918881416321,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.4604927599430084,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1832.96875,
      "completions/mean_terminated_length": 901.1666870117188,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 0.032,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2518174946308136,
      "learning_rate": 9.96645768238595e-07,
      "loss": 0.0,
      "num_tokens": 3493810.0,
      "reward": 0.08577289432287216,
      "reward_std": 0.6993601322174072,
      "rewards/cosine_scaled_reward/mean": -0.08211354911327362,
      "rewards/cosine_scaled_reward/std": 0.45168522000312805,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1779.0,
      "completions/mean_length": 1921.1875,
      "completions/mean_terminated_length": 1146.2222900390625,
      "completions/min_length": 710.0,
      "completions/min_terminated_length": 710.0,
      "epoch": 0.03314285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25027790665626526,
      "learning_rate": 9.956206309337066e-07,
      "loss": -0.0,
      "num_tokens": 3627238.0,
      "reward": -0.3098237216472626,
      "reward_std": 0.4339829087257385,
      "rewards/cosine_scaled_reward/mean": -0.2330368608236313,
      "rewards/cosine_scaled_reward/std": 0.17332859337329865,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1891.109375,
      "completions/mean_terminated_length": 1490.1666259765625,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.03428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24189673364162445,
      "learning_rate": 9.944597532678119e-07,
      "loss": 0.0,
      "num_tokens": 3758805.0,
      "reward": -0.08874380588531494,
      "reward_std": 0.5923835635185242,
      "rewards/cosine_scaled_reward/mean": -0.18499691784381866,
      "rewards/cosine_scaled_reward/std": 0.27955111861228943,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1467.0,
      "completions/mean_length": 1818.8125,
      "completions/mean_terminated_length": 825.6666870117188,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 0.03542857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24893531203269958,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.0,
      "num_tokens": 3885705.0,
      "reward": -0.18628405034542084,
      "reward_std": 0.5522075891494751,
      "rewards/cosine_scaled_reward/mean": -0.20251703262329102,
      "rewards/cosine_scaled_reward/std": 0.37591472268104553,
      "rewards/format_reward/mean": 0.21875,
      "rewards/format_reward/std": 0.4166666865348816,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1867.0,
      "completions/mean_length": 1878.140625,
      "completions/mean_terminated_length": 1211.769287109375,
      "completions/min_length": 654.0,
      "completions/min_terminated_length": 654.0,
      "epoch": 0.036571428571428574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25341352820396423,
      "learning_rate": 9.917322325514487e-07,
      "loss": 0.0,
      "num_tokens": 4016258.0,
      "reward": -0.14861394464969635,
      "reward_std": 0.5451517105102539,
      "rewards/cosine_scaled_reward/mean": -0.19149449467658997,
      "rewards/cosine_scaled_reward/std": 0.3489256203174591,
      "rewards/format_reward/mean": 0.234375,
      "rewards/format_reward/std": 0.42695629596710205,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1969.03125,
      "completions/mean_terminated_length": 1326.0,
      "completions/min_length": 998.0,
      "completions/min_terminated_length": 998.0,
      "epoch": 0.037714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23557375371456146,
      "learning_rate": 9.901664203302124e-07,
      "loss": -0.0,
      "num_tokens": 4153492.0,
      "reward": -0.3634287118911743,
      "reward_std": 0.5462046265602112,
      "rewards/cosine_scaled_reward/mean": -0.25983935594558716,
      "rewards/cosine_scaled_reward/std": 0.3271723687648773,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1568.296875,
      "completions/mean_terminated_length": 1024.6334228515625,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 0.038857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2892495095729828,
      "learning_rate": 9.88466529153356e-07,
      "loss": 0.0,
      "num_tokens": 4263415.0,
      "reward": 0.575156569480896,
      "reward_std": 0.8866004347801208,
      "rewards/cosine_scaled_reward/mean": 0.045390784740448,
      "rewards/cosine_scaled_reward/std": 0.5505619645118713,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1834.0,
      "completions/mean_length": 1830.65625,
      "completions/mean_terminated_length": 657.0,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 0.04,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28274399042129517,
      "learning_rate": 9.866330768241983e-07,
      "loss": -0.0,
      "num_tokens": 4392073.0,
      "reward": -0.1704331934452057,
      "reward_std": 0.7666259407997131,
      "rewards/cosine_scaled_reward/mean": -0.18677911162376404,
      "rewards/cosine_scaled_reward/std": 0.36125659942626953,
      "rewards/format_reward/mean": 0.203125,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1950.671875,
      "completions/mean_terminated_length": 1158.1429443359375,
      "completions/min_length": 669.0,
      "completions/min_terminated_length": 669.0,
      "epoch": 0.04114285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2504905164241791,
      "learning_rate": 9.846666218300807e-07,
      "loss": -0.0,
      "num_tokens": 4528028.0,
      "reward": -0.49544650316238403,
      "reward_std": 0.3493530750274658,
      "rewards/cosine_scaled_reward/mean": -0.302410751581192,
      "rewards/cosine_scaled_reward/std": 0.17342224717140198,
      "rewards/format_reward/mean": 0.109375,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1879.0,
      "completions/mean_length": 1956.546875,
      "completions/mean_terminated_length": 1397.6666259765625,
      "completions/min_length": 789.0,
      "completions/min_terminated_length": 789.0,
      "epoch": 0.04228571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24223695695400238,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.0,
      "num_tokens": 4664271.0,
      "reward": -0.2983526587486267,
      "reward_std": 0.45510220527648926,
      "rewards/cosine_scaled_reward/mean": -0.22730132937431335,
      "rewards/cosine_scaled_reward/std": 0.21532759070396423,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1844.0,
      "completions/mean_length": 1872.0625,
      "completions/mean_terminated_length": 1109.666748046875,
      "completions/min_length": 799.0,
      "completions/min_terminated_length": 799.0,
      "epoch": 0.04342857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22518785297870636,
      "learning_rate": 9.80337140183366e-07,
      "loss": 0.0,
      "num_tokens": 4795795.0,
      "reward": -0.0591111034154892,
      "reward_std": 0.38858330249786377,
      "rewards/cosine_scaled_reward/mean": -0.1311180591583252,
      "rewards/cosine_scaled_reward/std": 0.32316854596138,
      "rewards/format_reward/mean": 0.203125,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1674.8125,
      "completions/mean_terminated_length": 721.1111450195312,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.044571428571428574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26911357045173645,
      "learning_rate": 9.779754323328192e-07,
      "loss": -0.0,
      "num_tokens": 4913767.0,
      "reward": 0.14183415472507477,
      "reward_std": 0.6081592440605164,
      "rewards/cosine_scaled_reward/mean": -0.09314543008804321,
      "rewards/cosine_scaled_reward/std": 0.3410241901874542,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.4732423722743988,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1549.328125,
      "completions/mean_terminated_length": 908.1785888671875,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.045714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2770562767982483,
      "learning_rate": 9.754833590196926e-07,
      "loss": 0.0,
      "num_tokens": 5022996.0,
      "reward": 0.3034515678882599,
      "reward_std": 0.5147567987442017,
      "rewards/cosine_scaled_reward/mean": -0.09827423095703125,
      "rewards/cosine_scaled_reward/std": 0.39993754029273987,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5039526224136353,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1819.0,
      "completions/mean_length": 1768.609375,
      "completions/mean_terminated_length": 1302.9583740234375,
      "completions/min_length": 584.0,
      "completions/min_terminated_length": 584.0,
      "epoch": 0.046857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23544111847877502,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.0,
      "num_tokens": 5147339.0,
      "reward": 0.05204566568136215,
      "reward_std": 0.7308298349380493,
      "rewards/cosine_scaled_reward/mean": -0.18491466343402863,
      "rewards/cosine_scaled_reward/std": 0.3467314541339874,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1662.234375,
      "completions/mean_terminated_length": 748.5789794921875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.048,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3946230709552765,
      "learning_rate": 9.701111919237408e-07,
      "loss": 0.0,
      "num_tokens": 5264082.0,
      "reward": -0.1084136962890625,
      "reward_std": 0.35625624656677246,
      "rewards/cosine_scaled_reward/mean": -0.21045684814453125,
      "rewards/cosine_scaled_reward/std": 0.17068159580230713,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1628.796875,
      "completions/mean_terminated_length": 974.8399658203125,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 0.04914285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2622542679309845,
      "learning_rate": 9.672327345550543e-07,
      "loss": -0.0,
      "num_tokens": 5379941.0,
      "reward": 0.24864289164543152,
      "reward_std": 0.622364342212677,
      "rewards/cosine_scaled_reward/mean": -0.08661604672670364,
      "rewards/cosine_scaled_reward/std": 0.3968709111213684,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1813.0,
      "completions/mean_length": 1598.265625,
      "completions/mean_terminated_length": 848.7083740234375,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.05028571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3861124813556671,
      "learning_rate": 9.64227184053598e-07,
      "loss": -0.0,
      "num_tokens": 5492926.0,
      "reward": 0.17736095190048218,
      "reward_std": 0.5736653804779053,
      "rewards/cosine_scaled_reward/mean": -0.09881951659917831,
      "rewards/cosine_scaled_reward/std": 0.4637540578842163,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1945.546875,
      "completions/mean_terminated_length": 1228.375,
      "completions/min_length": 909.0,
      "completions/min_terminated_length": 909.0,
      "epoch": 0.05142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2586025893688202,
      "learning_rate": 9.610954559391704e-07,
      "loss": -0.0,
      "num_tokens": 5629097.0,
      "reward": -0.2874904274940491,
      "reward_std": 0.4528215825557709,
      "rewards/cosine_scaled_reward/mean": -0.21405771374702454,
      "rewards/cosine_scaled_reward/std": 0.3033171594142914,
      "rewards/format_reward/mean": 0.140625,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1514.0,
      "completions/mean_length": 1772.890625,
      "completions/mean_terminated_length": 874.2000732421875,
      "completions/min_length": 597.0,
      "completions/min_terminated_length": 597.0,
      "epoch": 0.052571428571428575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27347299456596375,
      "learning_rate": 9.578385041664925e-07,
      "loss": -0.0,
      "num_tokens": 5753730.0,
      "reward": -0.0957992672920227,
      "reward_std": 0.4836219251155853,
      "rewards/cosine_scaled_reward/mean": -0.17289963364601135,
      "rewards/cosine_scaled_reward/std": 0.3050842881202698,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1595.734375,
      "completions/mean_terminated_length": 890.199951171875,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.053714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3536407947540283,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.0,
      "num_tokens": 5866257.0,
      "reward": -0.01777055859565735,
      "reward_std": 0.6523094177246094,
      "rewards/cosine_scaled_reward/mean": -0.22763527929782867,
      "rewards/cosine_scaled_reward/std": 0.3455982208251953,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.5,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1792.0,
      "completions/mean_length": 1581.84375,
      "completions/mean_terminated_length": 1019.2413940429688,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 0.054857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26218104362487793,
      "learning_rate": 9.509529358847654e-07,
      "loss": -0.0,
      "num_tokens": 5978039.0,
      "reward": 0.36145922541618347,
      "reward_std": 0.8229352235794067,
      "rewards/cosine_scaled_reward/mean": -0.06145789101719856,
      "rewards/cosine_scaled_reward/std": 0.4491077661514282,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1525.0,
      "completions/mean_length": 1404.46875,
      "completions/mean_terminated_length": 799.9393920898438,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3138538897037506,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.0,
      "num_tokens": 6077989.0,
      "reward": 0.23753327131271362,
      "reward_std": 0.6856037378311157,
      "rewards/cosine_scaled_reward/mean": -0.1468583643436432,
      "rewards/cosine_scaled_reward/std": 0.36308491230010986,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5029674172401428,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1834.0,
      "completions/mean_length": 1668.46875,
      "completions/mean_terminated_length": 943.9091186523438,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.05714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2541959285736084,
      "learning_rate": 9.43578868212728e-07,
      "loss": 0.0,
      "num_tokens": 6195587.0,
      "reward": 0.2079824060201645,
      "reward_std": 0.6563009023666382,
      "rewards/cosine_scaled_reward/mean": -0.09132131934165955,
      "rewards/cosine_scaled_reward/std": 0.39781448245048523,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.421875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1850.0,
      "completions/mean_length": 1368.90625,
      "completions/mean_terminated_length": 873.3513793945312,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 0.05828571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28964340686798096,
      "learning_rate": 9.397114317029974e-07,
      "loss": -0.0,
      "num_tokens": 6293405.0,
      "reward": 0.3547493815422058,
      "reward_std": 0.702359139919281,
      "rewards/cosine_scaled_reward/mean": -0.1116877943277359,
      "rewards/cosine_scaled_reward/std": 0.37401553988456726,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1647.0,
      "completions/mean_length": 1649.34375,
      "completions/mean_terminated_length": 888.2727661132812,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.05942857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33357536792755127,
      "learning_rate": 9.357252853159505e-07,
      "loss": -0.0,
      "num_tokens": 6410315.0,
      "reward": 0.257318913936615,
      "reward_std": 1.0037888288497925,
      "rewards/cosine_scaled_reward/mean": -0.08227802813053131,
      "rewards/cosine_scaled_reward/std": 0.48496147990226746,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 1505.484375,
      "completions/mean_terminated_length": 850.72412109375,
      "completions/min_length": 448.0,
      "completions/min_terminated_length": 448.0,
      "epoch": 0.060571428571428575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28791311383247375,
      "learning_rate": 9.316216432703916e-07,
      "loss": 0.0,
      "num_tokens": 6517498.0,
      "reward": 0.10961895436048508,
      "reward_std": 0.6692662239074707,
      "rewards/cosine_scaled_reward/mean": -0.19519051909446716,
      "rewards/cosine_scaled_reward/std": 0.31183505058288574,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5039526224136353,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1480.296875,
      "completions/mean_terminated_length": 1116.3846435546875,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.061714285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25811222195625305,
      "learning_rate": 9.274017555754407e-07,
      "loss": 0.0,
      "num_tokens": 6623381.0,
      "reward": 0.7679780721664429,
      "reward_std": 0.8107975125312805,
      "rewards/cosine_scaled_reward/mean": 0.040239036083221436,
      "rewards/cosine_scaled_reward/std": 0.535083532333374,
      "rewards/format_reward/mean": 0.6875,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1734.0,
      "completions/mean_length": 1411.125,
      "completions/mean_terminated_length": 883.4285888671875,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 0.06285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2835226356983185,
      "learning_rate": 9.230669076497687e-07,
      "loss": -0.0,
      "num_tokens": 6723981.0,
      "reward": 0.2917740046977997,
      "reward_std": 0.7892479300498962,
      "rewards/cosine_scaled_reward/mean": -0.13536299765110016,
      "rewards/cosine_scaled_reward/std": 0.3841571509838104,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.5,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1539.265625,
      "completions/mean_terminated_length": 1030.53125,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 0.064,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27514079213142395,
      "learning_rate": 9.186184199300463e-07,
      "loss": 0.0,
      "num_tokens": 6833454.0,
      "reward": 0.518336832523346,
      "reward_std": 0.5821805000305176,
      "rewards/cosine_scaled_reward/mean": -0.045519083738327026,
      "rewards/cosine_scaled_reward/std": 0.47304341197013855,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.453125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1580.421875,
      "completions/mean_terminated_length": 1193.0,
      "completions/min_length": 709.0,
      "completions/min_terminated_length": 709.0,
      "epoch": 0.06514285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24258998036384583,
      "learning_rate": 9.140576474687263e-07,
      "loss": 0.0,
      "num_tokens": 6946169.0,
      "reward": 0.06029004976153374,
      "reward_std": 0.5583463311195374,
      "rewards/cosine_scaled_reward/mean": -0.2667299807071686,
      "rewards/cosine_scaled_reward/std": 0.29048436880111694,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.49501484632492065,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1777.0,
      "completions/mean_length": 1361.828125,
      "completions/mean_terminated_length": 921.974365234375,
      "completions/min_length": 480.0,
      "completions/min_terminated_length": 480.0,
      "epoch": 0.06628571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2641505300998688,
      "learning_rate": 9.093859795212817e-07,
      "loss": 0.0,
      "num_tokens": 7043422.0,
      "reward": 0.5511020421981812,
      "reward_std": 0.7235630750656128,
      "rewards/cosine_scaled_reward/mean": -0.036948978900909424,
      "rewards/cosine_scaled_reward/std": 0.4425795376300812,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 1535.578125,
      "completions/mean_terminated_length": 1184.9736328125,
      "completions/min_length": 549.0,
      "completions/min_terminated_length": 549.0,
      "epoch": 0.06742857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2389107048511505,
      "learning_rate": 9.046048391230247e-07,
      "loss": -0.0,
      "num_tokens": 7152163.0,
      "reward": 0.40797895193099976,
      "reward_std": 0.6782904863357544,
      "rewards/cosine_scaled_reward/mean": -0.11632302403450012,
      "rewards/cosine_scaled_reward/std": 0.4052051305770874,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.4836103618144989,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 1534.46875,
      "completions/mean_terminated_length": 1020.9375,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.06857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2644842863082886,
      "learning_rate": 8.997156826556369e-07,
      "loss": 0.0,
      "num_tokens": 7261257.0,
      "reward": 0.09485618025064468,
      "reward_std": 0.690287709236145,
      "rewards/cosine_scaled_reward/mean": -0.21038439869880676,
      "rewards/cosine_scaled_reward/std": 0.3277226686477661,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 1678.84375,
      "completions/mean_terminated_length": 1172.9630126953125,
      "completions/min_length": 434.0,
      "completions/min_terminated_length": 434.0,
      "epoch": 0.06971428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2128395438194275,
      "learning_rate": 8.9471999940354e-07,
      "loss": 0.0,
      "num_tokens": 7380223.0,
      "reward": 0.3876636028289795,
      "reward_std": 0.8163598775863647,
      "rewards/cosine_scaled_reward/mean": -0.06398070603609085,
      "rewards/cosine_scaled_reward/std": 0.37083569169044495,
      "rewards/format_reward/mean": 0.515625,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1620.0,
      "completions/mean_length": 1104.90625,
      "completions/mean_terminated_length": 909.1697998046875,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 0.07085714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2858783006668091,
      "learning_rate": 8.896193111002475e-07,
      "loss": -0.0,
      "num_tokens": 7461169.0,
      "reward": 1.0647192001342773,
      "reward_std": 0.7218182682991028,
      "rewards/cosine_scaled_reward/mean": 0.11048462986946106,
      "rewards/cosine_scaled_reward/std": 0.5039199590682983,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1842.0,
      "completions/mean_length": 1272.703125,
      "completions/mean_terminated_length": 866.5952758789062,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.072,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29019492864608765,
      "learning_rate": 8.844151714648274e-07,
      "loss": -0.0,
      "num_tokens": 7552878.0,
      "reward": 0.7738356590270996,
      "reward_std": 0.6520147323608398,
      "rewards/cosine_scaled_reward/mean": 0.050980325788259506,
      "rewards/cosine_scaled_reward/std": 0.4395767152309418,
      "rewards/format_reward/mean": 0.671875,
      "rewards/format_reward/std": 0.4732423722743988,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1929.0,
      "completions/mean_length": 1234.015625,
      "completions/mean_terminated_length": 1026.5294189453125,
      "completions/min_length": 425.0,
      "completions/min_terminated_length": 425.0,
      "epoch": 0.07314285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2868574261665344,
      "learning_rate": 8.791091657286267e-07,
      "loss": -0.0,
      "num_tokens": 7642807.0,
      "reward": 0.7110692858695984,
      "reward_std": 0.5455821752548218,
      "rewards/cosine_scaled_reward/mean": -0.07415284961462021,
      "rewards/cosine_scaled_reward/std": 0.3914482891559601,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 1297.984375,
      "completions/mean_terminated_length": 957.0682373046875,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 0.07428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27404776215553284,
      "learning_rate": 8.737029101523929e-07,
      "loss": 0.0,
      "num_tokens": 7736582.0,
      "reward": 0.3309648334980011,
      "reward_std": 0.5653569102287292,
      "rewards/cosine_scaled_reward/mean": -0.18608009815216064,
      "rewards/cosine_scaled_reward/std": 0.3975105583667755,
      "rewards/format_reward/mean": 0.703125,
      "rewards/format_reward/std": 0.4604927599430084,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1778.0,
      "completions/mean_length": 940.03125,
      "completions/mean_terminated_length": 781.7500610351562,
      "completions/min_length": 265.0,
      "completions/min_terminated_length": 265.0,
      "epoch": 0.07542857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3140449821949005,
      "learning_rate": 8.681980515339463e-07,
      "loss": 0.0,
      "num_tokens": 7806976.0,
      "reward": 0.9573196172714233,
      "reward_std": 0.7096561193466187,
      "rewards/cosine_scaled_reward/mean": 0.04115980118513107,
      "rewards/cosine_scaled_reward/std": 0.4971291422843933,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1740.34375,
      "completions/mean_terminated_length": 1191.9130859375,
      "completions/min_length": 769.0,
      "completions/min_terminated_length": 769.0,
      "epoch": 0.07657142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22468577325344086,
      "learning_rate": 8.625962667065487e-07,
      "loss": -0.0,
      "num_tokens": 7929094.0,
      "reward": -0.05777654051780701,
      "reward_std": 0.4052577018737793,
      "rewards/cosine_scaled_reward/mean": -0.2242007553577423,
      "rewards/cosine_scaled_reward/std": 0.19833898544311523,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1562.0,
      "completions/mean_length": 818.421875,
      "completions/mean_terminated_length": 691.22412109375,
      "completions/min_length": 296.0,
      "completions/min_terminated_length": 296.0,
      "epoch": 0.07771428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32118088006973267,
      "learning_rate": 8.568992620281243e-07,
      "loss": -0.0,
      "num_tokens": 7990729.0,
      "reward": 0.9250792264938354,
      "reward_std": 0.7536466717720032,
      "rewards/cosine_scaled_reward/mean": 0.0016020983457565308,
      "rewards/cosine_scaled_reward/std": 0.4650251567363739,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1106.140625,
      "completions/mean_terminated_length": 888.7885131835938,
      "completions/min_length": 369.0,
      "completions/min_terminated_length": 369.0,
      "epoch": 0.07885714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.34258174896240234,
      "learning_rate": 8.511087728614862e-07,
      "loss": 0.0,
      "num_tokens": 8071866.0,
      "reward": 0.43910637497901917,
      "reward_std": 0.5676280856132507,
      "rewards/cosine_scaled_reward/mean": -0.19450931251049042,
      "rewards/cosine_scaled_reward/std": 0.277770459651947,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 1519.0625,
      "completions/mean_terminated_length": 1157.157958984375,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 0.08,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2579463720321655,
      "learning_rate": 8.452265630457282e-07,
      "loss": 0.0,
      "num_tokens": 8180542.0,
      "reward": 0.38829973340034485,
      "reward_std": 0.7944818735122681,
      "rewards/cosine_scaled_reward/mean": -0.12616263329982758,
      "rewards/cosine_scaled_reward/std": 0.42241016030311584,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.4836103618144989,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1468.515625,
      "completions/mean_terminated_length": 1017.8055419921875,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 0.08114285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29125189781188965,
      "learning_rate": 8.392544243589427e-07,
      "loss": 0.0,
      "num_tokens": 8285247.0,
      "reward": 0.5433856248855591,
      "reward_std": 0.7695709466934204,
      "rewards/cosine_scaled_reward/mean": -0.017369696870446205,
      "rewards/cosine_scaled_reward/std": 0.49587228894233704,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1293.078125,
      "completions/mean_terminated_length": 949.9318237304688,
      "completions/min_length": 264.0,
      "completions/min_terminated_length": 264.0,
      "epoch": 0.08228571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3353990316390991,
      "learning_rate": 8.331941759724268e-07,
      "loss": -0.0,
      "num_tokens": 8378356.0,
      "reward": 0.42883288860321045,
      "reward_std": 0.6259180307388306,
      "rewards/cosine_scaled_reward/mean": -0.14495855569839478,
      "rewards/cosine_scaled_reward/std": 0.29958412051200867,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.40625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1505.328125,
      "completions/mean_terminated_length": 1134.0263671875,
      "completions/min_length": 590.0,
      "completions/min_terminated_length": 590.0,
      "epoch": 0.08342857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24657614529132843,
      "learning_rate": 8.270476638965461e-07,
      "loss": 0.0,
      "num_tokens": 8485425.0,
      "reward": 0.2927630543708801,
      "reward_std": 0.5202052593231201,
      "rewards/cosine_scaled_reward/mean": -0.15830597281455994,
      "rewards/cosine_scaled_reward/std": 0.3964028060436249,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1677.0,
      "completions/mean_length": 1350.203125,
      "completions/mean_terminated_length": 931.5250244140625,
      "completions/min_length": 416.0,
      "completions/min_terminated_length": 416.0,
      "epoch": 0.08457142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2827599048614502,
      "learning_rate": 8.208167604184217e-07,
      "loss": -0.0,
      "num_tokens": 8581766.0,
      "reward": 0.599705159664154,
      "reward_std": 0.6438653469085693,
      "rewards/cosine_scaled_reward/mean": -0.012647412717342377,
      "rewards/cosine_scaled_reward/std": 0.49363547563552856,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1410.484375,
      "completions/mean_terminated_length": 1141.3111572265625,
      "completions/min_length": 334.0,
      "completions/min_terminated_length": 334.0,
      "epoch": 0.08571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2516263425350189,
      "learning_rate": 8.145033635316128e-07,
      "loss": -0.0,
      "num_tokens": 8682997.0,
      "reward": 0.7366052269935608,
      "reward_std": 0.5748400092124939,
      "rewards/cosine_scaled_reward/mean": -0.030134890228509903,
      "rewards/cosine_scaled_reward/std": 0.4929082691669464,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1604.0,
      "completions/mean_length": 1206.453125,
      "completions/mean_terminated_length": 851.1333618164062,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.08685714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33788394927978516,
      "learning_rate": 8.081093963579707e-07,
      "loss": 0.0,
      "num_tokens": 8770610.0,
      "reward": 0.2925173044204712,
      "reward_std": 0.543351948261261,
      "rewards/cosine_scaled_reward/mean": -0.2053038477897644,
      "rewards/cosine_scaled_reward/std": 0.35098204016685486,
      "rewards/format_reward/mean": 0.703125,
      "rewards/format_reward/std": 0.4604927599430084,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1203.796875,
      "completions/mean_terminated_length": 988.6078491210938,
      "completions/min_length": 459.0,
      "completions/min_terminated_length": 459.0,
      "epoch": 0.088,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2513405680656433,
      "learning_rate": 8.01636806561836e-07,
      "loss": -0.0,
      "num_tokens": 8859229.0,
      "reward": 0.5237706899642944,
      "reward_std": 0.6414985060691833,
      "rewards/cosine_scaled_reward/mean": -0.15217715501785278,
      "rewards/cosine_scaled_reward/std": 0.3733552396297455,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 1434.796875,
      "completions/mean_terminated_length": 893.7352905273438,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.08914285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28274238109588623,
      "learning_rate": 7.950875657567621e-07,
      "loss": -0.0,
      "num_tokens": 8961776.0,
      "reward": 0.3842669129371643,
      "reward_std": 0.7946954369544983,
      "rewards/cosine_scaled_reward/mean": -0.08911655098199844,
      "rewards/cosine_scaled_reward/std": 0.4567674696445465,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.5,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1759.0,
      "completions/mean_length": 958.921875,
      "completions/mean_terminated_length": 757.24072265625,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.09028571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3380838930606842,
      "learning_rate": 7.884636689049422e-07,
      "loss": 0.0,
      "num_tokens": 9033243.0,
      "reward": 0.5171822309494019,
      "reward_std": 0.4722011089324951,
      "rewards/cosine_scaled_reward/mean": -0.16328388452529907,
      "rewards/cosine_scaled_reward/std": 0.371114581823349,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1309.25,
      "completions/mean_terminated_length": 997.3333740234375,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.09142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28732550144195557,
      "learning_rate": 7.817671337095244e-07,
      "loss": -0.0,
      "num_tokens": 9127427.0,
      "reward": 0.42246782779693604,
      "reward_std": 0.5261572003364563,
      "rewards/cosine_scaled_reward/mean": -0.1559535712003708,
      "rewards/cosine_scaled_reward/std": 0.37669748067855835,
      "rewards/format_reward/mean": 0.734375,
      "rewards/format_reward/std": 0.44515693187713623,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1156.71875,
      "completions/mean_terminated_length": 991.6666870117188,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 0.09257142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36872920393943787,
      "learning_rate": 7.75e-07,
      "loss": 0.0,
      "num_tokens": 9212729.0,
      "reward": 0.6148654222488403,
      "reward_std": 0.6330965161323547,
      "rewards/cosine_scaled_reward/mean": -0.13006731867790222,
      "rewards/cosine_scaled_reward/std": 0.36434388160705566,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1259.765625,
      "completions/mean_terminated_length": 997.0208740234375,
      "completions/min_length": 441.0,
      "completions/min_terminated_length": 441.0,
      "epoch": 0.09371428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2519986927509308,
      "learning_rate": 7.681643291108517e-07,
      "loss": 0.0,
      "num_tokens": 9303682.0,
      "reward": 0.8954258561134338,
      "reward_std": 0.4955286383628845,
      "rewards/cosine_scaled_reward/mean": 0.05708790570497513,
      "rewards/cosine_scaled_reward/std": 0.548876941204071,
      "rewards/format_reward/mean": 0.78125,
      "rewards/format_reward/std": 0.4166666865348816,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1342.71875,
      "completions/mean_terminated_length": 973.2857055664062,
      "completions/min_length": 66.0,
      "completions/min_terminated_length": 66.0,
      "epoch": 0.09485714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.5289212465286255,
      "learning_rate": 7.612622032536507e-07,
      "loss": -0.0,
      "num_tokens": 9400704.0,
      "reward": 0.5462230443954468,
      "reward_std": 0.6437035799026489,
      "rewards/cosine_scaled_reward/mean": -0.05501346290111542,
      "rewards/cosine_scaled_reward/std": 0.3811412453651428,
      "rewards/format_reward/mean": 0.65625,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1603.0,
      "completions/mean_length": 1094.421875,
      "completions/mean_terminated_length": 874.3654174804688,
      "completions/min_length": 77.0,
      "completions/min_terminated_length": 77.0,
      "epoch": 0.096,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3851650059223175,
      "learning_rate": 7.54295724882796e-07,
      "loss": -0.0,
      "num_tokens": 9481443.0,
      "reward": 0.42046594619750977,
      "reward_std": 0.5146702527999878,
      "rewards/cosine_scaled_reward/mean": -0.18820451200008392,
      "rewards/cosine_scaled_reward/std": 0.3148095905780792,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 1293.734375,
      "completions/mean_terminated_length": 1101.4705810546875,
      "completions/min_length": 634.0,
      "completions/min_terminated_length": 634.0,
      "epoch": 0.09714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21673259139060974,
      "learning_rate": 7.472670160550848e-07,
      "loss": -0.0,
      "num_tokens": 9574562.0,
      "reward": 0.5379458665847778,
      "reward_std": 0.5725549459457397,
      "rewards/cosine_scaled_reward/mean": -0.1529020518064499,
      "rewards/cosine_scaled_reward/std": 0.34331607818603516,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1286.953125,
      "completions/mean_terminated_length": 1111.326904296875,
      "completions/min_length": 457.0,
      "completions/min_terminated_length": 457.0,
      "epoch": 0.09828571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29995283484458923,
      "learning_rate": 7.401782177833147e-07,
      "loss": -0.0,
      "num_tokens": 9667783.0,
      "reward": 0.7049737572669983,
      "reward_std": 0.5882902145385742,
      "rewards/cosine_scaled_reward/mean": -0.06938813626766205,
      "rewards/cosine_scaled_reward/std": 0.35692107677459717,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1650.0,
      "completions/mean_length": 953.484375,
      "completions/mean_terminated_length": 797.1250610351562,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.09942857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.8409918546676636,
      "learning_rate": 7.330314893841101e-07,
      "loss": -0.0,
      "num_tokens": 9738926.0,
      "reward": 0.5864202976226807,
      "reward_std": 0.5318285822868347,
      "rewards/cosine_scaled_reward/mean": -0.13647735118865967,
      "rewards/cosine_scaled_reward/std": 0.3167019188404083,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 1729.0,
      "completions/max_terminated_length": 1729.0,
      "completions/mean_length": 886.546875,
      "completions/mean_terminated_length": 886.546875,
      "completions/min_length": 126.0,
      "completions/min_terminated_length": 126.0,
      "epoch": 0.10057142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.35503509640693665,
      "learning_rate": 7.258290078201731e-07,
      "loss": -0.0,
      "num_tokens": 9806177.0,
      "reward": 1.2680045366287231,
      "reward_std": 0.6174743175506592,
      "rewards/cosine_scaled_reward/mean": 0.14181479811668396,
      "rewards/cosine_scaled_reward/std": 0.5076755881309509,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1268.875,
      "completions/mean_terminated_length": 1124.5926513671875,
      "completions/min_length": 496.0,
      "completions/min_terminated_length": 496.0,
      "epoch": 0.10171428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2577503025531769,
      "learning_rate": 7.185729670371604e-07,
      "loss": 0.0,
      "num_tokens": 9898385.0,
      "reward": 0.7599090337753296,
      "reward_std": 0.8181240558624268,
      "rewards/cosine_scaled_reward/mean": -0.073170505464077,
      "rewards/cosine_scaled_reward/std": 0.43178030848503113,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1870.0,
      "completions/mean_length": 1033.5,
      "completions/mean_terminated_length": 888.5714721679688,
      "completions/min_length": 160.0,
      "completions/min_terminated_length": 160.0,
      "epoch": 0.10285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.39826640486717224,
      "learning_rate": 7.11265577295385e-07,
      "loss": -0.0,
      "num_tokens": 9974529.0,
      "reward": 0.4363415837287903,
      "reward_std": 0.48137861490249634,
      "rewards/cosine_scaled_reward/mean": -0.21932920813560486,
      "rewards/cosine_scaled_reward/std": 0.31020957231521606,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1911.0,
      "completions/mean_length": 1406.75,
      "completions/mean_terminated_length": 1155.8260498046875,
      "completions/min_length": 288.0,
      "completions/min_terminated_length": 288.0,
      "epoch": 0.104,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2828798294067383,
      "learning_rate": 7.039090644965509e-07,
      "loss": 0.0,
      "num_tokens": 10075129.0,
      "reward": 0.6231826543807983,
      "reward_std": 0.8636409044265747,
      "rewards/cosine_scaled_reward/mean": -0.06340868771076202,
      "rewards/cosine_scaled_reward/std": 0.49163660407066345,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1240.375,
      "completions/mean_terminated_length": 1054.0,
      "completions/min_length": 294.0,
      "completions/min_terminated_length": 294.0,
      "epoch": 0.10514285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28899839520454407,
      "learning_rate": 6.965056695057204e-07,
      "loss": -0.0,
      "num_tokens": 10164505.0,
      "reward": 0.5402791500091553,
      "reward_std": 0.6010072827339172,
      "rewards/cosine_scaled_reward/mean": -0.18298542499542236,
      "rewards/cosine_scaled_reward/std": 0.3275497853755951,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1452.328125,
      "completions/mean_terminated_length": 1161.4185791015625,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 0.10628571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29217013716697693,
      "learning_rate": 6.890576474687263e-07,
      "loss": 0.0,
      "num_tokens": 10268798.0,
      "reward": 0.3901035785675049,
      "reward_std": 0.5545454025268555,
      "rewards/cosine_scaled_reward/mean": -0.20338571071624756,
      "rewards/cosine_scaled_reward/std": 0.30444955825805664,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1862.0,
      "completions/mean_length": 1260.515625,
      "completions/mean_terminated_length": 928.022216796875,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 0.10742857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3268873393535614,
      "learning_rate": 6.815672671252315e-07,
      "loss": 0.0,
      "num_tokens": 10359239.0,
      "reward": 0.7386243343353271,
      "reward_std": 0.7403403520584106,
      "rewards/cosine_scaled_reward/mean": 0.017749670892953873,
      "rewards/cosine_scaled_reward/std": 0.48611870408058167,
      "rewards/format_reward/mean": 0.703125,
      "rewards/format_reward/std": 0.4604927599430084,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1586.0,
      "completions/mean_length": 1262.921875,
      "completions/mean_terminated_length": 1099.981201171875,
      "completions/min_length": 494.0,
      "completions/min_terminated_length": 494.0,
      "epoch": 0.10857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2648283839225769,
      "learning_rate": 6.740368101176495e-07,
      "loss": -0.0,
      "num_tokens": 10450594.0,
      "reward": 0.2988126277923584,
      "reward_std": 0.47151660919189453,
      "rewards/cosine_scaled_reward/mean": -0.2646561861038208,
      "rewards/cosine_scaled_reward/std": 0.2507747411727905,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1733.0,
      "completions/mean_length": 1106.09375,
      "completions/mean_terminated_length": 931.6666870117188,
      "completions/min_length": 352.0,
      "completions/min_terminated_length": 352.0,
      "epoch": 0.10971428571428571,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29469895362854004,
      "learning_rate": 6.664685702961344e-07,
      "loss": -0.0,
      "num_tokens": 10532520.0,
      "reward": 0.810766339302063,
      "reward_std": 0.48911383748054504,
      "rewards/cosine_scaled_reward/mean": -0.02430431731045246,
      "rewards/cosine_scaled_reward/std": 0.47827479243278503,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 1183.0625,
      "completions/mean_terminated_length": 1041.5272216796875,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 0.11085714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2775411605834961,
      "learning_rate": 6.588648530198504e-07,
      "loss": -0.0,
      "num_tokens": 10618500.0,
      "reward": 0.711927056312561,
      "reward_std": 0.7540339231491089,
      "rewards/cosine_scaled_reward/mean": -0.08934895694255829,
      "rewards/cosine_scaled_reward/std": 0.39428383111953735,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 1130.078125,
      "completions/mean_terminated_length": 998.9464721679688,
      "completions/min_length": 335.0,
      "completions/min_terminated_length": 335.0,
      "epoch": 0.112,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2643944323062897,
      "learning_rate": 6.512279744547392e-07,
      "loss": -0.0,
      "num_tokens": 10701889.0,
      "reward": 0.9012003540992737,
      "reward_std": 0.6251660585403442,
      "rewards/cosine_scaled_reward/mean": -0.0025248080492019653,
      "rewards/cosine_scaled_reward/std": 0.48845264315605164,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29378482699394226,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 1079.828125,
      "completions/mean_terminated_length": 941.5178833007812,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 0.11314285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.31110212206840515,
      "learning_rate": 6.435602608679916e-07,
      "loss": -0.0,
      "num_tokens": 10782166.0,
      "reward": 0.9453647136688232,
      "reward_std": 0.5895795822143555,
      "rewards/cosine_scaled_reward/mean": -0.011692702770233154,
      "rewards/cosine_scaled_reward/std": 0.49524030089378357,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1099.609375,
      "completions/mean_terminated_length": 964.1250610351562,
      "completions/min_length": 394.0,
      "completions/min_terminated_length": 394.0,
      "epoch": 0.11428571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28003421425819397,
      "learning_rate": 6.358640479194451e-07,
      "loss": 0.0,
      "num_tokens": 10862253.0,
      "reward": 1.0164594650268555,
      "reward_std": 0.8200770616531372,
      "rewards/cosine_scaled_reward/mean": 0.047292180359363556,
      "rewards/cosine_scaled_reward/std": 0.5198192000389099,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1940.0,
      "completions/mean_length": 1095.8125,
      "completions/mean_terminated_length": 1048.9835205078125,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 0.11542857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.31470584869384766,
      "learning_rate": 6.281416799501187e-07,
      "loss": -0.0,
      "num_tokens": 10943817.0,
      "reward": 0.7535556554794312,
      "reward_std": 0.5378469228744507,
      "rewards/cosine_scaled_reward/mean": -0.11540969461202621,
      "rewards/cosine_scaled_reward/std": 0.37915751338005066,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1952.0,
      "completions/mean_length": 1071.109375,
      "completions/mean_terminated_length": 931.5535888671875,
      "completions/min_length": 415.0,
      "completions/min_terminated_length": 415.0,
      "epoch": 0.11657142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3006651699542999,
      "learning_rate": 6.203955092681039e-07,
      "loss": 0.0,
      "num_tokens": 11022520.0,
      "reward": 0.6289626359939575,
      "reward_std": 0.5818617939949036,
      "rewards/cosine_scaled_reward/mean": -0.12301868200302124,
      "rewards/cosine_scaled_reward/std": 0.3668438792228699,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 1084.203125,
      "completions/mean_terminated_length": 965.8421020507812,
      "completions/min_length": 222.0,
      "completions/min_terminated_length": 222.0,
      "epoch": 0.11771428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32252946496009827,
      "learning_rate": 6.126278954320294e-07,
      "loss": -0.0,
      "num_tokens": 11102261.0,
      "reward": 0.6010515689849854,
      "reward_std": 0.7124715447425842,
      "rewards/cosine_scaled_reward/mean": -0.1682242453098297,
      "rewards/cosine_scaled_reward/std": 0.34618714451789856,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 1172.25,
      "completions/mean_terminated_length": 990.4906005859375,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.11885714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30141475796699524,
      "learning_rate": 6.048412045323164e-07,
      "loss": 0.0,
      "num_tokens": 11187749.0,
      "reward": 0.5190969705581665,
      "reward_std": 0.4445875287055969,
      "rewards/cosine_scaled_reward/mean": -0.16232651472091675,
      "rewards/cosine_scaled_reward/std": 0.2870725095272064,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 1155.9375,
      "completions/mean_terminated_length": 882.8571166992188,
      "completions/min_length": 492.0,
      "completions/min_terminated_length": 492.0,
      "epoch": 0.12,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3128022253513336,
      "learning_rate": 5.97037808470444e-07,
      "loss": 0.0,
      "num_tokens": 11273017.0,
      "reward": 0.844305157661438,
      "reward_std": 0.8654354810714722,
      "rewards/cosine_scaled_reward/mean": 0.0080900639295578,
      "rewards/cosine_scaled_reward/std": 0.5000066757202148,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 982.828125,
      "completions/mean_terminated_length": 911.8167114257812,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 0.12114285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.287850946187973,
      "learning_rate": 5.892200842364462e-07,
      "loss": -0.0,
      "num_tokens": 11346750.0,
      "reward": 0.9764542579650879,
      "reward_std": 0.523072361946106,
      "rewards/cosine_scaled_reward/mean": 0.019477128982543945,
      "rewards/cosine_scaled_reward/std": 0.44723302125930786,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1898.0,
      "completions/mean_length": 1099.75,
      "completions/mean_terminated_length": 944.581787109375,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.12228571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2976638078689575,
      "learning_rate": 5.813904131848564e-07,
      "loss": -0.0,
      "num_tokens": 11428286.0,
      "reward": 0.727447509765625,
      "reward_std": 0.630887508392334,
      "rewards/cosine_scaled_reward/mean": -0.0737762451171875,
      "rewards/cosine_scaled_reward/std": 0.4439302980899811,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1688.0,
      "completions/mean_length": 1213.25,
      "completions/mean_terminated_length": 979.5199584960938,
      "completions/min_length": 472.0,
      "completions/min_terminated_length": 472.0,
      "epoch": 0.12342857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2800474762916565,
      "learning_rate": 5.735511803093248e-07,
      "loss": -0.0,
      "num_tokens": 11516294.0,
      "reward": 0.5832531452178955,
      "reward_std": 0.73647540807724,
      "rewards/cosine_scaled_reward/mean": -0.11462344229221344,
      "rewards/cosine_scaled_reward/std": 0.37341246008872986,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.39339789748191833,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1828.0,
      "completions/mean_length": 1253.625,
      "completions/mean_terminated_length": 1010.448974609375,
      "completions/min_length": 316.0,
      "completions/min_terminated_length": 316.0,
      "epoch": 0.12457142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2575153708457947,
      "learning_rate": 5.657047735161255e-07,
      "loss": 0.0,
      "num_tokens": 11607774.0,
      "reward": 0.5941106677055359,
      "reward_std": 0.6006972789764404,
      "rewards/cosine_scaled_reward/mean": -0.10138219594955444,
      "rewards/cosine_scaled_reward/std": 0.4213758409023285,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 1212.171875,
      "completions/mean_terminated_length": 978.1399536132812,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.12571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3692750930786133,
      "learning_rate": 5.578535828967777e-07,
      "loss": 0.0,
      "num_tokens": 11696129.0,
      "reward": 0.5244784355163574,
      "reward_std": 0.5243270397186279,
      "rewards/cosine_scaled_reward/mean": -0.15963581204414368,
      "rewards/cosine_scaled_reward/std": 0.31791090965270996,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1957.0,
      "completions/mean_length": 925.640625,
      "completions/mean_terminated_length": 907.825439453125,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 0.12685714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2943163514137268,
      "learning_rate": 5.5e-07,
      "loss": 0.0,
      "num_tokens": 11765490.0,
      "reward": 1.0681891441345215,
      "reward_std": 0.666343092918396,
      "rewards/cosine_scaled_reward/mean": 0.041907064616680145,
      "rewards/cosine_scaled_reward/std": 0.4317578375339508,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 1297.265625,
      "completions/mean_terminated_length": 1067.448974609375,
      "completions/min_length": 536.0,
      "completions/min_terminated_length": 536.0,
      "epoch": 0.128,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2623840868473053,
      "learning_rate": 5.421464171032224e-07,
      "loss": -0.0,
      "num_tokens": 11859611.0,
      "reward": 0.5628055930137634,
      "reward_std": 0.664225697517395,
      "rewards/cosine_scaled_reward/mean": -0.09359719604253769,
      "rewards/cosine_scaled_reward/std": 0.4235653281211853,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1643.0,
      "completions/mean_length": 905.21875,
      "completions/mean_terminated_length": 887.0794067382812,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.12914285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3688279390335083,
      "learning_rate": 5.342952264838747e-07,
      "loss": 0.0,
      "num_tokens": 11927841.0,
      "reward": 1.0120556354522705,
      "reward_std": 0.7051924467086792,
      "rewards/cosine_scaled_reward/mean": 0.006027787923812866,
      "rewards/cosine_scaled_reward/std": 0.4939332902431488,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 984.96875,
      "completions/mean_terminated_length": 914.1000366210938,
      "completions/min_length": 267.0,
      "completions/min_terminated_length": 267.0,
      "epoch": 0.13028571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27232953906059265,
      "learning_rate": 5.264488196906752e-07,
      "loss": 0.0,
      "num_tokens": 12000727.0,
      "reward": 0.5494575500488281,
      "reward_std": 0.5694750547409058,
      "rewards/cosine_scaled_reward/mean": -0.21745873987674713,
      "rewards/cosine_scaled_reward/std": 0.3295048475265503,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1129.015625,
      "completions/mean_terminated_length": 958.8333129882812,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.13142857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.31655776500701904,
      "learning_rate": 5.186095868151436e-07,
      "loss": 0.0,
      "num_tokens": 12084200.0,
      "reward": 0.6148514747619629,
      "reward_std": 0.592422604560852,
      "rewards/cosine_scaled_reward/mean": -0.12226178497076035,
      "rewards/cosine_scaled_reward/std": 0.44683361053466797,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1829.0,
      "completions/mean_length": 1261.96875,
      "completions/mean_terminated_length": 1098.8302001953125,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.13257142857142856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2751889228820801,
      "learning_rate": 5.107799157635538e-07,
      "loss": 0.0,
      "num_tokens": 12176350.0,
      "reward": 0.7403342723846436,
      "reward_std": 0.6721117496490479,
      "rewards/cosine_scaled_reward/mean": -0.05170784145593643,
      "rewards/cosine_scaled_reward/std": 0.43005797266960144,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1906.0,
      "completions/mean_length": 1156.421875,
      "completions/mean_terminated_length": 1080.8643798828125,
      "completions/min_length": 510.0,
      "completions/min_terminated_length": 510.0,
      "epoch": 0.1337142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2688439190387726,
      "learning_rate": 5.02962191529556e-07,
      "loss": 0.0,
      "num_tokens": 12261385.0,
      "reward": 0.8138295412063599,
      "reward_std": 0.5513401031494141,
      "rewards/cosine_scaled_reward/mean": -0.09308521449565887,
      "rewards/cosine_scaled_reward/std": 0.3840063810348511,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1838.0,
      "completions/mean_length": 1200.328125,
      "completions/mean_terminated_length": 962.97998046875,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 0.13485714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27440333366394043,
      "learning_rate": 4.951587954676837e-07,
      "loss": -0.0,
      "num_tokens": 12348742.0,
      "reward": 0.7029136419296265,
      "reward_std": 0.819955587387085,
      "rewards/cosine_scaled_reward/mean": -0.05479319393634796,
      "rewards/cosine_scaled_reward/std": 0.4745423495769501,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.39339789748191833,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1073.125,
      "completions/mean_terminated_length": 933.857177734375,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.136,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4175036549568176,
      "learning_rate": 4.873721045679706e-07,
      "loss": -0.0,
      "num_tokens": 12427830.0,
      "reward": 0.9872255325317383,
      "reward_std": 0.5814859867095947,
      "rewards/cosine_scaled_reward/mean": 0.056112758815288544,
      "rewards/cosine_scaled_reward/std": 0.5062689781188965,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1674.0,
      "completions/mean_length": 865.609375,
      "completions/mean_terminated_length": 827.4677124023438,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.13714285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3479575216770172,
      "learning_rate": 4.79604490731896e-07,
      "loss": 0.0,
      "num_tokens": 12493685.0,
      "reward": 0.8312849998474121,
      "reward_std": 0.6629652976989746,
      "rewards/cosine_scaled_reward/mean": -0.07654500752687454,
      "rewards/cosine_scaled_reward/std": 0.4900154173374176,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 754.1875,
      "completions/mean_terminated_length": 733.6508178710938,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.1382857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3520006239414215,
      "learning_rate": 4.7185832004988133e-07,
      "loss": 0.0,
      "num_tokens": 12551649.0,
      "reward": 1.1358025074005127,
      "reward_std": 0.6475541591644287,
      "rewards/cosine_scaled_reward/mean": 0.06790120899677277,
      "rewards/cosine_scaled_reward/std": 0.5241734385490417,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 1191.53125,
      "completions/mean_terminated_length": 1069.1785888671875,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.13942857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29668495059013367,
      "learning_rate": 4.641359520805548e-07,
      "loss": 0.0,
      "num_tokens": 12639467.0,
      "reward": 0.9448899030685425,
      "reward_std": 0.731184184551239,
      "rewards/cosine_scaled_reward/mean": -0.01193002238869667,
      "rewards/cosine_scaled_reward/std": 0.4401930868625641,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1111.390625,
      "completions/mean_terminated_length": 996.368408203125,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.14057142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33428606390953064,
      "learning_rate": 4.5643973913200837e-07,
      "loss": 0.0,
      "num_tokens": 12720828.0,
      "reward": 0.605126678943634,
      "reward_std": 0.5388374328613281,
      "rewards/cosine_scaled_reward/mean": -0.16618669033050537,
      "rewards/cosine_scaled_reward/std": 0.36216598749160767,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1889.0,
      "completions/mean_length": 1088.375,
      "completions/mean_terminated_length": 1007.0508422851562,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.1417142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27362972497940063,
      "learning_rate": 4.4877202554526084e-07,
      "loss": 0.0,
      "num_tokens": 12801436.0,
      "reward": 0.8875737190246582,
      "reward_std": 0.7504779100418091,
      "rewards/cosine_scaled_reward/mean": -0.04058811068534851,
      "rewards/cosine_scaled_reward/std": 0.41639918088912964,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 1045.59375,
      "completions/mean_terminated_length": 941.8965454101562,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 0.14285714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26206985116004944,
      "learning_rate": 4.4113514698014953e-07,
      "loss": 0.0,
      "num_tokens": 12879282.0,
      "reward": 0.9140812754631042,
      "reward_std": 0.5521372556686401,
      "rewards/cosine_scaled_reward/mean": -0.011709354817867279,
      "rewards/cosine_scaled_reward/std": 0.478300005197525,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1726.0,
      "completions/mean_length": 1053.3125,
      "completions/mean_terminated_length": 950.413818359375,
      "completions/min_length": 555.0,
      "completions/min_terminated_length": 555.0,
      "epoch": 0.144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2631860077381134,
      "learning_rate": 4.3353142970386557e-07,
      "loss": 0.0,
      "num_tokens": 12957990.0,
      "reward": 0.9084943532943726,
      "reward_std": 0.7618498802185059,
      "rewards/cosine_scaled_reward/mean": -0.006690334528684616,
      "rewards/cosine_scaled_reward/std": 0.4652135372161865,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1078.5625,
      "completions/mean_terminated_length": 978.27587890625,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.14514285714285713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32610929012298584,
      "learning_rate": 4.2596318988235037e-07,
      "loss": 0.0,
      "num_tokens": 13038114.0,
      "reward": 0.6247783303260803,
      "reward_std": 0.6168485879898071,
      "rewards/cosine_scaled_reward/mean": -0.16417336463928223,
      "rewards/cosine_scaled_reward/std": 0.3348706066608429,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1018.53125,
      "completions/mean_terminated_length": 949.9000244140625,
      "completions/min_length": 392.0,
      "completions/min_terminated_length": 392.0,
      "epoch": 0.1462857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2964915633201599,
      "learning_rate": 4.1843273287476854e-07,
      "loss": 0.0,
      "num_tokens": 13113652.0,
      "reward": 1.0467863082885742,
      "reward_std": 0.696172833442688,
      "rewards/cosine_scaled_reward/mean": 0.031205661594867706,
      "rewards/cosine_scaled_reward/std": 0.5108028650283813,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1711.0,
      "completions/mean_length": 1295.765625,
      "completions/mean_terminated_length": 1104.0196533203125,
      "completions/min_length": 493.0,
      "completions/min_terminated_length": 493.0,
      "epoch": 0.14742857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24947936832904816,
      "learning_rate": 4.1094235253127374e-07,
      "loss": -0.0,
      "num_tokens": 13207285.0,
      "reward": 0.6936242580413818,
      "reward_std": 0.7887886762619019,
      "rewards/cosine_scaled_reward/mean": -0.07506285607814789,
      "rewards/cosine_scaled_reward/std": 0.42623400688171387,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1796.0,
      "completions/mean_length": 1126.21875,
      "completions/mean_terminated_length": 913.5000610351562,
      "completions/min_length": 359.0,
      "completions/min_terminated_length": 359.0,
      "epoch": 0.14857142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2999359369277954,
      "learning_rate": 4.034943304942796e-07,
      "loss": -0.0,
      "num_tokens": 13289867.0,
      "reward": 0.43217456340789795,
      "reward_std": 0.5134756565093994,
      "rewards/cosine_scaled_reward/mean": -0.19797520339488983,
      "rewards/cosine_scaled_reward/std": 0.26216205954551697,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1773.0,
      "completions/mean_length": 1085.21875,
      "completions/mean_terminated_length": 885.396240234375,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.14971428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.336990088224411,
      "learning_rate": 3.9609093550344907e-07,
      "loss": 0.0,
      "num_tokens": 13369057.0,
      "reward": 0.9999659061431885,
      "reward_std": 0.5346121788024902,
      "rewards/cosine_scaled_reward/mean": 0.06248297542333603,
      "rewards/cosine_scaled_reward/std": 0.48766252398490906,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1847.0,
      "completions/mean_length": 1286.34375,
      "completions/mean_terminated_length": 798.1026000976562,
      "completions/min_length": 356.0,
      "completions/min_terminated_length": 356.0,
      "epoch": 0.15085714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29184556007385254,
      "learning_rate": 3.8873442270461485e-07,
      "loss": -0.0,
      "num_tokens": 13462447.0,
      "reward": 0.6418495178222656,
      "reward_std": 0.674288809299469,
      "rewards/cosine_scaled_reward/mean": 0.008424755185842514,
      "rewards/cosine_scaled_reward/std": 0.4481044411659241,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1703.0,
      "completions/mean_length": 1147.875,
      "completions/mean_terminated_length": 940.1538696289062,
      "completions/min_length": 462.0,
      "completions/min_terminated_length": 462.0,
      "epoch": 0.152,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2999313175678253,
      "learning_rate": 3.8142703296283953e-07,
      "loss": 0.0,
      "num_tokens": 13547135.0,
      "reward": 0.5941276550292969,
      "reward_std": 0.6214425563812256,
      "rewards/cosine_scaled_reward/mean": -0.11699868738651276,
      "rewards/cosine_scaled_reward/std": 0.32567545771598816,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1888.0,
      "completions/mean_length": 952.796875,
      "completions/mean_terminated_length": 935.4127807617188,
      "completions/min_length": 314.0,
      "completions/min_terminated_length": 314.0,
      "epoch": 0.15314285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3382585942745209,
      "learning_rate": 3.7417099217982686e-07,
      "loss": 0.0,
      "num_tokens": 13618682.0,
      "reward": 1.1113653182983398,
      "reward_std": 0.6422195434570312,
      "rewards/cosine_scaled_reward/mean": 0.07130765169858932,
      "rewards/cosine_scaled_reward/std": 0.5419203042984009,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1522.0,
      "completions/mean_length": 749.625,
      "completions/mean_terminated_length": 685.7704467773438,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.15428571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.45106765627861023,
      "learning_rate": 3.6696851061588994e-07,
      "loss": 0.0,
      "num_tokens": 13677130.0,
      "reward": 1.3096649646759033,
      "reward_std": 0.8325010538101196,
      "rewards/cosine_scaled_reward/mean": 0.17826992273330688,
      "rewards/cosine_scaled_reward/std": 0.49473828077316284,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1694.0,
      "completions/mean_length": 1074.609375,
      "completions/mean_terminated_length": 992.11865234375,
      "completions/min_length": 432.0,
      "completions/min_terminated_length": 432.0,
      "epoch": 0.15542857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3076488673686981,
      "learning_rate": 3.5982178221668533e-07,
      "loss": 0.0,
      "num_tokens": 13756577.0,
      "reward": 1.0143537521362305,
      "reward_std": 0.645289957523346,
      "rewards/cosine_scaled_reward/mean": 0.01498936116695404,
      "rewards/cosine_scaled_reward/std": 0.4602736830711365,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1153.1875,
      "completions/mean_terminated_length": 1077.35595703125,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 0.15657142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26375824213027954,
      "learning_rate": 3.5273298394491515e-07,
      "loss": 0.0,
      "num_tokens": 13840989.0,
      "reward": 0.7302319407463074,
      "reward_std": 0.6595839262008667,
      "rewards/cosine_scaled_reward/mean": -0.11144650727510452,
      "rewards/cosine_scaled_reward/std": 0.36661940813064575,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1057.84375,
      "completions/mean_terminated_length": 1025.9031982421875,
      "completions/min_length": 318.0,
      "completions/min_terminated_length": 318.0,
      "epoch": 0.15771428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28375834226608276,
      "learning_rate": 3.45704275117204e-07,
      "loss": 0.0,
      "num_tokens": 13920003.0,
      "reward": 0.9056442975997925,
      "reward_std": 0.5362595319747925,
      "rewards/cosine_scaled_reward/mean": -0.047177836298942566,
      "rewards/cosine_scaled_reward/std": 0.46028000116348267,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1131.96875,
      "completions/mean_terminated_length": 1001.107177734375,
      "completions/min_length": 278.0,
      "completions/min_terminated_length": 278.0,
      "epoch": 0.15885714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3277582824230194,
      "learning_rate": 3.387377967463493e-07,
      "loss": -0.0,
      "num_tokens": 14003089.0,
      "reward": 0.6589163541793823,
      "reward_std": 0.4049028754234314,
      "rewards/cosine_scaled_reward/mean": -0.17054180800914764,
      "rewards/cosine_scaled_reward/std": 0.36281341314315796,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 1074.421875,
      "completions/mean_terminated_length": 1009.5167236328125,
      "completions/min_length": 414.0,
      "completions/min_terminated_length": 414.0,
      "epoch": 0.16,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 0.2817178964614868,
      "learning_rate": 3.3183567088914833e-07,
      "loss": -0.0,
      "num_tokens": 14082076.0,
      "reward": 0.9182517528533936,
      "reward_std": 0.4315429925918579,
      "rewards/cosine_scaled_reward/mean": -0.033061616122722626,
      "rewards/cosine_scaled_reward/std": 0.483820378780365,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1011.375,
      "completions/mean_terminated_length": 942.2667236328125,
      "completions/min_length": 348.0,
      "completions/min_terminated_length": 348.0,
      "epoch": 0.16114285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2890068590641022,
      "learning_rate": 3.250000000000001e-07,
      "loss": -0.0,
      "num_tokens": 14157364.0,
      "reward": 0.9973877668380737,
      "reward_std": 0.4701315760612488,
      "rewards/cosine_scaled_reward/mean": -0.0013061091303825378,
      "rewards/cosine_scaled_reward/std": 0.47546684741973877,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1878.0,
      "completions/mean_length": 1245.0625,
      "completions/mean_terminated_length": 1096.370361328125,
      "completions/min_length": 226.0,
      "completions/min_terminated_length": 226.0,
      "epoch": 0.16228571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3200085163116455,
      "learning_rate": 3.182328662904756e-07,
      "loss": 0.0,
      "num_tokens": 14247552.0,
      "reward": 0.5833260416984558,
      "reward_std": 0.6680535674095154,
      "rewards/cosine_scaled_reward/mean": -0.1692744940519333,
      "rewards/cosine_scaled_reward/std": 0.3119296729564667,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 953.15625,
      "completions/mean_terminated_length": 935.77783203125,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 0.16342857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3159373104572296,
      "learning_rate": 3.115363310950578e-07,
      "loss": -0.0,
      "num_tokens": 14319370.0,
      "reward": 0.7112681269645691,
      "reward_std": 0.5509551167488098,
      "rewards/cosine_scaled_reward/mean": -0.14436593651771545,
      "rewards/cosine_scaled_reward/std": 0.3024492859840393,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1857.0,
      "completions/mean_length": 1130.140625,
      "completions/mean_terminated_length": 939.6415405273438,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.16457142857142856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.366451621055603,
      "learning_rate": 3.0491243424323783e-07,
      "loss": 0.0,
      "num_tokens": 14403187.0,
      "reward": 0.8458279967308044,
      "reward_std": 0.48854437470436096,
      "rewards/cosine_scaled_reward/mean": 0.0010389834642410278,
      "rewards/cosine_scaled_reward/std": 0.49104803800582886,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1815.0,
      "completions/mean_length": 901.28125,
      "completions/mean_terminated_length": 883.0794067382812,
      "completions/min_length": 216.0,
      "completions/min_terminated_length": 216.0,
      "epoch": 0.1657142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3450138568878174,
      "learning_rate": 2.9836319343816397e-07,
      "loss": -0.0,
      "num_tokens": 14470813.0,
      "reward": 1.018430471420288,
      "reward_std": 0.7617365717887878,
      "rewards/cosine_scaled_reward/mean": 0.009215235710144043,
      "rewards/cosine_scaled_reward/std": 0.5040040016174316,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1837.0,
      "completions/mean_length": 1048.78125,
      "completions/mean_terminated_length": 964.1016845703125,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.16685714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2851085364818573,
      "learning_rate": 2.918906036420294e-07,
      "loss": 0.0,
      "num_tokens": 14548551.0,
      "reward": 0.7197650671005249,
      "reward_std": 0.6007095575332642,
      "rewards/cosine_scaled_reward/mean": -0.12449245154857635,
      "rewards/cosine_scaled_reward/std": 0.3838319182395935,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 1127.25,
      "completions/mean_terminated_length": 1065.86669921875,
      "completions/min_length": 522.0,
      "completions/min_terminated_length": 522.0,
      "epoch": 0.168,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2944687306880951,
      "learning_rate": 2.854966364683872e-07,
      "loss": -0.0,
      "num_tokens": 14631479.0,
      "reward": 0.8103795051574707,
      "reward_std": 0.4803225100040436,
      "rewards/cosine_scaled_reward/mean": -0.08699773252010345,
      "rewards/cosine_scaled_reward/std": 0.4309064447879791,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1860.0,
      "completions/mean_length": 985.265625,
      "completions/mean_terminated_length": 854.75439453125,
      "completions/min_length": 383.0,
      "completions/min_terminated_length": 383.0,
      "epoch": 0.16914285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.316753625869751,
      "learning_rate": 2.791832395815782e-07,
      "loss": -0.0,
      "num_tokens": 14705720.0,
      "reward": 0.8408422470092773,
      "reward_std": 0.47007906436920166,
      "rewards/cosine_scaled_reward/mean": -0.06395385414361954,
      "rewards/cosine_scaled_reward/std": 0.39589446783065796,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1468.953125,
      "completions/mean_terminated_length": 1165.642822265625,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 0.1702857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2663600444793701,
      "learning_rate": 2.729523361034538e-07,
      "loss": 0.0,
      "num_tokens": 14811629.0,
      "reward": 0.49039748311042786,
      "reward_std": 0.7405215501785278,
      "rewards/cosine_scaled_reward/mean": -0.12980125844478607,
      "rewards/cosine_scaled_reward/std": 0.38436219096183777,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1840.0,
      "completions/mean_length": 1005.96875,
      "completions/mean_terminated_length": 954.7212524414062,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.17142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27496519684791565,
      "learning_rate": 2.6680582402757324e-07,
      "loss": -0.0,
      "num_tokens": 14886515.0,
      "reward": 0.8553354740142822,
      "reward_std": 0.7562883496284485,
      "rewards/cosine_scaled_reward/mean": -0.06451976299285889,
      "rewards/cosine_scaled_reward/std": 0.43835195899009705,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 1021.234375,
      "completions/mean_terminated_length": 853.2181396484375,
      "completions/min_length": 232.0,
      "completions/min_terminated_length": 232.0,
      "epoch": 0.17257142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3529910445213318,
      "learning_rate": 2.6074557564105724e-07,
      "loss": -0.0,
      "num_tokens": 14962554.0,
      "reward": 1.2106778621673584,
      "reward_std": 0.8269187211990356,
      "rewards/cosine_scaled_reward/mean": 0.1444014459848404,
      "rewards/cosine_scaled_reward/std": 0.5135605931282043,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 1046.015625,
      "completions/mean_terminated_length": 942.362060546875,
      "completions/min_length": 186.0,
      "completions/min_terminated_length": 186.0,
      "epoch": 0.1737142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3202276825904846,
      "learning_rate": 2.547734369542718e-07,
      "loss": -0.0,
      "num_tokens": 15040163.0,
      "reward": 0.6286852955818176,
      "reward_std": 0.6429262161254883,
      "rewards/cosine_scaled_reward/mean": -0.14659486711025238,
      "rewards/cosine_scaled_reward/std": 0.376168429851532,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1984.0,
      "completions/mean_length": 1092.078125,
      "completions/mean_terminated_length": 1045.0655517578125,
      "completions/min_length": 328.0,
      "completions/min_terminated_length": 328.0,
      "epoch": 0.17485714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30404749512672424,
      "learning_rate": 2.488912271385139e-07,
      "loss": -0.0,
      "num_tokens": 15121200.0,
      "reward": 0.489207923412323,
      "reward_std": 0.3634736239910126,
      "rewards/cosine_scaled_reward/mean": -0.2319585382938385,
      "rewards/cosine_scaled_reward/std": 0.1667877584695816,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1203.296875,
      "completions/mean_terminated_length": 1027.981201171875,
      "completions/min_length": 540.0,
      "completions/min_terminated_length": 540.0,
      "epoch": 0.176,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2990516126155853,
      "learning_rate": 2.4310073797187573e-07,
      "loss": -0.0,
      "num_tokens": 15209275.0,
      "reward": 0.6597298383712769,
      "reward_std": 0.6371828317642212,
      "rewards/cosine_scaled_reward/mean": -0.08419756591320038,
      "rewards/cosine_scaled_reward/std": 0.36733028292655945,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1794.0,
      "completions/mean_length": 949.9375,
      "completions/mean_terminated_length": 770.2545166015625,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.17714285714285713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.357664555311203,
      "learning_rate": 2.374037332934512e-07,
      "loss": 0.0,
      "num_tokens": 15280695.0,
      "reward": 0.5976670980453491,
      "reward_std": 0.5265668630599976,
      "rewards/cosine_scaled_reward/mean": -0.16991643607616425,
      "rewards/cosine_scaled_reward/std": 0.3645385801792145,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1218.578125,
      "completions/mean_terminated_length": 942.1041870117188,
      "completions/min_length": 475.0,
      "completions/min_terminated_length": 475.0,
      "epoch": 0.1782857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2834795415401459,
      "learning_rate": 2.3180194846605364e-07,
      "loss": 0.0,
      "num_tokens": 15368788.0,
      "reward": 0.6993736028671265,
      "reward_std": 0.7138185501098633,
      "rewards/cosine_scaled_reward/mean": -0.05656319856643677,
      "rewards/cosine_scaled_reward/std": 0.4233575463294983,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.39339789748191833,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1904.0,
      "completions/mean_length": 1047.828125,
      "completions/mean_terminated_length": 981.1500244140625,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 0.17942857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3224254250526428,
      "learning_rate": 2.2629708984760706e-07,
      "loss": -0.0,
      "num_tokens": 15445809.0,
      "reward": 0.5333588719367981,
      "reward_std": 0.6562705039978027,
      "rewards/cosine_scaled_reward/mean": -0.20207059383392334,
      "rewards/cosine_scaled_reward/std": 0.31774869561195374,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1920.0,
      "completions/mean_length": 929.453125,
      "completions/mean_terminated_length": 893.3709106445312,
      "completions/min_length": 128.0,
      "completions/min_terminated_length": 128.0,
      "epoch": 0.18057142857142858,
      "frac_reward_zero_std": 0.125,
      "grad_norm": 0.2938927114009857,
      "learning_rate": 2.2089083427137329e-07,
      "loss": -0.0,
      "num_tokens": 15515414.0,
      "reward": 1.0217353105545044,
      "reward_std": 0.3362354636192322,
      "rewards/cosine_scaled_reward/mean": 0.010867662727832794,
      "rewards/cosine_scaled_reward/std": 0.4613310992717743,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 1150.15625,
      "completions/mean_terminated_length": 1090.300048828125,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 0.18171428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2798176407814026,
      "learning_rate": 2.1558482853517253e-07,
      "loss": -0.0,
      "num_tokens": 15600704.0,
      "reward": 0.6661320924758911,
      "reward_std": 0.6566643714904785,
      "rewards/cosine_scaled_reward/mean": -0.14349643886089325,
      "rewards/cosine_scaled_reward/std": 0.35907885432243347,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1877.0,
      "completions/mean_length": 964.6875,
      "completions/mean_terminated_length": 892.4667358398438,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.18285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.34724488854408264,
      "learning_rate": 2.1038068889975259e-07,
      "loss": -0.0,
      "num_tokens": 15674028.0,
      "reward": 1.087422490119934,
      "reward_std": 0.7433469295501709,
      "rewards/cosine_scaled_reward/mean": 0.059336259961128235,
      "rewards/cosine_scaled_reward/std": 0.4811782240867615,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1739.0,
      "completions/mean_length": 1075.484375,
      "completions/mean_terminated_length": 993.0678100585938,
      "completions/min_length": 311.0,
      "completions/min_terminated_length": 311.0,
      "epoch": 0.184,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.33432650566101074,
      "learning_rate": 2.0528000059645995e-07,
      "loss": 0.0,
      "num_tokens": 15752851.0,
      "reward": 0.9870826601982117,
      "reward_std": 0.8721657395362854,
      "rewards/cosine_scaled_reward/mean": 0.009166322648525238,
      "rewards/cosine_scaled_reward/std": 0.4949844777584076,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1593.0,
      "completions/mean_length": 1048.421875,
      "completions/mean_terminated_length": 963.7118530273438,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 0.18514285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3334272503852844,
      "learning_rate": 2.0028431734436308e-07,
      "loss": 0.0,
      "num_tokens": 15830254.0,
      "reward": 0.6991279721260071,
      "reward_std": 0.6646199822425842,
      "rewards/cosine_scaled_reward/mean": -0.11918601393699646,
      "rewards/cosine_scaled_reward/std": 0.336944580078125,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 1092.78125,
      "completions/mean_terminated_length": 872.34619140625,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 0.18628571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30620095133781433,
      "learning_rate": 1.9539516087697517e-07,
      "loss": -0.0,
      "num_tokens": 15911376.0,
      "reward": 1.2059270143508911,
      "reward_std": 0.7845113277435303,
      "rewards/cosine_scaled_reward/mean": 0.19671350717544556,
      "rewards/cosine_scaled_reward/std": 0.524649441242218,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.39339789748191833,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 941.140625,
      "completions/mean_terminated_length": 867.3500366210938,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.18742857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3235553801059723,
      "learning_rate": 1.9061402047871833e-07,
      "loss": -0.0,
      "num_tokens": 15982137.0,
      "reward": 0.8660029172897339,
      "reward_std": 0.4971213936805725,
      "rewards/cosine_scaled_reward/mean": -0.04356100410223007,
      "rewards/cosine_scaled_reward/std": 0.4514644742012024,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 1073.859375,
      "completions/mean_terminated_length": 973.086181640625,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 0.18857142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32722005248069763,
      "learning_rate": 1.8594235253127372e-07,
      "loss": -0.0,
      "num_tokens": 16062280.0,
      "reward": 1.0658413171768188,
      "reward_std": 0.37287360429763794,
      "rewards/cosine_scaled_reward/mean": 0.04854566603899002,
      "rewards/cosine_scaled_reward/std": 0.532474160194397,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1632.0,
      "completions/mean_length": 1209.6875,
      "completions/mean_terminated_length": 974.9599609375,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.18971428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2876857817173004,
      "learning_rate": 1.8138158006995363e-07,
      "loss": 0.0,
      "num_tokens": 16151276.0,
      "reward": 0.5080017447471619,
      "reward_std": 0.5052056312561035,
      "rewards/cosine_scaled_reward/mean": -0.14443661272525787,
      "rewards/cosine_scaled_reward/std": 0.3009922206401825,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1636.0,
      "completions/mean_length": 1010.515625,
      "completions/mean_terminated_length": 922.5932006835938,
      "completions/min_length": 518.0,
      "completions/min_terminated_length": 518.0,
      "epoch": 0.19085714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24922896921634674,
      "learning_rate": 1.7693309235023127e-07,
      "loss": 0.0,
      "num_tokens": 16227493.0,
      "reward": 0.6374216079711914,
      "reward_std": 0.5959868431091309,
      "rewards/cosine_scaled_reward/mean": -0.1500391960144043,
      "rewards/cosine_scaled_reward/std": 0.3088480234146118,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 991.953125,
      "completions/mean_terminated_length": 975.1905517578125,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 0.192,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28734132647514343,
      "learning_rate": 1.7259824442455923e-07,
      "loss": -0.0,
      "num_tokens": 16301818.0,
      "reward": 0.979953408241272,
      "reward_std": 0.515397310256958,
      "rewards/cosine_scaled_reward/mean": -0.010023288428783417,
      "rewards/cosine_scaled_reward/std": 0.4792404770851135,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1570.0,
      "completions/mean_length": 947.0,
      "completions/mean_terminated_length": 911.4838256835938,
      "completions/min_length": 529.0,
      "completions/min_terminated_length": 529.0,
      "epoch": 0.19314285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2832062244415283,
      "learning_rate": 1.6837835672960831e-07,
      "loss": -0.0,
      "num_tokens": 16373050.0,
      "reward": 1.5313034057617188,
      "reward_std": 0.6551711559295654,
      "rewards/cosine_scaled_reward/mean": 0.26565176248550415,
      "rewards/cosine_scaled_reward/std": 0.5679692625999451,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1002.859375,
      "completions/mean_terminated_length": 853.5535888671875,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.19428571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3436959385871887,
      "learning_rate": 1.6427471468404952e-07,
      "loss": -0.0,
      "num_tokens": 16447017.0,
      "reward": 0.7409926056861877,
      "reward_std": 0.45915085077285767,
      "rewards/cosine_scaled_reward/mean": -0.06700369715690613,
      "rewards/cosine_scaled_reward/std": 0.45885714888572693,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1571.0,
      "completions/mean_length": 1204.078125,
      "completions/mean_terminated_length": 873.8478393554688,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 0.19542857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.267551064491272,
      "learning_rate": 1.6028856829700258e-07,
      "loss": 0.0,
      "num_tokens": 16534470.0,
      "reward": 0.7415462732315063,
      "reward_std": 0.5762134790420532,
      "rewards/cosine_scaled_reward/mean": -0.004226889461278915,
      "rewards/cosine_scaled_reward/std": 0.4231807589530945,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 1047.71875,
      "completions/mean_terminated_length": 998.5245361328125,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.19657142857142856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3394479751586914,
      "learning_rate": 1.5642113178727193e-07,
      "loss": 0.0,
      "num_tokens": 16612140.0,
      "reward": 1.3958909511566162,
      "reward_std": 0.5264730453491211,
      "rewards/cosine_scaled_reward/mean": 0.2057579606771469,
      "rewards/cosine_scaled_reward/std": 0.5397146940231323,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1870.0,
      "completions/mean_length": 884.90625,
      "completions/mean_terminated_length": 807.36669921875,
      "completions/min_length": 292.0,
      "completions/min_terminated_length": 292.0,
      "epoch": 0.1977142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3885309100151062,
      "learning_rate": 1.5267358321348285e-07,
      "loss": -0.0,
      "num_tokens": 16680086.0,
      "reward": 0.8123934268951416,
      "reward_std": 0.6085149049758911,
      "rewards/cosine_scaled_reward/mean": -0.0781782865524292,
      "rewards/cosine_scaled_reward/std": 0.40098583698272705,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 998.46875,
      "completions/mean_terminated_length": 946.8524169921875,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 0.19885714285714284,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3092297315597534,
      "learning_rate": 1.4904706411523448e-07,
      "loss": 0.0,
      "num_tokens": 16754324.0,
      "reward": 1.0309419631958008,
      "reward_std": 0.710152804851532,
      "rewards/cosine_scaled_reward/mean": 0.023283500224351883,
      "rewards/cosine_scaled_reward/std": 0.48326122760772705,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 988.515625,
      "completions/mean_terminated_length": 936.4097900390625,
      "completions/min_length": 381.0,
      "completions/min_terminated_length": 381.0,
      "epoch": 0.2,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2942121922969818,
      "learning_rate": 1.4554267916537495e-07,
      "loss": -0.0,
      "num_tokens": 16828413.0,
      "reward": 0.9853310585021973,
      "reward_std": 0.4881608486175537,
      "rewards/cosine_scaled_reward/mean": 0.008290551602840424,
      "rewards/cosine_scaled_reward/std": 0.4611859917640686,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 1040.375,
      "completions/mean_terminated_length": 916.631591796875,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.20114285714285715,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3189985752105713,
      "learning_rate": 1.4216149583350755e-07,
      "loss": -0.0,
      "num_tokens": 16906165.0,
      "reward": 0.715010404586792,
      "reward_std": 0.6412252187728882,
      "rewards/cosine_scaled_reward/mean": -0.10343232750892639,
      "rewards/cosine_scaled_reward/std": 0.37575584650039673,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27048972249031067,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1705.0,
      "completions/mean_length": 942.484375,
      "completions/mean_terminated_length": 924.9365844726562,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.2022857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.36539387702941895,
      "learning_rate": 1.3890454406082956e-07,
      "loss": -0.0,
      "num_tokens": 16976708.0,
      "reward": 1.1415762901306152,
      "reward_std": 0.6199163198471069,
      "rewards/cosine_scaled_reward/mean": 0.07860065996646881,
      "rewards/cosine_scaled_reward/std": 0.48708510398864746,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1029.21875,
      "completions/mean_terminated_length": 961.300048828125,
      "completions/min_length": 407.0,
      "completions/min_terminated_length": 407.0,
      "epoch": 0.20342857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30215874314308167,
      "learning_rate": 1.3577281594640182e-07,
      "loss": -0.0,
      "num_tokens": 17054114.0,
      "reward": 1.04861319065094,
      "reward_std": 0.6202192306518555,
      "rewards/cosine_scaled_reward/mean": 0.04774410277605057,
      "rewards/cosine_scaled_reward/std": 0.5067017674446106,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.21304203569889069,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1255.328125,
      "completions/mean_terminated_length": 1033.3800048828125,
      "completions/min_length": 514.0,
      "completions/min_terminated_length": 514.0,
      "epoch": 0.20457142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2957271933555603,
      "learning_rate": 1.3276726544494571e-07,
      "loss": -0.0,
      "num_tokens": 17145287.0,
      "reward": 0.5679646730422974,
      "reward_std": 0.5812042951583862,
      "rewards/cosine_scaled_reward/mean": -0.12226766347885132,
      "rewards/cosine_scaled_reward/std": 0.30705568194389343,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.39339789748191833,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1005.6875,
      "completions/mean_terminated_length": 897.862060546875,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.2057142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32567262649536133,
      "learning_rate": 1.2988880807625927e-07,
      "loss": -0.0,
      "num_tokens": 17220827.0,
      "reward": 1.4265122413635254,
      "reward_std": 0.8014136552810669,
      "rewards/cosine_scaled_reward/mean": 0.22888115048408508,
      "rewards/cosine_scaled_reward/std": 0.5195574164390564,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1843.0,
      "completions/mean_length": 1158.75,
      "completions/mean_terminated_length": 994.0740966796875,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 0.20685714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2997998297214508,
      "learning_rate": 1.2713832064634125e-07,
      "loss": -0.0,
      "num_tokens": 17306459.0,
      "reward": 0.7811408042907715,
      "reward_std": 0.5742530822753906,
      "rewards/cosine_scaled_reward/mean": -0.04692957177758217,
      "rewards/cosine_scaled_reward/std": 0.47856900095939636,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 1189.953125,
      "completions/mean_terminated_length": 903.9375,
      "completions/min_length": 297.0,
      "completions/min_terminated_length": 297.0,
      "epoch": 0.208,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27360063791275024,
      "learning_rate": 1.2451664098030743e-07,
      "loss": -0.0,
      "num_tokens": 17392408.0,
      "reward": 0.4934787154197693,
      "reward_std": 0.638634204864502,
      "rewards/cosine_scaled_reward/mean": -0.15169814229011536,
      "rewards/cosine_scaled_reward/std": 0.3459075689315796,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 977.84375,
      "completions/mean_terminated_length": 867.137939453125,
      "completions/min_length": 185.0,
      "completions/min_terminated_length": 185.0,
      "epoch": 0.20914285714285713,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.31348717212677,
      "learning_rate": 1.220245676671809e-07,
      "loss": 0.0,
      "num_tokens": 17465606.0,
      "reward": 1.1325675249099731,
      "reward_std": 0.7940603494644165,
      "rewards/cosine_scaled_reward/mean": 0.07409626245498657,
      "rewards/cosine_scaled_reward/std": 0.48065999150276184,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1845.0,
      "completions/mean_length": 1064.8125,
      "completions/mean_terminated_length": 963.1034545898438,
      "completions/min_length": 123.0,
      "completions/min_terminated_length": 123.0,
      "epoch": 0.2102857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4040955603122711,
      "learning_rate": 1.1966285981663407e-07,
      "loss": 0.0,
      "num_tokens": 17545106.0,
      "reward": 0.7675222754478455,
      "reward_std": 0.5715835094451904,
      "rewards/cosine_scaled_reward/mean": -0.10842636972665787,
      "rewards/cosine_scaled_reward/std": 0.3608429729938507,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 1129.375,
      "completions/mean_terminated_length": 938.7169799804688,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.21142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3105817437171936,
      "learning_rate": 1.1743223682775649e-07,
      "loss": -0.0,
      "num_tokens": 17627570.0,
      "reward": 0.7331613302230835,
      "reward_std": 0.7032245993614197,
      "rewards/cosine_scaled_reward/mean": -0.07873181998729706,
      "rewards/cosine_scaled_reward/std": 0.4445188641548157,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1989.0,
      "completions/mean_length": 1137.9375,
      "completions/mean_terminated_length": 969.4074096679688,
      "completions/min_length": 538.0,
      "completions/min_terminated_length": 538.0,
      "epoch": 0.21257142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2835804224014282,
      "learning_rate": 1.1533337816991931e-07,
      "loss": 0.0,
      "num_tokens": 17711094.0,
      "reward": 0.9186009764671326,
      "reward_std": 0.6486971974372864,
      "rewards/cosine_scaled_reward/mean": 0.029613006860017776,
      "rewards/cosine_scaled_reward/std": 0.5038316249847412,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 1070.71875,
      "completions/mean_terminated_length": 987.8983154296875,
      "completions/min_length": 251.0,
      "completions/min_terminated_length": 251.0,
      "epoch": 0.21371428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3467414379119873,
      "learning_rate": 1.1336692317580158e-07,
      "loss": -0.0,
      "num_tokens": 17790044.0,
      "reward": 0.7328593730926514,
      "reward_std": 0.37134072184562683,
      "rewards/cosine_scaled_reward/mean": -0.12575779855251312,
      "rewards/cosine_scaled_reward/std": 0.4253794848918915,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1122.578125,
      "completions/mean_terminated_length": 1107.888916015625,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 0.21485714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2971196472644806,
      "learning_rate": 1.1153347084664419e-07,
      "loss": 0.0,
      "num_tokens": 17873689.0,
      "reward": 0.5169162750244141,
      "reward_std": 0.4035249650478363,
      "rewards/cosine_scaled_reward/mean": -0.23372937738895416,
      "rewards/cosine_scaled_reward/std": 0.2780001759529114,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 901.78125,
      "completions/mean_terminated_length": 804.64404296875,
      "completions/min_length": 119.0,
      "completions/min_terminated_length": 119.0,
      "epoch": 0.216,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.4040164351463318,
      "learning_rate": 1.0983357966978745e-07,
      "loss": -0.0,
      "num_tokens": 17940691.0,
      "reward": 0.6845261454582214,
      "reward_std": 0.575655460357666,
      "rewards/cosine_scaled_reward/mean": -0.14992442727088928,
      "rewards/cosine_scaled_reward/std": 0.402413934469223,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1892.0,
      "completions/mean_length": 985.53125,
      "completions/mean_terminated_length": 933.2786254882812,
      "completions/min_length": 466.0,
      "completions/min_terminated_length": 466.0,
      "epoch": 0.21714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2768203318119049,
      "learning_rate": 1.0826776744855121e-07,
      "loss": -0.0,
      "num_tokens": 18013533.0,
      "reward": 1.1712180376052856,
      "reward_std": 0.7703711986541748,
      "rewards/cosine_scaled_reward/mean": 0.09342151135206223,
      "rewards/cosine_scaled_reward/std": 0.5197325944900513,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 941.796875,
      "completions/mean_terminated_length": 924.2381591796875,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 0.21828571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.32142648100852966,
      "learning_rate": 1.068365111445064e-07,
      "loss": 0.0,
      "num_tokens": 18084096.0,
      "reward": 1.1099364757537842,
      "reward_std": 0.8239980936050415,
      "rewards/cosine_scaled_reward/mean": 0.06278076767921448,
      "rewards/cosine_scaled_reward/std": 0.504546046257019,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.125,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 1316.078125,
      "completions/mean_terminated_length": 1164.1697998046875,
      "completions/min_length": 490.0,
      "completions/min_terminated_length": 490.0,
      "epoch": 0.21942857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2582016587257385,
      "learning_rate": 1.0554024673218806e-07,
      "loss": -0.0,
      "num_tokens": 18179621.0,
      "reward": 0.5557677745819092,
      "reward_std": 0.47908520698547363,
      "rewards/cosine_scaled_reward/mean": -0.1596161425113678,
      "rewards/cosine_scaled_reward/std": 0.34955671429634094,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3333333432674408,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 1283.421875,
      "completions/mean_terminated_length": 1106.9808349609375,
      "completions/min_length": 523.0,
      "completions/min_terminated_length": 523.0,
      "epoch": 0.22057142857142858,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24153046309947968,
      "learning_rate": 1.0437936906629334e-07,
      "loss": 0.0,
      "num_tokens": 18272904.0,
      "reward": 0.858095109462738,
      "reward_std": 0.6822292804718018,
      "rewards/cosine_scaled_reward/mean": 0.014985032379627228,
      "rewards/cosine_scaled_reward/std": 0.4240723252296448,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 1341.828125,
      "completions/mean_terminated_length": 1065.5,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 0.22171428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2911985516548157,
      "learning_rate": 1.0335423176140511e-07,
      "loss": 0.0,
      "num_tokens": 18370141.0,
      "reward": 0.8254599571228027,
      "reward_std": 0.6828211545944214,
      "rewards/cosine_scaled_reward/mean": 0.01429247111082077,
      "rewards/cosine_scaled_reward/std": 0.43286019563674927,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1125.015625,
      "completions/mean_terminated_length": 1046.796630859375,
      "completions/min_length": 460.0,
      "completions/min_terminated_length": 460.0,
      "epoch": 0.22285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2628892660140991,
      "learning_rate": 1.0246514708427701e-07,
      "loss": 0.0,
      "num_tokens": 18453286.0,
      "reward": 0.9133278131484985,
      "reward_std": 0.7113606929779053,
      "rewards/cosine_scaled_reward/mean": -0.012086104601621628,
      "rewards/cosine_scaled_reward/std": 0.48524293303489685,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1865.0,
      "completions/mean_length": 1190.984375,
      "completions/mean_terminated_length": 1102.32763671875,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 0.224,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.29825979471206665,
      "learning_rate": 1.017123858587145e-07,
      "loss": 0.0,
      "num_tokens": 18541101.0,
      "reward": 0.7271528244018555,
      "reward_std": 0.7205462455749512,
      "rewards/cosine_scaled_reward/mean": -0.08173607289791107,
      "rewards/cosine_scaled_reward/std": 0.42802518606185913,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 906.96875,
      "completions/mean_terminated_length": 888.857177734375,
      "completions/min_length": 386.0,
      "completions/min_terminated_length": 386.0,
      "epoch": 0.22514285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3493105173110962,
      "learning_rate": 1.0109617738307911e-07,
      "loss": -0.0,
      "num_tokens": 18609515.0,
      "reward": 1.2718065977096558,
      "reward_std": 0.7709304094314575,
      "rewards/cosine_scaled_reward/mean": 0.13590331375598907,
      "rewards/cosine_scaled_reward/std": 0.5550388097763062,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1956.0,
      "completions/mean_length": 1087.234375,
      "completions/mean_terminated_length": 930.0181274414062,
      "completions/min_length": 306.0,
      "completions/min_terminated_length": 306.0,
      "epoch": 0.22628571428571428,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.30146288871765137,
      "learning_rate": 1.0061670936044178e-07,
      "loss": 0.0,
      "num_tokens": 18690602.0,
      "reward": 0.7452408075332642,
      "reward_std": 0.5873350501060486,
      "rewards/cosine_scaled_reward/mean": -0.11175459623336792,
      "rewards/cosine_scaled_reward/std": 0.3918561339378357,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1893.0,
      "completions/mean_length": 1234.953125,
      "completions/mean_terminated_length": 1135.105224609375,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 0.22742857142857142,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.267063170671463,
      "learning_rate": 1.002741278414069e-07,
      "loss": -0.0,
      "num_tokens": 18781143.0,
      "reward": 0.5316354036331177,
      "reward_std": 0.5563629269599915,
      "rewards/cosine_scaled_reward/mean": -0.20293226838111877,
      "rewards/cosine_scaled_reward/std": 0.2751743197441101,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 1032.4375,
      "completions/mean_terminated_length": 946.3728637695312,
      "completions/min_length": 464.0,
      "completions/min_terminated_length": 464.0,
      "epoch": 0.22857142857142856,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28624024987220764,
      "learning_rate": 1.0006853717962393e-07,
      "loss": -0.0,
      "num_tokens": 18857339.0,
      "reward": 1.3084194660186768,
      "reward_std": 0.7574798464775085,
      "rewards/cosine_scaled_reward/mean": 0.16983476281166077,
      "rewards/cosine_scaled_reward/std": 0.5382829308509827,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.17536810040473938,
      "step": 200
    },
    {
      "epoch": 0.22857142857142856,
      "step": 200,
      "total_flos": 0.0,
      "train_loss": -9.033828973770141e-10,
      "train_runtime": 10084.0613,
      "train_samples_per_second": 1.269,
      "train_steps_per_second": 0.02
    }
  ],
  "logging_steps": 1,
  "max_steps": 200,
  "num_input_tokens_seen": 18857339,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}