{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.05714285714285714,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1734.0,
      "completions/mean_length": 1702.03125,
      "completions/mean_terminated_length": 993.6190795898438,
      "completions/min_length": 483.0,
      "completions/min_terminated_length": 483.0,
      "epoch": 0.001142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25444135069847107,
      "learning_rate": 0.0,
      "loss": -0.0,
      "num_tokens": 118418.0,
      "reward": 0.17899775505065918,
      "reward_std": 0.7650213241577148,
      "rewards/cosine_scaled_reward/mean": -0.09800112992525101,
      "rewards/cosine_scaled_reward/std": 0.37953105568885803,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1738.90625,
      "completions/mean_terminated_length": 949.0,
      "completions/min_length": 435.0,
      "completions/min_terminated_length": 435.0,
      "epoch": 0.002285714285714286,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24364067614078522,
      "learning_rate": 5e-08,
      "loss": -0.0,
      "num_tokens": 239748.0,
      "reward": 0.3848632574081421,
      "reward_std": 0.9111153483390808,
      "rewards/cosine_scaled_reward/mean": 0.020556632429361343,
      "rewards/cosine_scaled_reward/std": 0.4492928683757782,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1405.0,
      "completions/mean_length": 1930.609375,
      "completions/mean_terminated_length": 974.71435546875,
      "completions/min_length": 477.0,
      "completions/min_terminated_length": 477.0,
      "epoch": 0.0034285714285714284,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27834266424179077,
      "learning_rate": 1e-07,
      "loss": -0.0,
      "num_tokens": 373779.0,
      "reward": -0.3227587938308716,
      "reward_std": 0.45940712094306946,
      "rewards/cosine_scaled_reward/mean": -0.2160668969154358,
      "rewards/cosine_scaled_reward/std": 0.21890601515769958,
      "rewards/format_reward/mean": 0.109375,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.515625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 1596.75,
      "completions/mean_terminated_length": 1116.3870849609375,
      "completions/min_length": 474.0,
      "completions/min_terminated_length": 474.0,
      "epoch": 0.004571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2799243628978729,
      "learning_rate": 1.5e-07,
      "loss": -0.0,
      "num_tokens": 485779.0,
      "reward": 0.27003082633018494,
      "reward_std": 0.7608597874641418,
      "rewards/cosine_scaled_reward/mean": -0.11498458683490753,
      "rewards/cosine_scaled_reward/std": 0.36645373702049255,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5039526224136353,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.96875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1755.0,
      "completions/mean_length": 2035.46875,
      "completions/mean_terminated_length": 1647.0,
      "completions/min_length": 1539.0,
      "completions/min_terminated_length": 1539.0,
      "epoch": 0.005714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24311119318008423,
      "learning_rate": 2e-07,
      "loss": -0.0,
      "num_tokens": 626865.0,
      "reward": -0.4839385151863098,
      "reward_std": 0.34498828649520874,
      "rewards/cosine_scaled_reward/mean": -0.2732192277908325,
      "rewards/cosine_scaled_reward/std": 0.18402352929115295,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1862.0,
      "completions/mean_length": 1884.109375,
      "completions/mean_terminated_length": 882.5555419921875,
      "completions/min_length": 524.0,
      "completions/min_terminated_length": 524.0,
      "epoch": 0.006857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2741600275039673,
      "learning_rate": 2.5e-07,
      "loss": -0.0,
      "num_tokens": 759096.0,
      "reward": -0.2049689143896103,
      "reward_std": 0.639178991317749,
      "rewards/cosine_scaled_reward/mean": -0.18060946464538574,
      "rewards/cosine_scaled_reward/std": 0.2599981129169464,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1795.0,
      "completions/mean_length": 1959.84375,
      "completions/mean_terminated_length": 1342.75,
      "completions/min_length": 974.0,
      "completions/min_terminated_length": 974.0,
      "epoch": 0.008,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21986258029937744,
      "learning_rate": 3e-07,
      "loss": -0.0,
      "num_tokens": 894934.0,
      "reward": -0.11210991442203522,
      "reward_std": 0.6349427103996277,
      "rewards/cosine_scaled_reward/mean": -0.14199243485927582,
      "rewards/cosine_scaled_reward/std": 0.3749195337295532,
      "rewards/format_reward/mean": 0.171875,
      "rewards/format_reward/std": 0.38025420904159546,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1895.0,
      "completions/mean_length": 1717.78125,
      "completions/mean_terminated_length": 873.888916015625,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.009142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23102505505084991,
      "learning_rate": 3.5e-07,
      "loss": -0.0,
      "num_tokens": 1015288.0,
      "reward": 0.12653985619544983,
      "reward_std": 0.4742490351200104,
      "rewards/cosine_scaled_reward/mean": -0.09298005700111389,
      "rewards/cosine_scaled_reward/std": 0.39157670736312866,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.828125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1815.0,
      "completions/mean_length": 1928.53125,
      "completions/mean_terminated_length": 1352.9091796875,
      "completions/min_length": 999.0,
      "completions/min_terminated_length": 999.0,
      "epoch": 0.010285714285714285,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2600339353084564,
      "learning_rate": 4e-07,
      "loss": -0.0,
      "num_tokens": 1150170.0,
      "reward": -0.14216071367263794,
      "reward_std": 0.702994704246521,
      "rewards/cosine_scaled_reward/mean": -0.17264285683631897,
      "rewards/cosine_scaled_reward/std": 0.33145979046821594,
      "rewards/format_reward/mean": 0.203125,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1432.0,
      "completions/mean_length": 1699.84375,
      "completions/mean_terminated_length": 810.1111450195312,
      "completions/min_length": 337.0,
      "completions/min_terminated_length": 337.0,
      "epoch": 0.011428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2749471366405487,
      "learning_rate": 4.5e-07,
      "loss": -0.0,
      "num_tokens": 1269792.0,
      "reward": -0.13922849297523499,
      "reward_std": 0.4937349855899811,
      "rewards/cosine_scaled_reward/mean": -0.2102392464876175,
      "rewards/cosine_scaled_reward/std": 0.30274781584739685,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1614.0,
      "completions/mean_length": 1994.453125,
      "completions/mean_terminated_length": 1191.25,
      "completions/min_length": 916.0,
      "completions/min_terminated_length": 916.0,
      "epoch": 0.012571428571428572,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2305486500263214,
      "learning_rate": 5e-07,
      "loss": 0.0,
      "num_tokens": 1409109.0,
      "reward": -0.39525067806243896,
      "reward_std": 0.3650783896446228,
      "rewards/cosine_scaled_reward/mean": -0.2288753092288971,
      "rewards/cosine_scaled_reward/std": 0.22182811796665192,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.24397502839565277,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 1706.6875,
      "completions/mean_terminated_length": 1174.239990234375,
      "completions/min_length": 319.0,
      "completions/min_terminated_length": 319.0,
      "epoch": 0.013714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2918066382408142,
      "learning_rate": 5.5e-07,
      "loss": 0.0,
      "num_tokens": 1529281.0,
      "reward": 0.08787664026021957,
      "reward_std": 0.7579531073570251,
      "rewards/cosine_scaled_reward/mean": -0.18262416124343872,
      "rewards/cosine_scaled_reward/std": 0.37901216745376587,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 1820.828125,
      "completions/mean_terminated_length": 1078.7333984375,
      "completions/min_length": 527.0,
      "completions/min_terminated_length": 527.0,
      "epoch": 0.014857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27849823236465454,
      "learning_rate": 6e-07,
      "loss": -0.0,
      "num_tokens": 1656854.0,
      "reward": 0.03077489137649536,
      "reward_std": 0.6479229927062988,
      "rewards/cosine_scaled_reward/mean": -0.12523755431175232,
      "rewards/cosine_scaled_reward/std": 0.34234777092933655,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 1772.296875,
      "completions/mean_terminated_length": 1165.75,
      "completions/min_length": 605.0,
      "completions/min_terminated_length": 605.0,
      "epoch": 0.016,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25037428736686707,
      "learning_rate": 6.5e-07,
      "loss": 0.0,
      "num_tokens": 1780889.0,
      "reward": 0.3261271119117737,
      "reward_std": 0.6276673078536987,
      "rewards/cosine_scaled_reward/mean": -0.008811453357338905,
      "rewards/cosine_scaled_reward/std": 0.46767035126686096,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 1715.5625,
      "completions/mean_terminated_length": 928.2105102539062,
      "completions/min_length": 413.0,
      "completions/min_terminated_length": 413.0,
      "epoch": 0.017142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26902371644973755,
      "learning_rate": 7e-07,
      "loss": -0.0,
      "num_tokens": 1901605.0,
      "reward": 0.3007117211818695,
      "reward_std": 0.3918319642543793,
      "rewards/cosine_scaled_reward/mean": -0.005894124507904053,
      "rewards/cosine_scaled_reward/std": 0.4677385091781616,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.984375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 757.0,
      "completions/mean_length": 2027.828125,
      "completions/mean_terminated_length": 757.0,
      "completions/min_length": 757.0,
      "completions/min_terminated_length": 757.0,
      "epoch": 0.018285714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25064370036125183,
      "learning_rate": 7.5e-07,
      "loss": -0.0,
      "num_tokens": 2041826.0,
      "reward": -0.499896764755249,
      "reward_std": 0.34189552068710327,
      "rewards/cosine_scaled_reward/mean": -0.2577608823776245,
      "rewards/cosine_scaled_reward/std": 0.18115636706352234,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.125,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1816.0,
      "completions/mean_length": 1530.796875,
      "completions/mean_terminated_length": 906.586181640625,
      "completions/min_length": 378.0,
      "completions/min_terminated_length": 378.0,
      "epoch": 0.019428571428571427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3018316924571991,
      "learning_rate": 8e-07,
      "loss": -0.0,
      "num_tokens": 2150317.0,
      "reward": 0.23110359907150269,
      "reward_std": 0.6260336637496948,
      "rewards/cosine_scaled_reward/mean": -0.12663568556308746,
      "rewards/cosine_scaled_reward/std": 0.39377179741859436,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1689.0,
      "completions/mean_length": 1813.671875,
      "completions/mean_terminated_length": 894.3846435546875,
      "completions/min_length": 505.0,
      "completions/min_terminated_length": 505.0,
      "epoch": 0.02057142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23236438632011414,
      "learning_rate": 8.499999999999999e-07,
      "loss": 0.0,
      "num_tokens": 2276768.0,
      "reward": -0.10029121488332748,
      "reward_std": 0.7172800302505493,
      "rewards/cosine_scaled_reward/mean": -0.18295811116695404,
      "rewards/cosine_scaled_reward/std": 0.3038564622402191,
      "rewards/format_reward/mean": 0.265625,
      "rewards/format_reward/std": 0.44515693187713623,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.78125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1697.0,
      "completions/mean_length": 1843.15625,
      "completions/mean_terminated_length": 1111.571533203125,
      "completions/min_length": 484.0,
      "completions/min_terminated_length": 484.0,
      "epoch": 0.021714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2313074916601181,
      "learning_rate": 9e-07,
      "loss": -0.0,
      "num_tokens": 2405986.0,
      "reward": 0.09310440719127655,
      "reward_std": 0.7020131349563599,
      "rewards/cosine_scaled_reward/mean": -0.08626029640436172,
      "rewards/cosine_scaled_reward/std": 0.44063708186149597,
      "rewards/format_reward/mean": 0.265625,
      "rewards/format_reward/std": 0.44515693187713623,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.578125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1676.0,
      "completions/mean_length": 1523.03125,
      "completions/mean_terminated_length": 803.629638671875,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.022857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.295642226934433,
      "learning_rate": 9.499999999999999e-07,
      "loss": -0.0,
      "num_tokens": 2514812.0,
      "reward": 0.3644811511039734,
      "reward_std": 0.7943294048309326,
      "rewards/cosine_scaled_reward/mean": -0.03650941699743271,
      "rewards/cosine_scaled_reward/std": 0.44610291719436646,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.5,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 1793.328125,
      "completions/mean_terminated_length": 1190.157958984375,
      "completions/min_length": 455.0,
      "completions/min_terminated_length": 455.0,
      "epoch": 0.024,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2961376905441284,
      "learning_rate": 1e-06,
      "loss": -0.0,
      "num_tokens": 2640393.0,
      "reward": 0.06134350597858429,
      "reward_std": 0.6498202085494995,
      "rewards/cosine_scaled_reward/mean": -0.14120325446128845,
      "rewards/cosine_scaled_reward/std": 0.3548509478569031,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1925.0,
      "completions/mean_length": 1386.75,
      "completions/mean_terminated_length": 872.4444580078125,
      "completions/min_length": 271.0,
      "completions/min_terminated_length": 271.0,
      "epoch": 0.025142857142857144,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.34918317198753357,
      "learning_rate": 9.99931462820376e-07,
      "loss": 0.0,
      "num_tokens": 2738161.0,
      "reward": 0.5064569711685181,
      "reward_std": 0.7104054689407349,
      "rewards/cosine_scaled_reward/mean": -0.035834040492773056,
      "rewards/cosine_scaled_reward/std": 0.4265843331813812,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.59375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1986.0,
      "completions/mean_length": 1656.59375,
      "completions/mean_terminated_length": 1084.5384521484375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.026285714285714287,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26697081327438354,
      "learning_rate": 9.997258721585931e-07,
      "loss": -0.0,
      "num_tokens": 2854975.0,
      "reward": 0.2737857699394226,
      "reward_std": 0.6956006288528442,
      "rewards/cosine_scaled_reward/mean": -0.0896696150302887,
      "rewards/cosine_scaled_reward/std": 0.3913433253765106,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.501733124256134,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 1790.421875,
      "completions/mean_terminated_length": 1223.75,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.027428571428571427,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24950818717479706,
      "learning_rate": 9.993832906395582e-07,
      "loss": -0.0,
      "num_tokens": 2980490.0,
      "reward": -0.08990197628736496,
      "reward_std": 0.7724581956863403,
      "rewards/cosine_scaled_reward/mean": -0.21682599186897278,
      "rewards/cosine_scaled_reward/std": 0.35711658000946045,
      "rewards/format_reward/mean": 0.34375,
      "rewards/format_reward/std": 0.4787135720252991,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 1703.953125,
      "completions/mean_terminated_length": 889.1052856445312,
      "completions/min_length": 427.0,
      "completions/min_terminated_length": 427.0,
      "epoch": 0.02857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28078693151474,
      "learning_rate": 9.989038226169207e-07,
      "loss": 0.0,
      "num_tokens": 3099839.0,
      "reward": -0.12643180787563324,
      "reward_std": 0.6687923669815063,
      "rewards/cosine_scaled_reward/mean": -0.21946589648723602,
      "rewards/cosine_scaled_reward/std": 0.30431970953941345,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 1938.078125,
      "completions/mean_terminated_length": 1608.3125,
      "completions/min_length": 1087.0,
      "completions/min_terminated_length": 1087.0,
      "epoch": 0.029714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.21486051380634308,
      "learning_rate": 9.982876141412855e-07,
      "loss": 0.0,
      "num_tokens": 3234508.0,
      "reward": 0.05503671616315842,
      "reward_std": 0.6532000303268433,
      "rewards/cosine_scaled_reward/mean": -0.1287316530942917,
      "rewards/cosine_scaled_reward/std": 0.36068078875541687,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1896.375,
      "completions/mean_terminated_length": 1401.0667724609375,
      "completions/min_length": 568.0,
      "completions/min_terminated_length": 568.0,
      "epoch": 0.030857142857142857,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2675936222076416,
      "learning_rate": 9.975348529157229e-07,
      "loss": -0.0,
      "num_tokens": 3366164.0,
      "reward": -0.02987336739897728,
      "reward_std": 0.5919089913368225,
      "rewards/cosine_scaled_reward/mean": -0.1633741855621338,
      "rewards/cosine_scaled_reward/std": 0.3508918881416321,
      "rewards/format_reward/mean": 0.296875,
      "rewards/format_reward/std": 0.4604927599430084,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 1832.96875,
      "completions/mean_terminated_length": 901.1666870117188,
      "completions/min_length": 450.0,
      "completions/min_terminated_length": 450.0,
      "epoch": 0.032,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2518174946308136,
      "learning_rate": 9.96645768238595e-07,
      "loss": 0.0,
      "num_tokens": 3493810.0,
      "reward": 0.08577289432287216,
      "reward_std": 0.6993601322174072,
      "rewards/cosine_scaled_reward/mean": -0.08211354911327362,
      "rewards/cosine_scaled_reward/std": 0.45168522000312805,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1779.0,
      "completions/mean_length": 1921.1875,
      "completions/mean_terminated_length": 1146.2222900390625,
      "completions/min_length": 710.0,
      "completions/min_terminated_length": 710.0,
      "epoch": 0.03314285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25027790665626526,
      "learning_rate": 9.956206309337066e-07,
      "loss": -0.0,
      "num_tokens": 3627238.0,
      "reward": -0.3098237216472626,
      "reward_std": 0.4339829087257385,
      "rewards/cosine_scaled_reward/mean": -0.2330368608236313,
      "rewards/cosine_scaled_reward/std": 0.17332859337329865,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 1891.109375,
      "completions/mean_terminated_length": 1490.1666259765625,
      "completions/min_length": 401.0,
      "completions/min_terminated_length": 401.0,
      "epoch": 0.03428571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24189673364162445,
      "learning_rate": 9.944597532678119e-07,
      "loss": 0.0,
      "num_tokens": 3758805.0,
      "reward": -0.08874380588531494,
      "reward_std": 0.5923835635185242,
      "rewards/cosine_scaled_reward/mean": -0.18499691784381866,
      "rewards/cosine_scaled_reward/std": 0.27955111861228943,
      "rewards/format_reward/mean": 0.28125,
      "rewards/format_reward/std": 0.4531635046005249,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1467.0,
      "completions/mean_length": 1818.8125,
      "completions/mean_terminated_length": 825.6666870117188,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 0.03542857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24893531203269958,
      "learning_rate": 9.931634888554935e-07,
      "loss": 0.0,
      "num_tokens": 3885705.0,
      "reward": -0.18628405034542084,
      "reward_std": 0.5522075891494751,
      "rewards/cosine_scaled_reward/mean": -0.20251703262329102,
      "rewards/cosine_scaled_reward/std": 0.37591472268104553,
      "rewards/format_reward/mean": 0.21875,
      "rewards/format_reward/std": 0.4166666865348816,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1867.0,
      "completions/mean_length": 1878.140625,
      "completions/mean_terminated_length": 1211.769287109375,
      "completions/min_length": 654.0,
      "completions/min_terminated_length": 654.0,
      "epoch": 0.036571428571428574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.25341352820396423,
      "learning_rate": 9.917322325514487e-07,
      "loss": 0.0,
      "num_tokens": 4016258.0,
      "reward": -0.14861394464969635,
      "reward_std": 0.5451517105102539,
      "rewards/cosine_scaled_reward/mean": -0.19149449467658997,
      "rewards/cosine_scaled_reward/std": 0.3489256203174591,
      "rewards/format_reward/mean": 0.234375,
      "rewards/format_reward/std": 0.42695629596710205,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1823.0,
      "completions/mean_length": 1969.03125,
      "completions/mean_terminated_length": 1326.0,
      "completions/min_length": 998.0,
      "completions/min_terminated_length": 998.0,
      "epoch": 0.037714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23557375371456146,
      "learning_rate": 9.901664203302124e-07,
      "loss": -0.0,
      "num_tokens": 4153492.0,
      "reward": -0.3634287118911743,
      "reward_std": 0.5462046265602112,
      "rewards/cosine_scaled_reward/mean": -0.25983935594558716,
      "rewards/cosine_scaled_reward/std": 0.3271723687648773,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1998.0,
      "completions/mean_length": 1568.296875,
      "completions/mean_terminated_length": 1024.6334228515625,
      "completions/min_length": 506.0,
      "completions/min_terminated_length": 506.0,
      "epoch": 0.038857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2892495095729828,
      "learning_rate": 9.88466529153356e-07,
      "loss": 0.0,
      "num_tokens": 4263415.0,
      "reward": 0.575156569480896,
      "reward_std": 0.8866004347801208,
      "rewards/cosine_scaled_reward/mean": 0.045390784740448,
      "rewards/cosine_scaled_reward/std": 0.5505619645118713,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.84375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1834.0,
      "completions/mean_length": 1830.65625,
      "completions/mean_terminated_length": 657.0,
      "completions/min_length": 371.0,
      "completions/min_terminated_length": 371.0,
      "epoch": 0.04,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.28274399042129517,
      "learning_rate": 9.866330768241983e-07,
      "loss": -0.0,
      "num_tokens": 4392073.0,
      "reward": -0.1704331934452057,
      "reward_std": 0.7666259407997131,
      "rewards/cosine_scaled_reward/mean": -0.18677911162376404,
      "rewards/cosine_scaled_reward/std": 0.36125659942626953,
      "rewards/format_reward/mean": 0.203125,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1950.671875,
      "completions/mean_terminated_length": 1158.1429443359375,
      "completions/min_length": 669.0,
      "completions/min_terminated_length": 669.0,
      "epoch": 0.04114285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2504905164241791,
      "learning_rate": 9.846666218300807e-07,
      "loss": -0.0,
      "num_tokens": 4528028.0,
      "reward": -0.49544650316238403,
      "reward_std": 0.3493530750274658,
      "rewards/cosine_scaled_reward/mean": -0.302410751581192,
      "rewards/cosine_scaled_reward/std": 0.17342224717140198,
      "rewards/format_reward/mean": 0.109375,
      "rewards/format_reward/std": 0.3145764470100403,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1879.0,
      "completions/mean_length": 1956.546875,
      "completions/mean_terminated_length": 1397.6666259765625,
      "completions/min_length": 789.0,
      "completions/min_terminated_length": 789.0,
      "epoch": 0.04228571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.24223695695400238,
      "learning_rate": 9.825677631722435e-07,
      "loss": 0.0,
      "num_tokens": 4664271.0,
      "reward": -0.2983526587486267,
      "reward_std": 0.45510220527648926,
      "rewards/cosine_scaled_reward/mean": -0.22730132937431335,
      "rewards/cosine_scaled_reward/std": 0.21532759070396423,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.36596253514289856,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1844.0,
      "completions/mean_length": 1872.0625,
      "completions/mean_terminated_length": 1109.666748046875,
      "completions/min_length": 799.0,
      "completions/min_terminated_length": 799.0,
      "epoch": 0.04342857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.22518785297870636,
      "learning_rate": 9.80337140183366e-07,
      "loss": 0.0,
      "num_tokens": 4795795.0,
      "reward": -0.0591111034154892,
      "reward_std": 0.38858330249786377,
      "rewards/cosine_scaled_reward/mean": -0.1311180591583252,
      "rewards/cosine_scaled_reward/std": 0.32316854596138,
      "rewards/format_reward/mean": 0.203125,
      "rewards/format_reward/std": 0.40550529956817627,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1982.0,
      "completions/mean_length": 1674.8125,
      "completions/mean_terminated_length": 721.1111450195312,
      "completions/min_length": 358.0,
      "completions/min_terminated_length": 358.0,
      "epoch": 0.044571428571428574,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26911357045173645,
      "learning_rate": 9.779754323328192e-07,
      "loss": -0.0,
      "num_tokens": 4913767.0,
      "reward": 0.14183415472507477,
      "reward_std": 0.6081592440605164,
      "rewards/cosine_scaled_reward/mean": -0.09314543008804321,
      "rewards/cosine_scaled_reward/std": 0.3410241901874542,
      "rewards/format_reward/mean": 0.328125,
      "rewards/format_reward/std": 0.4732423722743988,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 1549.328125,
      "completions/mean_terminated_length": 908.1785888671875,
      "completions/min_length": 204.0,
      "completions/min_terminated_length": 204.0,
      "epoch": 0.045714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2770562767982483,
      "learning_rate": 9.754833590196926e-07,
      "loss": 0.0,
      "num_tokens": 5022996.0,
      "reward": 0.3034515678882599,
      "reward_std": 0.5147567987442017,
      "rewards/cosine_scaled_reward/mean": -0.09827423095703125,
      "rewards/cosine_scaled_reward/std": 0.39993754029273987,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.5039526224136353,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1819.0,
      "completions/mean_length": 1768.609375,
      "completions/mean_terminated_length": 1302.9583740234375,
      "completions/min_length": 584.0,
      "completions/min_terminated_length": 584.0,
      "epoch": 0.046857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.23544111847877502,
      "learning_rate": 9.728616793536587e-07,
      "loss": 0.0,
      "num_tokens": 5147339.0,
      "reward": 0.05204566568136215,
      "reward_std": 0.7308298349380493,
      "rewards/cosine_scaled_reward/mean": -0.18491466343402863,
      "rewards/cosine_scaled_reward/std": 0.3467314541339874,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1894.0,
      "completions/mean_length": 1662.234375,
      "completions/mean_terminated_length": 748.5789794921875,
      "completions/min_length": 164.0,
      "completions/min_terminated_length": 164.0,
      "epoch": 0.048,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3946230709552765,
      "learning_rate": 9.701111919237408e-07,
      "loss": 0.0,
      "num_tokens": 5264082.0,
      "reward": -0.1084136962890625,
      "reward_std": 0.35625624656677246,
      "rewards/cosine_scaled_reward/mean": -0.21045684814453125,
      "rewards/cosine_scaled_reward/std": 0.17068159580230713,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.467176616191864,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 1628.796875,
      "completions/mean_terminated_length": 974.8399658203125,
      "completions/min_length": 387.0,
      "completions/min_terminated_length": 387.0,
      "epoch": 0.04914285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2622542679309845,
      "learning_rate": 9.672327345550543e-07,
      "loss": -0.0,
      "num_tokens": 5379941.0,
      "reward": 0.24864289164543152,
      "reward_std": 0.622364342212677,
      "rewards/cosine_scaled_reward/mean": -0.08661604672670364,
      "rewards/cosine_scaled_reward/std": 0.3968709111213684,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.49776285886764526,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1813.0,
      "completions/mean_length": 1598.265625,
      "completions/mean_terminated_length": 848.7083740234375,
      "completions/min_length": 233.0,
      "completions/min_terminated_length": 233.0,
      "epoch": 0.05028571428571429,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3861124813556671,
      "learning_rate": 9.64227184053598e-07,
      "loss": -0.0,
      "num_tokens": 5492926.0,
      "reward": 0.17736095190048218,
      "reward_std": 0.5736653804779053,
      "rewards/cosine_scaled_reward/mean": -0.09881951659917831,
      "rewards/cosine_scaled_reward/std": 0.4637540578842163,
      "rewards/format_reward/mean": 0.375,
      "rewards/format_reward/std": 0.48795005679130554,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1934.0,
      "completions/mean_length": 1945.546875,
      "completions/mean_terminated_length": 1228.375,
      "completions/min_length": 909.0,
      "completions/min_terminated_length": 909.0,
      "epoch": 0.05142857142857143,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2586025893688202,
      "learning_rate": 9.610954559391704e-07,
      "loss": -0.0,
      "num_tokens": 5629097.0,
      "reward": -0.2874904274940491,
      "reward_std": 0.4528215825557709,
      "rewards/cosine_scaled_reward/mean": -0.21405771374702454,
      "rewards/cosine_scaled_reward/std": 0.3033171594142914,
      "rewards/format_reward/mean": 0.140625,
      "rewards/format_reward/std": 0.3503824472427368,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1514.0,
      "completions/mean_length": 1772.890625,
      "completions/mean_terminated_length": 874.2000732421875,
      "completions/min_length": 597.0,
      "completions/min_terminated_length": 597.0,
      "epoch": 0.052571428571428575,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.27347299456596375,
      "learning_rate": 9.578385041664925e-07,
      "loss": -0.0,
      "num_tokens": 5753730.0,
      "reward": -0.0957992672920227,
      "reward_std": 0.4836219251155853,
      "rewards/cosine_scaled_reward/mean": -0.17289963364601135,
      "rewards/cosine_scaled_reward/std": 0.3050842881202698,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.4364357888698578,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.609375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 1595.734375,
      "completions/mean_terminated_length": 890.199951171875,
      "completions/min_length": 379.0,
      "completions/min_terminated_length": 379.0,
      "epoch": 0.053714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3536407947540283,
      "learning_rate": 9.54457320834625e-07,
      "loss": 0.0,
      "num_tokens": 5866257.0,
      "reward": -0.01777055859565735,
      "reward_std": 0.6523094177246094,
      "rewards/cosine_scaled_reward/mean": -0.22763527929782867,
      "rewards/cosine_scaled_reward/std": 0.3455982208251953,
      "rewards/format_reward/mean": 0.4375,
      "rewards/format_reward/std": 0.5,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1792.0,
      "completions/mean_length": 1581.84375,
      "completions/mean_terminated_length": 1019.2413940429688,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 0.054857142857142854,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.26218104362487793,
      "learning_rate": 9.509529358847654e-07,
      "loss": -0.0,
      "num_tokens": 5978039.0,
      "reward": 0.36145922541618347,
      "reward_std": 0.8229352235794067,
      "rewards/cosine_scaled_reward/mean": -0.06145789101719856,
      "rewards/cosine_scaled_reward/std": 0.4491077661514282,
      "rewards/format_reward/mean": 0.484375,
      "rewards/format_reward/std": 0.5037065148353577,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.484375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1525.0,
      "completions/mean_length": 1404.46875,
      "completions/mean_terminated_length": 799.9393920898438,
      "completions/min_length": 276.0,
      "completions/min_terminated_length": 276.0,
      "epoch": 0.056,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.3138538897037506,
      "learning_rate": 9.473264167865171e-07,
      "loss": 0.0,
      "num_tokens": 6077989.0,
      "reward": 0.23753327131271362,
      "reward_std": 0.6856037378311157,
      "rewards/cosine_scaled_reward/mean": -0.1468583643436432,
      "rewards/cosine_scaled_reward/std": 0.36308491230010986,
      "rewards/format_reward/mean": 0.53125,
      "rewards/format_reward/std": 0.5029674172401428,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1834.0,
      "completions/mean_length": 1668.46875,
      "completions/mean_terminated_length": 943.9091186523438,
      "completions/min_length": 327.0,
      "completions/min_terminated_length": 327.0,
      "epoch": 0.05714285714285714,
      "frac_reward_zero_std": 0.0,
      "grad_norm": 0.2541959285736084,
      "learning_rate": 9.43578868212728e-07,
      "loss": 0.0,
      "num_tokens": 6195587.0,
      "reward": 0.2079824060201645,
      "reward_std": 0.6563009023666382,
      "rewards/cosine_scaled_reward/mean": -0.09132131934165955,
      "rewards/cosine_scaled_reward/std": 0.39781448245048523,
      "rewards/format_reward/mean": 0.390625,
      "rewards/format_reward/std": 0.4917473793029785,
      "step": 50
    }
  ],
  "logging_steps": 1,
  "max_steps": 200,
  "num_input_tokens_seen": 6195587,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}