diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31284 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 162.1875, + "completions/mean_terminated_length": 112.21429443359375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0008, + "grad_norm": 4.733877182006836, + "kl": 0.0004642009735107422, + "learning_rate": 0.0, + "loss": -0.0409, + "num_tokens": 12286.0, + "reward": -14.972412109375, + "reward_std": 7.30580997467041, + "rewards/rm_reward_func/mean": -14.972412109375, + "rewards/rm_reward_func/std": 9.738199234008789, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 306.0625, + "completions/mean_terminated_length": 212.45455932617188, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.0016, + "grad_norm": 2.107487678527832, + "kl": 0.0006613731384277344, + "learning_rate": 1.5873015873015872e-08, + "loss": -0.2166, + "num_tokens": 27240.0, + "reward": -18.076171875, + "reward_std": 5.312369346618652, + "rewards/rm_reward_func/mean": -18.076171875, + "rewards/rm_reward_func/std": 5.666329383850098, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 287.6875, + "completions/mean_terminated_length": 153.10000610351562, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.0024, + "grad_norm": 2.3611650466918945, + "kl": 0.0004291534423828125, + "learning_rate": 3.1746031746031744e-08, + "loss": 0.0915, + "num_tokens": 46014.0, + "reward": -20.93310546875, + "reward_std": 6.643990993499756, + "rewards/rm_reward_func/mean": -20.93310546875, + "rewards/rm_reward_func/std": 8.887117385864258, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 471.09375, + "completions/mean_terminated_length": 366.5555725097656, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.0032, + "grad_norm": 1.5670565366744995, + "kl": 0.000698089599609375, + "learning_rate": 4.7619047619047613e-08, + "loss": -0.0638, + "num_tokens": 67489.0, + "reward": -12.26611328125, + "reward_std": 6.560182571411133, + "rewards/rm_reward_func/mean": -12.26611328125, + "rewards/rm_reward_func/std": 8.791576385498047, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 272.75, + "completions/mean_terminated_length": 179.13043212890625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.004, + "grad_norm": 4.25062370300293, + "kl": 0.0007371902465820312, + "learning_rate": 6.349206349206349e-08, + "loss": 0.1245, + "num_tokens": 78937.0, + "reward": -18.7626953125, + "reward_std": 5.579219818115234, + "rewards/rm_reward_func/mean": -18.7626953125, + "rewards/rm_reward_func/std": 6.908312797546387, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 163.0, + "completions/mean_terminated_length": 98.37036895751953, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.0048, + "grad_norm": 4.417694568634033, + "kl": 0.0005817413330078125, + "learning_rate": 7.936507936507936e-08, + "loss": -0.4422, + "num_tokens": 89649.0, + "reward": -20.189697265625, + "reward_std": 8.252399444580078, + "rewards/rm_reward_func/mean": -20.189697265625, + "rewards/rm_reward_func/std": 10.306472778320312, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 242.25, + "completions/mean_terminated_length": 166.72000122070312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0056, + "grad_norm": 4.025533676147461, + "kl": 0.0005292892456054688, + "learning_rate": 9.523809523809523e-08, + "loss": 0.0103, + "num_tokens": 103017.0, + "reward": -12.515022277832031, + "reward_std": 7.5117340087890625, + "rewards/rm_reward_func/mean": -12.515022277832031, + "rewards/rm_reward_func/std": 7.902947902679443, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 244.25, + "completions/mean_terminated_length": 182.4615478515625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.0064, + "grad_norm": 2.775796413421631, + "kl": 0.000759124755859375, + "learning_rate": 1.111111111111111e-07, + "loss": -0.0003, + "num_tokens": 115817.0, + "reward": -13.3485107421875, + "reward_std": 7.309122085571289, + "rewards/rm_reward_func/mean": -13.3485107421875, + "rewards/rm_reward_func/std": 8.070342063903809, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 328.03125, + "completions/mean_terminated_length": 202.15789794921875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0072, + "grad_norm": 2.073791265487671, + "kl": 0.0005431175231933594, + "learning_rate": 1.2698412698412698e-07, + "loss": -0.0042, + "num_tokens": 133738.0, + "reward": -19.9541015625, + "reward_std": 7.462824821472168, + "rewards/rm_reward_func/mean": -19.9541015625, + "rewards/rm_reward_func/std": 8.75332260131836, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 135.15625, + "completions/mean_terminated_length": 123.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.008, + "grad_norm": 3.537003755569458, + "kl": 0.0005927085876464844, + "learning_rate": 1.4285714285714285e-07, + "loss": -0.0045, + "num_tokens": 140407.0, + "reward": -17.105224609375, + "reward_std": 6.6533403396606445, + "rewards/rm_reward_func/mean": -17.105224609375, + "rewards/rm_reward_func/std": 7.8900837898254395, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 236.90625, + "completions/mean_terminated_length": 129.26087951660156, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0088, + "grad_norm": 2.1652469635009766, + "kl": 0.00041961669921875, + "learning_rate": 1.5873015873015872e-07, + "loss": 0.4337, + "num_tokens": 151652.0, + "reward": -16.7042236328125, + "reward_std": 6.598738670349121, + "rewards/rm_reward_func/mean": -16.7042236328125, + "rewards/rm_reward_func/std": 6.982585906982422, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 286.3125, + "completions/mean_terminated_length": 168.09524536132812, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.0096, + "grad_norm": 2.148557186126709, + "kl": 0.0005078315734863281, + "learning_rate": 1.7460317460317458e-07, + "loss": 0.1579, + "num_tokens": 169870.0, + "reward": -16.03668212890625, + "reward_std": 7.722983360290527, + "rewards/rm_reward_func/mean": -16.03668212890625, + "rewards/rm_reward_func/std": 8.73274040222168, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 204.34375, + "completions/mean_terminated_length": 118.19999694824219, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.0104, + "grad_norm": 2.9024100303649902, + "kl": 0.0005464553833007812, + "learning_rate": 1.9047619047619045e-07, + "loss": -0.1424, + "num_tokens": 182489.0, + "reward": -19.439453125, + "reward_std": 6.397669315338135, + "rewards/rm_reward_func/mean": -19.439453125, + "rewards/rm_reward_func/std": 8.439228057861328, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 250.25, + "completions/mean_terminated_length": 147.8260955810547, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0112, + "grad_norm": 2.709969997406006, + "kl": 0.0005555152893066406, + "learning_rate": 2.0634920634920632e-07, + "loss": -0.1489, + "num_tokens": 194233.0, + "reward": -18.40234375, + "reward_std": 5.450201988220215, + "rewards/rm_reward_func/mean": -18.40234375, + "rewards/rm_reward_func/std": 7.459920883178711, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 213.0625, + "completions/mean_terminated_length": 129.36000061035156, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.012, + "grad_norm": 2.7814769744873047, + "kl": 0.0005245208740234375, + "learning_rate": 2.222222222222222e-07, + "loss": -0.0593, + "num_tokens": 206339.0, + "reward": -14.456787109375, + "reward_std": 7.301092624664307, + "rewards/rm_reward_func/mean": -14.456787109375, + "rewards/rm_reward_func/std": 9.415141105651855, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 271.21875, + "completions/mean_terminated_length": 236.82144165039062, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.0128, + "grad_norm": 2.045577049255371, + "kl": 0.0004906654357910156, + "learning_rate": 2.3809523809523806e-07, + "loss": -0.0992, + "num_tokens": 218890.0, + "reward": -18.230010986328125, + "reward_std": 6.69823694229126, + "rewards/rm_reward_func/mean": -18.230010986328125, + "rewards/rm_reward_func/std": 8.588906288146973, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 377.875, + "completions/mean_terminated_length": 286.1052551269531, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.0136, + "grad_norm": 1.7612684965133667, + "kl": 0.0006260871887207031, + "learning_rate": 2.5396825396825396e-07, + "loss": -0.122, + "num_tokens": 233942.0, + "reward": -10.974349975585938, + "reward_std": 9.46108627319336, + "rewards/rm_reward_func/mean": -10.974349975585938, + "rewards/rm_reward_func/std": 11.269489288330078, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 308.9375, + "completions/mean_terminated_length": 216.63636779785156, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0144, + "grad_norm": 3.043867588043213, + "kl": 0.0006608963012695312, + "learning_rate": 2.698412698412698e-07, + "loss": -0.0415, + "num_tokens": 246140.0, + "reward": -15.01416015625, + "reward_std": 6.831326007843018, + "rewards/rm_reward_func/mean": -15.01416015625, + "rewards/rm_reward_func/std": 7.092841148376465, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 306.6875, + "completions/mean_terminated_length": 183.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.0152, + "grad_norm": 1.7109549045562744, + "kl": 0.0004494190216064453, + "learning_rate": 2.857142857142857e-07, + "loss": 0.0434, + "num_tokens": 260202.0, + "reward": -18.6416015625, + "reward_std": 7.036252021789551, + "rewards/rm_reward_func/mean": -18.6416015625, + "rewards/rm_reward_func/std": 8.478830337524414, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 293.96875, + "completions/mean_terminated_length": 208.6521759033203, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.016, + "grad_norm": 2.787404775619507, + "kl": 0.000705718994140625, + "learning_rate": 3.0158730158730156e-07, + "loss": -0.223, + "num_tokens": 274521.0, + "reward": -17.173828125, + "reward_std": 5.238741874694824, + "rewards/rm_reward_func/mean": -17.173828125, + "rewards/rm_reward_func/std": 5.566880702972412, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 187.1875, + "completions/mean_terminated_length": 112.23077392578125, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.0168, + "grad_norm": 2.882970094680786, + "kl": 0.0005102157592773438, + "learning_rate": 3.1746031746031743e-07, + "loss": -0.029, + "num_tokens": 283359.0, + "reward": -17.470703125, + "reward_std": 5.557958602905273, + "rewards/rm_reward_func/mean": -17.470703125, + "rewards/rm_reward_func/std": 5.913144111633301, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 293.84375, + "completions/mean_terminated_length": 194.68182373046875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.0176, + "grad_norm": 2.53083872795105, + "kl": 0.0007300376892089844, + "learning_rate": 3.333333333333333e-07, + "loss": -0.3244, + "num_tokens": 300082.0, + "reward": -17.8603515625, + "reward_std": 6.352341175079346, + "rewards/rm_reward_func/mean": -17.8603515625, + "rewards/rm_reward_func/std": 7.445318698883057, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 334.125, + "completions/mean_terminated_length": 177.1764678955078, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.0184, + "grad_norm": 2.098026752471924, + "kl": 0.00046825408935546875, + "learning_rate": 3.4920634920634917e-07, + "loss": 0.0348, + "num_tokens": 318878.0, + "reward": -20.6982421875, + "reward_std": 4.5884552001953125, + "rewards/rm_reward_func/mean": -20.6982421875, + "rewards/rm_reward_func/std": 5.56207275390625, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 237.28125, + "completions/mean_terminated_length": 145.70834350585938, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.0192, + "grad_norm": 2.4218556880950928, + "kl": 0.0005183219909667969, + "learning_rate": 3.6507936507936504e-07, + "loss": 0.1568, + "num_tokens": 332287.0, + "reward": -17.60882568359375, + "reward_std": 5.957723617553711, + "rewards/rm_reward_func/mean": -17.60882568359375, + "rewards/rm_reward_func/std": 7.467422962188721, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 234.96875, + "completions/mean_terminated_length": 126.56521606445312, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.02, + "grad_norm": 3.0351226329803467, + "kl": 0.0005309581756591797, + "learning_rate": 3.809523809523809e-07, + "loss": 0.0694, + "num_tokens": 348614.0, + "reward": -20.34765625, + "reward_std": 7.445685386657715, + "rewards/rm_reward_func/mean": -20.34765625, + "rewards/rm_reward_func/std": 8.580655097961426, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 300.78125, + "completions/mean_terminated_length": 204.77273559570312, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0208, + "grad_norm": 1.7689965963363647, + "kl": 0.0005326271057128906, + "learning_rate": 3.968253968253968e-07, + "loss": -0.1311, + "num_tokens": 360399.0, + "reward": -14.226348876953125, + "reward_std": 6.583255767822266, + "rewards/rm_reward_func/mean": -14.226348876953125, + "rewards/rm_reward_func/std": 7.479687213897705, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 260.25, + "completions/mean_terminated_length": 176.33334350585938, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0216, + "grad_norm": 2.9027605056762695, + "kl": 0.00046634674072265625, + "learning_rate": 4.1269841269841265e-07, + "loss": -0.0346, + "num_tokens": 372815.0, + "reward": -17.73095703125, + "reward_std": 5.46522331237793, + "rewards/rm_reward_func/mean": -17.73095703125, + "rewards/rm_reward_func/std": 7.74613618850708, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 310.96875, + "completions/mean_terminated_length": 190.35000610351562, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.0224, + "grad_norm": 2.343336582183838, + "kl": 0.0006885528564453125, + "learning_rate": 4.285714285714285e-07, + "loss": -0.0275, + "num_tokens": 385278.0, + "reward": -12.96875, + "reward_std": 5.088245868682861, + "rewards/rm_reward_func/mean": -12.96875, + "rewards/rm_reward_func/std": 5.312559127807617, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 241.28125, + "completions/mean_terminated_length": 151.0416717529297, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0232, + "grad_norm": 2.9956672191619873, + "kl": 0.0004820823669433594, + "learning_rate": 4.444444444444444e-07, + "loss": 0.0743, + "num_tokens": 405735.0, + "reward": -14.2308349609375, + "reward_std": 11.751058578491211, + "rewards/rm_reward_func/mean": -14.2308349609375, + "rewards/rm_reward_func/std": 12.032173156738281, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 217.125, + "completions/mean_terminated_length": 118.83333587646484, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.024, + "grad_norm": 3.1685726642608643, + "kl": 0.0006208419799804688, + "learning_rate": 4.6031746031746025e-07, + "loss": -0.0612, + "num_tokens": 417427.0, + "reward": -15.2958984375, + "reward_std": 9.689838409423828, + "rewards/rm_reward_func/mean": -15.2958984375, + "rewards/rm_reward_func/std": 10.436555862426758, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 188.15625, + "completions/mean_terminated_length": 113.42308044433594, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.0248, + "grad_norm": 3.6047468185424805, + "kl": 0.0006389617919921875, + "learning_rate": 4.761904761904761e-07, + "loss": 0.1663, + "num_tokens": 430200.0, + "reward": -12.5240478515625, + "reward_std": 6.992089748382568, + "rewards/rm_reward_func/mean": -12.5240478515625, + "rewards/rm_reward_func/std": 10.005572319030762, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 210.875, + "completions/mean_terminated_length": 155.11111450195312, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.0256, + "grad_norm": 2.584559679031372, + "kl": 0.0007905960083007812, + "learning_rate": 4.92063492063492e-07, + "loss": -0.0784, + "num_tokens": 442068.0, + "reward": -13.05859375, + "reward_std": 5.2848219871521, + "rewards/rm_reward_func/mean": -13.05859375, + "rewards/rm_reward_func/std": 11.056248664855957, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 274.84375, + "completions/mean_terminated_length": 150.61904907226562, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0264, + "grad_norm": 2.4749457836151123, + "kl": 0.0006674528121948242, + "learning_rate": 5.079365079365079e-07, + "loss": 0.1203, + "num_tokens": 456303.0, + "reward": -19.4091796875, + "reward_std": 5.808443546295166, + "rewards/rm_reward_func/mean": -19.4091796875, + "rewards/rm_reward_func/std": 8.455757141113281, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 232.125, + "completions/mean_terminated_length": 153.75999450683594, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0272, + "grad_norm": 3.137270927429199, + "kl": 0.0005202293395996094, + "learning_rate": 5.238095238095238e-07, + "loss": 0.534, + "num_tokens": 470035.0, + "reward": -12.41497802734375, + "reward_std": 5.886228561401367, + "rewards/rm_reward_func/mean": -12.41497802734375, + "rewards/rm_reward_func/std": 10.136944770812988, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 245.0, + "completions/mean_terminated_length": 140.52174377441406, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.028, + "grad_norm": 6.771448135375977, + "kl": 0.0007982254028320312, + "learning_rate": 5.396825396825396e-07, + "loss": 0.0315, + "num_tokens": 487523.0, + "reward": -11.912017822265625, + "reward_std": 12.166071891784668, + "rewards/rm_reward_func/mean": -11.912017822265625, + "rewards/rm_reward_func/std": 12.91728401184082, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 247.28125, + "completions/mean_terminated_length": 143.69564819335938, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0288, + "grad_norm": 3.114492416381836, + "kl": 0.0005984306335449219, + "learning_rate": 5.555555555555555e-07, + "loss": -0.1486, + "num_tokens": 500604.0, + "reward": -20.0234375, + "reward_std": 5.6200761795043945, + "rewards/rm_reward_func/mean": -20.0234375, + "rewards/rm_reward_func/std": 7.285369396209717, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 140.40000915527344, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0296, + "grad_norm": 1.9965054988861084, + "kl": 0.0004763603210449219, + "learning_rate": 5.714285714285714e-07, + "loss": -0.1308, + "num_tokens": 518156.0, + "reward": -16.847412109375, + "reward_std": 8.691463470458984, + "rewards/rm_reward_func/mean": -16.847412109375, + "rewards/rm_reward_func/std": 9.688831329345703, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 276.65625, + "completions/mean_terminated_length": 210.75999450683594, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0304, + "grad_norm": 2.355628252029419, + "kl": 0.000652313232421875, + "learning_rate": 5.873015873015873e-07, + "loss": 0.0537, + "num_tokens": 532513.0, + "reward": -15.91558837890625, + "reward_std": 6.340006351470947, + "rewards/rm_reward_func/mean": -15.91558837890625, + "rewards/rm_reward_func/std": 6.9672441482543945, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 166.25, + "completions/mean_terminated_length": 130.48275756835938, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0312, + "grad_norm": 3.0702366828918457, + "kl": 0.0012187957763671875, + "learning_rate": 6.031746031746031e-07, + "loss": 0.0736, + "num_tokens": 544713.0, + "reward": -18.3759765625, + "reward_std": 9.790019989013672, + "rewards/rm_reward_func/mean": -18.3759765625, + "rewards/rm_reward_func/std": 10.507294654846191, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 271.71875, + "completions/mean_terminated_length": 204.44000244140625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.032, + "grad_norm": 2.496678352355957, + "kl": 0.000782012939453125, + "learning_rate": 6.19047619047619e-07, + "loss": 0.0032, + "num_tokens": 561752.0, + "reward": -11.7308349609375, + "reward_std": 7.016323089599609, + "rewards/rm_reward_func/mean": -11.7308349609375, + "rewards/rm_reward_func/std": 10.941299438476562, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 191.625, + "completions/mean_terminated_length": 145.85714721679688, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.0328, + "grad_norm": 3.0735666751861572, + "kl": 0.0010395050048828125, + "learning_rate": 6.349206349206349e-07, + "loss": 0.0091, + "num_tokens": 569108.0, + "reward": -9.7904052734375, + "reward_std": 8.391822814941406, + "rewards/rm_reward_func/mean": -9.7904052734375, + "rewards/rm_reward_func/std": 9.721639633178711, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 227.34375, + "completions/mean_terminated_length": 174.62962341308594, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.0336, + "grad_norm": 2.8612797260284424, + "kl": 0.0013446807861328125, + "learning_rate": 6.507936507936507e-07, + "loss": -0.2049, + "num_tokens": 580255.0, + "reward": -12.479393005371094, + "reward_std": 7.963648319244385, + "rewards/rm_reward_func/mean": -12.479393005371094, + "rewards/rm_reward_func/std": 9.471086502075195, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 285.8125, + "completions/mean_terminated_length": 150.10000610351562, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0344, + "grad_norm": 2.278257131576538, + "kl": 0.0007457733154296875, + "learning_rate": 6.666666666666666e-07, + "loss": -0.1891, + "num_tokens": 598673.0, + "reward": -17.751708984375, + "reward_std": 7.0432562828063965, + "rewards/rm_reward_func/mean": -17.751708984375, + "rewards/rm_reward_func/std": 9.096941947937012, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 301.78125, + "completions/mean_terminated_length": 191.6666717529297, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.0352, + "grad_norm": 2.1108381748199463, + "kl": 0.0022859573364257812, + "learning_rate": 6.825396825396826e-07, + "loss": 0.1122, + "num_tokens": 613434.0, + "reward": -15.1778564453125, + "reward_std": 7.961122512817383, + "rewards/rm_reward_func/mean": -15.1778564453125, + "rewards/rm_reward_func/std": 10.789715766906738, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 264.21875, + "completions/mean_terminated_length": 194.83999633789062, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "epoch": 0.036, + "grad_norm": 2.695690870285034, + "kl": 0.0011796951293945312, + "learning_rate": 6.984126984126983e-07, + "loss": -0.1674, + "num_tokens": 625529.0, + "reward": -16.33392333984375, + "reward_std": 6.300891399383545, + "rewards/rm_reward_func/mean": -16.33392333984375, + "rewards/rm_reward_func/std": 10.230180740356445, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 240.84375, + "completions/mean_terminated_length": 178.2692413330078, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.0368, + "grad_norm": 2.2775719165802, + "kl": 0.0014019012451171875, + "learning_rate": 7.142857142857143e-07, + "loss": 0.2219, + "num_tokens": 635692.0, + "reward": -16.74169921875, + "reward_std": 6.463481903076172, + "rewards/rm_reward_func/mean": -16.74169921875, + "rewards/rm_reward_func/std": 7.286113262176514, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 240.375, + "completions/mean_terminated_length": 134.0869598388672, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.0376, + "grad_norm": 3.1222875118255615, + "kl": 0.003017425537109375, + "learning_rate": 7.301587301587301e-07, + "loss": 0.1266, + "num_tokens": 652696.0, + "reward": -16.237548828125, + "reward_std": 10.400668144226074, + "rewards/rm_reward_func/mean": -16.237548828125, + "rewards/rm_reward_func/std": 11.441701889038086, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 334.0, + "completions/mean_terminated_length": 308.5714416503906, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.0384, + "grad_norm": 1.5257015228271484, + "kl": 0.0009555816650390625, + "learning_rate": 7.46031746031746e-07, + "loss": -0.0477, + "num_tokens": 666904.0, + "reward": -10.626953125, + "reward_std": 9.35859489440918, + "rewards/rm_reward_func/mean": -10.626953125, + "rewards/rm_reward_func/std": 10.240569114685059, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 207.03125, + "completions/mean_terminated_length": 136.6538543701172, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.0392, + "grad_norm": 2.7735135555267334, + "kl": 0.0024394989013671875, + "learning_rate": 7.619047619047618e-07, + "loss": 0.3804, + "num_tokens": 680593.0, + "reward": -15.4420166015625, + "reward_std": 5.318691730499268, + "rewards/rm_reward_func/mean": -15.4420166015625, + "rewards/rm_reward_func/std": 7.727688789367676, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 352.96875, + "completions/mean_terminated_length": 290.7391357421875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.04, + "grad_norm": 1.7596663236618042, + "kl": 0.0008020401000976562, + "learning_rate": 7.777777777777778e-07, + "loss": 0.0049, + "num_tokens": 694408.0, + "reward": -13.23162841796875, + "reward_std": 5.9329681396484375, + "rewards/rm_reward_func/mean": -13.23162841796875, + "rewards/rm_reward_func/std": 6.714731693267822, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 257.5652160644531, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.0408, + "grad_norm": 1.781779170036316, + "kl": 0.0025453567504882812, + "learning_rate": 7.936507936507936e-07, + "loss": 0.0057, + "num_tokens": 710436.0, + "reward": -15.145263671875, + "reward_std": 8.059885025024414, + "rewards/rm_reward_func/mean": -15.145263671875, + "rewards/rm_reward_func/std": 9.787510871887207, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 277.3125, + "completions/mean_terminated_length": 185.478271484375, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.0416, + "grad_norm": 2.2292799949645996, + "kl": 0.00193023681640625, + "learning_rate": 8.095238095238095e-07, + "loss": -0.1962, + "num_tokens": 722526.0, + "reward": -15.7451171875, + "reward_std": 5.1294169425964355, + "rewards/rm_reward_func/mean": -15.7451171875, + "rewards/rm_reward_func/std": 5.931656360626221, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 237.4375, + "completions/mean_terminated_length": 145.9166717529297, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.0424, + "grad_norm": 2.5888214111328125, + "kl": 0.005604267120361328, + "learning_rate": 8.253968253968253e-07, + "loss": -0.1283, + "num_tokens": 735076.0, + "reward": -19.05859375, + "reward_std": 6.23822546005249, + "rewards/rm_reward_func/mean": -19.05859375, + "rewards/rm_reward_func/std": 7.2186808586120605, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 296.78125, + "completions/mean_terminated_length": 247.11538696289062, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.0432, + "grad_norm": 1.7540053129196167, + "kl": 0.001697540283203125, + "learning_rate": 8.412698412698413e-07, + "loss": -0.0297, + "num_tokens": 748837.0, + "reward": -14.128173828125, + "reward_std": 8.965902328491211, + "rewards/rm_reward_func/mean": -14.128173828125, + "rewards/rm_reward_func/std": 9.960022926330566, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 254.625, + "completions/mean_terminated_length": 182.55999755859375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.044, + "grad_norm": 2.7344412803649902, + "kl": 0.0054569244384765625, + "learning_rate": 8.57142857142857e-07, + "loss": -0.1131, + "num_tokens": 760737.0, + "reward": -12.757759094238281, + "reward_std": 8.074252128601074, + "rewards/rm_reward_func/mean": -12.757759094238281, + "rewards/rm_reward_func/std": 11.715660095214844, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 257.25, + "completions/mean_terminated_length": 230.89654541015625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.0448, + "grad_norm": 2.5666260719299316, + "kl": 0.0040874481201171875, + "learning_rate": 8.73015873015873e-07, + "loss": -0.2572, + "num_tokens": 774361.0, + "reward": -9.128143310546875, + "reward_std": 11.717550277709961, + "rewards/rm_reward_func/mean": -9.128143310546875, + "rewards/rm_reward_func/std": 13.220327377319336, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 274.03125, + "completions/mean_terminated_length": 207.39999389648438, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.0456, + "grad_norm": 2.327119827270508, + "kl": 0.002613067626953125, + "learning_rate": 8.888888888888888e-07, + "loss": -0.1845, + "num_tokens": 786210.0, + "reward": -15.12841796875, + "reward_std": 5.033686637878418, + "rewards/rm_reward_func/mean": -15.12841796875, + "rewards/rm_reward_func/std": 6.702640056610107, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 382.59375, + "completions/mean_terminated_length": 253.1875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.0464, + "grad_norm": 1.6900207996368408, + "kl": 0.0014858245849609375, + "learning_rate": 9.047619047619047e-07, + "loss": -0.0369, + "num_tokens": 803325.0, + "reward": -10.68212890625, + "reward_std": 7.683623313903809, + "rewards/rm_reward_func/mean": -10.68212890625, + "rewards/rm_reward_func/std": 10.182430267333984, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 392.71875, + "completions/mean_terminated_length": 273.4375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.0472, + "grad_norm": 1.482863426208496, + "kl": 0.0018873214721679688, + "learning_rate": 9.206349206349205e-07, + "loss": -0.1385, + "num_tokens": 821476.0, + "reward": -16.1591796875, + "reward_std": 6.018152236938477, + "rewards/rm_reward_func/mean": -16.1591796875, + "rewards/rm_reward_func/std": 7.973750114440918, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 197.40625, + "completions/mean_terminated_length": 139.1481475830078, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.048, + "grad_norm": 4.355982780456543, + "kl": 0.00453948974609375, + "learning_rate": 9.365079365079365e-07, + "loss": 0.1145, + "num_tokens": 832001.0, + "reward": -11.62872314453125, + "reward_std": 6.873530864715576, + "rewards/rm_reward_func/mean": -11.62872314453125, + "rewards/rm_reward_func/std": 7.132143497467041, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 318.53125, + "completions/mean_terminated_length": 217.1904754638672, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.0488, + "grad_norm": 1.7052397727966309, + "kl": 0.002231597900390625, + "learning_rate": 9.523809523809522e-07, + "loss": -0.1447, + "num_tokens": 845858.0, + "reward": -15.4871826171875, + "reward_std": 6.991832256317139, + "rewards/rm_reward_func/mean": -15.4871826171875, + "rewards/rm_reward_func/std": 8.580527305603027, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 298.625, + "completions/mean_terminated_length": 186.85714721679688, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.0496, + "grad_norm": 2.1558306217193604, + "kl": 0.0029659271240234375, + "learning_rate": 9.682539682539682e-07, + "loss": -0.0523, + "num_tokens": 857934.0, + "reward": -13.6885986328125, + "reward_std": 6.235533237457275, + "rewards/rm_reward_func/mean": -13.6885986328125, + "rewards/rm_reward_func/std": 7.642468452453613, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 184.53125, + "completions/mean_terminated_length": 173.96774291992188, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.0504, + "grad_norm": 2.6776647567749023, + "kl": 0.0102691650390625, + "learning_rate": 9.84126984126984e-07, + "loss": -0.0268, + "num_tokens": 870287.0, + "reward": -6.808349609375, + "reward_std": 4.451569557189941, + "rewards/rm_reward_func/mean": -6.808349609375, + "rewards/rm_reward_func/std": 9.845132827758789, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 370.84375, + "completions/mean_terminated_length": 286.1499938964844, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.0512, + "grad_norm": 1.7668260335922241, + "kl": 0.0019769668579101562, + "learning_rate": 1e-06, + "loss": -0.2167, + "num_tokens": 884882.0, + "reward": -12.884033203125, + "reward_std": 7.52500581741333, + "rewards/rm_reward_func/mean": -12.884033203125, + "rewards/rm_reward_func/std": 10.193265914916992, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 147.8125, + "completions/mean_terminated_length": 136.06451416015625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.052, + "grad_norm": 4.068594455718994, + "kl": 0.006591796875, + "learning_rate": 1e-06, + "loss": -0.2182, + "num_tokens": 894716.0, + "reward": -12.27734375, + "reward_std": 8.41262435913086, + "rewards/rm_reward_func/mean": -12.27734375, + "rewards/rm_reward_func/std": 9.8477144241333, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 304.875, + "completions/mean_terminated_length": 257.0769348144531, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.0528, + "grad_norm": 1.8289958238601685, + "kl": 0.007732391357421875, + "learning_rate": 1e-06, + "loss": -0.1021, + "num_tokens": 908728.0, + "reward": -11.79150390625, + "reward_std": 7.932310104370117, + "rewards/rm_reward_func/mean": -11.79150390625, + "rewards/rm_reward_func/std": 8.658713340759277, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 246.15625, + "completions/mean_terminated_length": 184.8076934814453, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.0536, + "grad_norm": 3.4050021171569824, + "kl": 0.007976531982421875, + "learning_rate": 1e-06, + "loss": -0.1934, + "num_tokens": 923405.0, + "reward": -15.1171875, + "reward_std": 5.055709362030029, + "rewards/rm_reward_func/mean": -15.1171875, + "rewards/rm_reward_func/std": 5.867112636566162, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 463.375, + "completions/mean_terminated_length": 123.0, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.0544, + "grad_norm": 1.4013042449951172, + "kl": 0.0021820068359375, + "learning_rate": 1e-06, + "loss": 0.0404, + "num_tokens": 941137.0, + "reward": -16.410730361938477, + "reward_std": 4.626013278961182, + "rewards/rm_reward_func/mean": -16.410730361938477, + "rewards/rm_reward_func/std": 8.236431121826172, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 231.5625, + "completions/mean_terminated_length": 231.5625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.0552, + "grad_norm": 2.294609308242798, + "kl": 0.0108642578125, + "learning_rate": 1e-06, + "loss": -0.1575, + "num_tokens": 955731.0, + "reward": -9.2740478515625, + "reward_std": 7.378659725189209, + "rewards/rm_reward_func/mean": -9.2740478515625, + "rewards/rm_reward_func/std": 7.7272562980651855, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 333.875, + "completions/mean_terminated_length": 252.9091033935547, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.056, + "grad_norm": 2.191699266433716, + "kl": 0.004730224609375, + "learning_rate": 1e-06, + "loss": -0.0719, + "num_tokens": 967767.0, + "reward": -15.390228271484375, + "reward_std": 7.572562217712402, + "rewards/rm_reward_func/mean": -15.390228271484375, + "rewards/rm_reward_func/std": 8.859856605529785, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 245.34375, + "completions/mean_terminated_length": 207.25001525878906, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.0568, + "grad_norm": 2.4537527561187744, + "kl": 0.00547027587890625, + "learning_rate": 1e-06, + "loss": -0.0316, + "num_tokens": 984962.0, + "reward": -12.403583526611328, + "reward_std": 9.502147674560547, + "rewards/rm_reward_func/mean": -12.403583526611328, + "rewards/rm_reward_func/std": 12.837342262268066, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 266.46875, + "completions/mean_terminated_length": 119.1500015258789, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0576, + "grad_norm": 3.0884881019592285, + "kl": 0.0132598876953125, + "learning_rate": 1e-06, + "loss": 0.2709, + "num_tokens": 997041.0, + "reward": -16.363525390625, + "reward_std": 7.187673568725586, + "rewards/rm_reward_func/mean": -16.363525390625, + "rewards/rm_reward_func/std": 8.429330825805664, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 315.25, + "completions/mean_terminated_length": 249.6666717529297, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.0584, + "grad_norm": 1.6936599016189575, + "kl": 0.003826141357421875, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 1009209.0, + "reward": -12.302734375, + "reward_std": 8.93604850769043, + "rewards/rm_reward_func/mean": -12.302734375, + "rewards/rm_reward_func/std": 11.811920166015625, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 338.53125, + "completions/mean_terminated_length": 259.68182373046875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.0592, + "grad_norm": 1.9878442287445068, + "kl": 0.0085601806640625, + "learning_rate": 1e-06, + "loss": -0.0482, + "num_tokens": 1026410.0, + "reward": -9.265411376953125, + "reward_std": 4.766518592834473, + "rewards/rm_reward_func/mean": -9.265411376953125, + "rewards/rm_reward_func/std": 7.621801376342773, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 269.8125, + "completions/mean_terminated_length": 189.08334350585938, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.06, + "grad_norm": 2.5788185596466064, + "kl": 0.00743865966796875, + "learning_rate": 1e-06, + "loss": -0.1126, + "num_tokens": 1037548.0, + "reward": -12.520263671875, + "reward_std": 6.719825744628906, + "rewards/rm_reward_func/mean": -12.520263671875, + "rewards/rm_reward_func/std": 9.429011344909668, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 309.0, + "completions/mean_terminated_length": 252.1599884033203, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 0.0608, + "grad_norm": 2.8929193019866943, + "kl": 0.01413726806640625, + "learning_rate": 1e-06, + "loss": 0.1065, + "num_tokens": 1052708.0, + "reward": -8.053512573242188, + "reward_std": 9.236981391906738, + "rewards/rm_reward_func/mean": -8.053512573242188, + "rewards/rm_reward_func/std": 10.180198669433594, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 233.46875, + "completions/mean_terminated_length": 181.88888549804688, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.0616, + "grad_norm": 3.8004934787750244, + "kl": 0.00760650634765625, + "learning_rate": 1e-06, + "loss": -0.1501, + "num_tokens": 1063803.0, + "reward": -16.579833984375, + "reward_std": 5.787054538726807, + "rewards/rm_reward_func/mean": -16.579833984375, + "rewards/rm_reward_func/std": 6.621433258056641, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 183.3125, + "completions/mean_terminated_length": 136.35714721679688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0624, + "grad_norm": 3.4723711013793945, + "kl": 0.02323150634765625, + "learning_rate": 1e-06, + "loss": 0.1118, + "num_tokens": 1077989.0, + "reward": -12.782829284667969, + "reward_std": 8.029681205749512, + "rewards/rm_reward_func/mean": -12.782829284667969, + "rewards/rm_reward_func/std": 11.106629371643066, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 361.09375, + "completions/mean_terminated_length": 257.84210205078125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.0632, + "grad_norm": 1.8364566564559937, + "kl": 0.004528045654296875, + "learning_rate": 1e-06, + "loss": -0.0565, + "num_tokens": 1093512.0, + "reward": -16.1376953125, + "reward_std": 4.166171550750732, + "rewards/rm_reward_func/mean": -16.1376953125, + "rewards/rm_reward_func/std": 7.928711414337158, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 291.15625, + "completions/mean_terminated_length": 250.25926208496094, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.064, + "grad_norm": 2.220452308654785, + "kl": 0.015720367431640625, + "learning_rate": 1e-06, + "loss": -0.0419, + "num_tokens": 1108173.0, + "reward": -10.775238037109375, + "reward_std": 4.5652265548706055, + "rewards/rm_reward_func/mean": -10.775238037109375, + "rewards/rm_reward_func/std": 7.609045505523682, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 329.1875, + "completions/mean_terminated_length": 204.1052703857422, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.0648, + "grad_norm": 2.3170006275177, + "kl": 0.01618194580078125, + "learning_rate": 1e-06, + "loss": 0.0386, + "num_tokens": 1121739.0, + "reward": -17.7099609375, + "reward_std": 4.739996910095215, + "rewards/rm_reward_func/mean": -17.7099609375, + "rewards/rm_reward_func/std": 6.945987224578857, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 270.96875, + "completions/mean_terminated_length": 161.40908813476562, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.0656, + "grad_norm": 3.323392629623413, + "kl": 0.008464813232421875, + "learning_rate": 1e-06, + "loss": 0.2266, + "num_tokens": 1133146.0, + "reward": -12.679931640625, + "reward_std": 4.928397178649902, + "rewards/rm_reward_func/mean": -12.679931640625, + "rewards/rm_reward_func/std": 6.924253940582275, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 248.875, + "completions/mean_terminated_length": 200.1481475830078, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.0664, + "grad_norm": 2.6900339126586914, + "kl": 0.01018524169921875, + "learning_rate": 1e-06, + "loss": -0.0528, + "num_tokens": 1149814.0, + "reward": -7.25341796875, + "reward_std": 6.038924217224121, + "rewards/rm_reward_func/mean": -7.25341796875, + "rewards/rm_reward_func/std": 10.74600601196289, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 178.34375, + "completions/mean_terminated_length": 101.34616088867188, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.0672, + "grad_norm": 4.004127025604248, + "kl": 0.02178192138671875, + "learning_rate": 1e-06, + "loss": 0.0598, + "num_tokens": 1160177.0, + "reward": -16.62255859375, + "reward_std": 4.248178958892822, + "rewards/rm_reward_func/mean": -16.62255859375, + "rewards/rm_reward_func/std": 7.380358695983887, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 312.84375, + "completions/mean_terminated_length": 137.11764526367188, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.068, + "grad_norm": 2.8040761947631836, + "kl": 0.00982666015625, + "learning_rate": 1e-06, + "loss": 0.1415, + "num_tokens": 1175764.0, + "reward": -17.01824951171875, + "reward_std": 7.747839450836182, + "rewards/rm_reward_func/mean": -17.01824951171875, + "rewards/rm_reward_func/std": 9.256941795349121, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 215.71875, + "completions/mean_terminated_length": 185.0689697265625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.0688, + "grad_norm": 2.5673210620880127, + "kl": 0.009735107421875, + "learning_rate": 1e-06, + "loss": -0.0762, + "num_tokens": 1184987.0, + "reward": -15.926788330078125, + "reward_std": 5.4856109619140625, + "rewards/rm_reward_func/mean": -15.926788330078125, + "rewards/rm_reward_func/std": 11.880026817321777, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 316.46875, + "completions/mean_terminated_length": 182.68421936035156, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0696, + "grad_norm": 3.4945647716522217, + "kl": 0.0219879150390625, + "learning_rate": 1e-06, + "loss": -0.1125, + "num_tokens": 1203586.0, + "reward": -14.038330078125, + "reward_std": 5.664522171020508, + "rewards/rm_reward_func/mean": -14.038330078125, + "rewards/rm_reward_func/std": 6.30980920791626, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 329.8125, + "completions/mean_terminated_length": 205.15789794921875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.0704, + "grad_norm": 2.4489378929138184, + "kl": 0.0096435546875, + "learning_rate": 1e-06, + "loss": -0.0648, + "num_tokens": 1218636.0, + "reward": -12.71484375, + "reward_std": 5.568541526794434, + "rewards/rm_reward_func/mean": -12.71484375, + "rewards/rm_reward_func/std": 7.223500728607178, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 261.65625, + "completions/mean_terminated_length": 178.20834350585938, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.0712, + "grad_norm": 2.9438974857330322, + "kl": 0.009120941162109375, + "learning_rate": 1e-06, + "loss": -0.3272, + "num_tokens": 1233817.0, + "reward": -15.4970703125, + "reward_std": 7.003983974456787, + "rewards/rm_reward_func/mean": -15.4970703125, + "rewards/rm_reward_func/std": 7.536901950836182, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 380.03125, + "completions/mean_terminated_length": 230.4666748046875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.072, + "grad_norm": 1.8864916563034058, + "kl": 0.004978179931640625, + "learning_rate": 1e-06, + "loss": -0.0408, + "num_tokens": 1251186.0, + "reward": -11.05682373046875, + "reward_std": 7.857794761657715, + "rewards/rm_reward_func/mean": -11.05682373046875, + "rewards/rm_reward_func/std": 10.446527481079102, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 299.84375, + "completions/mean_terminated_length": 250.88462829589844, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.0728, + "grad_norm": 2.5616235733032227, + "kl": 0.016010284423828125, + "learning_rate": 1e-06, + "loss": 0.0582, + "num_tokens": 1266789.0, + "reward": -10.15771484375, + "reward_std": 5.755458831787109, + "rewards/rm_reward_func/mean": -10.15771484375, + "rewards/rm_reward_func/std": 8.519619941711426, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 239.96875, + "completions/mean_terminated_length": 189.59259033203125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.0736, + "grad_norm": 2.7049736976623535, + "kl": 0.01027679443359375, + "learning_rate": 1e-06, + "loss": -0.1509, + "num_tokens": 1278604.0, + "reward": -9.600372314453125, + "reward_std": 9.936087608337402, + "rewards/rm_reward_func/mean": -9.600372314453125, + "rewards/rm_reward_func/std": 12.005492210388184, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 187.125, + "completions/mean_terminated_length": 165.4666748046875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.0744, + "grad_norm": 2.5620620250701904, + "kl": 0.0337066650390625, + "learning_rate": 1e-06, + "loss": 0.1496, + "num_tokens": 1291784.0, + "reward": -9.21728515625, + "reward_std": 6.912812232971191, + "rewards/rm_reward_func/mean": -9.21728515625, + "rewards/rm_reward_func/std": 12.330233573913574, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 294.90625, + "completions/mean_terminated_length": 209.95652770996094, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.0752, + "grad_norm": 2.3015153408050537, + "kl": 0.00948333740234375, + "learning_rate": 1e-06, + "loss": -0.0876, + "num_tokens": 1305133.0, + "reward": -12.93408203125, + "reward_std": 6.502342700958252, + "rewards/rm_reward_func/mean": -12.93408203125, + "rewards/rm_reward_func/std": 7.836559772491455, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 266.375, + "completions/mean_terminated_length": 220.88888549804688, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.076, + "grad_norm": 2.146843433380127, + "kl": 0.01415252685546875, + "learning_rate": 1e-06, + "loss": -0.158, + "num_tokens": 1319017.0, + "reward": -13.2066650390625, + "reward_std": 6.863822937011719, + "rewards/rm_reward_func/mean": -13.2066650390625, + "rewards/rm_reward_func/std": 8.038741111755371, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 314.0625, + "completions/mean_terminated_length": 195.3000030517578, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.0768, + "grad_norm": 1.9424676895141602, + "kl": 0.00811004638671875, + "learning_rate": 1e-06, + "loss": 0.0104, + "num_tokens": 1332283.0, + "reward": -13.4290771484375, + "reward_std": 4.132850170135498, + "rewards/rm_reward_func/mean": -13.4290771484375, + "rewards/rm_reward_func/std": 7.634493350982666, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 391.96875, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0776, + "grad_norm": 1.7791144847869873, + "kl": 0.007869720458984375, + "learning_rate": 1e-06, + "loss": -0.1288, + "num_tokens": 1349730.0, + "reward": -15.774658203125, + "reward_std": 3.6583547592163086, + "rewards/rm_reward_func/mean": -15.774658203125, + "rewards/rm_reward_func/std": 10.711609840393066, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 320.53125, + "completions/mean_terminated_length": 205.65000915527344, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.0784, + "grad_norm": 2.1269357204437256, + "kl": 0.00975799560546875, + "learning_rate": 1e-06, + "loss": -0.0606, + "num_tokens": 1365091.0, + "reward": -16.1484375, + "reward_std": 6.60704231262207, + "rewards/rm_reward_func/mean": -16.1484375, + "rewards/rm_reward_func/std": 7.830225944519043, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 367.9375, + "completions/mean_terminated_length": 255.88888549804688, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.0792, + "grad_norm": 1.9840925931930542, + "kl": 0.005859375, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 1379489.0, + "reward": -14.4749755859375, + "reward_std": 4.2872114181518555, + "rewards/rm_reward_func/mean": -14.4749755859375, + "rewards/rm_reward_func/std": 9.158607482910156, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 159.68182373046875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.08, + "grad_norm": 4.316776275634766, + "kl": 0.01677703857421875, + "learning_rate": 1e-06, + "loss": -0.1475, + "num_tokens": 1394266.0, + "reward": -10.733642578125, + "reward_std": 5.6294755935668945, + "rewards/rm_reward_func/mean": -10.733642578125, + "rewards/rm_reward_func/std": 7.425699710845947, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 335.75, + "completions/mean_terminated_length": 286.3999938964844, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.0808, + "grad_norm": 1.7881006002426147, + "kl": 0.0097503662109375, + "learning_rate": 1e-06, + "loss": -0.096, + "num_tokens": 1409818.0, + "reward": -7.52093505859375, + "reward_std": 5.593201637268066, + "rewards/rm_reward_func/mean": -7.52093505859375, + "rewards/rm_reward_func/std": 7.504574775695801, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 312.625, + "completions/mean_terminated_length": 234.60870361328125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0816, + "grad_norm": 2.146157741546631, + "kl": 0.006103515625, + "learning_rate": 1e-06, + "loss": -0.1829, + "num_tokens": 1423006.0, + "reward": -16.43505859375, + "reward_std": 9.570515632629395, + "rewards/rm_reward_func/mean": -16.43505859375, + "rewards/rm_reward_func/std": 10.863560676574707, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 248.96875, + "completions/mean_terminated_length": 200.25926208496094, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.0824, + "grad_norm": 2.147773265838623, + "kl": 0.01856231689453125, + "learning_rate": 1e-06, + "loss": -0.0924, + "num_tokens": 1438149.0, + "reward": -12.839599609375, + "reward_std": 7.8220062255859375, + "rewards/rm_reward_func/mean": -12.839599609375, + "rewards/rm_reward_func/std": 11.196205139160156, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 286.5625, + "completions/mean_terminated_length": 198.3478240966797, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.0832, + "grad_norm": 3.4694175720214844, + "kl": 0.01615142822265625, + "learning_rate": 1e-06, + "loss": 0.1468, + "num_tokens": 1451719.0, + "reward": -6.701904296875, + "reward_std": 4.378237724304199, + "rewards/rm_reward_func/mean": -6.701904296875, + "rewards/rm_reward_func/std": 5.643975257873535, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 325.90625, + "completions/mean_terminated_length": 198.57894897460938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.084, + "grad_norm": 4.255980491638184, + "kl": 0.01369476318359375, + "learning_rate": 1e-06, + "loss": 0.1302, + "num_tokens": 1464212.0, + "reward": -9.76995849609375, + "reward_std": 4.328599452972412, + "rewards/rm_reward_func/mean": -9.76995849609375, + "rewards/rm_reward_func/std": 7.710804462432861, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 397.65625, + "completions/mean_terminated_length": 230.53846740722656, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.0848, + "grad_norm": 1.7567524909973145, + "kl": 0.00659942626953125, + "learning_rate": 1e-06, + "loss": 0.0349, + "num_tokens": 1482321.0, + "reward": -7.1119384765625, + "reward_std": 4.052712917327881, + "rewards/rm_reward_func/mean": -7.1119384765625, + "rewards/rm_reward_func/std": 9.184833526611328, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 195.46875, + "completions/mean_terminated_length": 174.36666870117188, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.0856, + "grad_norm": 3.2434842586517334, + "kl": 0.023193359375, + "learning_rate": 1e-06, + "loss": 0.0347, + "num_tokens": 1491216.0, + "reward": -14.646728515625, + "reward_std": 6.7649335861206055, + "rewards/rm_reward_func/mean": -14.646728515625, + "rewards/rm_reward_func/std": 8.358160972595215, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 302.3125, + "completions/mean_terminated_length": 220.26087951660156, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.0864, + "grad_norm": 2.6229352951049805, + "kl": 0.01679229736328125, + "learning_rate": 1e-06, + "loss": 0.1256, + "num_tokens": 1507866.0, + "reward": -9.199478149414062, + "reward_std": 5.150927543640137, + "rewards/rm_reward_func/mean": -9.199478149414062, + "rewards/rm_reward_func/std": 9.794849395751953, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 161.59375, + "completions/mean_terminated_length": 150.29031372070312, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.0872, + "grad_norm": 4.822059631347656, + "kl": 0.02056884765625, + "learning_rate": 1e-06, + "loss": 0.0711, + "num_tokens": 1517229.0, + "reward": -4.12725830078125, + "reward_std": 5.21287202835083, + "rewards/rm_reward_func/mean": -4.12725830078125, + "rewards/rm_reward_func/std": 12.665528297424316, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 173.28125, + "completions/mean_terminated_length": 162.35482788085938, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.088, + "grad_norm": 3.1718881130218506, + "kl": 0.0259857177734375, + "learning_rate": 1e-06, + "loss": -0.0479, + "num_tokens": 1527830.0, + "reward": -1.9501953125, + "reward_std": 7.088983058929443, + "rewards/rm_reward_func/mean": -1.9501953125, + "rewards/rm_reward_func/std": 13.075862884521484, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 314.3125, + "completions/mean_terminated_length": 195.6999969482422, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.0888, + "grad_norm": 2.9083220958709717, + "kl": 0.00763702392578125, + "learning_rate": 1e-06, + "loss": -0.1952, + "num_tokens": 1539832.0, + "reward": -14.7685546875, + "reward_std": 5.08135986328125, + "rewards/rm_reward_func/mean": -14.7685546875, + "rewards/rm_reward_func/std": 6.53047513961792, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 275.3125, + "completions/mean_terminated_length": 220.69232177734375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.0896, + "grad_norm": 2.274761915206909, + "kl": 0.0136260986328125, + "learning_rate": 1e-06, + "loss": 0.0725, + "num_tokens": 1551002.0, + "reward": -13.492034912109375, + "reward_std": 6.166081428527832, + "rewards/rm_reward_func/mean": -13.492034912109375, + "rewards/rm_reward_func/std": 7.975113391876221, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 231.03125, + "completions/mean_terminated_length": 190.8928680419922, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.0904, + "grad_norm": 2.6368567943573, + "kl": 0.0188446044921875, + "learning_rate": 1e-06, + "loss": 0.1135, + "num_tokens": 1562987.0, + "reward": -10.3765869140625, + "reward_std": 7.1247758865356445, + "rewards/rm_reward_func/mean": -10.3765869140625, + "rewards/rm_reward_func/std": 10.783613204956055, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 338.59375, + "completions/mean_terminated_length": 259.7727355957031, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.0912, + "grad_norm": 1.9818639755249023, + "kl": 0.008087158203125, + "learning_rate": 1e-06, + "loss": -0.1639, + "num_tokens": 1577526.0, + "reward": -13.02783203125, + "reward_std": 5.1511993408203125, + "rewards/rm_reward_func/mean": -13.02783203125, + "rewards/rm_reward_func/std": 8.721351623535156, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 411.25, + "completions/mean_terminated_length": 281.71429443359375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.092, + "grad_norm": 1.7151145935058594, + "kl": 0.00791168212890625, + "learning_rate": 1e-06, + "loss": -0.0992, + "num_tokens": 1593094.0, + "reward": -16.25634765625, + "reward_std": 2.8228321075439453, + "rewards/rm_reward_func/mean": -16.25634765625, + "rewards/rm_reward_func/std": 9.191113471984863, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 292.84375, + "completions/mean_terminated_length": 99.47058868408203, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.0928, + "grad_norm": 5.23122501373291, + "kl": 0.05783843994140625, + "learning_rate": 1e-06, + "loss": -0.12, + "num_tokens": 1607281.0, + "reward": -14.076904296875, + "reward_std": 7.827744483947754, + "rewards/rm_reward_func/mean": -14.076904296875, + "rewards/rm_reward_func/std": 9.644954681396484, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 230.75, + "completions/mean_terminated_length": 190.57144165039062, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.0936, + "grad_norm": 3.4280595779418945, + "kl": 0.0171356201171875, + "learning_rate": 1e-06, + "loss": -0.2185, + "num_tokens": 1619497.0, + "reward": -11.8673095703125, + "reward_std": 6.000461101531982, + "rewards/rm_reward_func/mean": -11.8673095703125, + "rewards/rm_reward_func/std": 9.544291496276855, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 316.65625, + "completions/mean_terminated_length": 227.8636474609375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.0944, + "grad_norm": 2.618666172027588, + "kl": 0.0256805419921875, + "learning_rate": 1e-06, + "loss": 0.0873, + "num_tokens": 1638966.0, + "reward": -0.8642578125, + "reward_std": 4.0193023681640625, + "rewards/rm_reward_func/mean": -0.8642578125, + "rewards/rm_reward_func/std": 9.615918159484863, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 335.78125, + "completions/mean_terminated_length": 198.72222900390625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.0952, + "grad_norm": 2.747579574584961, + "kl": 0.0086669921875, + "learning_rate": 1e-06, + "loss": 0.0335, + "num_tokens": 1654991.0, + "reward": -10.556396484375, + "reward_std": 4.236153602600098, + "rewards/rm_reward_func/mean": -10.556396484375, + "rewards/rm_reward_func/std": 4.934238910675049, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 386.65625, + "completions/mean_terminated_length": 300.8947448730469, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.096, + "grad_norm": 1.5169398784637451, + "kl": 0.00820159912109375, + "learning_rate": 1e-06, + "loss": -0.0771, + "num_tokens": 1674180.0, + "reward": -7.51812744140625, + "reward_std": 8.49379825592041, + "rewards/rm_reward_func/mean": -7.51812744140625, + "rewards/rm_reward_func/std": 16.740442276000977, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 443.46875, + "completions/mean_terminated_length": 374.9375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.0968, + "grad_norm": 1.625799536705017, + "kl": 0.005413055419921875, + "learning_rate": 1e-06, + "loss": -0.0228, + "num_tokens": 1691907.0, + "reward": -7.287109375, + "reward_std": 5.570217132568359, + "rewards/rm_reward_func/mean": -7.287109375, + "rewards/rm_reward_func/std": 7.364434719085693, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 471.34375, + "completions/mean_terminated_length": 411.923095703125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.0976, + "grad_norm": 1.3326512575149536, + "kl": 0.00777435302734375, + "learning_rate": 1e-06, + "loss": -0.0402, + "num_tokens": 1711094.0, + "reward": -10.123046875, + "reward_std": 4.932340621948242, + "rewards/rm_reward_func/mean": -10.123046875, + "rewards/rm_reward_func/std": 11.740287780761719, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 255.875, + "completions/mean_terminated_length": 155.6521759033203, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.0984, + "grad_norm": 2.9619929790496826, + "kl": 0.0210113525390625, + "learning_rate": 1e-06, + "loss": -0.098, + "num_tokens": 1724690.0, + "reward": -16.21990966796875, + "reward_std": 5.218927383422852, + "rewards/rm_reward_func/mean": -16.21990966796875, + "rewards/rm_reward_func/std": 12.341767311096191, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 301.90625, + "completions/mean_terminated_length": 206.4091033935547, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.0992, + "grad_norm": 2.8766839504241943, + "kl": 0.033477783203125, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 1741615.0, + "reward": -11.9681396484375, + "reward_std": 6.536713600158691, + "rewards/rm_reward_func/mean": -11.9681396484375, + "rewards/rm_reward_func/std": 7.9947509765625, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 302.0, + "completions/mean_terminated_length": 116.70587921142578, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.1, + "grad_norm": 2.215245008468628, + "kl": 0.019420623779296875, + "learning_rate": 1e-06, + "loss": 0.3556, + "num_tokens": 1756991.0, + "reward": -16.614990234375, + "reward_std": 6.899796485900879, + "rewards/rm_reward_func/mean": -16.614990234375, + "rewards/rm_reward_func/std": 8.868526458740234, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 393.40625, + "completions/mean_terminated_length": 240.9285888671875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.1008, + "grad_norm": 2.0964515209198, + "kl": 0.020904541015625, + "learning_rate": 1e-06, + "loss": -0.1621, + "num_tokens": 1775780.0, + "reward": -14.6025390625, + "reward_std": 3.903238534927368, + "rewards/rm_reward_func/mean": -14.6025390625, + "rewards/rm_reward_func/std": 5.502429008483887, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 351.28125, + "completions/mean_terminated_length": 288.39129638671875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1016, + "grad_norm": 1.759840965270996, + "kl": 0.00807952880859375, + "learning_rate": 1e-06, + "loss": 0.0616, + "num_tokens": 1794061.0, + "reward": -1.0806732177734375, + "reward_std": 6.107678413391113, + "rewards/rm_reward_func/mean": -1.0806732177734375, + "rewards/rm_reward_func/std": 16.35316276550293, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 204.96875, + "completions/mean_terminated_length": 195.06451416015625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.1024, + "grad_norm": 2.658406972885132, + "kl": 0.05316162109375, + "learning_rate": 1e-06, + "loss": -0.2107, + "num_tokens": 1809396.0, + "reward": -3.645416259765625, + "reward_std": 8.27816390991211, + "rewards/rm_reward_func/mean": -3.645416259765625, + "rewards/rm_reward_func/std": 8.458089828491211, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 500.78125, + "completions/mean_terminated_length": 472.1111145019531, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.1032, + "grad_norm": 1.5516091585159302, + "kl": 0.0069122314453125, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 1828085.0, + "reward": -15.40673828125, + "reward_std": 3.3135409355163574, + "rewards/rm_reward_func/mean": -15.40673828125, + "rewards/rm_reward_func/std": 6.789217472076416, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 274.6875, + "completions/mean_terminated_length": 208.239990234375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.104, + "grad_norm": 2.1336140632629395, + "kl": 0.024105072021484375, + "learning_rate": 1e-06, + "loss": -0.0874, + "num_tokens": 1841803.0, + "reward": -10.004974365234375, + "reward_std": 6.054762840270996, + "rewards/rm_reward_func/mean": -10.004974365234375, + "rewards/rm_reward_func/std": 7.15830135345459, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 276.34375, + "completions/mean_terminated_length": 221.9615478515625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1048, + "grad_norm": 1.7219667434692383, + "kl": 0.011444091796875, + "learning_rate": 1e-06, + "loss": 0.1228, + "num_tokens": 1855510.0, + "reward": -10.95461654663086, + "reward_std": 6.540355205535889, + "rewards/rm_reward_func/mean": -10.95461654663086, + "rewards/rm_reward_func/std": 8.099757194519043, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 240.75, + "completions/mean_terminated_length": 178.1538543701172, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.1056, + "grad_norm": 2.4584615230560303, + "kl": 0.03453826904296875, + "learning_rate": 1e-06, + "loss": -0.1541, + "num_tokens": 1874118.0, + "reward": 4.7135009765625, + "reward_std": 6.0199971199035645, + "rewards/rm_reward_func/mean": 4.7135009765625, + "rewards/rm_reward_func/std": 14.076603889465332, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 330.03125, + "completions/mean_terminated_length": 234.71429443359375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1064, + "grad_norm": 2.0738425254821777, + "kl": 0.0190277099609375, + "learning_rate": 1e-06, + "loss": 0.1551, + "num_tokens": 1889815.0, + "reward": -8.329833984375, + "reward_std": 4.831732273101807, + "rewards/rm_reward_func/mean": -8.329833984375, + "rewards/rm_reward_func/std": 10.595080375671387, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 246.9375, + "completions/mean_terminated_length": 172.72000122070312, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1072, + "grad_norm": 2.7756316661834717, + "kl": 0.03842926025390625, + "learning_rate": 1e-06, + "loss": -0.0472, + "num_tokens": 1905173.0, + "reward": -4.789794921875, + "reward_std": 7.391261577606201, + "rewards/rm_reward_func/mean": -4.789794921875, + "rewards/rm_reward_func/std": 9.671632766723633, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 286.90625, + "completions/mean_terminated_length": 169.0, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.108, + "grad_norm": 2.302950859069824, + "kl": 0.01079559326171875, + "learning_rate": 1e-06, + "loss": 0.1128, + "num_tokens": 1916434.0, + "reward": -13.244140625, + "reward_std": 3.072427272796631, + "rewards/rm_reward_func/mean": -13.244140625, + "rewards/rm_reward_func/std": 5.928183078765869, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 272.1875, + "completions/mean_terminated_length": 178.3478240966797, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.1088, + "grad_norm": 2.4461910724639893, + "kl": 0.02993011474609375, + "learning_rate": 1e-06, + "loss": 0.1909, + "num_tokens": 1934320.0, + "reward": -7.178558349609375, + "reward_std": 8.683235168457031, + "rewards/rm_reward_func/mean": -7.178558349609375, + "rewards/rm_reward_func/std": 11.522482872009277, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 316.59375, + "completions/mean_terminated_length": 95.13333892822266, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.1096, + "grad_norm": 2.7466909885406494, + "kl": 0.01624298095703125, + "learning_rate": 1e-06, + "loss": 0.049, + "num_tokens": 1951635.0, + "reward": -13.73193359375, + "reward_std": 5.608217716217041, + "rewards/rm_reward_func/mean": -13.73193359375, + "rewards/rm_reward_func/std": 8.120978355407715, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 234.40625, + "completions/mean_terminated_length": 205.6896514892578, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1104, + "grad_norm": 1.869767189025879, + "kl": 0.0456085205078125, + "learning_rate": 1e-06, + "loss": -0.0197, + "num_tokens": 1966680.0, + "reward": -4.6903076171875, + "reward_std": 5.595372200012207, + "rewards/rm_reward_func/mean": -4.6903076171875, + "rewards/rm_reward_func/std": 11.459237098693848, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 315.65625, + "completions/mean_terminated_length": 226.4091033935547, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.1112, + "grad_norm": 2.032897710800171, + "kl": 0.016193389892578125, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 1979893.0, + "reward": -10.984375, + "reward_std": 4.737551689147949, + "rewards/rm_reward_func/mean": -10.984375, + "rewards/rm_reward_func/std": 9.891593933105469, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 179.59375, + "completions/mean_terminated_length": 168.87095642089844, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.112, + "grad_norm": 3.2677183151245117, + "kl": 0.037811279296875, + "learning_rate": 1e-06, + "loss": -0.0647, + "num_tokens": 1990632.0, + "reward": -5.863128662109375, + "reward_std": 5.620069980621338, + "rewards/rm_reward_func/mean": -5.863128662109375, + "rewards/rm_reward_func/std": 9.011246681213379, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 298.0625, + "completions/mean_terminated_length": 200.8181915283203, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.1128, + "grad_norm": 2.2187294960021973, + "kl": 0.0217132568359375, + "learning_rate": 1e-06, + "loss": -0.0544, + "num_tokens": 2005762.0, + "reward": -6.532073974609375, + "reward_std": 6.6942291259765625, + "rewards/rm_reward_func/mean": -6.532073974609375, + "rewards/rm_reward_func/std": 8.323713302612305, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 261.5, + "completions/mean_terminated_length": 111.20000457763672, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1136, + "grad_norm": 2.119244337081909, + "kl": 0.025115966796875, + "learning_rate": 1e-06, + "loss": 0.0755, + "num_tokens": 2019826.0, + "reward": -15.26123046875, + "reward_std": 4.2206244468688965, + "rewards/rm_reward_func/mean": -15.26123046875, + "rewards/rm_reward_func/std": 10.204392433166504, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 342.09375, + "completions/mean_terminated_length": 240.15000915527344, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.1144, + "grad_norm": 2.050989866256714, + "kl": 0.029693603515625, + "learning_rate": 1e-06, + "loss": -0.0631, + "num_tokens": 2034901.0, + "reward": -12.080078125, + "reward_std": 5.899226188659668, + "rewards/rm_reward_func/mean": -12.080078125, + "rewards/rm_reward_func/std": 18.376056671142578, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 176.90625, + "completions/mean_terminated_length": 166.09677124023438, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1152, + "grad_norm": 4.184224605560303, + "kl": 0.0889129638671875, + "learning_rate": 1e-06, + "loss": 0.1725, + "num_tokens": 2049802.0, + "reward": -6.920013427734375, + "reward_std": 7.062530994415283, + "rewards/rm_reward_func/mean": -6.920013427734375, + "rewards/rm_reward_func/std": 8.924982070922852, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 306.5, + "completions/mean_terminated_length": 165.89474487304688, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.116, + "grad_norm": 2.105541944503784, + "kl": 0.01551055908203125, + "learning_rate": 1e-06, + "loss": 0.0679, + "num_tokens": 2062522.0, + "reward": -11.2633056640625, + "reward_std": 6.433701992034912, + "rewards/rm_reward_func/mean": -11.2633056640625, + "rewards/rm_reward_func/std": 7.466246128082275, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 209.90625, + "completions/mean_terminated_length": 109.20833587646484, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.1168, + "grad_norm": 3.3518717288970947, + "kl": 0.03389739990234375, + "learning_rate": 1e-06, + "loss": -0.0395, + "num_tokens": 2080159.0, + "reward": -5.5408935546875, + "reward_std": 5.426172256469727, + "rewards/rm_reward_func/mean": -5.5408935546875, + "rewards/rm_reward_func/std": 17.110858917236328, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 187.0, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1176, + "grad_norm": 3.0106844902038574, + "kl": 0.03423309326171875, + "learning_rate": 1e-06, + "loss": -0.0581, + "num_tokens": 2095719.0, + "reward": -7.80078125, + "reward_std": 4.077527046203613, + "rewards/rm_reward_func/mean": -7.80078125, + "rewards/rm_reward_func/std": 14.844619750976562, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 370.25, + "completions/mean_terminated_length": 330.55999755859375, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.1184, + "grad_norm": 1.8441935777664185, + "kl": 0.017578125, + "learning_rate": 1e-06, + "loss": 0.0872, + "num_tokens": 2113479.0, + "reward": -6.53564453125, + "reward_std": 6.958893775939941, + "rewards/rm_reward_func/mean": -6.53564453125, + "rewards/rm_reward_func/std": 10.368814468383789, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 286.09375, + "completions/mean_terminated_length": 197.69566345214844, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.1192, + "grad_norm": 2.4818575382232666, + "kl": 0.0302276611328125, + "learning_rate": 1e-06, + "loss": 0.0591, + "num_tokens": 2126994.0, + "reward": -12.82244873046875, + "reward_std": 4.638556957244873, + "rewards/rm_reward_func/mean": -12.82244873046875, + "rewards/rm_reward_func/std": 10.2630615234375, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 294.03125, + "completions/mean_terminated_length": 253.6666717529297, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.12, + "grad_norm": 2.569693088531494, + "kl": 0.0395355224609375, + "learning_rate": 1e-06, + "loss": -0.046, + "num_tokens": 2139211.0, + "reward": -11.6160888671875, + "reward_std": 6.76583194732666, + "rewards/rm_reward_func/mean": -11.6160888671875, + "rewards/rm_reward_func/std": 11.445720672607422, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 189.125, + "completions/mean_terminated_length": 155.72413635253906, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1208, + "grad_norm": 2.781977891921997, + "kl": 0.0484619140625, + "learning_rate": 1e-06, + "loss": 0.2609, + "num_tokens": 2153239.0, + "reward": -6.052490234375, + "reward_std": 6.7315874099731445, + "rewards/rm_reward_func/mean": -6.052490234375, + "rewards/rm_reward_func/std": 10.542217254638672, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 222.21875, + "completions/mean_terminated_length": 180.82144165039062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1216, + "grad_norm": 2.72622013092041, + "kl": 0.0226287841796875, + "learning_rate": 1e-06, + "loss": -0.0736, + "num_tokens": 2165806.0, + "reward": -6.59765625, + "reward_std": 4.983492374420166, + "rewards/rm_reward_func/mean": -6.59765625, + "rewards/rm_reward_func/std": 5.629057884216309, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 277.34375, + "completions/mean_terminated_length": 269.7742004394531, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1224, + "grad_norm": 2.2262492179870605, + "kl": 0.042388916015625, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 2183569.0, + "reward": 5.511383056640625, + "reward_std": 7.687152862548828, + "rewards/rm_reward_func/mean": 5.511383056640625, + "rewards/rm_reward_func/std": 13.129677772521973, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 197.3333282470703, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.1232, + "grad_norm": 2.253307580947876, + "kl": 0.044769287109375, + "learning_rate": 1e-06, + "loss": 0.0174, + "num_tokens": 2196713.0, + "reward": -11.32904052734375, + "reward_std": 6.906871795654297, + "rewards/rm_reward_func/mean": -11.32904052734375, + "rewards/rm_reward_func/std": 9.20895767211914, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 310.65625, + "completions/mean_terminated_length": 243.5416717529297, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.124, + "grad_norm": 2.193121910095215, + "kl": 0.031219482421875, + "learning_rate": 1e-06, + "loss": 0.0718, + "num_tokens": 2213398.0, + "reward": -12.6551513671875, + "reward_std": 5.4895477294921875, + "rewards/rm_reward_func/mean": -12.6551513671875, + "rewards/rm_reward_func/std": 6.482039928436279, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 260.53125, + "completions/mean_terminated_length": 176.70834350585938, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.1248, + "grad_norm": 2.8202707767486572, + "kl": 0.046875, + "learning_rate": 1e-06, + "loss": 0.3188, + "num_tokens": 2229287.0, + "reward": -11.495651245117188, + "reward_std": 6.794215679168701, + "rewards/rm_reward_func/mean": -11.495651245117188, + "rewards/rm_reward_func/std": 11.453935623168945, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 383.4375, + "completions/mean_terminated_length": 306.3000183105469, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.1256, + "grad_norm": 1.8607921600341797, + "kl": 0.01166534423828125, + "learning_rate": 1e-06, + "loss": -0.0752, + "num_tokens": 2243397.0, + "reward": -6.07568359375, + "reward_std": 5.728580951690674, + "rewards/rm_reward_func/mean": -6.07568359375, + "rewards/rm_reward_func/std": 6.02807092666626, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 267.0625, + "completions/mean_terminated_length": 185.4166717529297, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.1264, + "grad_norm": 2.1332223415374756, + "kl": 0.0546417236328125, + "learning_rate": 1e-06, + "loss": 0.1248, + "num_tokens": 2263279.0, + "reward": -14.532852172851562, + "reward_std": 5.553691387176514, + "rewards/rm_reward_func/mean": -14.532852172851562, + "rewards/rm_reward_func/std": 7.50604772567749, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 359.21875, + "completions/mean_terminated_length": 289.7727355957031, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.1272, + "grad_norm": 1.9590290784835815, + "kl": 0.019989013671875, + "learning_rate": 1e-06, + "loss": 0.0572, + "num_tokens": 2281774.0, + "reward": -11.6298828125, + "reward_std": 3.71504545211792, + "rewards/rm_reward_func/mean": -11.6298828125, + "rewards/rm_reward_func/std": 14.31386661529541, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 191.09375, + "completions/mean_terminated_length": 145.25, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.128, + "grad_norm": 3.691321849822998, + "kl": 0.0278167724609375, + "learning_rate": 1e-06, + "loss": -0.1393, + "num_tokens": 2296249.0, + "reward": -10.544189453125, + "reward_std": 6.927615642547607, + "rewards/rm_reward_func/mean": -10.544189453125, + "rewards/rm_reward_func/std": 10.522303581237793, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 392.78125, + "completions/mean_terminated_length": 287.5882263183594, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.1288, + "grad_norm": 1.7210333347320557, + "kl": 0.0132598876953125, + "learning_rate": 1e-06, + "loss": 0.0496, + "num_tokens": 2313514.0, + "reward": -5.873046875, + "reward_std": 6.888821601867676, + "rewards/rm_reward_func/mean": -5.873046875, + "rewards/rm_reward_func/std": 12.062917709350586, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 349.5, + "completions/mean_terminated_length": 285.9130554199219, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.1296, + "grad_norm": 2.139791250228882, + "kl": 0.0163726806640625, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 2328362.0, + "reward": -11.768905639648438, + "reward_std": 3.496058464050293, + "rewards/rm_reward_func/mean": -11.768905639648438, + "rewards/rm_reward_func/std": 8.156883239746094, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 196.53334045410156, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.1304, + "grad_norm": 3.753201961517334, + "kl": 0.0533599853515625, + "learning_rate": 1e-06, + "loss": 0.044, + "num_tokens": 2341090.0, + "reward": -5.4664306640625, + "reward_std": 4.309957504272461, + "rewards/rm_reward_func/mean": -5.4664306640625, + "rewards/rm_reward_func/std": 8.21058177947998, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 325.65625, + "completions/mean_terminated_length": 291.1481628417969, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.1312, + "grad_norm": 1.735277771949768, + "kl": 0.00875091552734375, + "learning_rate": 1e-06, + "loss": 0.0537, + "num_tokens": 2354151.0, + "reward": -12.783935546875, + "reward_std": 6.066422462463379, + "rewards/rm_reward_func/mean": -12.783935546875, + "rewards/rm_reward_func/std": 7.095264911651611, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 248.96875, + "completions/mean_terminated_length": 248.96875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.132, + "grad_norm": 2.5273311138153076, + "kl": 0.0210723876953125, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 2363486.0, + "reward": -13.744873046875, + "reward_std": 4.9974470138549805, + "rewards/rm_reward_func/mean": -13.744873046875, + "rewards/rm_reward_func/std": 7.1511549949646, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 186.90625, + "completions/mean_terminated_length": 176.41934204101562, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1328, + "grad_norm": 2.773197889328003, + "kl": 0.0198974609375, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 2372755.0, + "reward": -16.49951171875, + "reward_std": 4.601117134094238, + "rewards/rm_reward_func/mean": -16.49951171875, + "rewards/rm_reward_func/std": 5.705376625061035, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 191.03125, + "completions/mean_terminated_length": 169.6333465576172, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.1336, + "grad_norm": 2.672168016433716, + "kl": 0.041656494140625, + "learning_rate": 1e-06, + "loss": 0.3662, + "num_tokens": 2386572.0, + "reward": -11.91943359375, + "reward_std": 5.190067768096924, + "rewards/rm_reward_func/mean": -11.91943359375, + "rewards/rm_reward_func/std": 5.921402454376221, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 199.875, + "completions/mean_terminated_length": 199.875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.1344, + "grad_norm": 2.2194015979766846, + "kl": 0.038055419921875, + "learning_rate": 1e-06, + "loss": -0.0667, + "num_tokens": 2403512.0, + "reward": -7.9786376953125, + "reward_std": 7.445808410644531, + "rewards/rm_reward_func/mean": -7.9786376953125, + "rewards/rm_reward_func/std": 12.615328788757324, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 196.46875, + "completions/mean_terminated_length": 175.433349609375, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.1352, + "grad_norm": 3.7039167881011963, + "kl": 0.0190277099609375, + "learning_rate": 1e-06, + "loss": -0.1102, + "num_tokens": 2412351.0, + "reward": -16.447265625, + "reward_std": 5.503678321838379, + "rewards/rm_reward_func/mean": -16.447265625, + "rewards/rm_reward_func/std": 6.798795700073242, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 329.46875, + "completions/mean_terminated_length": 146.9375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.136, + "grad_norm": 2.3713736534118652, + "kl": 0.031978607177734375, + "learning_rate": 1e-06, + "loss": 0.247, + "num_tokens": 2427542.0, + "reward": -12.30126953125, + "reward_std": 5.643823623657227, + "rewards/rm_reward_func/mean": -12.30126953125, + "rewards/rm_reward_func/std": 7.329387187957764, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 405.09375, + "completions/mean_terminated_length": 226.9166717529297, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1368, + "grad_norm": 1.409492015838623, + "kl": 0.009002685546875, + "learning_rate": 1e-06, + "loss": -0.0526, + "num_tokens": 2444233.0, + "reward": -13.084083557128906, + "reward_std": 7.102007865905762, + "rewards/rm_reward_func/mean": -13.084083557128906, + "rewards/rm_reward_func/std": 10.233674049377441, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 303.25, + "completions/mean_terminated_length": 94.5, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.1376, + "grad_norm": 4.074381351470947, + "kl": 0.028717041015625, + "learning_rate": 1e-06, + "loss": -0.1125, + "num_tokens": 2458313.0, + "reward": -20.6953125, + "reward_std": 2.867544651031494, + "rewards/rm_reward_func/mean": -20.6953125, + "rewards/rm_reward_func/std": 4.610558032989502, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 314.0, + "completions/mean_terminated_length": 258.55999755859375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.1384, + "grad_norm": 2.307784080505371, + "kl": 0.017913818359375, + "learning_rate": 1e-06, + "loss": -0.1953, + "num_tokens": 2473577.0, + "reward": -8.60296630859375, + "reward_std": 10.378360748291016, + "rewards/rm_reward_func/mean": -8.60296630859375, + "rewards/rm_reward_func/std": 14.28746509552002, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 215.21875, + "completions/mean_terminated_length": 195.433349609375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.1392, + "grad_norm": 2.6569936275482178, + "kl": 0.030517578125, + "learning_rate": 1e-06, + "loss": -0.1743, + "num_tokens": 2487280.0, + "reward": -7.1887969970703125, + "reward_std": 6.780439376831055, + "rewards/rm_reward_func/mean": -7.1887969970703125, + "rewards/rm_reward_func/std": 12.155976295471191, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 408.75, + "completions/mean_terminated_length": 291.73333740234375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.14, + "grad_norm": 1.5760732889175415, + "kl": 0.0154266357421875, + "learning_rate": 1e-06, + "loss": 0.0539, + "num_tokens": 2509080.0, + "reward": -14.091552734375, + "reward_std": 6.694252967834473, + "rewards/rm_reward_func/mean": -14.091552734375, + "rewards/rm_reward_func/std": 9.908426284790039, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 266.375, + "completions/mean_terminated_length": 220.88888549804688, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1408, + "grad_norm": 2.321152687072754, + "kl": 0.0243072509765625, + "learning_rate": 1e-06, + "loss": -0.1175, + "num_tokens": 2521332.0, + "reward": -7.186553955078125, + "reward_std": 6.190032005310059, + "rewards/rm_reward_func/mean": -7.186553955078125, + "rewards/rm_reward_func/std": 7.74464225769043, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 154.90625, + "completions/mean_terminated_length": 143.3870849609375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1416, + "grad_norm": 2.6954550743103027, + "kl": 0.021881103515625, + "learning_rate": 1e-06, + "loss": 0.3363, + "num_tokens": 2530113.0, + "reward": -6.23687744140625, + "reward_std": 4.889453887939453, + "rewards/rm_reward_func/mean": -6.23687744140625, + "rewards/rm_reward_func/std": 9.479598045349121, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 134.09375, + "completions/mean_terminated_length": 108.90000915527344, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.1424, + "grad_norm": 2.9601638317108154, + "kl": 0.039764404296875, + "learning_rate": 1e-06, + "loss": 0.0863, + "num_tokens": 2542532.0, + "reward": -3.060302734375, + "reward_std": 4.652748107910156, + "rewards/rm_reward_func/mean": -3.060302734375, + "rewards/rm_reward_func/std": 13.32276439666748, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 238.8125, + "completions/mean_terminated_length": 147.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.1432, + "grad_norm": 2.9207701683044434, + "kl": 0.0250091552734375, + "learning_rate": 1e-06, + "loss": -0.1142, + "num_tokens": 2556566.0, + "reward": -8.1273193359375, + "reward_std": 4.876923561096191, + "rewards/rm_reward_func/mean": -8.1273193359375, + "rewards/rm_reward_func/std": 14.315848350524902, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 218.3125, + "completions/mean_terminated_length": 176.35714721679688, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.144, + "grad_norm": 3.184692859649658, + "kl": 0.034332275390625, + "learning_rate": 1e-06, + "loss": 0.0914, + "num_tokens": 2568968.0, + "reward": -6.57763671875, + "reward_std": 6.7350006103515625, + "rewards/rm_reward_func/mean": -6.57763671875, + "rewards/rm_reward_func/std": 9.54586410522461, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 334.15625, + "completions/mean_terminated_length": 253.3181915283203, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.1448, + "grad_norm": 2.127804756164551, + "kl": 0.0266265869140625, + "learning_rate": 1e-06, + "loss": -0.1381, + "num_tokens": 2585293.0, + "reward": -12.908086776733398, + "reward_std": 3.882758140563965, + "rewards/rm_reward_func/mean": -12.908086776733398, + "rewards/rm_reward_func/std": 7.697563648223877, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 307.5625, + "completions/mean_terminated_length": 148.55555725097656, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.1456, + "grad_norm": 2.737328290939331, + "kl": 0.0284576416015625, + "learning_rate": 1e-06, + "loss": 0.1092, + "num_tokens": 2602639.0, + "reward": -2.4775390625, + "reward_std": 7.57058048248291, + "rewards/rm_reward_func/mean": -2.4775390625, + "rewards/rm_reward_func/std": 15.709602355957031, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 247.40625, + "completions/mean_terminated_length": 186.34616088867188, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.1464, + "grad_norm": 2.0840070247650146, + "kl": 0.017913818359375, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 2616756.0, + "reward": -11.5029296875, + "reward_std": 6.805416584014893, + "rewards/rm_reward_func/mean": -11.5029296875, + "rewards/rm_reward_func/std": 10.402040481567383, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 220.59375, + "completions/mean_terminated_length": 211.19354248046875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1472, + "grad_norm": 2.4085540771484375, + "kl": 0.020263671875, + "learning_rate": 1e-06, + "loss": 0.3972, + "num_tokens": 2625399.0, + "reward": -10.47216796875, + "reward_std": 5.67173433303833, + "rewards/rm_reward_func/mean": -10.47216796875, + "rewards/rm_reward_func/std": 7.354183197021484, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 334.125, + "completions/mean_terminated_length": 284.32000732421875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.148, + "grad_norm": 1.5594758987426758, + "kl": 0.019866943359375, + "learning_rate": 1e-06, + "loss": -0.0565, + "num_tokens": 2640363.0, + "reward": -3.104736328125, + "reward_std": 12.37482738494873, + "rewards/rm_reward_func/mean": -3.104736328125, + "rewards/rm_reward_func/std": 17.64823341369629, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 345.71875, + "completions/mean_terminated_length": 231.94737243652344, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.1488, + "grad_norm": 1.8342353105545044, + "kl": 0.036224365234375, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 2659930.0, + "reward": -14.759765625, + "reward_std": 4.9378662109375, + "rewards/rm_reward_func/mean": -14.759765625, + "rewards/rm_reward_func/std": 8.639665603637695, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 387.78125, + "completions/mean_terminated_length": 331.3182067871094, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.1496, + "grad_norm": 1.8258551359176636, + "kl": 0.0152435302734375, + "learning_rate": 1e-06, + "loss": -0.0234, + "num_tokens": 2677363.0, + "reward": -9.47021484375, + "reward_std": 4.15437126159668, + "rewards/rm_reward_func/mean": -9.47021484375, + "rewards/rm_reward_func/std": 4.541082382202148, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 411.40625, + "completions/mean_terminated_length": 297.4000244140625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.1504, + "grad_norm": 1.693354845046997, + "kl": 0.01218414306640625, + "learning_rate": 1e-06, + "loss": -0.1033, + "num_tokens": 2694344.0, + "reward": -10.7547607421875, + "reward_std": 4.365148544311523, + "rewards/rm_reward_func/mean": -10.7547607421875, + "rewards/rm_reward_func/std": 11.428670883178711, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 254.0, + "completions/mean_terminated_length": 227.3103485107422, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.1512, + "grad_norm": 1.9936155080795288, + "kl": 0.049560546875, + "learning_rate": 1e-06, + "loss": -0.0708, + "num_tokens": 2712184.0, + "reward": 1.21685791015625, + "reward_std": 9.429040908813477, + "rewards/rm_reward_func/mean": 1.21685791015625, + "rewards/rm_reward_func/std": 12.698073387145996, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 257.78125, + "completions/mean_terminated_length": 210.70370483398438, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.152, + "grad_norm": 2.3250443935394287, + "kl": 0.0221099853515625, + "learning_rate": 1e-06, + "loss": -0.213, + "num_tokens": 2722497.0, + "reward": -15.48681640625, + "reward_std": 6.284116744995117, + "rewards/rm_reward_func/mean": -15.48681640625, + "rewards/rm_reward_func/std": 6.530057430267334, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 340.65625, + "completions/mean_terminated_length": 250.90476989746094, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.1528, + "grad_norm": 3.0360898971557617, + "kl": 0.0372772216796875, + "learning_rate": 1e-06, + "loss": 0.12, + "num_tokens": 2735822.0, + "reward": -10.4608154296875, + "reward_std": 5.204061508178711, + "rewards/rm_reward_func/mean": -10.4608154296875, + "rewards/rm_reward_func/std": 13.650362968444824, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 278.0, + "completions/mean_terminated_length": 186.43478393554688, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1536, + "grad_norm": 3.131574869155884, + "kl": 0.02890777587890625, + "learning_rate": 1e-06, + "loss": -0.0877, + "num_tokens": 2752006.0, + "reward": -7.861541748046875, + "reward_std": 5.182844161987305, + "rewards/rm_reward_func/mean": -7.861541748046875, + "rewards/rm_reward_func/std": 14.873505592346191, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 359.40625, + "completions/mean_terminated_length": 299.6956481933594, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.1544, + "grad_norm": 1.8317567110061646, + "kl": 0.0149993896484375, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 2767147.0, + "reward": -2.938201904296875, + "reward_std": 6.259884357452393, + "rewards/rm_reward_func/mean": -2.938201904296875, + "rewards/rm_reward_func/std": 9.629374504089355, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 361.375, + "completions/mean_terminated_length": 282.4761962890625, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.1552, + "grad_norm": 2.187994956970215, + "kl": 0.0270233154296875, + "learning_rate": 1e-06, + "loss": -0.0489, + "num_tokens": 2784991.0, + "reward": -6.2962646484375, + "reward_std": 4.145152568817139, + "rewards/rm_reward_func/mean": -6.2962646484375, + "rewards/rm_reward_func/std": 6.6450910568237305, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 365.84375, + "completions/mean_terminated_length": 200.20001220703125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.156, + "grad_norm": 1.7896136045455933, + "kl": 0.0216827392578125, + "learning_rate": 1e-06, + "loss": -0.0193, + "num_tokens": 2801498.0, + "reward": -12.621337890625, + "reward_std": 3.6846072673797607, + "rewards/rm_reward_func/mean": -12.621337890625, + "rewards/rm_reward_func/std": 13.76012134552002, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 360.125, + "completions/mean_terminated_length": 280.5714416503906, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1568, + "grad_norm": 2.624439001083374, + "kl": 0.0124359130859375, + "learning_rate": 1e-06, + "loss": -0.1602, + "num_tokens": 2816070.0, + "reward": -7.5452880859375, + "reward_std": 5.356627464294434, + "rewards/rm_reward_func/mean": -7.5452880859375, + "rewards/rm_reward_func/std": 10.857898712158203, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 462.78125, + "completions/mean_terminated_length": 315.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1576, + "grad_norm": 1.4865249395370483, + "kl": 0.01401519775390625, + "learning_rate": 1e-06, + "loss": 0.0446, + "num_tokens": 2836343.0, + "reward": -17.361328125, + "reward_std": 2.944314956665039, + "rewards/rm_reward_func/mean": -17.361328125, + "rewards/rm_reward_func/std": 4.919506549835205, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 323.40625, + "completions/mean_terminated_length": 194.36842346191406, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.1584, + "grad_norm": 2.7904696464538574, + "kl": 0.02349853515625, + "learning_rate": 1e-06, + "loss": 0.054, + "num_tokens": 2850596.0, + "reward": -7.79931640625, + "reward_std": 4.384397506713867, + "rewards/rm_reward_func/mean": -7.79931640625, + "rewards/rm_reward_func/std": 5.952577590942383, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 356.09375, + "completions/mean_terminated_length": 285.227294921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 0.1592, + "grad_norm": 4.847655773162842, + "kl": 0.0278778076171875, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 2865799.0, + "reward": -9.28839111328125, + "reward_std": 5.713525772094727, + "rewards/rm_reward_func/mean": -9.28839111328125, + "rewards/rm_reward_func/std": 6.336117744445801, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 380.09375, + "completions/mean_terminated_length": 277.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.16, + "grad_norm": 1.70150625705719, + "kl": 0.0133056640625, + "learning_rate": 1e-06, + "loss": -0.019, + "num_tokens": 2881554.0, + "reward": -9.2530517578125, + "reward_std": 4.449319362640381, + "rewards/rm_reward_func/mean": -9.2530517578125, + "rewards/rm_reward_func/std": 12.349023818969727, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 287.0, + "completions/mean_terminated_length": 184.72727966308594, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.1608, + "grad_norm": 2.765225887298584, + "kl": 0.0222930908203125, + "learning_rate": 1e-06, + "loss": -0.0387, + "num_tokens": 2893602.0, + "reward": -12.2939453125, + "reward_std": 5.316097259521484, + "rewards/rm_reward_func/mean": -12.2939453125, + "rewards/rm_reward_func/std": 6.76662015914917, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 171.6875, + "completions/mean_terminated_length": 160.7096710205078, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1616, + "grad_norm": 3.027146339416504, + "kl": 0.085205078125, + "learning_rate": 1e-06, + "loss": -0.0947, + "num_tokens": 2908896.0, + "reward": 2.9041290283203125, + "reward_std": 6.956555366516113, + "rewards/rm_reward_func/mean": 2.9041290283203125, + "rewards/rm_reward_func/std": 9.795485496520996, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 306.65625, + "completions/mean_terminated_length": 277.3214416503906, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.1624, + "grad_norm": 2.1454830169677734, + "kl": 0.02978515625, + "learning_rate": 1e-06, + "loss": -0.0813, + "num_tokens": 2925037.0, + "reward": -14.658203125, + "reward_std": 3.523207664489746, + "rewards/rm_reward_func/mean": -14.658203125, + "rewards/rm_reward_func/std": 6.658777236938477, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 200.875, + "completions/mean_terminated_length": 180.1333465576172, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.1632, + "grad_norm": 2.497971296310425, + "kl": 0.0396728515625, + "learning_rate": 1e-06, + "loss": -0.1702, + "num_tokens": 2939729.0, + "reward": 2.259765625, + "reward_std": 4.65114688873291, + "rewards/rm_reward_func/mean": 2.259765625, + "rewards/rm_reward_func/std": 13.810540199279785, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 368.375, + "completions/mean_terminated_length": 293.1428527832031, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.164, + "grad_norm": 1.8702443838119507, + "kl": 0.0179443359375, + "learning_rate": 1e-06, + "loss": -0.0335, + "num_tokens": 2954261.0, + "reward": -4.393974304199219, + "reward_std": 6.49305534362793, + "rewards/rm_reward_func/mean": -4.393974304199219, + "rewards/rm_reward_func/std": 12.690206527709961, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 284.0625, + "completions/mean_terminated_length": 231.4615478515625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.1648, + "grad_norm": 2.3136606216430664, + "kl": 0.0377349853515625, + "learning_rate": 1e-06, + "loss": -0.0303, + "num_tokens": 2967591.0, + "reward": -5.62640380859375, + "reward_std": 6.597134113311768, + "rewards/rm_reward_func/mean": -5.62640380859375, + "rewards/rm_reward_func/std": 11.036033630371094, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 341.6875, + "completions/mean_terminated_length": 284.91668701171875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.1656, + "grad_norm": 1.844236969947815, + "kl": 0.0184478759765625, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 2980677.0, + "reward": -10.12890625, + "reward_std": 6.0780534744262695, + "rewards/rm_reward_func/mean": -10.12890625, + "rewards/rm_reward_func/std": 9.617057800292969, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 317.25, + "completions/mean_terminated_length": 272.3077087402344, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.1664, + "grad_norm": 1.9264299869537354, + "kl": 0.0269317626953125, + "learning_rate": 1e-06, + "loss": 0.074, + "num_tokens": 2996325.0, + "reward": -10.135498046875, + "reward_std": 6.745743274688721, + "rewards/rm_reward_func/mean": -10.135498046875, + "rewards/rm_reward_func/std": 7.226227760314941, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 282.1875, + "completions/mean_terminated_length": 258.4137878417969, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.1672, + "grad_norm": 2.8933424949645996, + "kl": 0.0302581787109375, + "learning_rate": 1e-06, + "loss": 0.1133, + "num_tokens": 3009603.0, + "reward": -2.920623779296875, + "reward_std": 5.987672805786133, + "rewards/rm_reward_func/mean": -2.920623779296875, + "rewards/rm_reward_func/std": 8.611761093139648, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 335.25, + "completions/mean_terminated_length": 214.3157958984375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.168, + "grad_norm": 1.9518332481384277, + "kl": 0.02947235107421875, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 3028139.0, + "reward": -13.851043701171875, + "reward_std": 4.232006549835205, + "rewards/rm_reward_func/mean": -13.851043701171875, + "rewards/rm_reward_func/std": 8.888598442077637, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 348.25, + "completions/mean_terminated_length": 262.4761962890625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.1688, + "grad_norm": 2.0551650524139404, + "kl": 0.0159454345703125, + "learning_rate": 1e-06, + "loss": -0.1365, + "num_tokens": 3043139.0, + "reward": -19.849609375, + "reward_std": 4.200689315795898, + "rewards/rm_reward_func/mean": -19.849609375, + "rewards/rm_reward_func/std": 4.785938262939453, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 233.71875, + "completions/mean_terminated_length": 224.74192810058594, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.1696, + "grad_norm": 2.2041149139404297, + "kl": 0.0253448486328125, + "learning_rate": 1e-06, + "loss": -0.0295, + "num_tokens": 3052634.0, + "reward": -8.964599609375, + "reward_std": 5.494900703430176, + "rewards/rm_reward_func/mean": -8.964599609375, + "rewards/rm_reward_func/std": 8.429585456848145, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 344.8125, + "completions/mean_terminated_length": 177.625, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1704, + "grad_norm": 1.9921987056732178, + "kl": 0.03647613525390625, + "learning_rate": 1e-06, + "loss": 0.0642, + "num_tokens": 3069956.0, + "reward": -10.4140625, + "reward_std": 4.13259744644165, + "rewards/rm_reward_func/mean": -10.4140625, + "rewards/rm_reward_func/std": 14.571422576904297, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 239.09375, + "completions/mean_terminated_length": 210.86207580566406, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.1712, + "grad_norm": 2.77146053314209, + "kl": 0.044525146484375, + "learning_rate": 1e-06, + "loss": 0.1287, + "num_tokens": 3082231.0, + "reward": -13.8243408203125, + "reward_std": 4.768612384796143, + "rewards/rm_reward_func/mean": -13.8243408203125, + "rewards/rm_reward_func/std": 8.846455574035645, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 337.875, + "completions/mean_terminated_length": 184.23529052734375, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.172, + "grad_norm": 2.6639955043792725, + "kl": 0.0299835205078125, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 3096803.0, + "reward": -13.962890625, + "reward_std": 3.681051254272461, + "rewards/rm_reward_func/mean": -13.962890625, + "rewards/rm_reward_func/std": 6.399148464202881, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 407.0625, + "completions/mean_terminated_length": 372.0833435058594, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.1728, + "grad_norm": 1.8944278955459595, + "kl": 0.0150146484375, + "learning_rate": 1e-06, + "loss": -0.0914, + "num_tokens": 3111989.0, + "reward": -4.1859130859375, + "reward_std": 5.547937393188477, + "rewards/rm_reward_func/mean": -4.1859130859375, + "rewards/rm_reward_func/std": 7.873926162719727, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 309.375, + "completions/mean_terminated_length": 230.0869598388672, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1736, + "grad_norm": 1.9232604503631592, + "kl": 0.0459136962890625, + "learning_rate": 1e-06, + "loss": -0.0354, + "num_tokens": 3128521.0, + "reward": -4.9200439453125, + "reward_std": 7.50829553604126, + "rewards/rm_reward_func/mean": -4.9200439453125, + "rewards/rm_reward_func/std": 19.806100845336914, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 218.90625, + "completions/mean_terminated_length": 188.58621215820312, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.1744, + "grad_norm": 2.4838643074035645, + "kl": 0.063140869140625, + "learning_rate": 1e-06, + "loss": -0.0556, + "num_tokens": 3141662.0, + "reward": -5.502475738525391, + "reward_std": 8.156133651733398, + "rewards/rm_reward_func/mean": -5.502475738525391, + "rewards/rm_reward_func/std": 8.874342918395996, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 404.84375, + "completions/mean_terminated_length": 310.29412841796875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.1752, + "grad_norm": 1.6384692192077637, + "kl": 0.030029296875, + "learning_rate": 1e-06, + "loss": -0.0346, + "num_tokens": 3160969.0, + "reward": -12.671340942382812, + "reward_std": 4.365384578704834, + "rewards/rm_reward_func/mean": -12.671340942382812, + "rewards/rm_reward_func/std": 5.337032318115234, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 312.28125, + "completions/mean_terminated_length": 156.94444274902344, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.176, + "grad_norm": 3.26279616355896, + "kl": 0.02972412109375, + "learning_rate": 1e-06, + "loss": 0.6511, + "num_tokens": 3172522.0, + "reward": -9.061149597167969, + "reward_std": 5.750113487243652, + "rewards/rm_reward_func/mean": -9.061149597167969, + "rewards/rm_reward_func/std": 10.637591361999512, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 287.90625, + "completions/mean_terminated_length": 225.1599884033203, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.1768, + "grad_norm": 2.2829983234405518, + "kl": 0.0205841064453125, + "learning_rate": 1e-06, + "loss": 0.0837, + "num_tokens": 3186431.0, + "reward": -14.95361328125, + "reward_std": 5.428106307983398, + "rewards/rm_reward_func/mean": -14.95361328125, + "rewards/rm_reward_func/std": 6.8203444480896, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 318.84375, + "completions/mean_terminated_length": 148.41175842285156, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.1776, + "grad_norm": 2.4689595699310303, + "kl": 0.05229949951171875, + "learning_rate": 1e-06, + "loss": -0.1073, + "num_tokens": 3203282.0, + "reward": -9.2550048828125, + "reward_std": 4.454341411590576, + "rewards/rm_reward_func/mean": -9.2550048828125, + "rewards/rm_reward_func/std": 12.4971923828125, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 225.5, + "completions/mean_terminated_length": 206.40000915527344, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.1784, + "grad_norm": 2.553861141204834, + "kl": 0.0411834716796875, + "learning_rate": 1e-06, + "loss": 0.2421, + "num_tokens": 3221778.0, + "reward": -5.6124267578125, + "reward_std": 5.90608549118042, + "rewards/rm_reward_func/mean": -5.6124267578125, + "rewards/rm_reward_func/std": 11.847006797790527, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 256.5, + "completions/mean_terminated_length": 184.95999145507812, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.1792, + "grad_norm": 2.3087923526763916, + "kl": 0.035736083984375, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 3236194.0, + "reward": 9.390869140625, + "reward_std": 7.845202445983887, + "rewards/rm_reward_func/mean": 9.390869140625, + "rewards/rm_reward_func/std": 17.698915481567383, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 319.5, + "completions/mean_terminated_length": 169.7777862548828, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.18, + "grad_norm": 2.6516273021698, + "kl": 0.028228759765625, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 3248626.0, + "reward": -14.34375, + "reward_std": 4.134519577026367, + "rewards/rm_reward_func/mean": -14.34375, + "rewards/rm_reward_func/std": 5.796331882476807, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 288.6875, + "completions/mean_terminated_length": 237.1538543701172, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.1808, + "grad_norm": 2.208421230316162, + "kl": 0.021697998046875, + "learning_rate": 1e-06, + "loss": -0.0535, + "num_tokens": 3261608.0, + "reward": -7.2805328369140625, + "reward_std": 5.929561614990234, + "rewards/rm_reward_func/mean": -7.2805328369140625, + "rewards/rm_reward_func/std": 11.348499298095703, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 427.125, + "completions/mean_terminated_length": 318.0, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.1816, + "grad_norm": 1.6376084089279175, + "kl": 0.01723480224609375, + "learning_rate": 1e-06, + "loss": -0.0495, + "num_tokens": 3278964.0, + "reward": -10.26123046875, + "reward_std": 5.910520076751709, + "rewards/rm_reward_func/mean": -10.26123046875, + "rewards/rm_reward_func/std": 6.3919758796691895, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 286.46875, + "completions/mean_terminated_length": 263.137939453125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.1824, + "grad_norm": 1.7282941341400146, + "kl": 0.024383544921875, + "learning_rate": 1e-06, + "loss": 0.0332, + "num_tokens": 3295739.0, + "reward": -2.4454345703125, + "reward_std": 8.56084156036377, + "rewards/rm_reward_func/mean": -2.4454345703125, + "rewards/rm_reward_func/std": 11.857633590698242, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 258.71875, + "completions/mean_terminated_length": 250.5483856201172, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.1832, + "grad_norm": 2.416971206665039, + "kl": 0.028839111328125, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 3311298.0, + "reward": 0.365478515625, + "reward_std": 5.829031467437744, + "rewards/rm_reward_func/mean": 0.365478515625, + "rewards/rm_reward_func/std": 14.736551284790039, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 407.34375, + "completions/mean_terminated_length": 302.6875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.184, + "grad_norm": 1.7603254318237305, + "kl": 0.0258331298828125, + "learning_rate": 1e-06, + "loss": -0.075, + "num_tokens": 3330245.0, + "reward": -7.0024261474609375, + "reward_std": 5.309955596923828, + "rewards/rm_reward_func/mean": -7.0024261474609375, + "rewards/rm_reward_func/std": 11.73162841796875, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 346.375, + "completions/mean_terminated_length": 180.75, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.1848, + "grad_norm": 1.8091561794281006, + "kl": 0.0192108154296875, + "learning_rate": 1e-06, + "loss": -0.0868, + "num_tokens": 3343289.0, + "reward": -11.54052734375, + "reward_std": 3.743626832962036, + "rewards/rm_reward_func/mean": -11.54052734375, + "rewards/rm_reward_func/std": 8.75136661529541, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 337.6875, + "completions/mean_terminated_length": 305.40740966796875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.1856, + "grad_norm": 1.8760968446731567, + "kl": 0.02587890625, + "learning_rate": 1e-06, + "loss": -0.0908, + "num_tokens": 3359031.0, + "reward": -5.57666015625, + "reward_std": 5.031200408935547, + "rewards/rm_reward_func/mean": -5.57666015625, + "rewards/rm_reward_func/std": 14.13427448272705, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 200.6875, + "completions/mean_terminated_length": 143.0370330810547, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.1864, + "grad_norm": 2.716813325881958, + "kl": 0.0333251953125, + "learning_rate": 1e-06, + "loss": 0.464, + "num_tokens": 3368797.0, + "reward": -4.90576171875, + "reward_std": 10.027959823608398, + "rewards/rm_reward_func/mean": -4.90576171875, + "rewards/rm_reward_func/std": 9.909440994262695, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 359.03125, + "completions/mean_terminated_length": 278.9047546386719, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.1872, + "grad_norm": 1.756624698638916, + "kl": 0.01898193359375, + "learning_rate": 1e-06, + "loss": 0.0578, + "num_tokens": 3384686.0, + "reward": -4.2568359375, + "reward_std": 6.863399982452393, + "rewards/rm_reward_func/mean": -4.2568359375, + "rewards/rm_reward_func/std": 9.429292678833008, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 210.7692413330078, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.188, + "grad_norm": 2.7444419860839844, + "kl": 0.0295867919921875, + "learning_rate": 1e-06, + "loss": 0.1159, + "num_tokens": 3397262.0, + "reward": -10.483909606933594, + "reward_std": 4.836560249328613, + "rewards/rm_reward_func/mean": -10.483909606933594, + "rewards/rm_reward_func/std": 6.31536340713501, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 188.53125, + "completions/mean_terminated_length": 178.09677124023438, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.1888, + "grad_norm": 2.5071353912353516, + "kl": 0.05291748046875, + "learning_rate": 1e-06, + "loss": -0.1413, + "num_tokens": 3410503.0, + "reward": -8.572799682617188, + "reward_std": 4.778408050537109, + "rewards/rm_reward_func/mean": -8.572799682617188, + "rewards/rm_reward_func/std": 10.283060073852539, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 390.5625, + "completions/mean_terminated_length": 307.47369384765625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.1896, + "grad_norm": 2.5968477725982666, + "kl": 0.0594329833984375, + "learning_rate": 1e-06, + "loss": 0.0351, + "num_tokens": 3426585.0, + "reward": -13.81103515625, + "reward_std": 3.1418185234069824, + "rewards/rm_reward_func/mean": -13.81103515625, + "rewards/rm_reward_func/std": 6.806953430175781, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 230.0625, + "completions/mean_terminated_length": 230.0625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.1904, + "grad_norm": 2.2781054973602295, + "kl": 0.0489501953125, + "learning_rate": 1e-06, + "loss": 0.0887, + "num_tokens": 3439547.0, + "reward": -0.3016376495361328, + "reward_std": 5.685807228088379, + "rewards/rm_reward_func/mean": -0.3016376495361328, + "rewards/rm_reward_func/std": 13.500185012817383, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 411.875, + "completions/mean_terminated_length": 323.5294189453125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.1912, + "grad_norm": 1.8714083433151245, + "kl": 0.0715484619140625, + "learning_rate": 1e-06, + "loss": 0.0688, + "num_tokens": 3456111.0, + "reward": -4.5411376953125, + "reward_std": 5.111152648925781, + "rewards/rm_reward_func/mean": -4.5411376953125, + "rewards/rm_reward_func/std": 11.221564292907715, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 460.46875, + "completions/mean_terminated_length": 305.875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.192, + "grad_norm": 1.794485330581665, + "kl": 0.0208587646484375, + "learning_rate": 1e-06, + "loss": -0.0451, + "num_tokens": 3474950.0, + "reward": -13.3818359375, + "reward_std": 3.603726387023926, + "rewards/rm_reward_func/mean": -13.3818359375, + "rewards/rm_reward_func/std": 5.3652544021606445, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 283.78125, + "completions/mean_terminated_length": 219.87998962402344, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.1928, + "grad_norm": 2.5906386375427246, + "kl": 0.040985107421875, + "learning_rate": 1e-06, + "loss": 0.0191, + "num_tokens": 3491007.0, + "reward": -3.46533203125, + "reward_std": 5.687483787536621, + "rewards/rm_reward_func/mean": -3.46533203125, + "rewards/rm_reward_func/std": 16.856298446655273, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 230.5625, + "completions/mean_terminated_length": 136.75, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.1936, + "grad_norm": 3.2524259090423584, + "kl": 0.0953216552734375, + "learning_rate": 1e-06, + "loss": 0.0633, + "num_tokens": 3505113.0, + "reward": -5.0096893310546875, + "reward_std": 6.2869133949279785, + "rewards/rm_reward_func/mean": -5.0096893310546875, + "rewards/rm_reward_func/std": 14.090009689331055, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 335.84375, + "completions/mean_terminated_length": 136.20001220703125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.1944, + "grad_norm": 2.56569766998291, + "kl": 0.0410614013671875, + "learning_rate": 1e-06, + "loss": 0.1776, + "num_tokens": 3519148.0, + "reward": -9.60137939453125, + "reward_std": 4.221039772033691, + "rewards/rm_reward_func/mean": -9.60137939453125, + "rewards/rm_reward_func/std": 12.294721603393555, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 364.90625, + "completions/mean_terminated_length": 315.875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.1952, + "grad_norm": 2.0789108276367188, + "kl": 0.027130126953125, + "learning_rate": 1e-06, + "loss": -0.0602, + "num_tokens": 3536865.0, + "reward": -8.1817626953125, + "reward_std": 6.716730117797852, + "rewards/rm_reward_func/mean": -8.1817626953125, + "rewards/rm_reward_func/std": 8.459527969360352, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 338.4375, + "completions/mean_terminated_length": 259.54547119140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.196, + "grad_norm": 1.9831430912017822, + "kl": 0.023406982421875, + "learning_rate": 1e-06, + "loss": -0.0289, + "num_tokens": 3554175.0, + "reward": -5.8529052734375, + "reward_std": 4.307180404663086, + "rewards/rm_reward_func/mean": -5.8529052734375, + "rewards/rm_reward_func/std": 15.775003433227539, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 342.9375, + "completions/mean_terminated_length": 254.38095092773438, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.1968, + "grad_norm": 1.9376928806304932, + "kl": 0.0331573486328125, + "learning_rate": 1e-06, + "loss": -0.0539, + "num_tokens": 3572589.0, + "reward": -3.322021484375, + "reward_std": 5.675527572631836, + "rewards/rm_reward_func/mean": -3.322021484375, + "rewards/rm_reward_func/std": 12.340668678283691, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 352.84375, + "completions/mean_terminated_length": 269.4761962890625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.1976, + "grad_norm": 1.5846179723739624, + "kl": 0.02777099609375, + "learning_rate": 1e-06, + "loss": 0.1087, + "num_tokens": 3589008.0, + "reward": 1.33056640625, + "reward_std": 4.936090469360352, + "rewards/rm_reward_func/mean": 1.33056640625, + "rewards/rm_reward_func/std": 14.16551685333252, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 326.03125, + "completions/mean_terminated_length": 241.5, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.1984, + "grad_norm": 1.9523719549179077, + "kl": 0.0448760986328125, + "learning_rate": 1e-06, + "loss": -0.0978, + "num_tokens": 3606897.0, + "reward": 0.3890380859375, + "reward_std": 4.791427135467529, + "rewards/rm_reward_func/mean": 0.3890380859375, + "rewards/rm_reward_func/std": 10.650434494018555, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 288.375, + "completions/mean_terminated_length": 200.86956787109375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.1992, + "grad_norm": 1.8975037336349487, + "kl": 0.05194091796875, + "learning_rate": 1e-06, + "loss": -0.015, + "num_tokens": 3624381.0, + "reward": -10.30517578125, + "reward_std": 4.711612701416016, + "rewards/rm_reward_func/mean": -10.30517578125, + "rewards/rm_reward_func/std": 10.91821002960205, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 339.9375, + "completions/mean_terminated_length": 236.6999969482422, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.2, + "grad_norm": 2.049720525741577, + "kl": 0.02801513671875, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 3642531.0, + "reward": -9.1484375, + "reward_std": 6.937699794769287, + "rewards/rm_reward_func/mean": -9.1484375, + "rewards/rm_reward_func/std": 8.857656478881836, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 381.5, + "completions/mean_terminated_length": 251.0, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.2008, + "grad_norm": 1.7274067401885986, + "kl": 0.0271759033203125, + "learning_rate": 1e-06, + "loss": -0.0136, + "num_tokens": 3658259.0, + "reward": -5.727935791015625, + "reward_std": 5.033329010009766, + "rewards/rm_reward_func/mean": -5.727935791015625, + "rewards/rm_reward_func/std": 9.802799224853516, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 280.9375, + "completions/mean_terminated_length": 216.239990234375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.2016, + "grad_norm": 2.5147905349731445, + "kl": 0.034881591796875, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 3670417.0, + "reward": -13.61163330078125, + "reward_std": 5.531689643859863, + "rewards/rm_reward_func/mean": -13.61163330078125, + "rewards/rm_reward_func/std": 8.323686599731445, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 338.8125, + "completions/mean_terminated_length": 220.3157958984375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2024, + "grad_norm": 2.53934907913208, + "kl": 0.0482330322265625, + "learning_rate": 1e-06, + "loss": 0.0738, + "num_tokens": 3689347.0, + "reward": -5.8792724609375, + "reward_std": 6.232890605926514, + "rewards/rm_reward_func/mean": -5.8792724609375, + "rewards/rm_reward_func/std": 10.38478946685791, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 308.28125, + "completions/mean_terminated_length": 240.375, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.2032, + "grad_norm": 2.3067352771759033, + "kl": 0.0487823486328125, + "learning_rate": 1e-06, + "loss": 0.0398, + "num_tokens": 3704556.0, + "reward": -5.58111572265625, + "reward_std": 6.81657600402832, + "rewards/rm_reward_func/mean": -5.58111572265625, + "rewards/rm_reward_func/std": 9.89129638671875, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 409.3125, + "completions/mean_terminated_length": 329.4444580078125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.204, + "grad_norm": 1.8927600383758545, + "kl": 0.020660400390625, + "learning_rate": 1e-06, + "loss": -0.0231, + "num_tokens": 3719918.0, + "reward": -7.13653564453125, + "reward_std": 5.616809368133545, + "rewards/rm_reward_func/mean": -7.13653564453125, + "rewards/rm_reward_func/std": 7.415529251098633, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 315.71875, + "completions/mean_terminated_length": 302.63336181640625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.2048, + "grad_norm": 2.0776658058166504, + "kl": 0.0266265869140625, + "learning_rate": 1e-06, + "loss": -0.1051, + "num_tokens": 3732605.0, + "reward": -8.834228515625, + "reward_std": 5.082633972167969, + "rewards/rm_reward_func/mean": -8.834228515625, + "rewards/rm_reward_func/std": 6.943499565124512, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 442.6875, + "completions/mean_terminated_length": 373.375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.2056, + "grad_norm": 1.6974042654037476, + "kl": 0.0186309814453125, + "learning_rate": 1e-06, + "loss": -0.0301, + "num_tokens": 3749083.0, + "reward": -3.372802734375, + "reward_std": 5.8216400146484375, + "rewards/rm_reward_func/mean": -3.372802734375, + "rewards/rm_reward_func/std": 11.382164001464844, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 290.75, + "completions/mean_terminated_length": 228.79998779296875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.2064, + "grad_norm": 2.4395079612731934, + "kl": 0.0274658203125, + "learning_rate": 1e-06, + "loss": -0.0288, + "num_tokens": 3761507.0, + "reward": -15.692626953125, + "reward_std": 4.800331115722656, + "rewards/rm_reward_func/mean": -15.692626953125, + "rewards/rm_reward_func/std": 6.407756328582764, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 235.90625, + "completions/mean_terminated_length": 143.875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.2072, + "grad_norm": 2.621724843978882, + "kl": 0.043609619140625, + "learning_rate": 1e-06, + "loss": -0.0366, + "num_tokens": 3776160.0, + "reward": -6.2373046875, + "reward_std": 4.902165412902832, + "rewards/rm_reward_func/mean": -6.2373046875, + "rewards/rm_reward_func/std": 9.572051048278809, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 332.71875, + "completions/mean_terminated_length": 225.15000915527344, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.208, + "grad_norm": 3.0389907360076904, + "kl": 0.087677001953125, + "learning_rate": 1e-06, + "loss": 0.2951, + "num_tokens": 3791295.0, + "reward": -2.5626220703125, + "reward_std": 6.700636863708496, + "rewards/rm_reward_func/mean": -2.5626220703125, + "rewards/rm_reward_func/std": 15.121737480163574, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 444.0625, + "completions/mean_terminated_length": 376.125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.2088, + "grad_norm": 1.5149556398391724, + "kl": 0.018341064453125, + "learning_rate": 1e-06, + "loss": 0.0463, + "num_tokens": 3807761.0, + "reward": -12.09765625, + "reward_std": 7.1959099769592285, + "rewards/rm_reward_func/mean": -12.09765625, + "rewards/rm_reward_func/std": 12.532217979431152, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 317.09375, + "completions/mean_terminated_length": 262.5199890136719, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.2096, + "grad_norm": 1.9040241241455078, + "kl": 0.0269927978515625, + "learning_rate": 1e-06, + "loss": -0.0267, + "num_tokens": 3821012.0, + "reward": -8.36376953125, + "reward_std": 5.254627227783203, + "rewards/rm_reward_func/mean": -8.36376953125, + "rewards/rm_reward_func/std": 11.884527206420898, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 343.25, + "completions/mean_terminated_length": 212.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.2104, + "grad_norm": 1.7037732601165771, + "kl": 0.036346435546875, + "learning_rate": 1e-06, + "loss": 0.1254, + "num_tokens": 3838236.0, + "reward": 2.8682861328125, + "reward_std": 7.202817916870117, + "rewards/rm_reward_func/mean": 2.8682861328125, + "rewards/rm_reward_func/std": 12.06040096282959, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 273.46875, + "completions/mean_terminated_length": 239.3928680419922, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.2112, + "grad_norm": 2.5100741386413574, + "kl": 0.0361175537109375, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 3850723.0, + "reward": -5.05352783203125, + "reward_std": 7.889523983001709, + "rewards/rm_reward_func/mean": -5.05352783203125, + "rewards/rm_reward_func/std": 9.206114768981934, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 383.1875, + "completions/mean_terminated_length": 324.6363830566406, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.212, + "grad_norm": 2.1479246616363525, + "kl": 0.03021240234375, + "learning_rate": 1e-06, + "loss": 0.124, + "num_tokens": 3867449.0, + "reward": -8.210693359375, + "reward_std": 5.56413459777832, + "rewards/rm_reward_func/mean": -8.210693359375, + "rewards/rm_reward_func/std": 12.148035049438477, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 397.65625, + "completions/mean_terminated_length": 337.76190185546875, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.2128, + "grad_norm": 1.5911149978637695, + "kl": 0.025665283203125, + "learning_rate": 1e-06, + "loss": -0.0311, + "num_tokens": 3886934.0, + "reward": -7.0245361328125, + "reward_std": 4.548398494720459, + "rewards/rm_reward_func/mean": -7.0245361328125, + "rewards/rm_reward_func/std": 11.464239120483398, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 333.71875, + "completions/mean_terminated_length": 263.9565124511719, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.2136, + "grad_norm": 1.9456273317337036, + "kl": 0.04315185546875, + "learning_rate": 1e-06, + "loss": -0.0492, + "num_tokens": 3903413.0, + "reward": -3.105712890625, + "reward_std": 5.184563636779785, + "rewards/rm_reward_func/mean": -3.105712890625, + "rewards/rm_reward_func/std": 12.23321533203125, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 279.96875, + "completions/mean_terminated_length": 189.17391967773438, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.2144, + "grad_norm": 2.681539535522461, + "kl": 0.0362091064453125, + "learning_rate": 1e-06, + "loss": 0.1062, + "num_tokens": 3916380.0, + "reward": -7.007049560546875, + "reward_std": 5.494674205780029, + "rewards/rm_reward_func/mean": -7.007049560546875, + "rewards/rm_reward_func/std": 9.55606746673584, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 345.40625, + "completions/mean_terminated_length": 306.9615478515625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.2152, + "grad_norm": 1.6299346685409546, + "kl": 0.0472869873046875, + "learning_rate": 1e-06, + "loss": -0.0677, + "num_tokens": 3936073.0, + "reward": -1.64794921875, + "reward_std": 9.052248001098633, + "rewards/rm_reward_func/mean": -1.64794921875, + "rewards/rm_reward_func/std": 15.8290376663208, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 391.375, + "completions/mean_terminated_length": 270.75, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.216, + "grad_norm": 3.4785752296447754, + "kl": 0.03466796875, + "learning_rate": 1e-06, + "loss": 0.3205, + "num_tokens": 3955997.0, + "reward": -3.32977294921875, + "reward_std": 6.84357213973999, + "rewards/rm_reward_func/mean": -3.32977294921875, + "rewards/rm_reward_func/std": 8.706695556640625, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 297.1875, + "completions/mean_terminated_length": 247.61538696289062, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.2168, + "grad_norm": 1.9957735538482666, + "kl": 0.045166015625, + "learning_rate": 1e-06, + "loss": -0.0316, + "num_tokens": 3973427.0, + "reward": -2.9058837890625, + "reward_std": 5.755159378051758, + "rewards/rm_reward_func/mean": -2.9058837890625, + "rewards/rm_reward_func/std": 14.275411605834961, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 378.875, + "completions/mean_terminated_length": 287.78948974609375, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.2176, + "grad_norm": 1.6302871704101562, + "kl": 0.0281524658203125, + "learning_rate": 1e-06, + "loss": -0.0668, + "num_tokens": 3990847.0, + "reward": -0.719207763671875, + "reward_std": 3.5893776416778564, + "rewards/rm_reward_func/mean": -0.719207763671875, + "rewards/rm_reward_func/std": 12.773297309875488, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 419.46875, + "completions/mean_terminated_length": 347.5, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.2184, + "grad_norm": 1.5056275129318237, + "kl": 0.011810302734375, + "learning_rate": 1e-06, + "loss": -0.1226, + "num_tokens": 4006462.0, + "reward": -12.791015625, + "reward_std": 5.466350555419922, + "rewards/rm_reward_func/mean": -12.791015625, + "rewards/rm_reward_func/std": 12.433148384094238, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 334.9375, + "completions/mean_terminated_length": 265.6521911621094, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.2192, + "grad_norm": 2.69622802734375, + "kl": 0.02227783203125, + "learning_rate": 1e-06, + "loss": -0.0272, + "num_tokens": 4020428.0, + "reward": -4.096305847167969, + "reward_std": 6.064450263977051, + "rewards/rm_reward_func/mean": -4.096305847167969, + "rewards/rm_reward_func/std": 7.449769020080566, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 415.5625, + "completions/mean_terminated_length": 330.4705810546875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.22, + "grad_norm": 1.9108415842056274, + "kl": 0.02203369140625, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 4039358.0, + "reward": -17.158203125, + "reward_std": 3.0149383544921875, + "rewards/rm_reward_func/mean": -17.158203125, + "rewards/rm_reward_func/std": 6.455570220947266, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 188.84375, + "completions/mean_terminated_length": 98.36000061035156, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2208, + "grad_norm": 5.601883888244629, + "kl": 0.06268310546875, + "learning_rate": 1e-06, + "loss": -0.104, + "num_tokens": 4052561.0, + "reward": -5.098052978515625, + "reward_std": 3.7989611625671387, + "rewards/rm_reward_func/mean": -5.098052978515625, + "rewards/rm_reward_func/std": 7.928662300109863, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 299.15625, + "completions/mean_terminated_length": 187.6666717529297, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.2216, + "grad_norm": 2.2623674869537354, + "kl": 0.02234649658203125, + "learning_rate": 1e-06, + "loss": -0.1164, + "num_tokens": 4065254.0, + "reward": -13.227783203125, + "reward_std": 4.641175746917725, + "rewards/rm_reward_func/mean": -13.227783203125, + "rewards/rm_reward_func/std": 5.642585277557373, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 395.4375, + "completions/mean_terminated_length": 325.5, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.2224, + "grad_norm": 1.7105231285095215, + "kl": 0.0183563232421875, + "learning_rate": 1e-06, + "loss": -0.0711, + "num_tokens": 4080020.0, + "reward": -15.16748046875, + "reward_std": 4.37330436706543, + "rewards/rm_reward_func/mean": -15.16748046875, + "rewards/rm_reward_func/std": 7.067616939544678, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 399.59375, + "completions/mean_terminated_length": 312.1666564941406, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.2232, + "grad_norm": 1.892221212387085, + "kl": 0.024383544921875, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 4098079.0, + "reward": -15.8828125, + "reward_std": 3.171330213546753, + "rewards/rm_reward_func/mean": -15.8828125, + "rewards/rm_reward_func/std": 5.500910758972168, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 359.625, + "completions/mean_terminated_length": 290.3636474609375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.224, + "grad_norm": 1.8023781776428223, + "kl": 0.0357208251953125, + "learning_rate": 1e-06, + "loss": 0.087, + "num_tokens": 4114099.0, + "reward": -5.289306640625, + "reward_std": 5.588537693023682, + "rewards/rm_reward_func/mean": -5.289306640625, + "rewards/rm_reward_func/std": 9.471607208251953, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 433.625, + "completions/mean_terminated_length": 372.6666564941406, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.2248, + "grad_norm": 1.2361620664596558, + "kl": 0.0216827392578125, + "learning_rate": 1e-06, + "loss": 0.0598, + "num_tokens": 4134175.0, + "reward": -4.2342987060546875, + "reward_std": 6.207003593444824, + "rewards/rm_reward_func/mean": -4.2342987060546875, + "rewards/rm_reward_func/std": 8.529419898986816, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 348.03125, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.2256, + "grad_norm": 2.294182777404785, + "kl": 0.046630859375, + "learning_rate": 1e-06, + "loss": 0.1388, + "num_tokens": 4155616.0, + "reward": -10.0469970703125, + "reward_std": 3.794982433319092, + "rewards/rm_reward_func/mean": -10.0469970703125, + "rewards/rm_reward_func/std": 12.319479942321777, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 253.75, + "completions/mean_terminated_length": 236.53334045410156, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.2264, + "grad_norm": 2.3744797706604004, + "kl": 0.0567169189453125, + "learning_rate": 1e-06, + "loss": -0.1747, + "num_tokens": 4169240.0, + "reward": -9.008575439453125, + "reward_std": 6.606152057647705, + "rewards/rm_reward_func/mean": -9.008575439453125, + "rewards/rm_reward_func/std": 8.55732536315918, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 406.0, + "completions/mean_terminated_length": 285.8666687011719, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.2272, + "grad_norm": 1.5771548748016357, + "kl": 0.02734375, + "learning_rate": 1e-06, + "loss": 0.0267, + "num_tokens": 4188248.0, + "reward": -7.66046142578125, + "reward_std": 4.248453140258789, + "rewards/rm_reward_func/mean": -7.66046142578125, + "rewards/rm_reward_func/std": 13.414899826049805, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 285.125, + "completions/mean_terminated_length": 182.0, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.228, + "grad_norm": 2.4854447841644287, + "kl": 0.079833984375, + "learning_rate": 1e-06, + "loss": -0.063, + "num_tokens": 4205276.0, + "reward": 3.4002685546875, + "reward_std": 6.687047004699707, + "rewards/rm_reward_func/mean": 3.4002685546875, + "rewards/rm_reward_func/std": 9.661017417907715, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 340.875, + "completions/mean_terminated_length": 223.7894744873047, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.2288, + "grad_norm": 1.8065805435180664, + "kl": 0.0284423828125, + "learning_rate": 1e-06, + "loss": 0.2362, + "num_tokens": 4221600.0, + "reward": 1.372802734375, + "reward_std": 6.996541500091553, + "rewards/rm_reward_func/mean": 1.372802734375, + "rewards/rm_reward_func/std": 12.117879867553711, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 272.65625, + "completions/mean_terminated_length": 147.2857208251953, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.2296, + "grad_norm": 7.6442131996154785, + "kl": 0.06048583984375, + "learning_rate": 1e-06, + "loss": -0.0536, + "num_tokens": 4233365.0, + "reward": -9.55126953125, + "reward_std": 3.7911038398742676, + "rewards/rm_reward_func/mean": -9.55126953125, + "rewards/rm_reward_func/std": 5.26282262802124, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 290.1875, + "completions/mean_terminated_length": 275.4000244140625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.2304, + "grad_norm": 1.9716819524765015, + "kl": 0.0238189697265625, + "learning_rate": 1e-06, + "loss": -0.0509, + "num_tokens": 4247795.0, + "reward": -4.84552001953125, + "reward_std": 6.16115140914917, + "rewards/rm_reward_func/mean": -4.84552001953125, + "rewards/rm_reward_func/std": 10.725358963012695, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 327.34375, + "completions/mean_terminated_length": 265.79168701171875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.2312, + "grad_norm": 2.1234254837036133, + "kl": 0.0257720947265625, + "learning_rate": 1e-06, + "loss": -0.1404, + "num_tokens": 4260686.0, + "reward": -11.875, + "reward_std": 3.504284381866455, + "rewards/rm_reward_func/mean": -11.875, + "rewards/rm_reward_func/std": 4.181146144866943, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 300.5, + "completions/mean_terminated_length": 286.4000244140625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.232, + "grad_norm": 2.0691609382629395, + "kl": 0.0880126953125, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 4281198.0, + "reward": -0.46728515625, + "reward_std": 6.135943412780762, + "rewards/rm_reward_func/mean": -0.46728515625, + "rewards/rm_reward_func/std": 11.502102851867676, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 367.0625, + "completions/mean_terminated_length": 267.8947448730469, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.2328, + "grad_norm": 1.7965786457061768, + "kl": 0.0315399169921875, + "learning_rate": 1e-06, + "loss": -0.1021, + "num_tokens": 4297688.0, + "reward": -2.347442626953125, + "reward_std": 5.834395408630371, + "rewards/rm_reward_func/mean": -2.347442626953125, + "rewards/rm_reward_func/std": 12.373488426208496, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 313.0, + "completions/mean_terminated_length": 284.5714416503906, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.2336, + "grad_norm": 1.658935546875, + "kl": 0.0229034423828125, + "learning_rate": 1e-06, + "loss": -0.075, + "num_tokens": 4311752.0, + "reward": -9.09576416015625, + "reward_std": 6.807614803314209, + "rewards/rm_reward_func/mean": -9.09576416015625, + "rewards/rm_reward_func/std": 10.425131797790527, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 420.5, + "completions/mean_terminated_length": 329.0, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.2344, + "grad_norm": 1.7930186986923218, + "kl": 0.0128173828125, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 4327392.0, + "reward": -8.46075439453125, + "reward_std": 4.663958549499512, + "rewards/rm_reward_func/mean": -8.46075439453125, + "rewards/rm_reward_func/std": 8.817876815795898, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 221.15625, + "completions/mean_terminated_length": 221.15625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2352, + "grad_norm": 2.0271503925323486, + "kl": 0.08636474609375, + "learning_rate": 1e-06, + "loss": -0.0616, + "num_tokens": 4342949.0, + "reward": 13.44140625, + "reward_std": 6.08550500869751, + "rewards/rm_reward_func/mean": 13.44140625, + "rewards/rm_reward_func/std": 10.562538146972656, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 468.0, + "completions/mean_terminated_length": 371.20001220703125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.236, + "grad_norm": 1.4713637828826904, + "kl": 0.015411376953125, + "learning_rate": 1e-06, + "loss": 0.0529, + "num_tokens": 4363325.0, + "reward": -2.3758544921875, + "reward_std": 6.289039134979248, + "rewards/rm_reward_func/mean": -2.3758544921875, + "rewards/rm_reward_func/std": 13.755448341369629, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 450.625, + "completions/mean_terminated_length": 333.4545593261719, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.2368, + "grad_norm": 1.6358439922332764, + "kl": 0.0178070068359375, + "learning_rate": 1e-06, + "loss": 0.0577, + "num_tokens": 4381129.0, + "reward": -11.5068359375, + "reward_std": 5.03669548034668, + "rewards/rm_reward_func/mean": -11.5068359375, + "rewards/rm_reward_func/std": 7.384062767028809, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 407.90625, + "completions/mean_terminated_length": 289.933349609375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2376, + "grad_norm": 1.7263087034225464, + "kl": 0.014862060546875, + "learning_rate": 1e-06, + "loss": -0.1594, + "num_tokens": 4396014.0, + "reward": -11.924072265625, + "reward_std": 5.428852558135986, + "rewards/rm_reward_func/mean": -11.924072265625, + "rewards/rm_reward_func/std": 6.060250759124756, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 287.4375, + "completions/mean_terminated_length": 287.4375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.2384, + "grad_norm": 1.992435336112976, + "kl": 0.03497314453125, + "learning_rate": 1e-06, + "loss": -0.0988, + "num_tokens": 4411468.0, + "reward": -1.6038665771484375, + "reward_std": 6.636127471923828, + "rewards/rm_reward_func/mean": -1.6038665771484375, + "rewards/rm_reward_func/std": 17.53786277770996, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 261.15625, + "completions/mean_terminated_length": 253.06451416015625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.2392, + "grad_norm": 2.253258466720581, + "kl": 0.0723724365234375, + "learning_rate": 1e-06, + "loss": 0.0828, + "num_tokens": 4429025.0, + "reward": 4.983894348144531, + "reward_std": 6.524765968322754, + "rewards/rm_reward_func/mean": 4.983894348144531, + "rewards/rm_reward_func/std": 9.147462844848633, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 401.21875, + "completions/mean_terminated_length": 357.86956787109375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.24, + "grad_norm": 1.6379445791244507, + "kl": 0.0186004638671875, + "learning_rate": 1e-06, + "loss": -0.1623, + "num_tokens": 4447152.0, + "reward": -12.0419921875, + "reward_std": 5.342292308807373, + "rewards/rm_reward_func/mean": -12.0419921875, + "rewards/rm_reward_func/std": 8.954245567321777, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 400.71875, + "completions/mean_terminated_length": 302.5294189453125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.2408, + "grad_norm": 1.8253401517868042, + "kl": 0.017791748046875, + "learning_rate": 1e-06, + "loss": 0.1733, + "num_tokens": 4463711.0, + "reward": -11.329345703125, + "reward_std": 3.783949851989746, + "rewards/rm_reward_func/mean": -11.329345703125, + "rewards/rm_reward_func/std": 5.177454471588135, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 347.59375, + "completions/mean_terminated_length": 272.8636474609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.2416, + "grad_norm": 1.710831880569458, + "kl": 0.0245361328125, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 4481586.0, + "reward": -8.01708984375, + "reward_std": 4.78322172164917, + "rewards/rm_reward_func/mean": -8.01708984375, + "rewards/rm_reward_func/std": 12.38097858428955, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 334.5625, + "completions/mean_terminated_length": 157.125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.2424, + "grad_norm": 2.6372230052948, + "kl": 0.0394134521484375, + "learning_rate": 1e-06, + "loss": 0.1496, + "num_tokens": 4497308.0, + "reward": -11.478271484375, + "reward_std": 6.022144317626953, + "rewards/rm_reward_func/mean": -11.478271484375, + "rewards/rm_reward_func/std": 13.830948829650879, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 300.8125, + "completions/mean_terminated_length": 261.7037048339844, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.2432, + "grad_norm": 1.9959298372268677, + "kl": 0.06793212890625, + "learning_rate": 1e-06, + "loss": -0.0407, + "num_tokens": 4516718.0, + "reward": -1.5045166015625, + "reward_std": 5.3790717124938965, + "rewards/rm_reward_func/mean": -1.5045166015625, + "rewards/rm_reward_func/std": 10.15602970123291, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 399.03125, + "completions/mean_terminated_length": 331.25, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.244, + "grad_norm": 1.810784101486206, + "kl": 0.032501220703125, + "learning_rate": 1e-06, + "loss": 0.0418, + "num_tokens": 4535287.0, + "reward": -4.7049560546875, + "reward_std": 5.622498035430908, + "rewards/rm_reward_func/mean": -4.7049560546875, + "rewards/rm_reward_func/std": 13.821171760559082, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 303.125, + "completions/mean_terminated_length": 273.2857360839844, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.2448, + "grad_norm": 1.9075818061828613, + "kl": 0.0207061767578125, + "learning_rate": 1e-06, + "loss": -0.1245, + "num_tokens": 4548051.0, + "reward": -10.3795166015625, + "reward_std": 6.449045658111572, + "rewards/rm_reward_func/mean": -10.3795166015625, + "rewards/rm_reward_func/std": 7.897385597229004, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 241.71875, + "completions/mean_terminated_length": 166.0399932861328, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.2456, + "grad_norm": 2.4077939987182617, + "kl": 0.052154541015625, + "learning_rate": 1e-06, + "loss": 0.3574, + "num_tokens": 4561546.0, + "reward": -2.5845947265625, + "reward_std": 7.590365409851074, + "rewards/rm_reward_func/mean": -2.5845947265625, + "rewards/rm_reward_func/std": 14.81585693359375, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 333.5625, + "completions/mean_terminated_length": 263.7391357421875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2464, + "grad_norm": 1.9114305973052979, + "kl": 0.035064697265625, + "learning_rate": 1e-06, + "loss": 0.04, + "num_tokens": 4576964.0, + "reward": -9.07763671875, + "reward_std": 7.5965046882629395, + "rewards/rm_reward_func/mean": -9.07763671875, + "rewards/rm_reward_func/std": 13.071847915649414, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 368.6875, + "completions/mean_terminated_length": 293.6190490722656, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.2472, + "grad_norm": 1.6189061403274536, + "kl": 0.0187225341796875, + "learning_rate": 1e-06, + "loss": -0.039, + "num_tokens": 4594018.0, + "reward": -11.057861328125, + "reward_std": 4.950892448425293, + "rewards/rm_reward_func/mean": -11.057861328125, + "rewards/rm_reward_func/std": 5.277069091796875, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 336.875, + "completions/mean_terminated_length": 331.2257995605469, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.248, + "grad_norm": 1.7306835651397705, + "kl": 0.02288818359375, + "learning_rate": 1e-06, + "loss": -0.1504, + "num_tokens": 4608766.0, + "reward": -2.810791015625, + "reward_std": 8.265456199645996, + "rewards/rm_reward_func/mean": -2.810791015625, + "rewards/rm_reward_func/std": 11.374640464782715, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 347.5625, + "completions/mean_terminated_length": 261.4285888671875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2488, + "grad_norm": 2.0870978832244873, + "kl": 0.031982421875, + "learning_rate": 1e-06, + "loss": 0.1205, + "num_tokens": 4623152.0, + "reward": -11.062347412109375, + "reward_std": 4.841803550720215, + "rewards/rm_reward_func/mean": -11.062347412109375, + "rewards/rm_reward_func/std": 8.273815155029297, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 291.15625, + "completions/mean_terminated_length": 291.15625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.2496, + "grad_norm": 1.9850847721099854, + "kl": 0.0645751953125, + "learning_rate": 1e-06, + "loss": -0.0201, + "num_tokens": 4639357.0, + "reward": 1.6715087890625, + "reward_std": 7.272180557250977, + "rewards/rm_reward_func/mean": 1.6715087890625, + "rewards/rm_reward_func/std": 10.000642776489258, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 412.75, + "completions/mean_terminated_length": 344.84210205078125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.2504, + "grad_norm": 1.7677154541015625, + "kl": 0.021148681640625, + "learning_rate": 1e-06, + "loss": -0.0264, + "num_tokens": 4657661.0, + "reward": -8.342529296875, + "reward_std": 5.402894496917725, + "rewards/rm_reward_func/mean": -8.342529296875, + "rewards/rm_reward_func/std": 8.33918285369873, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 220.4375, + "completions/mean_terminated_length": 211.03225708007812, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2512, + "grad_norm": 2.3048784732818604, + "kl": 0.039642333984375, + "learning_rate": 1e-06, + "loss": 0.1729, + "num_tokens": 4667747.0, + "reward": -13.045974731445312, + "reward_std": 5.921314239501953, + "rewards/rm_reward_func/mean": -13.045974731445312, + "rewards/rm_reward_func/std": 6.762684345245361, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 368.125, + "completions/mean_terminated_length": 302.727294921875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.252, + "grad_norm": 1.733473539352417, + "kl": 0.031646728515625, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 4685351.0, + "reward": 1.50848388671875, + "reward_std": 5.111229419708252, + "rewards/rm_reward_func/mean": 1.50848388671875, + "rewards/rm_reward_func/std": 9.598787307739258, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 331.40625, + "completions/mean_terminated_length": 271.2083435058594, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2528, + "grad_norm": 2.494640588760376, + "kl": 0.04351806640625, + "learning_rate": 1e-06, + "loss": -0.1987, + "num_tokens": 4702764.0, + "reward": -13.033536911010742, + "reward_std": 7.699631690979004, + "rewards/rm_reward_func/mean": -13.033536911010742, + "rewards/rm_reward_func/std": 11.280793190002441, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 392.1875, + "completions/mean_terminated_length": 299.0, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.2536, + "grad_norm": 1.9197250604629517, + "kl": 0.02630615234375, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 4718386.0, + "reward": -10.430419921875, + "reward_std": 4.848438739776611, + "rewards/rm_reward_func/mean": -10.430419921875, + "rewards/rm_reward_func/std": 11.871861457824707, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 357.0625, + "completions/mean_terminated_length": 286.6363830566406, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.2544, + "grad_norm": 1.9128575325012207, + "kl": 0.025848388671875, + "learning_rate": 1e-06, + "loss": 0.1769, + "num_tokens": 4731604.0, + "reward": -9.52398681640625, + "reward_std": 5.165442943572998, + "rewards/rm_reward_func/mean": -9.52398681640625, + "rewards/rm_reward_func/std": 9.442614555358887, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 349.03125, + "completions/mean_terminated_length": 294.7083435058594, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.2552, + "grad_norm": 1.7569587230682373, + "kl": 0.03021240234375, + "learning_rate": 1e-06, + "loss": -0.0501, + "num_tokens": 4747461.0, + "reward": -0.45166015625, + "reward_std": 6.613267421722412, + "rewards/rm_reward_func/mean": -0.45166015625, + "rewards/rm_reward_func/std": 17.83439064025879, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 351.34375, + "completions/mean_terminated_length": 288.478271484375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.256, + "grad_norm": 1.7636747360229492, + "kl": 0.0303497314453125, + "learning_rate": 1e-06, + "loss": -0.0623, + "num_tokens": 4761880.0, + "reward": -11.116455078125, + "reward_std": 5.632594108581543, + "rewards/rm_reward_func/mean": -11.116455078125, + "rewards/rm_reward_func/std": 8.949335098266602, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 351.53125, + "completions/mean_terminated_length": 255.25, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.2568, + "grad_norm": 1.9710261821746826, + "kl": 0.028839111328125, + "learning_rate": 1e-06, + "loss": 0.0336, + "num_tokens": 4776817.0, + "reward": -12.9344482421875, + "reward_std": 4.549389839172363, + "rewards/rm_reward_func/mean": -12.9344482421875, + "rewards/rm_reward_func/std": 7.669496536254883, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 319.15625, + "completions/mean_terminated_length": 243.69566345214844, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.2576, + "grad_norm": 1.8862268924713135, + "kl": 0.0443878173828125, + "learning_rate": 1e-06, + "loss": -0.0775, + "num_tokens": 4792326.0, + "reward": 0.488311767578125, + "reward_std": 3.7697737216949463, + "rewards/rm_reward_func/mean": 0.488311767578125, + "rewards/rm_reward_func/std": 13.558219909667969, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 375.8125, + "completions/mean_terminated_length": 304.4761962890625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.2584, + "grad_norm": 1.7897722721099854, + "kl": 0.0289764404296875, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 4811936.0, + "reward": -3.474853515625, + "reward_std": 7.083941459655762, + "rewards/rm_reward_func/mean": -3.474853515625, + "rewards/rm_reward_func/std": 13.624695777893066, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 222.875, + "completions/mean_terminated_length": 203.60000610351562, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.2592, + "grad_norm": 2.401153087615967, + "kl": 0.077850341796875, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 4825716.0, + "reward": 8.217742919921875, + "reward_std": 4.947168350219727, + "rewards/rm_reward_func/mean": 8.217742919921875, + "rewards/rm_reward_func/std": 16.79034996032715, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 361.75, + "completions/mean_terminated_length": 319.67999267578125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.26, + "grad_norm": 1.607993245124817, + "kl": 0.0171356201171875, + "learning_rate": 1e-06, + "loss": -0.0276, + "num_tokens": 4839860.0, + "reward": -13.188232421875, + "reward_std": 6.050688743591309, + "rewards/rm_reward_func/mean": -13.188232421875, + "rewards/rm_reward_func/std": 9.066386222839355, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 333.3125, + "completions/mean_terminated_length": 252.09091186523438, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.2608, + "grad_norm": 1.8725932836532593, + "kl": 0.043548583984375, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 4859878.0, + "reward": -2.271453857421875, + "reward_std": 5.920629501342773, + "rewards/rm_reward_func/mean": -2.271453857421875, + "rewards/rm_reward_func/std": 12.933178901672363, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 343.53125, + "completions/mean_terminated_length": 255.2857208251953, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.2616, + "grad_norm": 2.7248876094818115, + "kl": 0.0300140380859375, + "learning_rate": 1e-06, + "loss": -0.0188, + "num_tokens": 4873927.0, + "reward": -9.418548583984375, + "reward_std": 6.114006996154785, + "rewards/rm_reward_func/mean": -9.418548583984375, + "rewards/rm_reward_func/std": 7.356963634490967, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 178.65625, + "completions/mean_terminated_length": 144.1724090576172, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.2624, + "grad_norm": 2.708681583404541, + "kl": 0.06121826171875, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 4885604.0, + "reward": -7.2904052734375, + "reward_std": 8.234804153442383, + "rewards/rm_reward_func/mean": -7.2904052734375, + "rewards/rm_reward_func/std": 8.977691650390625, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 295.34375, + "completions/mean_terminated_length": 245.34616088867188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.2632, + "grad_norm": 2.5953567028045654, + "kl": 0.0465087890625, + "learning_rate": 1e-06, + "loss": -0.1946, + "num_tokens": 4900319.0, + "reward": 3.05426025390625, + "reward_std": 8.720661163330078, + "rewards/rm_reward_func/mean": 3.05426025390625, + "rewards/rm_reward_func/std": 16.918840408325195, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 322.3125, + "completions/mean_terminated_length": 174.7777862548828, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.264, + "grad_norm": 3.3830807209014893, + "kl": 0.03387451171875, + "learning_rate": 1e-06, + "loss": 0.2846, + "num_tokens": 4913377.0, + "reward": -11.0482177734375, + "reward_std": 5.540251731872559, + "rewards/rm_reward_func/mean": -11.0482177734375, + "rewards/rm_reward_func/std": 6.693453788757324, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 251.625, + "completions/mean_terminated_length": 243.22579956054688, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2648, + "grad_norm": 2.1989102363586426, + "kl": 0.0380859375, + "learning_rate": 1e-06, + "loss": 0.0575, + "num_tokens": 4924901.0, + "reward": -7.898895263671875, + "reward_std": 5.975490570068359, + "rewards/rm_reward_func/mean": -7.898895263671875, + "rewards/rm_reward_func/std": 10.212793350219727, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 388.40625, + "completions/mean_terminated_length": 353.79998779296875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.2656, + "grad_norm": 1.8516799211502075, + "kl": 0.06949615478515625, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 4942034.0, + "reward": -3.15252685546875, + "reward_std": 6.84132194519043, + "rewards/rm_reward_func/mean": -3.15252685546875, + "rewards/rm_reward_func/std": 15.043277740478516, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 299.75, + "completions/mean_terminated_length": 277.7930908203125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.2664, + "grad_norm": 2.41034197807312, + "kl": 0.0587158203125, + "learning_rate": 1e-06, + "loss": -0.0787, + "num_tokens": 4956562.0, + "reward": 1.786224365234375, + "reward_std": 5.918888092041016, + "rewards/rm_reward_func/mean": 1.786224365234375, + "rewards/rm_reward_func/std": 17.39568328857422, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 391.375, + "completions/mean_terminated_length": 336.54547119140625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.2672, + "grad_norm": 1.5766760110855103, + "kl": 0.0235748291015625, + "learning_rate": 1e-06, + "loss": -0.0252, + "num_tokens": 4974246.0, + "reward": 2.0384979248046875, + "reward_std": 4.191490173339844, + "rewards/rm_reward_func/mean": 2.0384979248046875, + "rewards/rm_reward_func/std": 14.296323776245117, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 170.9375, + "completions/mean_terminated_length": 170.9375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.268, + "grad_norm": 3.0106358528137207, + "kl": 0.071319580078125, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 4985428.0, + "reward": -4.9619140625, + "reward_std": 5.420950412750244, + "rewards/rm_reward_func/mean": -4.9619140625, + "rewards/rm_reward_func/std": 18.715456008911133, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 256.21875, + "completions/mean_terminated_length": 247.9677276611328, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.2688, + "grad_norm": 2.012521266937256, + "kl": 0.061553955078125, + "learning_rate": 1e-06, + "loss": 0.0486, + "num_tokens": 5002027.0, + "reward": -3.966064453125, + "reward_std": 6.985547065734863, + "rewards/rm_reward_func/mean": -3.966064453125, + "rewards/rm_reward_func/std": 19.603078842163086, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 390.90625, + "completions/mean_terminated_length": 357.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.2696, + "grad_norm": 1.628297209739685, + "kl": 0.0216217041015625, + "learning_rate": 1e-06, + "loss": -0.0761, + "num_tokens": 5017824.0, + "reward": -8.81033706665039, + "reward_std": 7.6989922523498535, + "rewards/rm_reward_func/mean": -8.81033706665039, + "rewards/rm_reward_func/std": 9.597116470336914, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 291.5625, + "completions/mean_terminated_length": 268.75860595703125, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.2704, + "grad_norm": 2.2480905055999756, + "kl": 0.030853271484375, + "learning_rate": 1e-06, + "loss": -0.0634, + "num_tokens": 5030514.0, + "reward": -3.6298828125, + "reward_std": 6.74791955947876, + "rewards/rm_reward_func/mean": -3.6298828125, + "rewards/rm_reward_func/std": 15.210151672363281, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 220.75, + "completions/mean_terminated_length": 220.75, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.2712, + "grad_norm": 3.0822904109954834, + "kl": 0.04705810546875, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 5039834.0, + "reward": -8.640167236328125, + "reward_std": 5.175829887390137, + "rewards/rm_reward_func/mean": -8.640167236328125, + "rewards/rm_reward_func/std": 9.299836158752441, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 479.0, + "completions/mean_terminated_length": 361.14288330078125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.272, + "grad_norm": 1.4823734760284424, + "kl": 0.0165557861328125, + "learning_rate": 1e-06, + "loss": -0.0675, + "num_tokens": 5058018.0, + "reward": -12.299896240234375, + "reward_std": 4.636070251464844, + "rewards/rm_reward_func/mean": -12.299896240234375, + "rewards/rm_reward_func/std": 11.526834487915039, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 395.25, + "completions/mean_terminated_length": 315.368408203125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.2728, + "grad_norm": 1.805376648902893, + "kl": 0.02496337890625, + "learning_rate": 1e-06, + "loss": 0.0437, + "num_tokens": 5073338.0, + "reward": -0.086181640625, + "reward_std": 7.662697792053223, + "rewards/rm_reward_func/mean": -0.086181640625, + "rewards/rm_reward_func/std": 13.884127616882324, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 349.28125, + "completions/mean_terminated_length": 326.0357360839844, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.2736, + "grad_norm": 1.7540146112442017, + "kl": 0.046905517578125, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 5092387.0, + "reward": 4.705078125, + "reward_std": 7.491105079650879, + "rewards/rm_reward_func/mean": 4.705078125, + "rewards/rm_reward_func/std": 16.02699089050293, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 456.65625, + "completions/mean_terminated_length": 393.933349609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.2744, + "grad_norm": 1.596224308013916, + "kl": 0.0178375244140625, + "learning_rate": 1e-06, + "loss": -0.033, + "num_tokens": 5108936.0, + "reward": -11.037109375, + "reward_std": 4.30076265335083, + "rewards/rm_reward_func/mean": -11.037109375, + "rewards/rm_reward_func/std": 6.300858497619629, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 402.03125, + "completions/mean_terminated_length": 359.0, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.2752, + "grad_norm": 1.6954522132873535, + "kl": 0.0232696533203125, + "learning_rate": 1e-06, + "loss": 0.0552, + "num_tokens": 5127001.0, + "reward": 6.864501953125, + "reward_std": 6.955081939697266, + "rewards/rm_reward_func/mean": 6.864501953125, + "rewards/rm_reward_func/std": 23.170753479003906, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 371.28125, + "completions/mean_terminated_length": 261.8333435058594, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.276, + "grad_norm": 1.9409406185150146, + "kl": 0.024749755859375, + "learning_rate": 1e-06, + "loss": 0.0925, + "num_tokens": 5141346.0, + "reward": -3.85028076171875, + "reward_std": 6.2533369064331055, + "rewards/rm_reward_func/mean": -3.85028076171875, + "rewards/rm_reward_func/std": 12.705477714538574, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 335.59375, + "completions/mean_terminated_length": 266.5652160644531, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.2768, + "grad_norm": 1.9261739253997803, + "kl": 0.052581787109375, + "learning_rate": 1e-06, + "loss": -0.0241, + "num_tokens": 5160381.0, + "reward": 1.224853515625, + "reward_std": 8.395392417907715, + "rewards/rm_reward_func/mean": 1.224853515625, + "rewards/rm_reward_func/std": 16.938411712646484, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 258.625, + "completions/mean_terminated_length": 241.7333526611328, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.2776, + "grad_norm": 2.2349298000335693, + "kl": 0.0374755859375, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 5173585.0, + "reward": -0.70086669921875, + "reward_std": 5.016613483428955, + "rewards/rm_reward_func/mean": -0.70086669921875, + "rewards/rm_reward_func/std": 10.745451927185059, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 299.53125, + "completions/mean_terminated_length": 260.1851806640625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.2784, + "grad_norm": 1.9352341890335083, + "kl": 0.05029296875, + "learning_rate": 1e-06, + "loss": 0.0547, + "num_tokens": 5188706.0, + "reward": -7.8873291015625, + "reward_std": 6.4391865730285645, + "rewards/rm_reward_func/mean": -7.8873291015625, + "rewards/rm_reward_func/std": 12.683124542236328, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 324.40625, + "completions/mean_terminated_length": 251.0, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.2792, + "grad_norm": 2.1771438121795654, + "kl": 0.036468505859375, + "learning_rate": 1e-06, + "loss": 0.0328, + "num_tokens": 5203655.0, + "reward": -13.24169921875, + "reward_std": 5.286601543426514, + "rewards/rm_reward_func/mean": -13.24169921875, + "rewards/rm_reward_func/std": 6.826824188232422, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 370.90625, + "completions/mean_terminated_length": 306.7727355957031, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.28, + "grad_norm": 2.0083401203155518, + "kl": 0.01971435546875, + "learning_rate": 1e-06, + "loss": -0.0278, + "num_tokens": 5217252.0, + "reward": -13.737640380859375, + "reward_std": 4.453140735626221, + "rewards/rm_reward_func/mean": -13.737640380859375, + "rewards/rm_reward_func/std": 8.085177421569824, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 350.375, + "completions/mean_terminated_length": 265.71429443359375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.2808, + "grad_norm": 1.8093756437301636, + "kl": 0.05267333984375, + "learning_rate": 1e-06, + "loss": 0.0633, + "num_tokens": 5240048.0, + "reward": 2.6619110107421875, + "reward_std": 6.197319984436035, + "rewards/rm_reward_func/mean": 2.6619110107421875, + "rewards/rm_reward_func/std": 22.322683334350586, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 193.3125, + "completions/mean_terminated_length": 183.03225708007812, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.2816, + "grad_norm": 2.6316277980804443, + "kl": 0.06842041015625, + "learning_rate": 1e-06, + "loss": 0.1929, + "num_tokens": 5252978.0, + "reward": -3.892364501953125, + "reward_std": 6.66155481338501, + "rewards/rm_reward_func/mean": -3.892364501953125, + "rewards/rm_reward_func/std": 12.852538108825684, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 360.125, + "completions/mean_terminated_length": 309.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.2824, + "grad_norm": 1.7261713743209839, + "kl": 0.029632568359375, + "learning_rate": 1e-06, + "loss": -0.0563, + "num_tokens": 5268494.0, + "reward": -2.4473037719726562, + "reward_std": 3.8644378185272217, + "rewards/rm_reward_func/mean": -2.4473037719726562, + "rewards/rm_reward_func/std": 18.2664852142334, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 328.59375, + "completions/mean_terminated_length": 309.6206970214844, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.2832, + "grad_norm": 1.8559916019439697, + "kl": 0.037078857421875, + "learning_rate": 1e-06, + "loss": 0.0477, + "num_tokens": 5284033.0, + "reward": 1.348388671875, + "reward_std": 5.176307678222656, + "rewards/rm_reward_func/mean": 1.348388671875, + "rewards/rm_reward_func/std": 17.202606201171875, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 294.03125, + "completions/mean_terminated_length": 271.4827575683594, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.284, + "grad_norm": 1.989271640777588, + "kl": 0.0599365234375, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 5301298.0, + "reward": -1.9483642578125, + "reward_std": 7.640567779541016, + "rewards/rm_reward_func/mean": -1.9483642578125, + "rewards/rm_reward_func/std": 8.990771293640137, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 289.5, + "completions/mean_terminated_length": 227.1999969482422, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.2848, + "grad_norm": 2.1532859802246094, + "kl": 0.08905029296875, + "learning_rate": 1e-06, + "loss": -0.032, + "num_tokens": 5318114.0, + "reward": -3.396728515625, + "reward_std": 4.209635257720947, + "rewards/rm_reward_func/mean": -3.396728515625, + "rewards/rm_reward_func/std": 13.515035629272461, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 298.75, + "completions/mean_terminated_length": 284.5333557128906, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.2856, + "grad_norm": 2.084589719772339, + "kl": 0.05072021484375, + "learning_rate": 1e-06, + "loss": -0.0287, + "num_tokens": 5332250.0, + "reward": -5.32440185546875, + "reward_std": 5.6072797775268555, + "rewards/rm_reward_func/mean": -5.32440185546875, + "rewards/rm_reward_func/std": 6.242136001586914, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 430.875, + "completions/mean_terminated_length": 326.5714416503906, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.2864, + "grad_norm": 1.4034979343414307, + "kl": 0.02532958984375, + "learning_rate": 1e-06, + "loss": -0.0278, + "num_tokens": 5353942.0, + "reward": -8.100069046020508, + "reward_std": 3.8967063426971436, + "rewards/rm_reward_func/mean": -8.100069046020508, + "rewards/rm_reward_func/std": 6.735255718231201, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 381.5625, + "completions/mean_terminated_length": 322.2727355957031, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.2872, + "grad_norm": 1.8018348217010498, + "kl": 0.041107177734375, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 5374488.0, + "reward": -2.639892578125, + "reward_std": 5.309588432312012, + "rewards/rm_reward_func/mean": -2.639892578125, + "rewards/rm_reward_func/std": 15.092827796936035, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 241.65625, + "completions/mean_terminated_length": 151.5416717529297, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.288, + "grad_norm": 4.3712968826293945, + "kl": 0.0679931640625, + "learning_rate": 1e-06, + "loss": -0.0242, + "num_tokens": 5384165.0, + "reward": -1.04290771484375, + "reward_std": 5.808795928955078, + "rewards/rm_reward_func/mean": -1.04290771484375, + "rewards/rm_reward_func/std": 9.838920593261719, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 388.84375, + "completions/mean_terminated_length": 293.0555725097656, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.2888, + "grad_norm": 2.020167350769043, + "kl": 0.031707763671875, + "learning_rate": 1e-06, + "loss": -0.073, + "num_tokens": 5402712.0, + "reward": -20.28466796875, + "reward_std": 3.61586594581604, + "rewards/rm_reward_func/mean": -20.28466796875, + "rewards/rm_reward_func/std": 9.34034252166748, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 374.3125, + "completions/mean_terminated_length": 302.19049072265625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.2896, + "grad_norm": 1.8239446878433228, + "kl": 0.029296875, + "learning_rate": 1e-06, + "loss": 0.0324, + "num_tokens": 5416666.0, + "reward": -14.373291015625, + "reward_std": 3.5013060569763184, + "rewards/rm_reward_func/mean": -14.373291015625, + "rewards/rm_reward_func/std": 8.156317710876465, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 437.1875, + "completions/mean_terminated_length": 386.0, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.2904, + "grad_norm": 1.5334134101867676, + "kl": 0.01849365234375, + "learning_rate": 1e-06, + "loss": 0.032, + "num_tokens": 5433840.0, + "reward": -8.109619140625, + "reward_std": 6.245476722717285, + "rewards/rm_reward_func/mean": -8.109619140625, + "rewards/rm_reward_func/std": 11.481464385986328, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 295.53125, + "completions/mean_terminated_length": 245.57693481445312, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.2912, + "grad_norm": 2.8151254653930664, + "kl": 0.042633056640625, + "learning_rate": 1e-06, + "loss": 0.1598, + "num_tokens": 5448513.0, + "reward": -10.741943359375, + "reward_std": 3.273339033126831, + "rewards/rm_reward_func/mean": -10.741943359375, + "rewards/rm_reward_func/std": 8.218607902526855, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 246.6875, + "completions/mean_terminated_length": 246.6875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.292, + "grad_norm": 2.0407087802886963, + "kl": 0.081298828125, + "learning_rate": 1e-06, + "loss": 0.0467, + "num_tokens": 5465983.0, + "reward": 13.4329833984375, + "reward_std": 5.441981315612793, + "rewards/rm_reward_func/mean": 13.4329833984375, + "rewards/rm_reward_func/std": 9.746720314025879, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 359.15625, + "completions/mean_terminated_length": 267.45001220703125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.2928, + "grad_norm": 2.3283233642578125, + "kl": 0.02703857421875, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 5479492.0, + "reward": -13.741943359375, + "reward_std": 6.479031562805176, + "rewards/rm_reward_func/mean": -13.741943359375, + "rewards/rm_reward_func/std": 7.022981643676758, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 265.53125, + "completions/mean_terminated_length": 169.0869598388672, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.2936, + "grad_norm": 2.344831943511963, + "kl": 0.058349609375, + "learning_rate": 1e-06, + "loss": 0.116, + "num_tokens": 5493733.0, + "reward": -4.61865234375, + "reward_std": 4.233585357666016, + "rewards/rm_reward_func/mean": -4.61865234375, + "rewards/rm_reward_func/std": 10.193796157836914, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 462.25, + "completions/mean_terminated_length": 418.3529357910156, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.2944, + "grad_norm": 1.607344627380371, + "kl": 0.020294189453125, + "learning_rate": 1e-06, + "loss": 0.0257, + "num_tokens": 5512061.0, + "reward": -9.101043701171875, + "reward_std": 4.552679061889648, + "rewards/rm_reward_func/mean": -9.101043701171875, + "rewards/rm_reward_func/std": 4.896659851074219, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 369.0, + "completions/mean_terminated_length": 313.0434875488281, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.2952, + "grad_norm": 1.6382696628570557, + "kl": 0.036224365234375, + "learning_rate": 1e-06, + "loss": -0.0807, + "num_tokens": 5526021.0, + "reward": 6.380889892578125, + "reward_std": 9.070781707763672, + "rewards/rm_reward_func/mean": 6.380889892578125, + "rewards/rm_reward_func/std": 11.414023399353027, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 223.1875, + "completions/mean_terminated_length": 203.933349609375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.296, + "grad_norm": 2.6846063137054443, + "kl": 0.059051513671875, + "learning_rate": 1e-06, + "loss": 0.1782, + "num_tokens": 5538603.0, + "reward": 4.91796875, + "reward_std": 6.754969596862793, + "rewards/rm_reward_func/mean": 4.91796875, + "rewards/rm_reward_func/std": 11.670215606689453, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 474.9375, + "completions/mean_terminated_length": 393.3999938964844, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.2968, + "grad_norm": 1.4546644687652588, + "kl": 0.018157958984375, + "learning_rate": 1e-06, + "loss": 0.0707, + "num_tokens": 5556417.0, + "reward": -5.75927734375, + "reward_std": 5.358532905578613, + "rewards/rm_reward_func/mean": -5.75927734375, + "rewards/rm_reward_func/std": 15.64819622039795, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 272.6875, + "completions/mean_terminated_length": 272.6875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.2976, + "grad_norm": 2.2273054122924805, + "kl": 0.0421142578125, + "learning_rate": 1e-06, + "loss": -0.072, + "num_tokens": 5568959.0, + "reward": 0.921630859375, + "reward_std": 5.4181108474731445, + "rewards/rm_reward_func/mean": 0.921630859375, + "rewards/rm_reward_func/std": 17.089065551757812, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 284.40625, + "completions/mean_terminated_length": 260.862060546875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.2984, + "grad_norm": 1.8863608837127686, + "kl": 0.070709228515625, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 5587652.0, + "reward": 5.859762668609619, + "reward_std": 6.691329479217529, + "rewards/rm_reward_func/mean": 5.859762668609619, + "rewards/rm_reward_func/std": 15.565380096435547, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 365.875, + "completions/mean_terminated_length": 338.8148193359375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2992, + "grad_norm": 1.7884395122528076, + "kl": 0.0263671875, + "learning_rate": 1e-06, + "loss": -0.0189, + "num_tokens": 5604088.0, + "reward": -5.9788818359375, + "reward_std": 5.031845569610596, + "rewards/rm_reward_func/mean": -5.9788818359375, + "rewards/rm_reward_func/std": 10.35934066772461, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 358.96875, + "completions/mean_terminated_length": 299.08697509765625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3, + "grad_norm": 1.8511404991149902, + "kl": 0.05419921875, + "learning_rate": 1e-06, + "loss": 0.0233, + "num_tokens": 5621687.0, + "reward": -3.8203125, + "reward_std": 7.369033336639404, + "rewards/rm_reward_func/mean": -3.8203125, + "rewards/rm_reward_func/std": 10.927128791809082, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 393.28125, + "completions/mean_terminated_length": 331.0952453613281, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.3008, + "grad_norm": 1.8309043645858765, + "kl": 0.039520263671875, + "learning_rate": 1e-06, + "loss": 0.0464, + "num_tokens": 5638464.0, + "reward": -6.15789794921875, + "reward_std": 4.764569282531738, + "rewards/rm_reward_func/mean": -6.15789794921875, + "rewards/rm_reward_func/std": 13.8583984375, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 414.78125, + "completions/mean_terminated_length": 339.1666564941406, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.3016, + "grad_norm": 1.8031203746795654, + "kl": 0.0245819091796875, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 5655065.0, + "reward": -5.0400390625, + "reward_std": 5.560011863708496, + "rewards/rm_reward_func/mean": -5.0400390625, + "rewards/rm_reward_func/std": 8.760733604431152, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 340.15625, + "completions/mean_terminated_length": 282.875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3024, + "grad_norm": 2.0854198932647705, + "kl": 0.02978515625, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 5667870.0, + "reward": -7.791015625, + "reward_std": 4.953996181488037, + "rewards/rm_reward_func/mean": -7.791015625, + "rewards/rm_reward_func/std": 8.143828392028809, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 266.53125, + "completions/mean_terminated_length": 170.478271484375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3032, + "grad_norm": 4.312221050262451, + "kl": 0.064727783203125, + "learning_rate": 1e-06, + "loss": 0.0302, + "num_tokens": 5679439.0, + "reward": -12.36181640625, + "reward_std": 5.124608516693115, + "rewards/rm_reward_func/mean": -12.36181640625, + "rewards/rm_reward_func/std": 5.709782123565674, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 312.4375, + "completions/mean_terminated_length": 275.4814758300781, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.304, + "grad_norm": 2.233138084411621, + "kl": 0.061248779296875, + "learning_rate": 1e-06, + "loss": 0.1093, + "num_tokens": 5694141.0, + "reward": -0.572265625, + "reward_std": 7.462414264678955, + "rewards/rm_reward_func/mean": -0.572265625, + "rewards/rm_reward_func/std": 17.70692253112793, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 353.4375, + "completions/mean_terminated_length": 230.11111450195312, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3048, + "grad_norm": 3.909087657928467, + "kl": 0.0380401611328125, + "learning_rate": 1e-06, + "loss": 0.1637, + "num_tokens": 5713523.0, + "reward": -7.41650390625, + "reward_std": 3.9527621269226074, + "rewards/rm_reward_func/mean": -7.41650390625, + "rewards/rm_reward_func/std": 4.340976715087891, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 327.96875, + "completions/mean_terminated_length": 276.44000244140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.3056, + "grad_norm": 1.762288212776184, + "kl": 0.086151123046875, + "learning_rate": 1e-06, + "loss": -0.0227, + "num_tokens": 5731066.0, + "reward": -7.09259033203125, + "reward_std": 7.640285968780518, + "rewards/rm_reward_func/mean": -7.09259033203125, + "rewards/rm_reward_func/std": 9.754437446594238, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 381.8125, + "completions/mean_terminated_length": 351.7692565917969, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.3064, + "grad_norm": 1.9935667514801025, + "kl": 0.038238525390625, + "learning_rate": 1e-06, + "loss": -0.0211, + "num_tokens": 5745788.0, + "reward": -2.145538330078125, + "reward_std": 5.055974960327148, + "rewards/rm_reward_func/mean": -2.145538330078125, + "rewards/rm_reward_func/std": 16.788618087768555, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 307.40625, + "completions/mean_terminated_length": 239.20834350585938, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.3072, + "grad_norm": 2.00789475440979, + "kl": 0.059600830078125, + "learning_rate": 1e-06, + "loss": 0.0343, + "num_tokens": 5764921.0, + "reward": 3.28466796875, + "reward_std": 3.6567811965942383, + "rewards/rm_reward_func/mean": 3.28466796875, + "rewards/rm_reward_func/std": 19.416015625, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 210.65625, + "completions/mean_terminated_length": 210.65625, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.308, + "grad_norm": 2.3222641944885254, + "kl": 0.093994140625, + "learning_rate": 1e-06, + "loss": 0.0369, + "num_tokens": 5780222.0, + "reward": -0.32086181640625, + "reward_std": 6.337305068969727, + "rewards/rm_reward_func/mean": -0.32086181640625, + "rewards/rm_reward_func/std": 10.747522354125977, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 444.625, + "completions/mean_terminated_length": 414.0, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.3088, + "grad_norm": 1.7412981986999512, + "kl": 0.022613525390625, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 5796866.0, + "reward": -6.87158203125, + "reward_std": 7.653716564178467, + "rewards/rm_reward_func/mean": -6.87158203125, + "rewards/rm_reward_func/std": 9.586761474609375, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 394.40625, + "completions/mean_terminated_length": 313.9473571777344, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.3096, + "grad_norm": 2.0770926475524902, + "kl": 0.055328369140625, + "learning_rate": 1e-06, + "loss": 0.0292, + "num_tokens": 5816607.0, + "reward": -8.527099609375, + "reward_std": 5.560388088226318, + "rewards/rm_reward_func/mean": -8.527099609375, + "rewards/rm_reward_func/std": 7.789054870605469, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 233.3125, + "completions/mean_terminated_length": 233.3125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.3104, + "grad_norm": 2.134597063064575, + "kl": 0.049163818359375, + "learning_rate": 1e-06, + "loss": 0.0533, + "num_tokens": 5829273.0, + "reward": -8.308349609375, + "reward_std": 3.111523389816284, + "rewards/rm_reward_func/mean": -8.308349609375, + "rewards/rm_reward_func/std": 7.5683465003967285, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 308.25, + "completions/mean_terminated_length": 251.1999969482422, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.3112, + "grad_norm": 1.9076645374298096, + "kl": 0.076416015625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 5850465.0, + "reward": 1.86328125, + "reward_std": 7.757295608520508, + "rewards/rm_reward_func/mean": 1.86328125, + "rewards/rm_reward_func/std": 14.142688751220703, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 445.34375, + "completions/mean_terminated_length": 386.5294189453125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.312, + "grad_norm": 1.6727628707885742, + "kl": 0.0352783203125, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 5869204.0, + "reward": -3.986175537109375, + "reward_std": 7.1820149421691895, + "rewards/rm_reward_func/mean": -3.986175537109375, + "rewards/rm_reward_func/std": 13.791895866394043, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 439.53125, + "completions/mean_terminated_length": 346.3571472167969, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.3128, + "grad_norm": 1.7423282861709595, + "kl": 0.021331787109375, + "learning_rate": 1e-06, + "loss": -0.1166, + "num_tokens": 5885845.0, + "reward": -11.200927734375, + "reward_std": 2.616241455078125, + "rewards/rm_reward_func/mean": -11.200927734375, + "rewards/rm_reward_func/std": 7.977281093597412, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 396.6875, + "completions/mean_terminated_length": 351.5652160644531, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.3136, + "grad_norm": 1.7295548915863037, + "kl": 0.03594970703125, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 5906531.0, + "reward": -4.1290740966796875, + "reward_std": 6.89487361907959, + "rewards/rm_reward_func/mean": -4.1290740966796875, + "rewards/rm_reward_func/std": 20.883413314819336, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 436.96875, + "completions/mean_terminated_length": 370.76470947265625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3144, + "grad_norm": 1.5077844858169556, + "kl": 0.0215911865234375, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 5923466.0, + "reward": -12.6561279296875, + "reward_std": 7.444554328918457, + "rewards/rm_reward_func/mean": -12.6561279296875, + "rewards/rm_reward_func/std": 11.132777214050293, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 329.625, + "completions/mean_terminated_length": 258.2608642578125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.3152, + "grad_norm": 2.140815019607544, + "kl": 0.06842041015625, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 5940158.0, + "reward": -5.93109130859375, + "reward_std": 5.369607925415039, + "rewards/rm_reward_func/mean": -5.93109130859375, + "rewards/rm_reward_func/std": 14.757482528686523, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 305.875, + "completions/mean_terminated_length": 237.1666717529297, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.316, + "grad_norm": 2.237273931503296, + "kl": 0.049835205078125, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 5954058.0, + "reward": -5.15557861328125, + "reward_std": 4.165969371795654, + "rewards/rm_reward_func/mean": -5.15557861328125, + "rewards/rm_reward_func/std": 8.919110298156738, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 409.71875, + "completions/mean_terminated_length": 356.1428527832031, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3168, + "grad_norm": 1.6388943195343018, + "kl": 0.0223846435546875, + "learning_rate": 1e-06, + "loss": -0.0923, + "num_tokens": 5969801.0, + "reward": -2.6805419921875, + "reward_std": 7.366881370544434, + "rewards/rm_reward_func/mean": -2.6805419921875, + "rewards/rm_reward_func/std": 9.875097274780273, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 313.125, + "completions/mean_terminated_length": 222.72727966308594, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3176, + "grad_norm": 2.1666622161865234, + "kl": 0.042724609375, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 5981677.0, + "reward": -9.8662109375, + "reward_std": 5.74767541885376, + "rewards/rm_reward_func/mean": -9.8662109375, + "rewards/rm_reward_func/std": 9.001533508300781, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 305.53125, + "completions/mean_terminated_length": 257.8846130371094, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.3184, + "grad_norm": 1.9383690357208252, + "kl": 0.049835205078125, + "learning_rate": 1e-06, + "loss": -0.0213, + "num_tokens": 5996806.0, + "reward": -0.748779296875, + "reward_std": 6.850553512573242, + "rewards/rm_reward_func/mean": -0.748779296875, + "rewards/rm_reward_func/std": 13.030008316040039, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 440.125, + "completions/mean_terminated_length": 384.22222900390625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.3192, + "grad_norm": 1.5667747259140015, + "kl": 0.0299530029296875, + "learning_rate": 1e-06, + "loss": -0.04, + "num_tokens": 6014370.0, + "reward": -12.9090576171875, + "reward_std": 7.616760730743408, + "rewards/rm_reward_func/mean": -12.9090576171875, + "rewards/rm_reward_func/std": 9.199716567993164, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 225.0625, + "completions/mean_terminated_length": 215.8064422607422, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.32, + "grad_norm": 2.1162643432617188, + "kl": 0.09149169921875, + "learning_rate": 1e-06, + "loss": -0.1137, + "num_tokens": 6032108.0, + "reward": 2.60638427734375, + "reward_std": 6.005467414855957, + "rewards/rm_reward_func/mean": 2.60638427734375, + "rewards/rm_reward_func/std": 13.77566146850586, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 265.15625, + "completions/mean_terminated_length": 219.44444274902344, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.3208, + "grad_norm": 2.059943199157715, + "kl": 0.0811767578125, + "learning_rate": 1e-06, + "loss": 0.0925, + "num_tokens": 6048705.0, + "reward": 7.7172698974609375, + "reward_std": 6.769311904907227, + "rewards/rm_reward_func/mean": 7.7172698974609375, + "rewards/rm_reward_func/std": 14.219295501708984, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 357.09375, + "completions/mean_terminated_length": 296.478271484375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.3216, + "grad_norm": 1.8876936435699463, + "kl": 0.05877685546875, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 6066140.0, + "reward": -5.876953125, + "reward_std": 3.376842498779297, + "rewards/rm_reward_func/mean": -5.876953125, + "rewards/rm_reward_func/std": 17.701467514038086, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 107.0, + "completions/mean_length": 389.46875, + "completions/mean_terminated_length": 76.33333587646484, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.3224, + "grad_norm": 2.607593059539795, + "kl": 0.032470703125, + "learning_rate": 1e-06, + "loss": 0.0791, + "num_tokens": 6080843.0, + "reward": -18.32568359375, + "reward_std": 5.9816083908081055, + "rewards/rm_reward_func/mean": -18.32568359375, + "rewards/rm_reward_func/std": 10.676798820495605, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 268.75, + "completions/mean_terminated_length": 243.58621215820312, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.3232, + "grad_norm": 4.719945430755615, + "kl": 0.0610809326171875, + "learning_rate": 1e-06, + "loss": 0.0841, + "num_tokens": 6094939.0, + "reward": 3.931671142578125, + "reward_std": 6.294184684753418, + "rewards/rm_reward_func/mean": 3.931671142578125, + "rewards/rm_reward_func/std": 11.474594116210938, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 198.34483337402344, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.324, + "grad_norm": 2.23734450340271, + "kl": 0.045806884765625, + "learning_rate": 1e-06, + "loss": 0.0176, + "num_tokens": 6106755.0, + "reward": -6.274169921875, + "reward_std": 5.485233306884766, + "rewards/rm_reward_func/mean": -6.274169921875, + "rewards/rm_reward_func/std": 16.541460037231445, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 253.28125, + "completions/mean_terminated_length": 236.03334045410156, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.3248, + "grad_norm": 1.9540272951126099, + "kl": 0.06414794921875, + "learning_rate": 1e-06, + "loss": 0.073, + "num_tokens": 6123484.0, + "reward": -1.925079345703125, + "reward_std": 4.655886650085449, + "rewards/rm_reward_func/mean": -1.925079345703125, + "rewards/rm_reward_func/std": 16.971372604370117, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 414.46875, + "completions/mean_terminated_length": 338.6111145019531, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.3256, + "grad_norm": 1.9807032346725464, + "kl": 0.0399169921875, + "learning_rate": 1e-06, + "loss": 0.036, + "num_tokens": 6140875.0, + "reward": -10.509796142578125, + "reward_std": 3.916581153869629, + "rewards/rm_reward_func/mean": -10.509796142578125, + "rewards/rm_reward_func/std": 7.326880931854248, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 331.0625, + "completions/mean_terminated_length": 312.3448181152344, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3264, + "grad_norm": 2.1229090690612793, + "kl": 0.0673828125, + "learning_rate": 1e-06, + "loss": -0.103, + "num_tokens": 6161261.0, + "reward": 1.81964111328125, + "reward_std": 6.121976852416992, + "rewards/rm_reward_func/mean": 1.81964111328125, + "rewards/rm_reward_func/std": 17.086164474487305, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 355.4375, + "completions/mean_terminated_length": 273.4285888671875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3272, + "grad_norm": 2.1034395694732666, + "kl": 0.0484466552734375, + "learning_rate": 1e-06, + "loss": -0.1113, + "num_tokens": 6178307.0, + "reward": -1.4040412902832031, + "reward_std": 6.1752119064331055, + "rewards/rm_reward_func/mean": -1.4040412902832031, + "rewards/rm_reward_func/std": 8.065441131591797, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 349.9375, + "completions/mean_terminated_length": 286.5217590332031, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.328, + "grad_norm": 2.1528029441833496, + "kl": 0.0430908203125, + "learning_rate": 1e-06, + "loss": -0.0194, + "num_tokens": 6195553.0, + "reward": -8.544677734375, + "reward_std": 3.9103150367736816, + "rewards/rm_reward_func/mean": -8.544677734375, + "rewards/rm_reward_func/std": 4.936274528503418, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 457.625, + "completions/mean_terminated_length": 318.6666564941406, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.3288, + "grad_norm": 1.6279497146606445, + "kl": 0.0218658447265625, + "learning_rate": 1e-06, + "loss": -0.0601, + "num_tokens": 6213261.0, + "reward": -14.61700439453125, + "reward_std": 2.316563129425049, + "rewards/rm_reward_func/mean": -14.61700439453125, + "rewards/rm_reward_func/std": 10.445907592773438, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 372.46875, + "completions/mean_terminated_length": 299.3809509277344, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.3296, + "grad_norm": 1.6539037227630615, + "kl": 0.05810546875, + "learning_rate": 1e-06, + "loss": -0.0196, + "num_tokens": 6233092.0, + "reward": -0.601226806640625, + "reward_std": 6.572290420532227, + "rewards/rm_reward_func/mean": -0.601226806640625, + "rewards/rm_reward_func/std": 13.360589981079102, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 391.03125, + "completions/mean_terminated_length": 327.66668701171875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.3304, + "grad_norm": 2.0171449184417725, + "kl": 0.038330078125, + "learning_rate": 1e-06, + "loss": -0.0259, + "num_tokens": 6251781.0, + "reward": -3.72802734375, + "reward_std": 3.2654664516448975, + "rewards/rm_reward_func/mean": -3.72802734375, + "rewards/rm_reward_func/std": 18.840932846069336, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 512.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 447.75, + "completions/mean_terminated_length": 255.0, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.3312, + "grad_norm": 1.609928846359253, + "kl": 0.0330657958984375, + "learning_rate": 1e-06, + "loss": -0.0353, + "num_tokens": 6276029.0, + "reward": -6.253570556640625, + "reward_std": 4.647231578826904, + "rewards/rm_reward_func/mean": -6.253570556640625, + "rewards/rm_reward_func/std": 12.78857135772705, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 403.09375, + "completions/mean_terminated_length": 307.0, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.332, + "grad_norm": 1.5196458101272583, + "kl": 0.022216796875, + "learning_rate": 1e-06, + "loss": 0.1536, + "num_tokens": 6292984.0, + "reward": -16.8687744140625, + "reward_std": 5.411165237426758, + "rewards/rm_reward_func/mean": -16.8687744140625, + "rewards/rm_reward_func/std": 8.016997337341309, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 267.625, + "completions/mean_terminated_length": 199.1999969482422, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3328, + "grad_norm": 4.476416110992432, + "kl": 0.0919189453125, + "learning_rate": 1e-06, + "loss": 0.0993, + "num_tokens": 6311812.0, + "reward": 3.409423828125, + "reward_std": 7.78582239151001, + "rewards/rm_reward_func/mean": 3.409423828125, + "rewards/rm_reward_func/std": 13.981962203979492, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 356.625, + "completions/mean_terminated_length": 263.3999938964844, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.3336, + "grad_norm": 1.4714343547821045, + "kl": 0.064117431640625, + "learning_rate": 1e-06, + "loss": 0.0166, + "num_tokens": 6334784.0, + "reward": 5.2001953125, + "reward_std": 6.571274757385254, + "rewards/rm_reward_func/mean": 5.2001953125, + "rewards/rm_reward_func/std": 20.831417083740234, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 408.1875, + "completions/mean_terminated_length": 373.5833435058594, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.3344, + "grad_norm": 1.7061028480529785, + "kl": 0.0244293212890625, + "learning_rate": 1e-06, + "loss": -0.0624, + "num_tokens": 6353462.0, + "reward": -3.072998046875, + "reward_std": 6.107587814331055, + "rewards/rm_reward_func/mean": -3.072998046875, + "rewards/rm_reward_func/std": 9.590071678161621, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 297.5625, + "completions/mean_terminated_length": 237.51998901367188, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.3352, + "grad_norm": 2.0976195335388184, + "kl": 0.030426025390625, + "learning_rate": 1e-06, + "loss": 0.025, + "num_tokens": 6368312.0, + "reward": -2.597900390625, + "reward_std": 7.013198375701904, + "rewards/rm_reward_func/mean": -2.597900390625, + "rewards/rm_reward_func/std": 11.319692611694336, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 269.625, + "completions/mean_terminated_length": 261.80645751953125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.336, + "grad_norm": 2.5582680702209473, + "kl": 0.049713134765625, + "learning_rate": 1e-06, + "loss": -0.0939, + "num_tokens": 6382996.0, + "reward": -1.8638916015625, + "reward_std": 6.8398332595825195, + "rewards/rm_reward_func/mean": -1.8638916015625, + "rewards/rm_reward_func/std": 9.822707176208496, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 308.4375, + "completions/mean_terminated_length": 215.9091033935547, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.3368, + "grad_norm": 3.3659751415252686, + "kl": 0.05645751953125, + "learning_rate": 1e-06, + "loss": 0.1791, + "num_tokens": 6401330.0, + "reward": -4.572998046875, + "reward_std": 6.079010963439941, + "rewards/rm_reward_func/mean": -4.572998046875, + "rewards/rm_reward_func/std": 7.835475921630859, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 318.625, + "completions/mean_terminated_length": 298.6206970214844, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.3376, + "grad_norm": 1.8582417964935303, + "kl": 0.027679443359375, + "learning_rate": 1e-06, + "loss": 0.0283, + "num_tokens": 6418062.0, + "reward": 5.303489685058594, + "reward_std": 6.764530181884766, + "rewards/rm_reward_func/mean": 5.303489685058594, + "rewards/rm_reward_func/std": 17.123449325561523, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 359.75, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.3384, + "grad_norm": 1.7756037712097168, + "kl": 0.04156494140625, + "learning_rate": 1e-06, + "loss": -0.0339, + "num_tokens": 6435270.0, + "reward": 3.2314453125, + "reward_std": 4.19941520690918, + "rewards/rm_reward_func/mean": 3.2314453125, + "rewards/rm_reward_func/std": 18.16136932373047, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 325.5, + "completions/mean_terminated_length": 273.2799987792969, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.3392, + "grad_norm": 1.9198381900787354, + "kl": 0.08453369140625, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 6451550.0, + "reward": -13.7490234375, + "reward_std": 4.622889518737793, + "rewards/rm_reward_func/mean": -13.7490234375, + "rewards/rm_reward_func/std": 7.576443672180176, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 330.03125, + "completions/mean_terminated_length": 324.1612854003906, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.34, + "grad_norm": 1.9938381910324097, + "kl": 0.027008056640625, + "learning_rate": 1e-06, + "loss": -0.028, + "num_tokens": 6466631.0, + "reward": -3.390380859375, + "reward_std": 5.43696928024292, + "rewards/rm_reward_func/mean": -3.390380859375, + "rewards/rm_reward_func/std": 11.02437686920166, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 406.53125, + "completions/mean_terminated_length": 351.2857360839844, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.3408, + "grad_norm": 2.015669345855713, + "kl": 0.022064208984375, + "learning_rate": 1e-06, + "loss": -0.099, + "num_tokens": 6482240.0, + "reward": -8.669967651367188, + "reward_std": 4.4157891273498535, + "rewards/rm_reward_func/mean": -8.669967651367188, + "rewards/rm_reward_func/std": 9.632221221923828, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 443.375, + "completions/mean_terminated_length": 390.0, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3416, + "grad_norm": 1.4734236001968384, + "kl": 0.016082763671875, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 6503196.0, + "reward": -4.57012939453125, + "reward_std": 8.454141616821289, + "rewards/rm_reward_func/mean": -4.57012939453125, + "rewards/rm_reward_func/std": 14.237653732299805, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 381.75, + "completions/mean_terminated_length": 280.4444580078125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.3424, + "grad_norm": 1.9539014101028442, + "kl": 0.027099609375, + "learning_rate": 1e-06, + "loss": -0.0878, + "num_tokens": 6520484.0, + "reward": -9.939960479736328, + "reward_std": 3.6802191734313965, + "rewards/rm_reward_func/mean": -9.939960479736328, + "rewards/rm_reward_func/std": 7.452787399291992, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 355.9375, + "completions/mean_terminated_length": 294.86956787109375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.3432, + "grad_norm": 1.582332968711853, + "kl": 0.0252532958984375, + "learning_rate": 1e-06, + "loss": 0.0506, + "num_tokens": 6538882.0, + "reward": -9.00830078125, + "reward_std": 5.846151828765869, + "rewards/rm_reward_func/mean": -9.00830078125, + "rewards/rm_reward_func/std": 7.6094279289245605, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 376.71875, + "completions/mean_terminated_length": 323.7826232910156, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.344, + "grad_norm": 1.7253316640853882, + "kl": 0.034088134765625, + "learning_rate": 1e-06, + "loss": 0.0517, + "num_tokens": 6555505.0, + "reward": -13.0115966796875, + "reward_std": 5.544178009033203, + "rewards/rm_reward_func/mean": -13.0115966796875, + "rewards/rm_reward_func/std": 6.809865951538086, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 293.46875, + "completions/mean_terminated_length": 243.03846740722656, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.3448, + "grad_norm": 2.1489760875701904, + "kl": 0.044586181640625, + "learning_rate": 1e-06, + "loss": -0.0291, + "num_tokens": 6570104.0, + "reward": -2.102783203125, + "reward_std": 4.935668468475342, + "rewards/rm_reward_func/mean": -2.102783203125, + "rewards/rm_reward_func/std": 16.327524185180664, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 344.21875, + "completions/mean_terminated_length": 288.29168701171875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.3456, + "grad_norm": 1.763724684715271, + "kl": 0.051727294921875, + "learning_rate": 1e-06, + "loss": 0.0343, + "num_tokens": 6588991.0, + "reward": 1.265380859375, + "reward_std": 7.057753086090088, + "rewards/rm_reward_func/mean": 1.265380859375, + "rewards/rm_reward_func/std": 13.014850616455078, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 430.8125, + "completions/mean_terminated_length": 295.5, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.3464, + "grad_norm": 1.5262986421585083, + "kl": 0.02630615234375, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 6608857.0, + "reward": -5.1100006103515625, + "reward_std": 4.372901916503906, + "rewards/rm_reward_func/mean": -5.1100006103515625, + "rewards/rm_reward_func/std": 12.572975158691406, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 446.78125, + "completions/mean_terminated_length": 351.4615478515625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.3472, + "grad_norm": 1.3079746961593628, + "kl": 0.0168914794921875, + "learning_rate": 1e-06, + "loss": 0.1265, + "num_tokens": 6626450.0, + "reward": -11.6146240234375, + "reward_std": 9.118956565856934, + "rewards/rm_reward_func/mean": -11.6146240234375, + "rewards/rm_reward_func/std": 14.253708839416504, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 419.78125, + "completions/mean_terminated_length": 364.45001220703125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.348, + "grad_norm": 1.660057783126831, + "kl": 0.0158843994140625, + "learning_rate": 1e-06, + "loss": -0.0878, + "num_tokens": 6644955.0, + "reward": -7.88311767578125, + "reward_std": 4.425982475280762, + "rewards/rm_reward_func/mean": -7.88311767578125, + "rewards/rm_reward_func/std": 8.020565032958984, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 328.25, + "completions/mean_terminated_length": 316.0000305175781, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.3488, + "grad_norm": 2.085163116455078, + "kl": 0.03985595703125, + "learning_rate": 1e-06, + "loss": 0.0972, + "num_tokens": 6658883.0, + "reward": -8.8162841796875, + "reward_std": 5.3973588943481445, + "rewards/rm_reward_func/mean": -8.8162841796875, + "rewards/rm_reward_func/std": 7.560286045074463, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 213.21739196777344, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3496, + "grad_norm": 2.533470869064331, + "kl": 0.0467529296875, + "learning_rate": 1e-06, + "loss": 0.0225, + "num_tokens": 6672747.0, + "reward": -4.641326904296875, + "reward_std": 5.287704944610596, + "rewards/rm_reward_func/mean": -4.641326904296875, + "rewards/rm_reward_func/std": 8.1183500289917, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 305.96875, + "completions/mean_terminated_length": 248.27999877929688, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.3504, + "grad_norm": 1.9119107723236084, + "kl": 0.05694580078125, + "learning_rate": 1e-06, + "loss": 0.0584, + "num_tokens": 6690394.0, + "reward": -4.938934326171875, + "reward_std": 6.149142265319824, + "rewards/rm_reward_func/mean": -4.938934326171875, + "rewards/rm_reward_func/std": 9.777031898498535, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 431.46875, + "completions/mean_terminated_length": 383.1499938964844, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.3512, + "grad_norm": 1.6924997568130493, + "kl": 0.01708984375, + "learning_rate": 1e-06, + "loss": -0.0395, + "num_tokens": 6706457.0, + "reward": -11.1624755859375, + "reward_std": 5.690855979919434, + "rewards/rm_reward_func/mean": -11.1624755859375, + "rewards/rm_reward_func/std": 11.857269287109375, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 408.5, + "completions/mean_terminated_length": 305.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.352, + "grad_norm": 1.6351063251495361, + "kl": 0.0347900390625, + "learning_rate": 1e-06, + "loss": 0.1134, + "num_tokens": 6724185.0, + "reward": -7.58599853515625, + "reward_std": 4.485294818878174, + "rewards/rm_reward_func/mean": -7.58599853515625, + "rewards/rm_reward_func/std": 16.259464263916016, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 395.375, + "completions/mean_terminated_length": 315.5789489746094, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.3528, + "grad_norm": 1.696117877960205, + "kl": 0.05206298828125, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 6743301.0, + "reward": -7.4130859375, + "reward_std": 4.897246837615967, + "rewards/rm_reward_func/mean": -7.4130859375, + "rewards/rm_reward_func/std": 9.967143058776855, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 318.53125, + "completions/mean_terminated_length": 254.0416717529297, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.3536, + "grad_norm": 1.8118313550949097, + "kl": 0.049346923828125, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 6760486.0, + "reward": -2.759063720703125, + "reward_std": 4.567935466766357, + "rewards/rm_reward_func/mean": -2.759063720703125, + "rewards/rm_reward_func/std": 15.637845993041992, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 345.0625, + "completions/mean_terminated_length": 279.7391357421875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.3544, + "grad_norm": 1.9427210092544556, + "kl": 0.047637939453125, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 6776872.0, + "reward": -0.4622802734375, + "reward_std": 4.253865718841553, + "rewards/rm_reward_func/mean": -0.4622802734375, + "rewards/rm_reward_func/std": 12.960844039916992, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 345.5, + "completions/mean_terminated_length": 269.81817626953125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.3552, + "grad_norm": 1.6731483936309814, + "kl": 0.0322265625, + "learning_rate": 1e-06, + "loss": -0.1116, + "num_tokens": 6792864.0, + "reward": -2.85986328125, + "reward_std": 5.278141975402832, + "rewards/rm_reward_func/mean": -2.85986328125, + "rewards/rm_reward_func/std": 13.660578727722168, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 277.25, + "completions/mean_terminated_length": 243.71429443359375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.356, + "grad_norm": 1.8227163553237915, + "kl": 0.0535888671875, + "learning_rate": 1e-06, + "loss": 0.1128, + "num_tokens": 6805304.0, + "reward": -4.6063385009765625, + "reward_std": 7.752448081970215, + "rewards/rm_reward_func/mean": -4.6063385009765625, + "rewards/rm_reward_func/std": 16.555973052978516, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 329.34375, + "completions/mean_terminated_length": 303.25, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3568, + "grad_norm": 2.7460734844207764, + "kl": 0.042724609375, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 6817835.0, + "reward": -8.437347412109375, + "reward_std": 6.30595588684082, + "rewards/rm_reward_func/mean": -8.437347412109375, + "rewards/rm_reward_func/std": 7.3206257820129395, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 441.96875, + "completions/mean_terminated_length": 410.1363830566406, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.3576, + "grad_norm": 1.6162619590759277, + "kl": 0.0192108154296875, + "learning_rate": 1e-06, + "loss": -0.063, + "num_tokens": 6835234.0, + "reward": -9.977783203125, + "reward_std": 3.615157127380371, + "rewards/rm_reward_func/mean": -9.977783203125, + "rewards/rm_reward_func/std": 6.0941057205200195, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 370.65625, + "completions/mean_terminated_length": 331.0799865722656, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.3584, + "grad_norm": 1.6218841075897217, + "kl": 0.0239105224609375, + "learning_rate": 1e-06, + "loss": 0.0943, + "num_tokens": 6851151.0, + "reward": -12.0640869140625, + "reward_std": 5.769428253173828, + "rewards/rm_reward_func/mean": -12.0640869140625, + "rewards/rm_reward_func/std": 7.268880367279053, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 425.71875, + "completions/mean_terminated_length": 380.5238037109375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.3592, + "grad_norm": 1.5768471956253052, + "kl": 0.01971435546875, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 6869942.0, + "reward": -11.62109375, + "reward_std": 3.5084919929504395, + "rewards/rm_reward_func/mean": -11.62109375, + "rewards/rm_reward_func/std": 7.041207790374756, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 377.34375, + "completions/mean_terminated_length": 296.5500183105469, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.36, + "grad_norm": 10.41253662109375, + "kl": 0.28369140625, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 6890017.0, + "reward": -2.74957275390625, + "reward_std": 6.988457679748535, + "rewards/rm_reward_func/mean": -2.74957275390625, + "rewards/rm_reward_func/std": 21.13848114013672, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 383.25, + "completions/mean_terminated_length": 369.9310302734375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.3608, + "grad_norm": 1.716001033782959, + "kl": 0.023468017578125, + "learning_rate": 1e-06, + "loss": -0.053, + "num_tokens": 6905073.0, + "reward": -4.01519775390625, + "reward_std": 4.076905727386475, + "rewards/rm_reward_func/mean": -4.01519775390625, + "rewards/rm_reward_func/std": 6.577391624450684, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 266.09375, + "completions/mean_terminated_length": 249.70001220703125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.3616, + "grad_norm": 2.0330448150634766, + "kl": 0.054962158203125, + "learning_rate": 1e-06, + "loss": -0.0542, + "num_tokens": 6919164.0, + "reward": 6.8297119140625, + "reward_std": 5.125823497772217, + "rewards/rm_reward_func/mean": 6.8297119140625, + "rewards/rm_reward_func/std": 13.47778606414795, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 392.71875, + "completions/mean_terminated_length": 311.1052551269531, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.3624, + "grad_norm": 1.8544645309448242, + "kl": 0.02447509765625, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 6934627.0, + "reward": -2.57177734375, + "reward_std": 4.244129180908203, + "rewards/rm_reward_func/mean": -2.57177734375, + "rewards/rm_reward_func/std": 8.67985725402832, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 282.0, + "completions/mean_terminated_length": 249.1428680419922, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.3632, + "grad_norm": 2.5911881923675537, + "kl": 0.029937744140625, + "learning_rate": 1e-06, + "loss": 0.0436, + "num_tokens": 6946707.0, + "reward": -4.53515625, + "reward_std": 6.382939338684082, + "rewards/rm_reward_func/mean": -4.53515625, + "rewards/rm_reward_func/std": 6.9819655418396, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 284.9375, + "completions/mean_terminated_length": 209.25, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.364, + "grad_norm": 2.1765058040618896, + "kl": 0.083892822265625, + "learning_rate": 1e-06, + "loss": -0.042, + "num_tokens": 6963633.0, + "reward": -4.5068359375, + "reward_std": 6.619154453277588, + "rewards/rm_reward_func/mean": -4.5068359375, + "rewards/rm_reward_func/std": 15.061408996582031, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 345.65625, + "completions/mean_terminated_length": 270.04547119140625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.3648, + "grad_norm": 3.204291343688965, + "kl": 0.060577392578125, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 6977350.0, + "reward": -7.983001708984375, + "reward_std": 4.367372989654541, + "rewards/rm_reward_func/mean": -7.983001708984375, + "rewards/rm_reward_func/std": 13.260966300964355, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 242.59375, + "completions/mean_terminated_length": 233.90321350097656, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.3656, + "grad_norm": 2.6048195362091064, + "kl": 0.0396728515625, + "learning_rate": 1e-06, + "loss": -0.0209, + "num_tokens": 6988545.0, + "reward": -1.818115234375, + "reward_std": 4.7883830070495605, + "rewards/rm_reward_func/mean": -1.818115234375, + "rewards/rm_reward_func/std": 16.03240203857422, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 361.90625, + "completions/mean_terminated_length": 283.28570556640625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.3664, + "grad_norm": 1.9588838815689087, + "kl": 0.057769775390625, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 7005350.0, + "reward": -2.230224609375, + "reward_std": 4.8073410987854, + "rewards/rm_reward_func/mean": -2.230224609375, + "rewards/rm_reward_func/std": 11.667881965637207, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 325.5, + "completions/mean_terminated_length": 306.2069091796875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.3672, + "grad_norm": 1.517564296722412, + "kl": 0.0651702880859375, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 7025382.0, + "reward": 10.4552001953125, + "reward_std": 8.374692916870117, + "rewards/rm_reward_func/mean": 10.4552001953125, + "rewards/rm_reward_func/std": 10.493587493896484, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 373.71875, + "completions/mean_terminated_length": 310.8636474609375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.368, + "grad_norm": 1.6146050691604614, + "kl": 0.04046630859375, + "learning_rate": 1e-06, + "loss": -0.0404, + "num_tokens": 7044197.0, + "reward": -5.0064697265625, + "reward_std": 3.5761313438415527, + "rewards/rm_reward_func/mean": -5.0064697265625, + "rewards/rm_reward_func/std": 18.095197677612305, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 386.15625, + "completions/mean_terminated_length": 310.6499938964844, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.3688, + "grad_norm": 1.713719129562378, + "kl": 0.0227508544921875, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 7059930.0, + "reward": -10.64794921875, + "reward_std": 4.460062026977539, + "rewards/rm_reward_func/mean": -10.64794921875, + "rewards/rm_reward_func/std": 5.889519214630127, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 347.4375, + "completions/mean_terminated_length": 316.96295166015625, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.3696, + "grad_norm": 1.8239960670471191, + "kl": 0.033050537109375, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 7073672.0, + "reward": -8.138671875, + "reward_std": 3.7073869705200195, + "rewards/rm_reward_func/mean": -8.138671875, + "rewards/rm_reward_func/std": 13.883122444152832, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 378.4375, + "completions/mean_terminated_length": 244.875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.3704, + "grad_norm": 1.9283287525177002, + "kl": 0.072601318359375, + "learning_rate": 1e-06, + "loss": 0.1211, + "num_tokens": 7093094.0, + "reward": -8.3095703125, + "reward_std": 7.634549140930176, + "rewards/rm_reward_func/mean": -8.3095703125, + "rewards/rm_reward_func/std": 21.24001121520996, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 355.71875, + "completions/mean_terminated_length": 284.68182373046875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.3712, + "grad_norm": 1.7455298900604248, + "kl": 0.054290771484375, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 7111045.0, + "reward": -5.1982421875, + "reward_std": 5.924152374267578, + "rewards/rm_reward_func/mean": -5.1982421875, + "rewards/rm_reward_func/std": 21.179086685180664, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 272.34375, + "completions/mean_terminated_length": 227.9629669189453, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.372, + "grad_norm": 2.281312942504883, + "kl": 0.0316314697265625, + "learning_rate": 1e-06, + "loss": -0.0227, + "num_tokens": 7123008.0, + "reward": -10.941818237304688, + "reward_std": 5.427191734313965, + "rewards/rm_reward_func/mean": -10.941818237304688, + "rewards/rm_reward_func/std": 8.099947929382324, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 254.1875, + "completions/mean_terminated_length": 237.00001525878906, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.3728, + "grad_norm": 2.468573570251465, + "kl": 0.045013427734375, + "learning_rate": 1e-06, + "loss": 0.0286, + "num_tokens": 7136206.0, + "reward": -1.58892822265625, + "reward_std": 4.972779273986816, + "rewards/rm_reward_func/mean": -1.58892822265625, + "rewards/rm_reward_func/std": 15.116301536560059, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 258.1875, + "completions/mean_terminated_length": 250.0, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.3736, + "grad_norm": 1.8227242231369019, + "kl": 0.095947265625, + "learning_rate": 1e-06, + "loss": 0.0526, + "num_tokens": 7158108.0, + "reward": 10.17816162109375, + "reward_std": 5.324087619781494, + "rewards/rm_reward_func/mean": 10.17816162109375, + "rewards/rm_reward_func/std": 15.198423385620117, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 280.03125, + "completions/mean_terminated_length": 246.8928680419922, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.3744, + "grad_norm": 1.9935191869735718, + "kl": 0.05120849609375, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 7171533.0, + "reward": 0.7098388671875, + "reward_std": 6.869742393493652, + "rewards/rm_reward_func/mean": 0.7098388671875, + "rewards/rm_reward_func/std": 16.380746841430664, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 291.4375, + "completions/mean_terminated_length": 217.9166717529297, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3752, + "grad_norm": 2.5600199699401855, + "kl": 0.065093994140625, + "learning_rate": 1e-06, + "loss": -0.0479, + "num_tokens": 7184707.0, + "reward": -11.051223754882812, + "reward_std": 4.0888285636901855, + "rewards/rm_reward_func/mean": -11.051223754882812, + "rewards/rm_reward_func/std": 11.321426391601562, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 422.90625, + "completions/mean_terminated_length": 353.6111145019531, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.376, + "grad_norm": 1.5506393909454346, + "kl": 0.020843505859375, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 7201912.0, + "reward": -4.878662109375, + "reward_std": 8.765281677246094, + "rewards/rm_reward_func/mean": -4.878662109375, + "rewards/rm_reward_func/std": 10.220052719116211, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 303.5625, + "completions/mean_terminated_length": 296.8387145996094, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.3768, + "grad_norm": 2.094174385070801, + "kl": 0.0538330078125, + "learning_rate": 1e-06, + "loss": 0.1022, + "num_tokens": 7216394.0, + "reward": -7.294506072998047, + "reward_std": 5.134894371032715, + "rewards/rm_reward_func/mean": -7.294506072998047, + "rewards/rm_reward_func/std": 7.337850093841553, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 401.90625, + "completions/mean_terminated_length": 326.5789489746094, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3776, + "grad_norm": 1.6700878143310547, + "kl": 0.0236358642578125, + "learning_rate": 1e-06, + "loss": -0.0708, + "num_tokens": 7233919.0, + "reward": -16.5185546875, + "reward_std": 3.444326877593994, + "rewards/rm_reward_func/mean": -16.5185546875, + "rewards/rm_reward_func/std": 7.095970630645752, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 364.96875, + "completions/mean_terminated_length": 315.9583435058594, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.3784, + "grad_norm": 1.8330386877059937, + "kl": 0.02752685546875, + "learning_rate": 1e-06, + "loss": -0.0254, + "num_tokens": 7249174.0, + "reward": -6.5049591064453125, + "reward_std": 3.9670650959014893, + "rewards/rm_reward_func/mean": -6.5049591064453125, + "rewards/rm_reward_func/std": 5.86607027053833, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 193.15625, + "completions/mean_terminated_length": 193.15625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.3792, + "grad_norm": 2.903164863586426, + "kl": 0.063934326171875, + "learning_rate": 1e-06, + "loss": -0.0509, + "num_tokens": 7258027.0, + "reward": -15.498779296875, + "reward_std": 4.604251861572266, + "rewards/rm_reward_func/mean": -15.498779296875, + "rewards/rm_reward_func/std": 8.81122875213623, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 355.78125, + "completions/mean_terminated_length": 294.6521911621094, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.38, + "grad_norm": 1.9176656007766724, + "kl": 0.03521728515625, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 7274964.0, + "reward": 3.0035400390625, + "reward_std": 7.521487236022949, + "rewards/rm_reward_func/mean": 3.0035400390625, + "rewards/rm_reward_func/std": 15.206151008605957, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 306.59375, + "completions/mean_terminated_length": 226.21739196777344, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.3808, + "grad_norm": 1.9174607992172241, + "kl": 0.0559234619140625, + "learning_rate": 1e-06, + "loss": -0.1395, + "num_tokens": 7290687.0, + "reward": -1.0838623046875, + "reward_std": 6.355273723602295, + "rewards/rm_reward_func/mean": -1.0838623046875, + "rewards/rm_reward_func/std": 12.367974281311035, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 338.8125, + "completions/mean_terminated_length": 248.09524536132812, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.3816, + "grad_norm": 1.6493277549743652, + "kl": 0.054351806640625, + "learning_rate": 1e-06, + "loss": 0.0972, + "num_tokens": 7307073.0, + "reward": 2.8267822265625, + "reward_std": 5.62716817855835, + "rewards/rm_reward_func/mean": 2.8267822265625, + "rewards/rm_reward_func/std": 14.384576797485352, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 295.84375, + "completions/mean_terminated_length": 288.8709716796875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.3824, + "grad_norm": 2.118684768676758, + "kl": 0.085845947265625, + "learning_rate": 1e-06, + "loss": -0.0097, + "num_tokens": 7324020.0, + "reward": 0.23003387451171875, + "reward_std": 5.897096633911133, + "rewards/rm_reward_func/mean": 0.23003387451171875, + "rewards/rm_reward_func/std": 10.607458114624023, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 203.0, + "completions/mean_terminated_length": 171.03448486328125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.3832, + "grad_norm": 3.0059311389923096, + "kl": 0.056060791015625, + "learning_rate": 1e-06, + "loss": 0.2264, + "num_tokens": 7334580.0, + "reward": -6.35906982421875, + "reward_std": 6.42362642288208, + "rewards/rm_reward_func/mean": -6.35906982421875, + "rewards/rm_reward_func/std": 9.905282974243164, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 262.4375, + "completions/mean_terminated_length": 262.4375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.384, + "grad_norm": 2.111828565597534, + "kl": 0.05828857421875, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 7351066.0, + "reward": 0.506378173828125, + "reward_std": 6.634819507598877, + "rewards/rm_reward_func/mean": 0.506378173828125, + "rewards/rm_reward_func/std": 8.402663230895996, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 379.8125, + "completions/mean_terminated_length": 349.3077087402344, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.3848, + "grad_norm": 1.7632617950439453, + "kl": 0.0442657470703125, + "learning_rate": 1e-06, + "loss": -0.0376, + "num_tokens": 7369084.0, + "reward": -3.5157470703125, + "reward_std": 6.1227874755859375, + "rewards/rm_reward_func/mean": -3.5157470703125, + "rewards/rm_reward_func/std": 9.15819263458252, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 310.03125, + "completions/mean_terminated_length": 263.423095703125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3856, + "grad_norm": 2.2979800701141357, + "kl": 0.0377197265625, + "learning_rate": 1e-06, + "loss": 0.035, + "num_tokens": 7386365.0, + "reward": -9.618408203125, + "reward_std": 7.2102742195129395, + "rewards/rm_reward_func/mean": -9.618408203125, + "rewards/rm_reward_func/std": 12.255056381225586, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 329.65625, + "completions/mean_terminated_length": 287.5769348144531, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.3864, + "grad_norm": 2.012070894241333, + "kl": 0.071197509765625, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 7407266.0, + "reward": 1.418060302734375, + "reward_std": 4.475214004516602, + "rewards/rm_reward_func/mean": 1.418060302734375, + "rewards/rm_reward_func/std": 9.48045539855957, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 340.1875, + "completions/mean_terminated_length": 315.64288330078125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.3872, + "grad_norm": 1.7281482219696045, + "kl": 0.0606689453125, + "learning_rate": 1e-06, + "loss": -0.0371, + "num_tokens": 7426488.0, + "reward": 3.58563232421875, + "reward_std": 5.264001846313477, + "rewards/rm_reward_func/mean": 3.58563232421875, + "rewards/rm_reward_func/std": 17.291501998901367, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 322.28125, + "completions/mean_terminated_length": 269.1600036621094, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.388, + "grad_norm": 2.1548638343811035, + "kl": 0.0372314453125, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 7441321.0, + "reward": -8.5115966796875, + "reward_std": 3.5897645950317383, + "rewards/rm_reward_func/mean": -8.5115966796875, + "rewards/rm_reward_func/std": 7.601719856262207, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 355.46875, + "completions/mean_terminated_length": 339.2758483886719, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.3888, + "grad_norm": 1.555582880973816, + "kl": 0.02935791015625, + "learning_rate": 1e-06, + "loss": 0.027, + "num_tokens": 7459536.0, + "reward": -5.0325927734375, + "reward_std": 7.168083190917969, + "rewards/rm_reward_func/mean": -5.0325927734375, + "rewards/rm_reward_func/std": 8.641008377075195, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 409.4375, + "completions/mean_terminated_length": 355.71429443359375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.3896, + "grad_norm": 1.7587456703186035, + "kl": 0.0215606689453125, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 7475478.0, + "reward": -6.15850830078125, + "reward_std": 3.4490017890930176, + "rewards/rm_reward_func/mean": -6.15850830078125, + "rewards/rm_reward_func/std": 10.474088668823242, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 379.03125, + "completions/mean_terminated_length": 309.3809509277344, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.3904, + "grad_norm": 1.8008023500442505, + "kl": 0.0450439453125, + "learning_rate": 1e-06, + "loss": 0.0306, + "num_tokens": 7491159.0, + "reward": -7.962890625, + "reward_std": 4.423135757446289, + "rewards/rm_reward_func/mean": -7.962890625, + "rewards/rm_reward_func/std": 8.374658584594727, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 382.96875, + "completions/mean_terminated_length": 294.6842041015625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.3912, + "grad_norm": 1.8386781215667725, + "kl": 0.04473876953125, + "learning_rate": 1e-06, + "loss": 0.0419, + "num_tokens": 7509158.0, + "reward": -2.9027099609375, + "reward_std": 4.522671699523926, + "rewards/rm_reward_func/mean": -2.9027099609375, + "rewards/rm_reward_func/std": 14.98139476776123, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 336.21875, + "completions/mean_terminated_length": 244.1428680419922, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.392, + "grad_norm": 2.0334396362304688, + "kl": 0.04815673828125, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 7526381.0, + "reward": -3.4616827964782715, + "reward_std": 4.1777262687683105, + "rewards/rm_reward_func/mean": -3.4616827964782715, + "rewards/rm_reward_func/std": 5.332122325897217, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 428.96875, + "completions/mean_terminated_length": 379.1499938964844, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.3928, + "grad_norm": 1.709693193435669, + "kl": 0.029937744140625, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 7544492.0, + "reward": -8.491455078125, + "reward_std": 4.969239234924316, + "rewards/rm_reward_func/mean": -8.491455078125, + "rewards/rm_reward_func/std": 8.360377311706543, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 418.375, + "completions/mean_terminated_length": 345.5555725097656, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.3936, + "grad_norm": 1.5140773057937622, + "kl": 0.024444580078125, + "learning_rate": 1e-06, + "loss": 0.0378, + "num_tokens": 7561496.0, + "reward": -6.728515625, + "reward_std": 5.0016865730285645, + "rewards/rm_reward_func/mean": -6.728515625, + "rewards/rm_reward_func/std": 9.732870101928711, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 378.625, + "completions/mean_terminated_length": 245.25, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.3944, + "grad_norm": 1.9988981485366821, + "kl": 0.0345306396484375, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 7577020.0, + "reward": -13.9371337890625, + "reward_std": 3.9132349491119385, + "rewards/rm_reward_func/mean": -13.9371337890625, + "rewards/rm_reward_func/std": 7.015035152435303, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 345.28125, + "completions/mean_terminated_length": 257.952392578125, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.3952, + "grad_norm": 2.311077356338501, + "kl": 0.030914306640625, + "learning_rate": 1e-06, + "loss": -0.1454, + "num_tokens": 7591453.0, + "reward": -3.0592727661132812, + "reward_std": 6.1845903396606445, + "rewards/rm_reward_func/mean": -3.0592727661132812, + "rewards/rm_reward_func/std": 13.736233711242676, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 403.25, + "completions/mean_terminated_length": 338.0, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.396, + "grad_norm": 1.5926947593688965, + "kl": 0.029205322265625, + "learning_rate": 1e-06, + "loss": -0.0936, + "num_tokens": 7608421.0, + "reward": -12.805755615234375, + "reward_std": 5.332446575164795, + "rewards/rm_reward_func/mean": -12.805755615234375, + "rewards/rm_reward_func/std": 7.457422733306885, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 312.75, + "completions/mean_terminated_length": 256.9599914550781, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.3968, + "grad_norm": 1.6806563138961792, + "kl": 0.055999755859375, + "learning_rate": 1e-06, + "loss": -0.0385, + "num_tokens": 7626349.0, + "reward": 3.3096923828125, + "reward_std": 5.034409523010254, + "rewards/rm_reward_func/mean": 3.3096923828125, + "rewards/rm_reward_func/std": 16.401277542114258, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 324.96875, + "completions/mean_terminated_length": 272.6000061035156, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.3976, + "grad_norm": 1.9360291957855225, + "kl": 0.0340576171875, + "learning_rate": 1e-06, + "loss": 0.0897, + "num_tokens": 7640652.0, + "reward": -5.7716064453125, + "reward_std": 5.161009788513184, + "rewards/rm_reward_func/mean": -5.7716064453125, + "rewards/rm_reward_func/std": 10.960410118103027, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 288.78125, + "completions/mean_terminated_length": 281.58062744140625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3984, + "grad_norm": 2.0398542881011963, + "kl": 0.0787353515625, + "learning_rate": 1e-06, + "loss": -0.1467, + "num_tokens": 7655829.0, + "reward": -4.9204559326171875, + "reward_std": 8.535689353942871, + "rewards/rm_reward_func/mean": -4.9204559326171875, + "rewards/rm_reward_func/std": 9.429495811462402, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 283.96875, + "completions/mean_terminated_length": 276.6128845214844, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.3992, + "grad_norm": 1.8778729438781738, + "kl": 0.0782470703125, + "learning_rate": 1e-06, + "loss": -0.0247, + "num_tokens": 7675948.0, + "reward": 16.57366943359375, + "reward_std": 4.19862174987793, + "rewards/rm_reward_func/mean": 16.57366943359375, + "rewards/rm_reward_func/std": 10.593950271606445, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 388.1875, + "completions/mean_terminated_length": 346.91668701171875, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4, + "grad_norm": 1.5485178232192993, + "kl": 0.036865234375, + "learning_rate": 1e-06, + "loss": -0.1606, + "num_tokens": 7693674.0, + "reward": -3.6925048828125, + "reward_std": 7.665305137634277, + "rewards/rm_reward_func/mean": -3.6925048828125, + "rewards/rm_reward_func/std": 7.908268451690674, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 346.75, + "completions/mean_terminated_length": 282.08697509765625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.4008, + "grad_norm": 1.9377690553665161, + "kl": 0.0292510986328125, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 7707658.0, + "reward": 2.3031005859375, + "reward_std": 3.9825987815856934, + "rewards/rm_reward_func/mean": 2.3031005859375, + "rewards/rm_reward_func/std": 14.883623123168945, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 439.09375, + "completions/mean_terminated_length": 356.4666748046875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.4016, + "grad_norm": 1.4154679775238037, + "kl": 0.0272216796875, + "learning_rate": 1e-06, + "loss": -0.0213, + "num_tokens": 7726933.0, + "reward": -3.1108627319335938, + "reward_std": 9.156184196472168, + "rewards/rm_reward_func/mean": -3.1108627319335938, + "rewards/rm_reward_func/std": 14.908904075622559, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 318.0625, + "completions/mean_terminated_length": 242.17391967773438, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.4024, + "grad_norm": 2.11954402923584, + "kl": 0.051177978515625, + "learning_rate": 1e-06, + "loss": -0.0356, + "num_tokens": 7742175.0, + "reward": -4.77392578125, + "reward_std": 4.379672050476074, + "rewards/rm_reward_func/mean": -4.77392578125, + "rewards/rm_reward_func/std": 14.566995620727539, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 429.125, + "completions/mean_terminated_length": 346.25, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.4032, + "grad_norm": 1.7557576894760132, + "kl": 0.02764892578125, + "learning_rate": 1e-06, + "loss": -0.0167, + "num_tokens": 7759403.0, + "reward": -10.5, + "reward_std": 5.095137596130371, + "rewards/rm_reward_func/mean": -10.5, + "rewards/rm_reward_func/std": 7.820406913757324, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 336.5625, + "completions/mean_terminated_length": 324.8666687011719, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.404, + "grad_norm": 1.819419503211975, + "kl": 0.0533447265625, + "learning_rate": 1e-06, + "loss": -0.0195, + "num_tokens": 7778077.0, + "reward": 1.3951187133789062, + "reward_std": 5.574717044830322, + "rewards/rm_reward_func/mean": 1.3951187133789062, + "rewards/rm_reward_func/std": 8.477134704589844, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 419.53125, + "completions/mean_terminated_length": 314.73333740234375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4048, + "grad_norm": 1.8295176029205322, + "kl": 0.055328369140625, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 7796518.0, + "reward": -8.6416015625, + "reward_std": 4.800066947937012, + "rewards/rm_reward_func/mean": -8.6416015625, + "rewards/rm_reward_func/std": 12.13750171661377, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 266.15625, + "completions/mean_terminated_length": 197.3199920654297, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.4056, + "grad_norm": 2.0645644664764404, + "kl": 0.053436279296875, + "learning_rate": 1e-06, + "loss": -0.2064, + "num_tokens": 7808859.0, + "reward": 1.672637939453125, + "reward_std": 5.09727668762207, + "rewards/rm_reward_func/mean": 1.672637939453125, + "rewards/rm_reward_func/std": 8.604231834411621, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 383.46875, + "completions/mean_terminated_length": 295.52630615234375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4064, + "grad_norm": 1.6251455545425415, + "kl": 0.0598297119140625, + "learning_rate": 1e-06, + "loss": -0.0484, + "num_tokens": 7827210.0, + "reward": -10.4619140625, + "reward_std": 4.327136039733887, + "rewards/rm_reward_func/mean": -10.4619140625, + "rewards/rm_reward_func/std": 5.655404567718506, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 379.6875, + "completions/mean_terminated_length": 262.9411926269531, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.4072, + "grad_norm": 1.5721628665924072, + "kl": 0.03826141357421875, + "learning_rate": 1e-06, + "loss": 0.0571, + "num_tokens": 7844400.0, + "reward": -8.17578125, + "reward_std": 4.743236064910889, + "rewards/rm_reward_func/mean": -8.17578125, + "rewards/rm_reward_func/std": 13.666367530822754, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 276.0, + "completions/mean_terminated_length": 260.2666931152344, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.408, + "grad_norm": 1.8655906915664673, + "kl": 0.0728759765625, + "learning_rate": 1e-06, + "loss": -0.0193, + "num_tokens": 7863784.0, + "reward": 3.91015625, + "reward_std": 5.72806453704834, + "rewards/rm_reward_func/mean": 3.91015625, + "rewards/rm_reward_func/std": 15.470877647399902, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 347.03125, + "completions/mean_terminated_length": 300.8399963378906, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.4088, + "grad_norm": 1.6363317966461182, + "kl": 0.055267333984375, + "learning_rate": 1e-06, + "loss": -0.0523, + "num_tokens": 7881377.0, + "reward": 7.588897705078125, + "reward_std": 6.77081823348999, + "rewards/rm_reward_func/mean": 7.588897705078125, + "rewards/rm_reward_func/std": 12.342753410339355, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 133.5, + "completions/mean_terminated_length": 121.29032135009766, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4096, + "grad_norm": 3.7751121520996094, + "kl": 0.14886474609375, + "learning_rate": 1e-06, + "loss": 0.0778, + "num_tokens": 7890737.0, + "reward": -6.1551513671875, + "reward_std": 4.558640480041504, + "rewards/rm_reward_func/mean": -6.1551513671875, + "rewards/rm_reward_func/std": 14.303430557250977, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 300.65625, + "completions/mean_terminated_length": 278.7930908203125, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.4104, + "grad_norm": 2.204343795776367, + "kl": 0.0499267578125, + "learning_rate": 1e-06, + "loss": 0.0794, + "num_tokens": 7902766.0, + "reward": -4.17291259765625, + "reward_std": 4.973928928375244, + "rewards/rm_reward_func/mean": -4.17291259765625, + "rewards/rm_reward_func/std": 6.387589454650879, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 320.03125, + "completions/mean_terminated_length": 300.17242431640625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.4112, + "grad_norm": 1.8514363765716553, + "kl": 0.0416259765625, + "learning_rate": 1e-06, + "loss": 0.0282, + "num_tokens": 7915255.0, + "reward": -1.3759765625, + "reward_std": 7.35073709487915, + "rewards/rm_reward_func/mean": -1.3759765625, + "rewards/rm_reward_func/std": 12.549677848815918, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 285.0, + "completions/mean_terminated_length": 148.8000030517578, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.412, + "grad_norm": 2.9163084030151367, + "kl": 0.079803466796875, + "learning_rate": 1e-06, + "loss": -0.0515, + "num_tokens": 7926583.0, + "reward": -10.772430419921875, + "reward_std": 3.568636417388916, + "rewards/rm_reward_func/mean": -10.772430419921875, + "rewards/rm_reward_func/std": 7.702577114105225, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 369.78125, + "completions/mean_terminated_length": 227.5625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4128, + "grad_norm": 3.725050449371338, + "kl": 0.037567138671875, + "learning_rate": 1e-06, + "loss": 0.2386, + "num_tokens": 7940936.0, + "reward": -10.381890296936035, + "reward_std": 4.116662979125977, + "rewards/rm_reward_func/mean": -10.381890296936035, + "rewards/rm_reward_func/std": 9.508091926574707, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 225.6875, + "completions/mean_terminated_length": 225.6875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4136, + "grad_norm": 2.1360368728637695, + "kl": 0.06878662109375, + "learning_rate": 1e-06, + "loss": -0.0902, + "num_tokens": 7955878.0, + "reward": 3.21148681640625, + "reward_std": 7.656820297241211, + "rewards/rm_reward_func/mean": 3.21148681640625, + "rewards/rm_reward_func/std": 18.270893096923828, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 391.6875, + "completions/mean_terminated_length": 344.60870361328125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.4144, + "grad_norm": 1.5391716957092285, + "kl": 0.042266845703125, + "learning_rate": 1e-06, + "loss": 0.0714, + "num_tokens": 7975204.0, + "reward": 5.716121673583984, + "reward_std": 9.05794620513916, + "rewards/rm_reward_func/mean": 5.716121673583984, + "rewards/rm_reward_func/std": 18.638763427734375, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 420.78125, + "completions/mean_terminated_length": 390.375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.4152, + "grad_norm": 1.7760664224624634, + "kl": 0.048370361328125, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 7991629.0, + "reward": 2.9716796875, + "reward_std": 5.100446701049805, + "rewards/rm_reward_func/mean": 2.9716796875, + "rewards/rm_reward_func/std": 14.36515998840332, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 384.875, + "completions/mean_terminated_length": 327.0909118652344, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.416, + "grad_norm": 1.469740867614746, + "kl": 0.05181884765625, + "learning_rate": 1e-06, + "loss": -0.0191, + "num_tokens": 8011417.0, + "reward": -2.9090576171875, + "reward_std": 7.1865057945251465, + "rewards/rm_reward_func/mean": -2.9090576171875, + "rewards/rm_reward_func/std": 17.995819091796875, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 404.3125, + "completions/mean_terminated_length": 339.70001220703125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.4168, + "grad_norm": 1.4723907709121704, + "kl": 0.03729248046875, + "learning_rate": 1e-06, + "loss": -0.1503, + "num_tokens": 8029227.0, + "reward": -6.587646484375, + "reward_std": 5.71742582321167, + "rewards/rm_reward_func/mean": -6.587646484375, + "rewards/rm_reward_func/std": 9.572473526000977, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 230.40625, + "completions/mean_terminated_length": 221.32257080078125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.4176, + "grad_norm": 2.3524017333984375, + "kl": 0.096527099609375, + "learning_rate": 1e-06, + "loss": -0.0719, + "num_tokens": 8044368.0, + "reward": 0.323974609375, + "reward_std": 7.459438323974609, + "rewards/rm_reward_func/mean": 0.323974609375, + "rewards/rm_reward_func/std": 10.246789932250977, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 268.25, + "completions/mean_terminated_length": 252.00001525878906, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.4184, + "grad_norm": 2.635575771331787, + "kl": 0.064483642578125, + "learning_rate": 1e-06, + "loss": 0.0647, + "num_tokens": 8056712.0, + "reward": -4.01123046875, + "reward_std": 6.8884053230285645, + "rewards/rm_reward_func/mean": -4.01123046875, + "rewards/rm_reward_func/std": 11.376402854919434, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 295.53125, + "completions/mean_terminated_length": 223.375, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.4192, + "grad_norm": 2.853746175765991, + "kl": 0.066070556640625, + "learning_rate": 1e-06, + "loss": 0.0501, + "num_tokens": 8071161.0, + "reward": 5.61767578125, + "reward_std": 3.9746556282043457, + "rewards/rm_reward_func/mean": 5.61767578125, + "rewards/rm_reward_func/std": 21.108842849731445, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 466.53125, + "completions/mean_terminated_length": 421.0625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.42, + "grad_norm": 1.528450846672058, + "kl": 0.027069091796875, + "learning_rate": 1e-06, + "loss": 0.0311, + "num_tokens": 8092146.0, + "reward": -0.02880859375, + "reward_std": 6.017075538635254, + "rewards/rm_reward_func/mean": -0.02880859375, + "rewards/rm_reward_func/std": 13.508607864379883, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 433.65625, + "completions/mean_terminated_length": 284.0909118652344, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.4208, + "grad_norm": 1.4769070148468018, + "kl": 0.0782470703125, + "learning_rate": 1e-06, + "loss": -0.0715, + "num_tokens": 8112983.0, + "reward": -11.803955078125, + "reward_std": 3.314962387084961, + "rewards/rm_reward_func/mean": -11.803955078125, + "rewards/rm_reward_func/std": 5.535274982452393, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 281.90625, + "completions/mean_terminated_length": 228.8076934814453, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.4216, + "grad_norm": 1.731550693511963, + "kl": 0.064117431640625, + "learning_rate": 1e-06, + "loss": -0.0974, + "num_tokens": 8131852.0, + "reward": -1.089111328125, + "reward_std": 6.1585869789123535, + "rewards/rm_reward_func/mean": -1.089111328125, + "rewards/rm_reward_func/std": 16.522184371948242, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 281.5, + "completions/mean_terminated_length": 228.3076934814453, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.4224, + "grad_norm": 2.2216453552246094, + "kl": 0.08673095703125, + "learning_rate": 1e-06, + "loss": -0.0074, + "num_tokens": 8148868.0, + "reward": -2.177978515625, + "reward_std": 7.805302619934082, + "rewards/rm_reward_func/mean": -2.177978515625, + "rewards/rm_reward_func/std": 15.631145477294922, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 264.625, + "completions/mean_terminated_length": 248.1333465576172, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4232, + "grad_norm": 2.806413412094116, + "kl": 0.0775146484375, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 8163120.0, + "reward": -1.0673828125, + "reward_std": 5.224954128265381, + "rewards/rm_reward_func/mean": -1.0673828125, + "rewards/rm_reward_func/std": 14.245368003845215, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 348.625, + "completions/mean_terminated_length": 263.0476379394531, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.424, + "grad_norm": 2.294907808303833, + "kl": 0.03643798828125, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 8178596.0, + "reward": -13.69073486328125, + "reward_std": 2.9994964599609375, + "rewards/rm_reward_func/mean": -13.69073486328125, + "rewards/rm_reward_func/std": 9.472757339477539, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 287.0625, + "completions/mean_terminated_length": 245.40740966796875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.4248, + "grad_norm": 2.5811612606048584, + "kl": 0.0753173828125, + "learning_rate": 1e-06, + "loss": 0.1233, + "num_tokens": 8196118.0, + "reward": 13.948974609375, + "reward_std": 9.174501419067383, + "rewards/rm_reward_func/mean": 13.948974609375, + "rewards/rm_reward_func/std": 17.62830924987793, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 403.34375, + "completions/mean_terminated_length": 263.64288330078125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4256, + "grad_norm": 2.1055965423583984, + "kl": 0.037872314453125, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 8214873.0, + "reward": -0.8842620849609375, + "reward_std": 6.876520156860352, + "rewards/rm_reward_func/mean": -0.8842620849609375, + "rewards/rm_reward_func/std": 7.569698810577393, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 244.875, + "completions/mean_terminated_length": 236.258056640625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.4264, + "grad_norm": 2.6911871433258057, + "kl": 0.064483642578125, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 8224909.0, + "reward": -9.286865234375, + "reward_std": 6.211329936981201, + "rewards/rm_reward_func/mean": -9.286865234375, + "rewards/rm_reward_func/std": 11.387242317199707, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 234.90625, + "completions/mean_terminated_length": 225.9677276611328, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.4272, + "grad_norm": 2.632063627243042, + "kl": 0.0777740478515625, + "learning_rate": 1e-06, + "loss": -0.0857, + "num_tokens": 8237522.0, + "reward": -3.9537768363952637, + "reward_std": 5.757923126220703, + "rewards/rm_reward_func/mean": -3.9537768363952637, + "rewards/rm_reward_func/std": 18.29018211364746, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 415.6875, + "completions/mean_terminated_length": 306.5333557128906, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.428, + "grad_norm": 1.6487957239151, + "kl": 0.02459716796875, + "learning_rate": 1e-06, + "loss": -0.071, + "num_tokens": 8255504.0, + "reward": -8.12353515625, + "reward_std": 3.6630172729492188, + "rewards/rm_reward_func/mean": -8.12353515625, + "rewards/rm_reward_func/std": 8.109033584594727, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 347.71875, + "completions/mean_terminated_length": 219.94444274902344, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.4288, + "grad_norm": 2.449939250946045, + "kl": 0.0662841796875, + "learning_rate": 1e-06, + "loss": -0.0702, + "num_tokens": 8275407.0, + "reward": -3.1197509765625, + "reward_std": 4.398626327514648, + "rewards/rm_reward_func/mean": -3.1197509765625, + "rewards/rm_reward_func/std": 13.390921592712402, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 372.59375, + "completions/mean_terminated_length": 326.125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.4296, + "grad_norm": 2.149513006210327, + "kl": 0.071044921875, + "learning_rate": 1e-06, + "loss": 0.0131, + "num_tokens": 8296362.0, + "reward": 2.45361328125, + "reward_std": 3.5364184379577637, + "rewards/rm_reward_func/mean": 2.45361328125, + "rewards/rm_reward_func/std": 16.90559959411621, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 341.8125, + "completions/mean_terminated_length": 264.4545593261719, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4304, + "grad_norm": 1.7294725179672241, + "kl": 0.038726806640625, + "learning_rate": 1e-06, + "loss": 0.1683, + "num_tokens": 8313724.0, + "reward": -0.88623046875, + "reward_std": 5.482547283172607, + "rewards/rm_reward_func/mean": -0.88623046875, + "rewards/rm_reward_func/std": 13.207742691040039, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 261.34375, + "completions/mean_terminated_length": 253.258056640625, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.4312, + "grad_norm": 1.7928751707077026, + "kl": 0.04119873046875, + "learning_rate": 1e-06, + "loss": -0.1246, + "num_tokens": 8328511.0, + "reward": -1.8203125, + "reward_std": 5.642034530639648, + "rewards/rm_reward_func/mean": -1.8203125, + "rewards/rm_reward_func/std": 15.959497451782227, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 395.875, + "completions/mean_terminated_length": 350.4347839355469, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.432, + "grad_norm": 1.4482436180114746, + "kl": 0.02685546875, + "learning_rate": 1e-06, + "loss": -0.1199, + "num_tokens": 8342603.0, + "reward": 2.650146484375, + "reward_std": 8.024805068969727, + "rewards/rm_reward_func/mean": 2.650146484375, + "rewards/rm_reward_func/std": 11.297515869140625, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 330.0625, + "completions/mean_terminated_length": 234.76190185546875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.4328, + "grad_norm": 2.329352617263794, + "kl": 0.062255859375, + "learning_rate": 1e-06, + "loss": 0.0891, + "num_tokens": 8359613.0, + "reward": -7.28204345703125, + "reward_std": 4.7732014656066895, + "rewards/rm_reward_func/mean": -7.28204345703125, + "rewards/rm_reward_func/std": 6.598299980163574, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 359.28125, + "completions/mean_terminated_length": 308.375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 0.4336, + "grad_norm": 1.9231770038604736, + "kl": 0.041412353515625, + "learning_rate": 1e-06, + "loss": -0.1052, + "num_tokens": 8374094.0, + "reward": -2.8352508544921875, + "reward_std": 5.2625274658203125, + "rewards/rm_reward_func/mean": -2.8352508544921875, + "rewards/rm_reward_func/std": 15.089098930358887, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 471.4375, + "completions/mean_terminated_length": 425.4666748046875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.4344, + "grad_norm": 1.6681606769561768, + "kl": 0.0244140625, + "learning_rate": 1e-06, + "loss": 0.0189, + "num_tokens": 8391868.0, + "reward": -3.3772406578063965, + "reward_std": 6.06258487701416, + "rewards/rm_reward_func/mean": -3.3772406578063965, + "rewards/rm_reward_func/std": 10.279738426208496, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 431.78125, + "completions/mean_terminated_length": 361.0, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.4352, + "grad_norm": 1.589442253112793, + "kl": 0.0265045166015625, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 8409661.0, + "reward": -7.08984375, + "reward_std": 6.788657188415527, + "rewards/rm_reward_func/mean": -7.08984375, + "rewards/rm_reward_func/std": 15.756629943847656, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 401.15625, + "completions/mean_terminated_length": 350.7727355957031, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.436, + "grad_norm": 1.7132800817489624, + "kl": 0.03582763671875, + "learning_rate": 1e-06, + "loss": -0.0222, + "num_tokens": 8425482.0, + "reward": -2.134765625, + "reward_std": 2.5514719486236572, + "rewards/rm_reward_func/mean": -2.134765625, + "rewards/rm_reward_func/std": 20.19614028930664, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 300.5625, + "completions/mean_terminated_length": 261.40740966796875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.4368, + "grad_norm": 2.3041093349456787, + "kl": 0.05938720703125, + "learning_rate": 1e-06, + "loss": -0.0074, + "num_tokens": 8437644.0, + "reward": -6.444091796875, + "reward_std": 5.620718479156494, + "rewards/rm_reward_func/mean": -6.444091796875, + "rewards/rm_reward_func/std": 6.755178451538086, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 367.40625, + "completions/mean_terminated_length": 301.68182373046875, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.4376, + "grad_norm": 1.7555508613586426, + "kl": 0.029144287109375, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 8451041.0, + "reward": -9.650146484375, + "reward_std": 4.595182418823242, + "rewards/rm_reward_func/mean": -9.650146484375, + "rewards/rm_reward_func/std": 8.019635200500488, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 381.84375, + "completions/mean_terminated_length": 292.78948974609375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.4384, + "grad_norm": 2.0770134925842285, + "kl": 0.0421142578125, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 8468172.0, + "reward": -3.4725341796875, + "reward_std": 6.244883060455322, + "rewards/rm_reward_func/mean": -3.4725341796875, + "rewards/rm_reward_func/std": 13.048554420471191, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 439.53125, + "completions/mean_terminated_length": 318.75, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.4392, + "grad_norm": 1.448085904121399, + "kl": 0.03973388671875, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 8491357.0, + "reward": -1.749267578125, + "reward_std": 3.098092555999756, + "rewards/rm_reward_func/mean": -1.749267578125, + "rewards/rm_reward_func/std": 15.183819770812988, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 445.5, + "completions/mean_terminated_length": 299.20001220703125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.44, + "grad_norm": 1.66658616065979, + "kl": 0.04742431640625, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 8511557.0, + "reward": -7.1875, + "reward_std": 3.023125410079956, + "rewards/rm_reward_func/mean": -7.1875, + "rewards/rm_reward_func/std": 16.064231872558594, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 338.875, + "completions/mean_terminated_length": 320.96551513671875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.4408, + "grad_norm": 1.7666022777557373, + "kl": 0.035064697265625, + "learning_rate": 1e-06, + "loss": -0.0426, + "num_tokens": 8525929.0, + "reward": -1.1615333557128906, + "reward_std": 5.899777889251709, + "rewards/rm_reward_func/mean": -1.1615333557128906, + "rewards/rm_reward_func/std": 19.450897216796875, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 462.90625, + "completions/mean_terminated_length": 413.8125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4416, + "grad_norm": 1.5199429988861084, + "kl": 0.031036376953125, + "learning_rate": 1e-06, + "loss": 0.0631, + "num_tokens": 8546550.0, + "reward": -17.064453125, + "reward_std": 5.069859981536865, + "rewards/rm_reward_func/mean": -17.064453125, + "rewards/rm_reward_func/std": 9.07813549041748, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 366.34375, + "completions/mean_terminated_length": 278.95001220703125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.4424, + "grad_norm": 1.8507323265075684, + "kl": 0.059173583984375, + "learning_rate": 1e-06, + "loss": -0.0346, + "num_tokens": 8565201.0, + "reward": -3.9059677124023438, + "reward_std": 5.102289199829102, + "rewards/rm_reward_func/mean": -3.9059677124023438, + "rewards/rm_reward_func/std": 10.643386840820312, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 380.5, + "completions/mean_terminated_length": 343.67999267578125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.4432, + "grad_norm": 2.084139347076416, + "kl": 0.041412353515625, + "learning_rate": 1e-06, + "loss": -0.0717, + "num_tokens": 8580489.0, + "reward": -8.0257568359375, + "reward_std": 6.233272552490234, + "rewards/rm_reward_func/mean": -8.0257568359375, + "rewards/rm_reward_func/std": 10.00146198272705, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 379.71875, + "completions/mean_terminated_length": 319.5909118652344, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.444, + "grad_norm": 2.340545177459717, + "kl": 0.05145263671875, + "learning_rate": 1e-06, + "loss": -0.0599, + "num_tokens": 8595288.0, + "reward": -8.4732666015625, + "reward_std": 4.474703311920166, + "rewards/rm_reward_func/mean": -8.4732666015625, + "rewards/rm_reward_func/std": 10.32524585723877, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 189.5, + "completions/mean_terminated_length": 168.00001525878906, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.4448, + "grad_norm": 4.612342357635498, + "kl": 0.05645751953125, + "learning_rate": 1e-06, + "loss": 0.3368, + "num_tokens": 8602728.0, + "reward": -5.0865478515625, + "reward_std": 5.480595588684082, + "rewards/rm_reward_func/mean": -5.0865478515625, + "rewards/rm_reward_func/std": 6.441463470458984, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 429.40625, + "completions/mean_terminated_length": 323.21429443359375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.4456, + "grad_norm": 1.7789241075515747, + "kl": 0.043365478515625, + "learning_rate": 1e-06, + "loss": -0.0202, + "num_tokens": 8621525.0, + "reward": -4.1884765625, + "reward_std": 5.582745552062988, + "rewards/rm_reward_func/mean": -4.1884765625, + "rewards/rm_reward_func/std": 15.907622337341309, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 269.28125, + "completions/mean_terminated_length": 234.60714721679688, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.4464, + "grad_norm": 2.5340311527252197, + "kl": 0.05902099609375, + "learning_rate": 1e-06, + "loss": -0.0175, + "num_tokens": 8639166.0, + "reward": -6.6932373046875, + "reward_std": 6.566526412963867, + "rewards/rm_reward_func/mean": -6.6932373046875, + "rewards/rm_reward_func/std": 8.136711120605469, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 308.34375, + "completions/mean_terminated_length": 240.45834350585938, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.4472, + "grad_norm": 2.0877692699432373, + "kl": 0.042083740234375, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 8653985.0, + "reward": -5.2181396484375, + "reward_std": 6.234033584594727, + "rewards/rm_reward_func/mean": -5.2181396484375, + "rewards/rm_reward_func/std": 9.214916229248047, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 428.71875, + "completions/mean_terminated_length": 390.8636474609375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.448, + "grad_norm": 1.898612141609192, + "kl": 0.02606201171875, + "learning_rate": 1e-06, + "loss": -0.0392, + "num_tokens": 8672664.0, + "reward": -10.3389892578125, + "reward_std": 3.8391504287719727, + "rewards/rm_reward_func/mean": -10.3389892578125, + "rewards/rm_reward_func/std": 14.046701431274414, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 339.40625, + "completions/mean_terminated_length": 307.4444580078125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.4488, + "grad_norm": 1.6997865438461304, + "kl": 0.042816162109375, + "learning_rate": 1e-06, + "loss": -0.0297, + "num_tokens": 8689197.0, + "reward": 4.0514068603515625, + "reward_std": 6.294465065002441, + "rewards/rm_reward_func/mean": 4.0514068603515625, + "rewards/rm_reward_func/std": 11.245402336120605, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 435.28125, + "completions/mean_terminated_length": 323.15386962890625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4496, + "grad_norm": 1.861504316329956, + "kl": 0.0372314453125, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 8709022.0, + "reward": -16.774169921875, + "reward_std": 3.5261833667755127, + "rewards/rm_reward_func/mean": -16.774169921875, + "rewards/rm_reward_func/std": 7.6449666023254395, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 396.1875, + "completions/mean_terminated_length": 350.86956787109375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.4504, + "grad_norm": 1.7116611003875732, + "kl": 0.049530029296875, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 8730628.0, + "reward": -4.0126953125, + "reward_std": 6.307745456695557, + "rewards/rm_reward_func/mean": -4.0126953125, + "rewards/rm_reward_func/std": 20.956491470336914, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 270.65625, + "completions/mean_terminated_length": 214.9615478515625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.4512, + "grad_norm": 4.119467735290527, + "kl": 0.066680908203125, + "learning_rate": 1e-06, + "loss": 0.0712, + "num_tokens": 8742201.0, + "reward": -1.2891464233398438, + "reward_std": 6.0930609703063965, + "rewards/rm_reward_func/mean": -1.2891464233398438, + "rewards/rm_reward_func/std": 22.26377296447754, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 379.03125, + "completions/mean_terminated_length": 360.0357360839844, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.452, + "grad_norm": 1.8070342540740967, + "kl": 0.057464599609375, + "learning_rate": 1e-06, + "loss": -0.0241, + "num_tokens": 8761482.0, + "reward": 4.4117431640625, + "reward_std": 5.54026460647583, + "rewards/rm_reward_func/mean": 4.4117431640625, + "rewards/rm_reward_func/std": 16.684171676635742, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 455.03125, + "completions/mean_terminated_length": 410.72222900390625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.4528, + "grad_norm": 1.7584404945373535, + "kl": 0.034942626953125, + "learning_rate": 1e-06, + "loss": 0.0507, + "num_tokens": 8780291.0, + "reward": -8.536376953125, + "reward_std": 3.1325812339782715, + "rewards/rm_reward_func/mean": -8.536376953125, + "rewards/rm_reward_func/std": 9.17507553100586, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 195.84616088867188, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4536, + "grad_norm": 3.3954875469207764, + "kl": 0.073760986328125, + "learning_rate": 1e-06, + "loss": -0.1078, + "num_tokens": 8792863.0, + "reward": -7.20025634765625, + "reward_std": 3.70098876953125, + "rewards/rm_reward_func/mean": -7.20025634765625, + "rewards/rm_reward_func/std": 8.027015686035156, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 404.4375, + "completions/mean_terminated_length": 374.3199768066406, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.4544, + "grad_norm": 1.650101661682129, + "kl": 0.029541015625, + "learning_rate": 1e-06, + "loss": -0.099, + "num_tokens": 8808037.0, + "reward": -6.333393096923828, + "reward_std": 7.396759986877441, + "rewards/rm_reward_func/mean": -6.333393096923828, + "rewards/rm_reward_func/std": 8.53515625, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 324.0, + "completions/mean_terminated_length": 280.6153869628906, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.4552, + "grad_norm": 1.9937595129013062, + "kl": 0.05224609375, + "learning_rate": 1e-06, + "loss": 0.081, + "num_tokens": 8823405.0, + "reward": 1.67779541015625, + "reward_std": 4.868046760559082, + "rewards/rm_reward_func/mean": 1.67779541015625, + "rewards/rm_reward_func/std": 14.292357444763184, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 442.78125, + "completions/mean_terminated_length": 364.3333435058594, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.456, + "grad_norm": 1.6618449687957764, + "kl": 0.045318603515625, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 8843094.0, + "reward": 0.7568359375, + "reward_std": 6.247023582458496, + "rewards/rm_reward_func/mean": 0.7568359375, + "rewards/rm_reward_func/std": 12.245682716369629, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 393.0625, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.4568, + "grad_norm": 1.9558320045471191, + "kl": 0.050079345703125, + "learning_rate": 1e-06, + "loss": 0.0449, + "num_tokens": 8861872.0, + "reward": -1.83599853515625, + "reward_std": 2.8411426544189453, + "rewards/rm_reward_func/mean": -1.83599853515625, + "rewards/rm_reward_func/std": 13.652066230773926, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 418.1875, + "completions/mean_terminated_length": 396.5384826660156, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.4576, + "grad_norm": 1.5574551820755005, + "kl": 0.0247802734375, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 8878374.0, + "reward": -5.01025390625, + "reward_std": 7.790569305419922, + "rewards/rm_reward_func/mean": -5.01025390625, + "rewards/rm_reward_func/std": 9.710042953491211, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 370.46875, + "completions/mean_terminated_length": 323.29168701171875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.4584, + "grad_norm": 1.6649190187454224, + "kl": 0.04254150390625, + "learning_rate": 1e-06, + "loss": 0.0363, + "num_tokens": 8894477.0, + "reward": 1.81884765625, + "reward_std": 6.395046234130859, + "rewards/rm_reward_func/mean": 1.81884765625, + "rewards/rm_reward_func/std": 13.108199119567871, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 478.40625, + "completions/mean_terminated_length": 404.5, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.4592, + "grad_norm": 1.4605319499969482, + "kl": 0.029205322265625, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 8917178.0, + "reward": -5.9092864990234375, + "reward_std": 4.498154640197754, + "rewards/rm_reward_func/mean": -5.9092864990234375, + "rewards/rm_reward_func/std": 9.532511711120605, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 374.90625, + "completions/mean_terminated_length": 329.2083435058594, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.46, + "grad_norm": 1.7478305101394653, + "kl": 0.04864501953125, + "learning_rate": 1e-06, + "loss": -0.0418, + "num_tokens": 8935271.0, + "reward": -3.3739013671875, + "reward_std": 3.241399049758911, + "rewards/rm_reward_func/mean": -3.3739013671875, + "rewards/rm_reward_func/std": 13.383931159973145, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 277.34375, + "completions/mean_terminated_length": 269.7742004394531, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.4608, + "grad_norm": 2.048452854156494, + "kl": 0.0543212890625, + "learning_rate": 1e-06, + "loss": 0.0104, + "num_tokens": 8949642.0, + "reward": -6.408203125, + "reward_std": 3.612302780151367, + "rewards/rm_reward_func/mean": -6.408203125, + "rewards/rm_reward_func/std": 18.179784774780273, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 426.46875, + "completions/mean_terminated_length": 381.66668701171875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.4616, + "grad_norm": 1.6895877122879028, + "kl": 0.038360595703125, + "learning_rate": 1e-06, + "loss": 0.0467, + "num_tokens": 8966473.0, + "reward": -5.572265625, + "reward_std": 6.944555759429932, + "rewards/rm_reward_func/mean": -5.572265625, + "rewards/rm_reward_func/std": 13.506739616394043, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 288.25, + "completions/mean_terminated_length": 225.59999084472656, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.4624, + "grad_norm": 2.7493460178375244, + "kl": 0.079559326171875, + "learning_rate": 1e-06, + "loss": -0.067, + "num_tokens": 8984105.0, + "reward": 0.946533203125, + "reward_std": 4.869527339935303, + "rewards/rm_reward_func/mean": 0.946533203125, + "rewards/rm_reward_func/std": 10.845769882202148, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 413.875, + "completions/mean_terminated_length": 362.4761962890625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.4632, + "grad_norm": 1.462739109992981, + "kl": 0.034515380859375, + "learning_rate": 1e-06, + "loss": 0.0178, + "num_tokens": 9002733.0, + "reward": 0.7504730224609375, + "reward_std": 5.5196428298950195, + "rewards/rm_reward_func/mean": 0.7504730224609375, + "rewards/rm_reward_func/std": 14.839042663574219, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 471.78125, + "completions/mean_terminated_length": 395.0, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.464, + "grad_norm": 1.2745565176010132, + "kl": 0.0222320556640625, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 9024966.0, + "reward": -4.34912109375, + "reward_std": 6.752235412597656, + "rewards/rm_reward_func/mean": -4.34912109375, + "rewards/rm_reward_func/std": 10.378704071044922, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 486.21875, + "completions/mean_terminated_length": 437.0, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.4648, + "grad_norm": 1.5439950227737427, + "kl": 0.0284423828125, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 9044445.0, + "reward": -14.03125, + "reward_std": 3.323228359222412, + "rewards/rm_reward_func/mean": -14.03125, + "rewards/rm_reward_func/std": 7.576478958129883, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 379.90625, + "completions/mean_terminated_length": 328.2174072265625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.4656, + "grad_norm": 1.81442129611969, + "kl": 0.0552978515625, + "learning_rate": 1e-06, + "loss": -0.0186, + "num_tokens": 9061706.0, + "reward": 5.249755859375, + "reward_std": 6.854490756988525, + "rewards/rm_reward_func/mean": 5.249755859375, + "rewards/rm_reward_func/std": 11.908827781677246, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 283.625, + "completions/mean_terminated_length": 207.5, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.4664, + "grad_norm": 2.5863113403320312, + "kl": 0.088226318359375, + "learning_rate": 1e-06, + "loss": 0.025, + "num_tokens": 9078206.0, + "reward": -3.6611328125, + "reward_std": 4.318477153778076, + "rewards/rm_reward_func/mean": -3.6611328125, + "rewards/rm_reward_func/std": 9.88959789276123, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 253.625, + "completions/mean_terminated_length": 236.40000915527344, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.4672, + "grad_norm": 2.1111981868743896, + "kl": 0.0927734375, + "learning_rate": 1e-06, + "loss": 0.0382, + "num_tokens": 9099354.0, + "reward": 2.12542724609375, + "reward_std": 8.13827896118164, + "rewards/rm_reward_func/mean": 2.12542724609375, + "rewards/rm_reward_func/std": 15.183740615844727, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 365.875, + "completions/mean_terminated_length": 332.15386962890625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.468, + "grad_norm": 1.5588277578353882, + "kl": 0.031951904296875, + "learning_rate": 1e-06, + "loss": -0.0702, + "num_tokens": 9115110.0, + "reward": -1.10736083984375, + "reward_std": 7.7266340255737305, + "rewards/rm_reward_func/mean": -1.10736083984375, + "rewards/rm_reward_func/std": 9.949874877929688, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 291.78125, + "completions/mean_terminated_length": 230.1199951171875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.4688, + "grad_norm": 4.776576519012451, + "kl": 0.11102294921875, + "learning_rate": 1e-06, + "loss": -0.0665, + "num_tokens": 9126783.0, + "reward": -0.97686767578125, + "reward_std": 6.303781986236572, + "rewards/rm_reward_func/mean": -0.97686767578125, + "rewards/rm_reward_func/std": 7.537662029266357, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 271.6875, + "completions/mean_terminated_length": 255.66668701171875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.4696, + "grad_norm": 2.1785507202148438, + "kl": 0.0574951171875, + "learning_rate": 1e-06, + "loss": -0.0352, + "num_tokens": 9140509.0, + "reward": -12.054443359375, + "reward_std": 5.0501251220703125, + "rewards/rm_reward_func/mean": -12.054443359375, + "rewards/rm_reward_func/std": 6.112198829650879, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 317.25, + "completions/mean_terminated_length": 272.3077087402344, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.4704, + "grad_norm": 1.684846043586731, + "kl": 0.05352783203125, + "learning_rate": 1e-06, + "loss": -0.0666, + "num_tokens": 9159325.0, + "reward": 1.56640625, + "reward_std": 4.619665145874023, + "rewards/rm_reward_func/mean": 1.56640625, + "rewards/rm_reward_func/std": 20.391523361206055, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 292.59375, + "completions/mean_terminated_length": 285.51611328125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.4712, + "grad_norm": 2.1246981620788574, + "kl": 0.06927490234375, + "learning_rate": 1e-06, + "loss": 0.0792, + "num_tokens": 9174160.0, + "reward": -7.285400390625, + "reward_std": 6.483778953552246, + "rewards/rm_reward_func/mean": -7.285400390625, + "rewards/rm_reward_func/std": 10.082353591918945, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 238.59375, + "completions/mean_terminated_length": 238.59375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.472, + "grad_norm": 2.2388391494750977, + "kl": 0.07855224609375, + "learning_rate": 1e-06, + "loss": -0.0159, + "num_tokens": 9188667.0, + "reward": 0.26898193359375, + "reward_std": 5.77647590637207, + "rewards/rm_reward_func/mean": 0.26898193359375, + "rewards/rm_reward_func/std": 7.5786824226379395, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 379.78125, + "completions/mean_terminated_length": 328.0434875488281, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.4728, + "grad_norm": 1.898703932762146, + "kl": 0.03497314453125, + "learning_rate": 1e-06, + "loss": -0.0413, + "num_tokens": 9205644.0, + "reward": -6.84521484375, + "reward_std": 5.259591102600098, + "rewards/rm_reward_func/mean": -6.84521484375, + "rewards/rm_reward_func/std": 12.33168888092041, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 262.625, + "completions/mean_terminated_length": 254.5806427001953, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.4736, + "grad_norm": 1.9234881401062012, + "kl": 0.0618896484375, + "learning_rate": 1e-06, + "loss": 0.0315, + "num_tokens": 9219504.0, + "reward": -7.0870361328125, + "reward_std": 6.356988430023193, + "rewards/rm_reward_func/mean": -7.0870361328125, + "rewards/rm_reward_func/std": 17.12438201904297, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 231.375, + "completions/mean_terminated_length": 231.375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.4744, + "grad_norm": 2.4398467540740967, + "kl": 0.0859375, + "learning_rate": 1e-06, + "loss": -0.0387, + "num_tokens": 9234444.0, + "reward": 12.42877197265625, + "reward_std": 4.771339416503906, + "rewards/rm_reward_func/mean": 12.42877197265625, + "rewards/rm_reward_func/std": 13.801860809326172, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 393.90625, + "completions/mean_terminated_length": 313.1052551269531, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.4752, + "grad_norm": 1.66500985622406, + "kl": 0.044342041015625, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 9255217.0, + "reward": -8.709545135498047, + "reward_std": 4.2999091148376465, + "rewards/rm_reward_func/mean": -8.709545135498047, + "rewards/rm_reward_func/std": 9.781368255615234, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 384.46875, + "completions/mean_terminated_length": 285.27777099609375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.476, + "grad_norm": 1.6352938413619995, + "kl": 0.044647216796875, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 9275768.0, + "reward": -1.682861328125, + "reward_std": 4.313589096069336, + "rewards/rm_reward_func/mean": -1.682861328125, + "rewards/rm_reward_func/std": 15.943973541259766, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 299.65625, + "completions/mean_terminated_length": 269.3214416503906, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.4768, + "grad_norm": 1.834975242614746, + "kl": 0.05218505859375, + "learning_rate": 1e-06, + "loss": 0.0771, + "num_tokens": 9288565.0, + "reward": 12.650634765625, + "reward_std": 7.1547322273254395, + "rewards/rm_reward_func/mean": 12.650634765625, + "rewards/rm_reward_func/std": 17.42757797241211, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 263.53125, + "completions/mean_terminated_length": 237.8275909423828, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.4776, + "grad_norm": 2.0557568073272705, + "kl": 0.06146240234375, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 9306966.0, + "reward": 2.749176025390625, + "reward_std": 5.658071041107178, + "rewards/rm_reward_func/mean": 2.749176025390625, + "rewards/rm_reward_func/std": 15.680941581726074, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 420.4375, + "completions/mean_terminated_length": 245.63636779785156, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.4784, + "grad_norm": 1.680214285850525, + "kl": 0.062255859375, + "learning_rate": 1e-06, + "loss": 0.0611, + "num_tokens": 9328148.0, + "reward": -4.691650390625, + "reward_std": 6.364143371582031, + "rewards/rm_reward_func/mean": -4.691650390625, + "rewards/rm_reward_func/std": 10.662463188171387, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 443.84375, + "completions/mean_terminated_length": 390.8333435058594, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.4792, + "grad_norm": 1.7402617931365967, + "kl": 0.03106689453125, + "learning_rate": 1e-06, + "loss": -0.0185, + "num_tokens": 9347727.0, + "reward": -6.467201232910156, + "reward_std": 4.6516571044921875, + "rewards/rm_reward_func/mean": -6.467201232910156, + "rewards/rm_reward_func/std": 8.010056495666504, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 410.71875, + "completions/mean_terminated_length": 331.9444580078125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.48, + "grad_norm": 1.6535178422927856, + "kl": 0.052520751953125, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 9365662.0, + "reward": -1.771240234375, + "reward_std": 5.220821857452393, + "rewards/rm_reward_func/mean": -1.771240234375, + "rewards/rm_reward_func/std": 14.564139366149902, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 415.40625, + "completions/mean_terminated_length": 305.933349609375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.4808, + "grad_norm": 1.6914423704147339, + "kl": 0.04718017578125, + "learning_rate": 1e-06, + "loss": 0.055, + "num_tokens": 9384427.0, + "reward": -12.693359375, + "reward_std": 5.4263811111450195, + "rewards/rm_reward_func/mean": -12.693359375, + "rewards/rm_reward_func/std": 10.985054016113281, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 367.28125, + "completions/mean_terminated_length": 326.7599792480469, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4816, + "grad_norm": 1.9667549133300781, + "kl": 0.037353515625, + "learning_rate": 1e-06, + "loss": 0.0493, + "num_tokens": 9398788.0, + "reward": -6.305328369140625, + "reward_std": 5.133925437927246, + "rewards/rm_reward_func/mean": -6.305328369140625, + "rewards/rm_reward_func/std": 9.334168434143066, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 238.1875, + "completions/mean_terminated_length": 219.933349609375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.4824, + "grad_norm": 2.58416748046875, + "kl": 0.06597900390625, + "learning_rate": 1e-06, + "loss": -0.0214, + "num_tokens": 9410746.0, + "reward": -7.6614990234375, + "reward_std": 7.693002700805664, + "rewards/rm_reward_func/mean": -7.6614990234375, + "rewards/rm_reward_func/std": 14.761738777160645, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 235.65625, + "completions/mean_terminated_length": 196.17857360839844, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 0.4832, + "grad_norm": 2.5852348804473877, + "kl": 0.13104248046875, + "learning_rate": 1e-06, + "loss": -0.1143, + "num_tokens": 9421127.0, + "reward": 2.098388671875, + "reward_std": 7.974844932556152, + "rewards/rm_reward_func/mean": 2.098388671875, + "rewards/rm_reward_func/std": 9.957184791564941, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 325.4375, + "completions/mean_terminated_length": 282.3846130371094, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.484, + "grad_norm": 1.7157469987869263, + "kl": 0.05224609375, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 9439773.0, + "reward": 0.78857421875, + "reward_std": 5.723252773284912, + "rewards/rm_reward_func/mean": 0.78857421875, + "rewards/rm_reward_func/std": 16.11743927001953, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 325.96875, + "completions/mean_terminated_length": 319.9677429199219, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.4848, + "grad_norm": 1.998733639717102, + "kl": 0.035552978515625, + "learning_rate": 1e-06, + "loss": -0.0857, + "num_tokens": 9453172.0, + "reward": 1.727783203125, + "reward_std": 5.9384870529174805, + "rewards/rm_reward_func/mean": 1.727783203125, + "rewards/rm_reward_func/std": 10.705312728881836, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 281.46875, + "completions/mean_terminated_length": 266.1000061035156, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.4856, + "grad_norm": 1.7531453371047974, + "kl": 0.04669189453125, + "learning_rate": 1e-06, + "loss": -0.0411, + "num_tokens": 9466747.0, + "reward": -5.81231689453125, + "reward_std": 7.3530473709106445, + "rewards/rm_reward_func/mean": -5.81231689453125, + "rewards/rm_reward_func/std": 12.287769317626953, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 416.03125, + "completions/mean_terminated_length": 393.8846435546875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.4864, + "grad_norm": 1.66568922996521, + "kl": 0.0294189453125, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 9482012.0, + "reward": -9.846038818359375, + "reward_std": 4.556893825531006, + "rewards/rm_reward_func/mean": -9.846038818359375, + "rewards/rm_reward_func/std": 9.061426162719727, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 403.0625, + "completions/mean_terminated_length": 353.54547119140625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.4872, + "grad_norm": 1.5169655084609985, + "kl": 0.0379638671875, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 9501286.0, + "reward": -8.5224609375, + "reward_std": 3.9122314453125, + "rewards/rm_reward_func/mean": -8.5224609375, + "rewards/rm_reward_func/std": 7.418508529663086, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 351.125, + "completions/mean_terminated_length": 306.0799865722656, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.488, + "grad_norm": 2.2387771606445312, + "kl": 0.06402587890625, + "learning_rate": 1e-06, + "loss": 0.0565, + "num_tokens": 9515282.0, + "reward": -9.9456787109375, + "reward_std": 3.9989800453186035, + "rewards/rm_reward_func/mean": -9.9456787109375, + "rewards/rm_reward_func/std": 5.2820024490356445, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 324.71875, + "completions/mean_terminated_length": 281.5, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.4888, + "grad_norm": 4.628117561340332, + "kl": 0.0728759765625, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 9529217.0, + "reward": -5.538818359375, + "reward_std": 6.819372177124023, + "rewards/rm_reward_func/mean": -5.538818359375, + "rewards/rm_reward_func/std": 13.196995735168457, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 366.84375, + "completions/mean_terminated_length": 326.1999816894531, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.4896, + "grad_norm": 1.801947832107544, + "kl": 0.053436279296875, + "learning_rate": 1e-06, + "loss": 0.0553, + "num_tokens": 9547500.0, + "reward": 6.10186767578125, + "reward_std": 4.505708694458008, + "rewards/rm_reward_func/mean": 6.10186767578125, + "rewards/rm_reward_func/std": 9.225835800170898, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 298.90625, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.4904, + "grad_norm": 2.222768545150757, + "kl": 0.09490966796875, + "learning_rate": 1e-06, + "loss": 0.0954, + "num_tokens": 9563457.0, + "reward": -3.5010986328125, + "reward_std": 5.8613433837890625, + "rewards/rm_reward_func/mean": -3.5010986328125, + "rewards/rm_reward_func/std": 8.318079948425293, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 295.1875, + "completions/mean_terminated_length": 264.21429443359375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4912, + "grad_norm": 2.4827163219451904, + "kl": 0.064544677734375, + "learning_rate": 1e-06, + "loss": -0.0408, + "num_tokens": 9578143.0, + "reward": -3.1361923217773438, + "reward_std": 6.2348527908325195, + "rewards/rm_reward_func/mean": -3.1361923217773438, + "rewards/rm_reward_func/std": 11.596470832824707, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 251.375, + "completions/mean_terminated_length": 234.00001525878906, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.492, + "grad_norm": 2.847848415374756, + "kl": 0.09564208984375, + "learning_rate": 1e-06, + "loss": 0.0503, + "num_tokens": 9592859.0, + "reward": 1.0433130264282227, + "reward_std": 7.778831958770752, + "rewards/rm_reward_func/mean": 1.0433130264282227, + "rewards/rm_reward_func/std": 17.03553581237793, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 265.84375, + "completions/mean_terminated_length": 183.7916717529297, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.4928, + "grad_norm": 2.820173740386963, + "kl": 0.058197021484375, + "learning_rate": 1e-06, + "loss": 0.0582, + "num_tokens": 9607222.0, + "reward": 4.380126953125, + "reward_std": 8.631103515625, + "rewards/rm_reward_func/mean": 4.380126953125, + "rewards/rm_reward_func/std": 13.627358436584473, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 334.78125, + "completions/mean_terminated_length": 178.41175842285156, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.4936, + "grad_norm": 2.0903537273406982, + "kl": 0.0706787109375, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 9625343.0, + "reward": -1.5030517578125, + "reward_std": 6.87320613861084, + "rewards/rm_reward_func/mean": -1.5030517578125, + "rewards/rm_reward_func/std": 10.338603019714355, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 415.28125, + "completions/mean_terminated_length": 318.5625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.4944, + "grad_norm": 1.6368532180786133, + "kl": 0.037841796875, + "learning_rate": 1e-06, + "loss": -0.0313, + "num_tokens": 9640840.0, + "reward": -0.279541015625, + "reward_std": 7.710910320281982, + "rewards/rm_reward_func/mean": -0.279541015625, + "rewards/rm_reward_func/std": 10.624358177185059, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 296.96875, + "completions/mean_terminated_length": 212.8260955810547, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.4952, + "grad_norm": 2.6116342544555664, + "kl": 0.05682373046875, + "learning_rate": 1e-06, + "loss": -0.0387, + "num_tokens": 9654815.0, + "reward": -2.69921875, + "reward_std": 5.148902416229248, + "rewards/rm_reward_func/mean": -2.69921875, + "rewards/rm_reward_func/std": 8.160242080688477, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 212.4375, + "completions/mean_terminated_length": 202.77418518066406, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.496, + "grad_norm": 3.1982104778289795, + "kl": 0.057373046875, + "learning_rate": 1e-06, + "loss": 0.2864, + "num_tokens": 9668509.0, + "reward": 1.27655029296875, + "reward_std": 5.835885047912598, + "rewards/rm_reward_func/mean": 1.27655029296875, + "rewards/rm_reward_func/std": 10.774589538574219, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 312.4375, + "completions/mean_terminated_length": 266.3846130371094, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.4968, + "grad_norm": 2.5851590633392334, + "kl": 0.0930938720703125, + "learning_rate": 1e-06, + "loss": 0.0605, + "num_tokens": 9682323.0, + "reward": -10.87109375, + "reward_std": 3.5456008911132812, + "rewards/rm_reward_func/mean": -10.87109375, + "rewards/rm_reward_func/std": 8.541075706481934, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 336.71875, + "completions/mean_terminated_length": 296.2692565917969, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.4976, + "grad_norm": 2.3126883506774902, + "kl": 0.0582275390625, + "learning_rate": 1e-06, + "loss": -0.0541, + "num_tokens": 9695466.0, + "reward": -7.81304931640625, + "reward_std": 4.616548538208008, + "rewards/rm_reward_func/mean": -7.81304931640625, + "rewards/rm_reward_func/std": 10.593924522399902, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 225.0, + "completions/mean_terminated_length": 205.86668395996094, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.4984, + "grad_norm": 2.4763219356536865, + "kl": 0.07196044921875, + "learning_rate": 1e-06, + "loss": -0.0575, + "num_tokens": 9705922.0, + "reward": -1.53277587890625, + "reward_std": 6.163647651672363, + "rewards/rm_reward_func/mean": -1.53277587890625, + "rewards/rm_reward_func/std": 6.894399642944336, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 158.5, + "completions/mean_terminated_length": 158.5, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.4992, + "grad_norm": 2.5002763271331787, + "kl": 0.162841796875, + "learning_rate": 1e-06, + "loss": 0.0508, + "num_tokens": 9720290.0, + "reward": 0.99395751953125, + "reward_std": 7.8396711349487305, + "rewards/rm_reward_func/mean": 0.99395751953125, + "rewards/rm_reward_func/std": 8.556918144226074, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 259.6875, + "completions/mean_terminated_length": 233.58621215820312, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.5, + "grad_norm": 4.446507930755615, + "kl": 0.071929931640625, + "learning_rate": 1e-06, + "loss": -0.0511, + "num_tokens": 9731312.0, + "reward": 2.507080078125, + "reward_std": 6.047924518585205, + "rewards/rm_reward_func/mean": 2.507080078125, + "rewards/rm_reward_func/std": 10.99448013305664, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 379.84375, + "completions/mean_terminated_length": 349.3461608886719, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.5008, + "grad_norm": 1.792451024055481, + "kl": 0.032623291015625, + "learning_rate": 1e-06, + "loss": -0.036, + "num_tokens": 9748531.0, + "reward": -5.40380859375, + "reward_std": 5.815441131591797, + "rewards/rm_reward_func/mean": -5.40380859375, + "rewards/rm_reward_func/std": 10.240708351135254, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 302.5, + "completions/mean_terminated_length": 280.82757568359375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.5016, + "grad_norm": 2.6559295654296875, + "kl": 0.05279541015625, + "learning_rate": 1e-06, + "loss": 0.056, + "num_tokens": 9762539.0, + "reward": -8.6014404296875, + "reward_std": 4.403963565826416, + "rewards/rm_reward_func/mean": -8.6014404296875, + "rewards/rm_reward_func/std": 12.047574996948242, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 408.34375, + "completions/mean_terminated_length": 337.4210510253906, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.5024, + "grad_norm": 1.783828854560852, + "kl": 0.034027099609375, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 9778182.0, + "reward": -15.5244140625, + "reward_std": 2.8039050102233887, + "rewards/rm_reward_func/mean": -15.5244140625, + "rewards/rm_reward_func/std": 4.276149272918701, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 314.96875, + "completions/mean_terminated_length": 225.4091033935547, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.5032, + "grad_norm": 2.4500131607055664, + "kl": 0.056640625, + "learning_rate": 1e-06, + "loss": 0.0393, + "num_tokens": 9794269.0, + "reward": -13.24908447265625, + "reward_std": 3.2599363327026367, + "rewards/rm_reward_func/mean": -13.24908447265625, + "rewards/rm_reward_func/std": 5.914889812469482, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 387.40625, + "completions/mean_terminated_length": 364.3333435058594, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.504, + "grad_norm": 1.5309455394744873, + "kl": 0.046295166015625, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 9811658.0, + "reward": -1.0367431640625, + "reward_std": 5.9622039794921875, + "rewards/rm_reward_func/mean": -1.0367431640625, + "rewards/rm_reward_func/std": 7.742504596710205, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 415.90625, + "completions/mean_terminated_length": 378.3043518066406, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.5048, + "grad_norm": 1.4947994947433472, + "kl": 0.04400634765625, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 9831607.0, + "reward": -7.2921142578125, + "reward_std": 4.765170574188232, + "rewards/rm_reward_func/mean": -7.2921142578125, + "rewards/rm_reward_func/std": 6.113991737365723, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 355.03125, + "completions/mean_terminated_length": 293.60870361328125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.5056, + "grad_norm": 1.589095950126648, + "kl": 0.0552978515625, + "learning_rate": 1e-06, + "loss": 0.0468, + "num_tokens": 9847840.0, + "reward": -1.291259765625, + "reward_std": 7.207411766052246, + "rewards/rm_reward_func/mean": -1.291259765625, + "rewards/rm_reward_func/std": 10.963984489440918, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 313.40625, + "completions/mean_terminated_length": 292.862060546875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5064, + "grad_norm": 2.0282340049743652, + "kl": 0.0574951171875, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 9864133.0, + "reward": 4.97216796875, + "reward_std": 7.371421813964844, + "rewards/rm_reward_func/mean": 4.97216796875, + "rewards/rm_reward_func/std": 13.95275592803955, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 371.75, + "completions/mean_terminated_length": 308.0, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.5072, + "grad_norm": 1.7619508504867554, + "kl": 0.06695556640625, + "learning_rate": 1e-06, + "loss": 0.1002, + "num_tokens": 9881549.0, + "reward": -5.4586181640625, + "reward_std": 4.504947662353516, + "rewards/rm_reward_func/mean": -5.4586181640625, + "rewards/rm_reward_func/std": 11.516170501708984, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 329.125, + "completions/mean_terminated_length": 323.2257995605469, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.508, + "grad_norm": 1.6464998722076416, + "kl": 0.042755126953125, + "learning_rate": 1e-06, + "loss": -0.0816, + "num_tokens": 9894737.0, + "reward": 12.748550415039062, + "reward_std": 8.391376495361328, + "rewards/rm_reward_func/mean": 12.748550415039062, + "rewards/rm_reward_func/std": 19.06916046142578, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 368.46875, + "completions/mean_terminated_length": 293.28570556640625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.5088, + "grad_norm": 1.850199818611145, + "kl": 0.04595947265625, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 9908624.0, + "reward": -6.959320068359375, + "reward_std": 5.563976287841797, + "rewards/rm_reward_func/mean": -6.959320068359375, + "rewards/rm_reward_func/std": 7.926870822906494, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 398.0625, + "completions/mean_terminated_length": 320.1052551269531, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5096, + "grad_norm": 1.7937188148498535, + "kl": 0.045806884765625, + "learning_rate": 1e-06, + "loss": 0.0534, + "num_tokens": 9925842.0, + "reward": -2.13720703125, + "reward_std": 3.777244806289673, + "rewards/rm_reward_func/mean": -2.13720703125, + "rewards/rm_reward_func/std": 13.738818168640137, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 425.125, + "completions/mean_terminated_length": 373.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.5104, + "grad_norm": 1.5506154298782349, + "kl": 0.03411865234375, + "learning_rate": 1e-06, + "loss": -0.0729, + "num_tokens": 9942966.0, + "reward": -6.9198150634765625, + "reward_std": 3.644956588745117, + "rewards/rm_reward_func/mean": -6.9198150634765625, + "rewards/rm_reward_func/std": 5.957521915435791, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 310.03125, + "completions/mean_terminated_length": 242.70834350585938, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.5112, + "grad_norm": 1.7421283721923828, + "kl": 0.07305908203125, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 9963783.0, + "reward": 7.260498046875, + "reward_std": 6.513481140136719, + "rewards/rm_reward_func/mean": 7.260498046875, + "rewards/rm_reward_func/std": 15.586586952209473, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 264.03125, + "completions/mean_terminated_length": 256.0322570800781, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.512, + "grad_norm": 2.81355619430542, + "kl": 0.086578369140625, + "learning_rate": 1e-06, + "loss": -0.0545, + "num_tokens": 9974304.0, + "reward": -9.13055419921875, + "reward_std": 3.8298912048339844, + "rewards/rm_reward_func/mean": -9.13055419921875, + "rewards/rm_reward_func/std": 9.440561294555664, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 316.3125, + "completions/mean_terminated_length": 303.2666931152344, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.5128, + "grad_norm": 2.1373441219329834, + "kl": 0.04754638671875, + "learning_rate": 1e-06, + "loss": -0.0951, + "num_tokens": 9989930.0, + "reward": -1.6126251220703125, + "reward_std": 4.084563255310059, + "rewards/rm_reward_func/mean": -1.6126251220703125, + "rewards/rm_reward_func/std": 14.366743087768555, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 281.25, + "completions/mean_terminated_length": 265.8666687011719, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.5136, + "grad_norm": 2.004603862762451, + "kl": 0.06787109375, + "learning_rate": 1e-06, + "loss": 0.0411, + "num_tokens": 10009138.0, + "reward": 3.9356231689453125, + "reward_std": 7.002434730529785, + "rewards/rm_reward_func/mean": 3.9356231689453125, + "rewards/rm_reward_func/std": 15.040298461914062, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 259.53125, + "completions/mean_terminated_length": 212.7777862548828, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5144, + "grad_norm": 2.8500921726226807, + "kl": 0.07763671875, + "learning_rate": 1e-06, + "loss": 0.2831, + "num_tokens": 10025979.0, + "reward": 2.1851806640625, + "reward_std": 6.346950531005859, + "rewards/rm_reward_func/mean": 2.1851806640625, + "rewards/rm_reward_func/std": 12.503535270690918, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 388.15625, + "completions/mean_terminated_length": 303.4210510253906, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.5152, + "grad_norm": 1.5350950956344604, + "kl": 0.032867431640625, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 10046080.0, + "reward": -3.0421142578125, + "reward_std": 4.437884330749512, + "rewards/rm_reward_func/mean": -3.0421142578125, + "rewards/rm_reward_func/std": 10.139527320861816, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 420.46875, + "completions/mean_terminated_length": 399.3461608886719, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.516, + "grad_norm": 1.7513537406921387, + "kl": 0.035552978515625, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 10063983.0, + "reward": 4.243305206298828, + "reward_std": 6.186894416809082, + "rewards/rm_reward_func/mean": 4.243305206298828, + "rewards/rm_reward_func/std": 9.74747085571289, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 337.5, + "completions/mean_terminated_length": 279.3333435058594, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.5168, + "grad_norm": 1.842439889907837, + "kl": 0.060791015625, + "learning_rate": 1e-06, + "loss": -0.0122, + "num_tokens": 10078255.0, + "reward": -10.680419921875, + "reward_std": 4.411944389343262, + "rewards/rm_reward_func/mean": -10.680419921875, + "rewards/rm_reward_func/std": 12.604340553283691, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 466.53125, + "completions/mean_terminated_length": 408.0714416503906, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.5176, + "grad_norm": 1.3872531652450562, + "kl": 0.032196044921875, + "learning_rate": 1e-06, + "loss": 0.0647, + "num_tokens": 10099880.0, + "reward": -2.6961669921875, + "reward_std": 4.499116897583008, + "rewards/rm_reward_func/mean": -2.6961669921875, + "rewards/rm_reward_func/std": 10.117501258850098, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 366.71875, + "completions/mean_terminated_length": 345.96429443359375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.5184, + "grad_norm": 1.91852605342865, + "kl": 0.040557861328125, + "learning_rate": 1e-06, + "loss": -0.0402, + "num_tokens": 10116311.0, + "reward": -5.5899658203125, + "reward_std": 8.005522727966309, + "rewards/rm_reward_func/mean": -5.5899658203125, + "rewards/rm_reward_func/std": 12.845687866210938, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 321.1875, + "completions/mean_terminated_length": 315.0322570800781, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.5192, + "grad_norm": 1.8793799877166748, + "kl": 0.0601806640625, + "learning_rate": 1e-06, + "loss": 0.0312, + "num_tokens": 10134677.0, + "reward": 3.2547149658203125, + "reward_std": 6.556687831878662, + "rewards/rm_reward_func/mean": 3.2547149658203125, + "rewards/rm_reward_func/std": 12.492579460144043, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 258.71875, + "completions/mean_terminated_length": 241.83334350585938, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.52, + "grad_norm": 2.1236674785614014, + "kl": 0.055816650390625, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 10146356.0, + "reward": -3.1171875, + "reward_std": 5.859035491943359, + "rewards/rm_reward_func/mean": -3.1171875, + "rewards/rm_reward_func/std": 7.14331579208374, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 396.46875, + "completions/mean_terminated_length": 357.9583435058594, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.5208, + "grad_norm": 1.4211030006408691, + "kl": 0.039154052734375, + "learning_rate": 1e-06, + "loss": -0.061, + "num_tokens": 10166019.0, + "reward": 5.25341796875, + "reward_std": 10.9521484375, + "rewards/rm_reward_func/mean": 5.25341796875, + "rewards/rm_reward_func/std": 15.721810340881348, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 376.65625, + "completions/mean_terminated_length": 345.423095703125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5216, + "grad_norm": 1.9456560611724854, + "kl": 0.04608154296875, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 10182192.0, + "reward": -5.745849609375, + "reward_std": 4.124988555908203, + "rewards/rm_reward_func/mean": -5.745849609375, + "rewards/rm_reward_func/std": 11.21120548248291, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 383.46875, + "completions/mean_terminated_length": 374.9000244140625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5224, + "grad_norm": 1.6588629484176636, + "kl": 0.04278564453125, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 10200519.0, + "reward": 5.29150390625, + "reward_std": 4.636484622955322, + "rewards/rm_reward_func/mean": 5.29150390625, + "rewards/rm_reward_func/std": 15.785604476928711, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 288.84375, + "completions/mean_terminated_length": 265.75860595703125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.5232, + "grad_norm": 1.7255975008010864, + "kl": 0.0921630859375, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 10220714.0, + "reward": 6.576171875, + "reward_std": 4.431131839752197, + "rewards/rm_reward_func/mean": 6.576171875, + "rewards/rm_reward_func/std": 19.577335357666016, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 330.28125, + "completions/mean_terminated_length": 296.629638671875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.524, + "grad_norm": 2.8558413982391357, + "kl": 0.0816650390625, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 10237387.0, + "reward": -5.791311264038086, + "reward_std": 4.968636512756348, + "rewards/rm_reward_func/mean": -5.791311264038086, + "rewards/rm_reward_func/std": 7.290288925170898, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 243.34375, + "completions/mean_terminated_length": 243.34375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.5248, + "grad_norm": 2.336859941482544, + "kl": 0.1103515625, + "learning_rate": 1e-06, + "loss": -0.0244, + "num_tokens": 10250534.0, + "reward": 6.526641845703125, + "reward_std": 5.3141584396362305, + "rewards/rm_reward_func/mean": 6.526641845703125, + "rewards/rm_reward_func/std": 14.079063415527344, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 314.8125, + "completions/mean_terminated_length": 225.18182373046875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.5256, + "grad_norm": 2.8797378540039062, + "kl": 0.1182861328125, + "learning_rate": 1e-06, + "loss": 0.0319, + "num_tokens": 10264592.0, + "reward": -3.999755859375, + "reward_std": 3.713993787765503, + "rewards/rm_reward_func/mean": -3.999755859375, + "rewards/rm_reward_func/std": 9.400103569030762, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 290.53125, + "completions/mean_terminated_length": 258.89288330078125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.5264, + "grad_norm": 2.0574896335601807, + "kl": 0.074462890625, + "learning_rate": 1e-06, + "loss": 0.0272, + "num_tokens": 10282777.0, + "reward": 5.42041015625, + "reward_std": 4.476833343505859, + "rewards/rm_reward_func/mean": 5.42041015625, + "rewards/rm_reward_func/std": 19.670827865600586, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 334.15625, + "completions/mean_terminated_length": 308.75, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.5272, + "grad_norm": 1.619626522064209, + "kl": 0.03314208984375, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 10297694.0, + "reward": 11.3914794921875, + "reward_std": 11.132980346679688, + "rewards/rm_reward_func/mean": 11.3914794921875, + "rewards/rm_reward_func/std": 12.282429695129395, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 354.71875, + "completions/mean_terminated_length": 325.59259033203125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.528, + "grad_norm": 2.439786672592163, + "kl": 0.0491180419921875, + "learning_rate": 1e-06, + "loss": 0.0338, + "num_tokens": 10315453.0, + "reward": 1.82305908203125, + "reward_std": 6.995844841003418, + "rewards/rm_reward_func/mean": 1.82305908203125, + "rewards/rm_reward_func/std": 9.528385162353516, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 386.0, + "completions/mean_terminated_length": 336.6956481933594, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.5288, + "grad_norm": 1.8334951400756836, + "kl": 0.05743408203125, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 10333885.0, + "reward": -1.98828125, + "reward_std": 6.946992874145508, + "rewards/rm_reward_func/mean": -1.98828125, + "rewards/rm_reward_func/std": 12.256511688232422, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 429.65625, + "completions/mean_terminated_length": 347.3125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.5296, + "grad_norm": 1.840104103088379, + "kl": 0.0287322998046875, + "learning_rate": 1e-06, + "loss": -0.0355, + "num_tokens": 10351754.0, + "reward": -9.538909912109375, + "reward_std": 4.58635139465332, + "rewards/rm_reward_func/mean": -9.538909912109375, + "rewards/rm_reward_func/std": 12.071836471557617, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 289.28125, + "completions/mean_terminated_length": 289.28125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5304, + "grad_norm": 2.0324652194976807, + "kl": 0.05859375, + "learning_rate": 1e-06, + "loss": -0.0354, + "num_tokens": 10365827.0, + "reward": 1.1844482421875, + "reward_std": 3.9891622066497803, + "rewards/rm_reward_func/mean": 1.1844482421875, + "rewards/rm_reward_func/std": 14.031904220581055, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 346.28125, + "completions/mean_terminated_length": 281.4347839355469, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.5312, + "grad_norm": 1.7831884622573853, + "kl": 0.032440185546875, + "learning_rate": 1e-06, + "loss": -0.0478, + "num_tokens": 10379572.0, + "reward": -2.488250732421875, + "reward_std": 5.685158729553223, + "rewards/rm_reward_func/mean": -2.488250732421875, + "rewards/rm_reward_func/std": 6.3590545654296875, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 272.03125, + "completions/mean_terminated_length": 272.03125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.532, + "grad_norm": 1.8832321166992188, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.0564, + "num_tokens": 10394173.0, + "reward": -6.126922607421875, + "reward_std": 6.068042755126953, + "rewards/rm_reward_func/mean": -6.126922607421875, + "rewards/rm_reward_func/std": 12.041190147399902, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 262.09375, + "completions/mean_terminated_length": 236.2413787841797, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.5328, + "grad_norm": 1.9710813760757446, + "kl": 0.0518798828125, + "learning_rate": 1e-06, + "loss": 0.0308, + "num_tokens": 10407152.0, + "reward": -8.995849609375, + "reward_std": 5.564748764038086, + "rewards/rm_reward_func/mean": -8.995849609375, + "rewards/rm_reward_func/std": 11.433723449707031, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 251.0, + "completions/mean_terminated_length": 202.6666717529297, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.5336, + "grad_norm": 3.284055471420288, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": -0.0242, + "num_tokens": 10417864.0, + "reward": 4.966941833496094, + "reward_std": 4.505078315734863, + "rewards/rm_reward_func/mean": 4.966941833496094, + "rewards/rm_reward_func/std": 9.16561508178711, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 411.3125, + "completions/mean_terminated_length": 371.9130554199219, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.5344, + "grad_norm": 1.6275047063827515, + "kl": 0.04730224609375, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 10437658.0, + "reward": -2.92010498046875, + "reward_std": 3.7449872493743896, + "rewards/rm_reward_func/mean": -2.92010498046875, + "rewards/rm_reward_func/std": 18.138641357421875, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 311.09375, + "completions/mean_terminated_length": 282.39288330078125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.5352, + "grad_norm": 1.9545366764068604, + "kl": 0.050048828125, + "learning_rate": 1e-06, + "loss": -0.0653, + "num_tokens": 10452317.0, + "reward": 2.5281982421875, + "reward_std": 5.642504692077637, + "rewards/rm_reward_func/mean": 2.5281982421875, + "rewards/rm_reward_func/std": 17.278425216674805, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 370.5, + "completions/mean_terminated_length": 355.862060546875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.536, + "grad_norm": 1.5525546073913574, + "kl": 0.033599853515625, + "learning_rate": 1e-06, + "loss": -0.0678, + "num_tokens": 10466829.0, + "reward": 11.3348388671875, + "reward_std": 9.100727081298828, + "rewards/rm_reward_func/mean": 11.3348388671875, + "rewards/rm_reward_func/std": 16.054628372192383, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 241.09375, + "completions/mean_terminated_length": 232.35482788085938, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.5368, + "grad_norm": 2.1375620365142822, + "kl": 0.10784912109375, + "learning_rate": 1e-06, + "loss": -0.1169, + "num_tokens": 10485712.0, + "reward": -1.968780517578125, + "reward_std": 8.304351806640625, + "rewards/rm_reward_func/mean": -1.968780517578125, + "rewards/rm_reward_func/std": 9.414302825927734, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 271.84375, + "completions/mean_terminated_length": 237.5357208251953, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.5376, + "grad_norm": 1.9433166980743408, + "kl": 0.09661865234375, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 10506307.0, + "reward": 12.4996337890625, + "reward_std": 7.820014476776123, + "rewards/rm_reward_func/mean": 12.4996337890625, + "rewards/rm_reward_func/std": 11.681737899780273, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 317.375, + "completions/mean_terminated_length": 200.60000610351562, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.5384, + "grad_norm": 2.130408525466919, + "kl": 0.07281494140625, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 10519543.0, + "reward": -0.3404541015625, + "reward_std": 3.709181070327759, + "rewards/rm_reward_func/mean": -0.3404541015625, + "rewards/rm_reward_func/std": 16.980573654174805, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 462.46875, + "completions/mean_terminated_length": 406.3333435058594, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.5392, + "grad_norm": 1.6774389743804932, + "kl": 0.0355224609375, + "learning_rate": 1e-06, + "loss": 0.0253, + "num_tokens": 10537534.0, + "reward": -7.8924560546875, + "reward_std": 4.415147304534912, + "rewards/rm_reward_func/mean": -7.8924560546875, + "rewards/rm_reward_func/std": 12.546490669250488, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 364.1875, + "completions/mean_terminated_length": 263.0526428222656, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.54, + "grad_norm": 3.784860610961914, + "kl": 0.058868408203125, + "learning_rate": 1e-06, + "loss": 0.2937, + "num_tokens": 10554500.0, + "reward": -5.244312286376953, + "reward_std": 4.874639987945557, + "rewards/rm_reward_func/mean": -5.244312286376953, + "rewards/rm_reward_func/std": 7.0300822257995605, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 420.625, + "completions/mean_terminated_length": 349.5555725097656, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.5408, + "grad_norm": 1.6555469036102295, + "kl": 0.04266357421875, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 10573168.0, + "reward": -8.323974609375, + "reward_std": 4.69626522064209, + "rewards/rm_reward_func/mean": -8.323974609375, + "rewards/rm_reward_func/std": 8.503193855285645, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 321.59375, + "completions/mean_terminated_length": 247.0869598388672, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.5416, + "grad_norm": 2.275675058364868, + "kl": 0.039520263671875, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 10585491.0, + "reward": -8.84619140625, + "reward_std": 3.501187324523926, + "rewards/rm_reward_func/mean": -8.84619140625, + "rewards/rm_reward_func/std": 11.03673267364502, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 324.21875, + "completions/mean_terminated_length": 238.8636474609375, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.5424, + "grad_norm": 5.370597839355469, + "kl": 0.081756591796875, + "learning_rate": 1e-06, + "loss": 0.1498, + "num_tokens": 10598666.0, + "reward": -7.942596435546875, + "reward_std": 5.026332378387451, + "rewards/rm_reward_func/mean": -7.942596435546875, + "rewards/rm_reward_func/std": 7.243937015533447, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 293.9375, + "completions/mean_terminated_length": 221.25, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.5432, + "grad_norm": 2.0717859268188477, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": -0.0554, + "num_tokens": 10616280.0, + "reward": -0.78076171875, + "reward_std": 6.128231525421143, + "rewards/rm_reward_func/mean": -0.78076171875, + "rewards/rm_reward_func/std": 11.496376037597656, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 417.0, + "completions/mean_terminated_length": 174.22222900390625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.544, + "grad_norm": 1.7002750635147095, + "kl": 0.041168212890625, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 10634576.0, + "reward": -11.08489990234375, + "reward_std": 4.592316150665283, + "rewards/rm_reward_func/mean": -11.08489990234375, + "rewards/rm_reward_func/std": 16.18546485900879, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 375.625, + "completions/mean_terminated_length": 356.14288330078125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.5448, + "grad_norm": 1.8196467161178589, + "kl": 0.06439208984375, + "learning_rate": 1e-06, + "loss": 0.0267, + "num_tokens": 10649876.0, + "reward": -6.37109375, + "reward_std": 6.397860527038574, + "rewards/rm_reward_func/mean": -6.37109375, + "rewards/rm_reward_func/std": 11.019240379333496, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 370.03125, + "completions/mean_terminated_length": 295.66668701171875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.5456, + "grad_norm": 1.6078789234161377, + "kl": 0.0586090087890625, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 10669509.0, + "reward": 10.754119873046875, + "reward_std": 7.517634868621826, + "rewards/rm_reward_func/mean": 10.754119873046875, + "rewards/rm_reward_func/std": 17.389326095581055, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 305.625, + "completions/mean_terminated_length": 284.2758483886719, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.5464, + "grad_norm": 1.49837327003479, + "kl": 0.0294189453125, + "learning_rate": 1e-06, + "loss": 0.1081, + "num_tokens": 10685697.0, + "reward": 11.482666015625, + "reward_std": 8.782905578613281, + "rewards/rm_reward_func/mean": 11.482666015625, + "rewards/rm_reward_func/std": 11.745436668395996, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 367.5625, + "completions/mean_terminated_length": 352.6206970214844, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.5472, + "grad_norm": 1.7170090675354004, + "kl": 0.09490966796875, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 10702963.0, + "reward": 15.99676513671875, + "reward_std": 4.8248138427734375, + "rewards/rm_reward_func/mean": 15.99676513671875, + "rewards/rm_reward_func/std": 19.297609329223633, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 333.59375, + "completions/mean_terminated_length": 274.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.548, + "grad_norm": 2.1568658351898193, + "kl": 0.0845947265625, + "learning_rate": 1e-06, + "loss": -0.0457, + "num_tokens": 10720230.0, + "reward": -4.34033203125, + "reward_std": 4.9244465827941895, + "rewards/rm_reward_func/mean": -4.34033203125, + "rewards/rm_reward_func/std": 8.559109687805176, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 320.9375, + "completions/mean_terminated_length": 257.25, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.5488, + "grad_norm": 1.8681159019470215, + "kl": 0.08856201171875, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 10737508.0, + "reward": -0.158447265625, + "reward_std": 4.81589412689209, + "rewards/rm_reward_func/mean": -0.158447265625, + "rewards/rm_reward_func/std": 18.756925582885742, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 388.59375, + "completions/mean_terminated_length": 347.4583435058594, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.5496, + "grad_norm": 1.6849080324172974, + "kl": 0.0643310546875, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 10758903.0, + "reward": 2.583740234375, + "reward_std": 5.178232192993164, + "rewards/rm_reward_func/mean": 2.583740234375, + "rewards/rm_reward_func/std": 14.70258617401123, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 393.28125, + "completions/mean_terminated_length": 300.9444580078125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.5504, + "grad_norm": 1.757041096687317, + "kl": 0.05181884765625, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 10776864.0, + "reward": 0.264892578125, + "reward_std": 3.916109561920166, + "rewards/rm_reward_func/mean": 0.264892578125, + "rewards/rm_reward_func/std": 18.9810791015625, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 334.8125, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.5512, + "grad_norm": 2.5558958053588867, + "kl": 0.051971435546875, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 10794122.0, + "reward": 3.2089996337890625, + "reward_std": 5.54947566986084, + "rewards/rm_reward_func/mean": 3.2089996337890625, + "rewards/rm_reward_func/std": 15.356097221374512, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 357.125, + "completions/mean_terminated_length": 321.3846130371094, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.552, + "grad_norm": 2.092510223388672, + "kl": 0.0394287109375, + "learning_rate": 1e-06, + "loss": 0.0937, + "num_tokens": 10808670.0, + "reward": -5.185791015625, + "reward_std": 5.963362693786621, + "rewards/rm_reward_func/mean": -5.185791015625, + "rewards/rm_reward_func/std": 6.186548233032227, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 348.3125, + "completions/mean_terminated_length": 331.3793029785156, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5528, + "grad_norm": 1.8329681158065796, + "kl": 0.08404541015625, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 10829104.0, + "reward": 11.92388916015625, + "reward_std": 5.070147514343262, + "rewards/rm_reward_func/mean": 11.92388916015625, + "rewards/rm_reward_func/std": 10.43554401397705, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 361.9375, + "completions/mean_terminated_length": 311.91668701171875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.5536, + "grad_norm": 1.559317708015442, + "kl": 0.046844482421875, + "learning_rate": 1e-06, + "loss": 0.0325, + "num_tokens": 10845462.0, + "reward": 5.54058837890625, + "reward_std": 3.387269973754883, + "rewards/rm_reward_func/mean": 5.54058837890625, + "rewards/rm_reward_func/std": 14.400303840637207, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 479.21875, + "completions/mean_terminated_length": 437.0714416503906, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.5544, + "grad_norm": 1.6343897581100464, + "kl": 0.03253173828125, + "learning_rate": 1e-06, + "loss": -0.0115, + "num_tokens": 10863821.0, + "reward": -2.62115478515625, + "reward_std": 4.382885932922363, + "rewards/rm_reward_func/mean": -2.62115478515625, + "rewards/rm_reward_func/std": 8.437052726745605, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 215.125, + "completions/mean_terminated_length": 215.125, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.5552, + "grad_norm": 2.8981897830963135, + "kl": 0.106201171875, + "learning_rate": 1e-06, + "loss": -0.0996, + "num_tokens": 10876001.0, + "reward": 3.706756591796875, + "reward_std": 5.014890193939209, + "rewards/rm_reward_func/mean": 3.706756591796875, + "rewards/rm_reward_func/std": 9.173577308654785, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 320.09375, + "completions/mean_terminated_length": 292.6785888671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.556, + "grad_norm": 1.8655980825424194, + "kl": 0.05462646484375, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 10891484.0, + "reward": 6.146728515625, + "reward_std": 4.927000045776367, + "rewards/rm_reward_func/mean": 6.146728515625, + "rewards/rm_reward_func/std": 10.616254806518555, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 255.4375, + "completions/mean_terminated_length": 247.16128540039062, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.5568, + "grad_norm": 2.2220993041992188, + "kl": 0.05902099609375, + "learning_rate": 1e-06, + "loss": -0.0128, + "num_tokens": 10904794.0, + "reward": -10.7166748046875, + "reward_std": 4.411683082580566, + "rewards/rm_reward_func/mean": -10.7166748046875, + "rewards/rm_reward_func/std": 13.029986381530762, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 283.03125, + "completions/mean_terminated_length": 283.03125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.5576, + "grad_norm": 1.9172923564910889, + "kl": 0.0772705078125, + "learning_rate": 1e-06, + "loss": -0.062, + "num_tokens": 10918971.0, + "reward": 12.12677001953125, + "reward_std": 6.7252349853515625, + "rewards/rm_reward_func/mean": 12.12677001953125, + "rewards/rm_reward_func/std": 10.66632080078125, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 144.375, + "completions/mean_terminated_length": 144.375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.5584, + "grad_norm": 4.019201278686523, + "kl": 0.2318115234375, + "learning_rate": 1e-06, + "loss": -0.0281, + "num_tokens": 10931639.0, + "reward": 5.988006591796875, + "reward_std": 5.7215895652771, + "rewards/rm_reward_func/mean": 5.988006591796875, + "rewards/rm_reward_func/std": 9.47252368927002, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 431.5, + "completions/mean_terminated_length": 404.66668701171875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.5592, + "grad_norm": 1.6360769271850586, + "kl": 0.0386962890625, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 10949055.0, + "reward": -4.39208984375, + "reward_std": 4.501108169555664, + "rewards/rm_reward_func/mean": -4.39208984375, + "rewards/rm_reward_func/std": 11.40958309173584, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 290.34375, + "completions/mean_terminated_length": 275.5666809082031, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.56, + "grad_norm": 4.067753791809082, + "kl": 0.10693359375, + "learning_rate": 1e-06, + "loss": 0.2284, + "num_tokens": 10962210.0, + "reward": 0.4071044921875, + "reward_std": 5.08738374710083, + "rewards/rm_reward_func/mean": 0.4071044921875, + "rewards/rm_reward_func/std": 17.504343032836914, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 480.90625, + "completions/mean_terminated_length": 449.8125, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.5608, + "grad_norm": 1.3039507865905762, + "kl": 0.03387451171875, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 10987103.0, + "reward": -11.9521484375, + "reward_std": 3.968744993209839, + "rewards/rm_reward_func/mean": -11.9521484375, + "rewards/rm_reward_func/std": 5.491401672363281, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 379.375, + "completions/mean_terminated_length": 335.16668701171875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.5616, + "grad_norm": 1.645443320274353, + "kl": 0.06439208984375, + "learning_rate": 1e-06, + "loss": 0.0412, + "num_tokens": 11005451.0, + "reward": -1.69891357421875, + "reward_std": 6.343710422515869, + "rewards/rm_reward_func/mean": -1.69891357421875, + "rewards/rm_reward_func/std": 16.001075744628906, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 353.9375, + "completions/mean_terminated_length": 317.4615478515625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.5624, + "grad_norm": 1.8242735862731934, + "kl": 0.0655517578125, + "learning_rate": 1e-06, + "loss": -0.0305, + "num_tokens": 11021993.0, + "reward": 13.1923828125, + "reward_std": 4.600231170654297, + "rewards/rm_reward_func/mean": 13.1923828125, + "rewards/rm_reward_func/std": 17.514408111572266, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 274.25, + "completions/mean_terminated_length": 249.65516662597656, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.5632, + "grad_norm": 2.241856336593628, + "kl": 0.0906982421875, + "learning_rate": 1e-06, + "loss": 0.0396, + "num_tokens": 11034577.0, + "reward": 1.3895263671875, + "reward_std": 9.88357162475586, + "rewards/rm_reward_func/mean": 1.3895263671875, + "rewards/rm_reward_func/std": 12.232261657714844, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 334.25, + "completions/mean_terminated_length": 275.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.564, + "grad_norm": 1.9395514726638794, + "kl": 0.045684814453125, + "learning_rate": 1e-06, + "loss": 0.0862, + "num_tokens": 11047177.0, + "reward": -1.9337158203125, + "reward_std": 5.662674903869629, + "rewards/rm_reward_func/mean": -1.9337158203125, + "rewards/rm_reward_func/std": 6.434460163116455, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 340.125, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.5648, + "grad_norm": 1.7742900848388672, + "kl": 0.075836181640625, + "learning_rate": 1e-06, + "loss": 0.0275, + "num_tokens": 11065189.0, + "reward": 4.7626953125, + "reward_std": 4.769928932189941, + "rewards/rm_reward_func/mean": 4.7626953125, + "rewards/rm_reward_func/std": 20.30078125, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 267.0, + "completions/mean_terminated_length": 232.00001525878906, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5656, + "grad_norm": 3.2380051612854004, + "kl": 0.08892822265625, + "learning_rate": 1e-06, + "loss": 0.2856, + "num_tokens": 11076661.0, + "reward": 2.7825469970703125, + "reward_std": 5.321039199829102, + "rewards/rm_reward_func/mean": 2.7825469970703125, + "rewards/rm_reward_func/std": 18.488374710083008, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 474.75, + "completions/mean_terminated_length": 420.3077087402344, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.5664, + "grad_norm": 1.6572221517562866, + "kl": 0.035736083984375, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 11095293.0, + "reward": -6.4884033203125, + "reward_std": 4.526544094085693, + "rewards/rm_reward_func/mean": -6.4884033203125, + "rewards/rm_reward_func/std": 14.073304176330566, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 427.09375, + "completions/mean_terminated_length": 393.86956787109375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.5672, + "grad_norm": 1.7585015296936035, + "kl": 0.03955078125, + "learning_rate": 1e-06, + "loss": 0.0448, + "num_tokens": 11111320.0, + "reward": -5.62213134765625, + "reward_std": 5.79171895980835, + "rewards/rm_reward_func/mean": -5.62213134765625, + "rewards/rm_reward_func/std": 16.12542152404785, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 377.15625, + "completions/mean_terminated_length": 324.39129638671875, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.568, + "grad_norm": 1.9821372032165527, + "kl": 0.0450439453125, + "learning_rate": 1e-06, + "loss": 0.1217, + "num_tokens": 11125349.0, + "reward": -9.55194091796875, + "reward_std": 3.529968738555908, + "rewards/rm_reward_func/mean": -9.55194091796875, + "rewards/rm_reward_func/std": 7.033637046813965, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 403.625, + "completions/mean_terminated_length": 346.8571472167969, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.5688, + "grad_norm": 1.8035898208618164, + "kl": 0.06439208984375, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 11143873.0, + "reward": -5.576171875, + "reward_std": 6.76420259475708, + "rewards/rm_reward_func/mean": -5.576171875, + "rewards/rm_reward_func/std": 12.331089973449707, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 358.28125, + "completions/mean_terminated_length": 253.1052703857422, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.5696, + "grad_norm": 1.855508804321289, + "kl": 0.0621337890625, + "learning_rate": 1e-06, + "loss": -0.0356, + "num_tokens": 11161154.0, + "reward": 6.67767333984375, + "reward_std": 5.646854400634766, + "rewards/rm_reward_func/mean": 6.67767333984375, + "rewards/rm_reward_func/std": 18.676925659179688, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 363.21875, + "completions/mean_terminated_length": 353.3000183105469, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.5704, + "grad_norm": 1.8120245933532715, + "kl": 0.038299560546875, + "learning_rate": 1e-06, + "loss": -0.0258, + "num_tokens": 11174553.0, + "reward": -3.0521240234375, + "reward_std": 5.34282112121582, + "rewards/rm_reward_func/mean": -3.0521240234375, + "rewards/rm_reward_func/std": 7.3457255363464355, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 398.75, + "completions/mean_terminated_length": 339.4285888671875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.5712, + "grad_norm": 1.9382396936416626, + "kl": 0.07861328125, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 11193913.0, + "reward": -3.8599853515625, + "reward_std": 3.715731620788574, + "rewards/rm_reward_func/mean": -3.8599853515625, + "rewards/rm_reward_func/std": 15.207404136657715, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 279.71875, + "completions/mean_terminated_length": 272.2257995605469, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.572, + "grad_norm": 1.9403306245803833, + "kl": 0.0960693359375, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 11208152.0, + "reward": -1.0338287353515625, + "reward_std": 6.118830680847168, + "rewards/rm_reward_func/mean": -1.0338287353515625, + "rewards/rm_reward_func/std": 9.452960968017578, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 415.8125, + "completions/mean_terminated_length": 341.0, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.5728, + "grad_norm": 1.5466837882995605, + "kl": 0.058074951171875, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 11225906.0, + "reward": -3.4312744140625, + "reward_std": 4.028469085693359, + "rewards/rm_reward_func/mean": -3.4312744140625, + "rewards/rm_reward_func/std": 15.68981647491455, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 244.0625, + "completions/mean_terminated_length": 244.0625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.5736, + "grad_norm": 2.488198757171631, + "kl": 0.06854248046875, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 11239196.0, + "reward": -6.377685546875, + "reward_std": 5.443893909454346, + "rewards/rm_reward_func/mean": -6.377685546875, + "rewards/rm_reward_func/std": 11.648910522460938, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 308.125, + "completions/mean_terminated_length": 270.370361328125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.5744, + "grad_norm": 1.94734525680542, + "kl": 0.0572509765625, + "learning_rate": 1e-06, + "loss": 0.0849, + "num_tokens": 11251384.0, + "reward": -9.083984375, + "reward_std": 4.338766574859619, + "rewards/rm_reward_func/mean": -9.083984375, + "rewards/rm_reward_func/std": 15.627335548400879, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 409.6875, + "completions/mean_terminated_length": 319.4117736816406, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.5752, + "grad_norm": 1.957594633102417, + "kl": 0.03814697265625, + "learning_rate": 1e-06, + "loss": 0.0462, + "num_tokens": 11268574.0, + "reward": 1.68914794921875, + "reward_std": 7.686867713928223, + "rewards/rm_reward_func/mean": 1.68914794921875, + "rewards/rm_reward_func/std": 16.86966896057129, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 233.5, + "completions/mean_terminated_length": 233.5, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.576, + "grad_norm": 4.8920159339904785, + "kl": 0.11688232421875, + "learning_rate": 1e-06, + "loss": 0.0883, + "num_tokens": 11282718.0, + "reward": 0.9250030517578125, + "reward_std": 6.628058910369873, + "rewards/rm_reward_func/mean": 0.9250030517578125, + "rewards/rm_reward_func/std": 13.629246711730957, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 374.59375, + "completions/mean_terminated_length": 320.8260803222656, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.5768, + "grad_norm": 1.8501105308532715, + "kl": 0.095672607421875, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 11298337.0, + "reward": -2.7148971557617188, + "reward_std": 3.236755847930908, + "rewards/rm_reward_func/mean": -2.7148971557617188, + "rewards/rm_reward_func/std": 13.256773948669434, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 375.78125, + "completions/mean_terminated_length": 330.375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.5776, + "grad_norm": 1.883704662322998, + "kl": 0.05047607421875, + "learning_rate": 1e-06, + "loss": -0.0406, + "num_tokens": 11312826.0, + "reward": -0.91534423828125, + "reward_std": 8.073993682861328, + "rewards/rm_reward_func/mean": -0.91534423828125, + "rewards/rm_reward_func/std": 13.965790748596191, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 231.65625, + "completions/mean_terminated_length": 202.65516662597656, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.5784, + "grad_norm": 2.755598306655884, + "kl": 0.09375, + "learning_rate": 1e-06, + "loss": -0.0229, + "num_tokens": 11326015.0, + "reward": 9.4757080078125, + "reward_std": 5.148957252502441, + "rewards/rm_reward_func/mean": 9.4757080078125, + "rewards/rm_reward_func/std": 17.220090866088867, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 463.21875, + "completions/mean_terminated_length": 420.1764831542969, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.5792, + "grad_norm": 1.6228283643722534, + "kl": 0.034698486328125, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 11347278.0, + "reward": 2.3042962551116943, + "reward_std": 4.509110450744629, + "rewards/rm_reward_func/mean": 2.3042962551116943, + "rewards/rm_reward_func/std": 5.8462653160095215, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 326.84375, + "completions/mean_terminated_length": 326.84375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.58, + "grad_norm": 1.9439358711242676, + "kl": 0.04693603515625, + "learning_rate": 1e-06, + "loss": -0.0334, + "num_tokens": 11359593.0, + "reward": -5.938232421875, + "reward_std": 5.530063629150391, + "rewards/rm_reward_func/mean": -5.938232421875, + "rewards/rm_reward_func/std": 10.272322654724121, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 322.28125, + "completions/mean_terminated_length": 259.04168701171875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.5808, + "grad_norm": 2.157841920852661, + "kl": 0.083709716796875, + "learning_rate": 1e-06, + "loss": 0.0441, + "num_tokens": 11377930.0, + "reward": 5.68743896484375, + "reward_std": 5.2516770362854, + "rewards/rm_reward_func/mean": 5.68743896484375, + "rewards/rm_reward_func/std": 13.145316123962402, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 422.90625, + "completions/mean_terminated_length": 344.29412841796875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.5816, + "grad_norm": 1.7282661199569702, + "kl": 0.05499267578125, + "learning_rate": 1e-06, + "loss": 0.0362, + "num_tokens": 11396751.0, + "reward": -1.7255859375, + "reward_std": 5.906103134155273, + "rewards/rm_reward_func/mean": -1.7255859375, + "rewards/rm_reward_func/std": 8.331377983093262, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 432.1875, + "completions/mean_terminated_length": 279.81817626953125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.5824, + "grad_norm": 1.590354084968567, + "kl": 0.0254058837890625, + "learning_rate": 1e-06, + "loss": -0.0187, + "num_tokens": 11414421.0, + "reward": -13.15496826171875, + "reward_std": 4.039255619049072, + "rewards/rm_reward_func/mean": -13.15496826171875, + "rewards/rm_reward_func/std": 5.822081089019775, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 235.75, + "completions/mean_terminated_length": 235.75, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.5832, + "grad_norm": 1.8473788499832153, + "kl": 0.1175537109375, + "learning_rate": 1e-06, + "loss": -0.0196, + "num_tokens": 11429917.0, + "reward": 17.54296875, + "reward_std": 5.92159366607666, + "rewards/rm_reward_func/mean": 17.54296875, + "rewards/rm_reward_func/std": 11.708049774169922, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 303.59375, + "completions/mean_terminated_length": 289.70001220703125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.584, + "grad_norm": 2.0560455322265625, + "kl": 0.06414794921875, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 11444072.0, + "reward": -2.415924072265625, + "reward_std": 7.520061016082764, + "rewards/rm_reward_func/mean": -2.415924072265625, + "rewards/rm_reward_func/std": 10.055807113647461, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 273.53125, + "completions/mean_terminated_length": 248.862060546875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.5848, + "grad_norm": 2.338869094848633, + "kl": 0.077484130859375, + "learning_rate": 1e-06, + "loss": -0.0662, + "num_tokens": 11459553.0, + "reward": 5.96075439453125, + "reward_std": 6.575499057769775, + "rewards/rm_reward_func/mean": 5.96075439453125, + "rewards/rm_reward_func/std": 8.43700122833252, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 343.03125, + "completions/mean_terminated_length": 286.7083435058594, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.5856, + "grad_norm": 2.0642449855804443, + "kl": 0.06201171875, + "learning_rate": 1e-06, + "loss": -0.0491, + "num_tokens": 11475530.0, + "reward": 0.683990478515625, + "reward_std": 8.264944076538086, + "rewards/rm_reward_func/mean": 0.683990478515625, + "rewards/rm_reward_func/std": 13.442095756530762, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 306.21875, + "completions/mean_terminated_length": 306.21875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.5864, + "grad_norm": 1.9789447784423828, + "kl": 0.07464599609375, + "learning_rate": 1e-06, + "loss": -0.0853, + "num_tokens": 11492441.0, + "reward": 5.1513671875, + "reward_std": 4.4709792137146, + "rewards/rm_reward_func/mean": 5.1513671875, + "rewards/rm_reward_func/std": 13.890679359436035, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 334.53125, + "completions/mean_terminated_length": 253.8636474609375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.5872, + "grad_norm": 1.9370701313018799, + "kl": 0.048370361328125, + "learning_rate": 1e-06, + "loss": 0.0431, + "num_tokens": 11505418.0, + "reward": -9.562255859375, + "reward_std": 4.457119941711426, + "rewards/rm_reward_func/mean": -9.562255859375, + "rewards/rm_reward_func/std": 13.220370292663574, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 259.40625, + "completions/mean_terminated_length": 259.40625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.588, + "grad_norm": 2.126023769378662, + "kl": 0.0888671875, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 11519775.0, + "reward": 2.12115478515625, + "reward_std": 7.761030673980713, + "rewards/rm_reward_func/mean": 2.12115478515625, + "rewards/rm_reward_func/std": 8.630084991455078, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 352.59375, + "completions/mean_terminated_length": 290.2174072265625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.5888, + "grad_norm": 1.9931689500808716, + "kl": 0.07501220703125, + "learning_rate": 1e-06, + "loss": 0.0719, + "num_tokens": 11537602.0, + "reward": -9.73712158203125, + "reward_std": 4.026195526123047, + "rewards/rm_reward_func/mean": -9.73712158203125, + "rewards/rm_reward_func/std": 7.206016540527344, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 420.28125, + "completions/mean_terminated_length": 372.23809814453125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.5896, + "grad_norm": 1.844557523727417, + "kl": 0.0496826171875, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 11555403.0, + "reward": -9.93096923828125, + "reward_std": 5.591634750366211, + "rewards/rm_reward_func/mean": -9.93096923828125, + "rewards/rm_reward_func/std": 9.549666404724121, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 344.0625, + "completions/mean_terminated_length": 320.0714416503906, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5904, + "grad_norm": 2.1637046337127686, + "kl": 0.0584716796875, + "learning_rate": 1e-06, + "loss": -0.0546, + "num_tokens": 11568757.0, + "reward": -3.5477294921875, + "reward_std": 5.942305088043213, + "rewards/rm_reward_func/mean": -3.5477294921875, + "rewards/rm_reward_func/std": 6.813287734985352, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 290.125, + "completions/mean_terminated_length": 282.9677429199219, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.5912, + "grad_norm": 1.968424677848816, + "kl": 0.06671142578125, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 11585505.0, + "reward": -1.160400390625, + "reward_std": 5.084662914276123, + "rewards/rm_reward_func/mean": -1.160400390625, + "rewards/rm_reward_func/std": 14.539655685424805, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 433.8125, + "completions/mean_terminated_length": 380.3157958984375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.592, + "grad_norm": 1.6739078760147095, + "kl": 0.034454345703125, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 11602739.0, + "reward": -7.1455535888671875, + "reward_std": 3.9881582260131836, + "rewards/rm_reward_func/mean": -7.1455535888671875, + "rewards/rm_reward_func/std": 14.133525848388672, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 365.15625, + "completions/mean_terminated_length": 337.96295166015625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.5928, + "grad_norm": 1.8537671566009521, + "kl": 0.08612060546875, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 11620864.0, + "reward": -2.640869140625, + "reward_std": 3.5159153938293457, + "rewards/rm_reward_func/mean": -2.640869140625, + "rewards/rm_reward_func/std": 18.093698501586914, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 381.96875, + "completions/mean_terminated_length": 214.7857208251953, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.5936, + "grad_norm": 3.9266419410705566, + "kl": 0.068115234375, + "learning_rate": 1e-06, + "loss": 0.0779, + "num_tokens": 11638935.0, + "reward": -10.34033203125, + "reward_std": 6.511270523071289, + "rewards/rm_reward_func/mean": -10.34033203125, + "rewards/rm_reward_func/std": 11.386975288391113, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 405.8125, + "completions/mean_terminated_length": 370.41668701171875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.5944, + "grad_norm": 1.524894118309021, + "kl": 0.05712890625, + "learning_rate": 1e-06, + "loss": -0.0441, + "num_tokens": 11659441.0, + "reward": -2.652099609375, + "reward_std": 4.48708963394165, + "rewards/rm_reward_func/mean": -2.652099609375, + "rewards/rm_reward_func/std": 19.106359481811523, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 339.21875, + "completions/mean_terminated_length": 314.5357360839844, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.5952, + "grad_norm": 1.934404969215393, + "kl": 0.04644775390625, + "learning_rate": 1e-06, + "loss": -0.081, + "num_tokens": 11674360.0, + "reward": -2.7947998046875, + "reward_std": 6.291441917419434, + "rewards/rm_reward_func/mean": -2.7947998046875, + "rewards/rm_reward_func/std": 8.098343849182129, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 359.875, + "completions/mean_terminated_length": 164.2857208251953, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.596, + "grad_norm": 2.268794059753418, + "kl": 0.04498291015625, + "learning_rate": 1e-06, + "loss": 0.0663, + "num_tokens": 11691436.0, + "reward": -7.95465087890625, + "reward_std": 3.1400527954101562, + "rewards/rm_reward_func/mean": -7.95465087890625, + "rewards/rm_reward_func/std": 7.613680839538574, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 289.40625, + "completions/mean_terminated_length": 289.40625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.5968, + "grad_norm": 1.9559932947158813, + "kl": 0.082275390625, + "learning_rate": 1e-06, + "loss": -0.0782, + "num_tokens": 11704529.0, + "reward": 20.318115234375, + "reward_std": 7.018957614898682, + "rewards/rm_reward_func/mean": 20.318115234375, + "rewards/rm_reward_func/std": 22.436296463012695, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 334.6875, + "completions/mean_terminated_length": 275.5833435058594, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.5976, + "grad_norm": 1.891306757926941, + "kl": 0.0604248046875, + "learning_rate": 1e-06, + "loss": -0.0533, + "num_tokens": 11719319.0, + "reward": -6.52178955078125, + "reward_std": 4.2834343910217285, + "rewards/rm_reward_func/mean": -6.52178955078125, + "rewards/rm_reward_func/std": 8.81009578704834, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 304.71875, + "completions/mean_terminated_length": 235.625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.5984, + "grad_norm": 1.5848143100738525, + "kl": 0.055938720703125, + "learning_rate": 1e-06, + "loss": -0.0355, + "num_tokens": 11736710.0, + "reward": 2.39227294921875, + "reward_std": 4.494058609008789, + "rewards/rm_reward_func/mean": 2.39227294921875, + "rewards/rm_reward_func/std": 18.010425567626953, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 338.28125, + "completions/mean_terminated_length": 306.1111145019531, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.5992, + "grad_norm": 1.9604521989822388, + "kl": 0.04913330078125, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 11749719.0, + "reward": -13.73291015625, + "reward_std": 3.1303107738494873, + "rewards/rm_reward_func/mean": -13.73291015625, + "rewards/rm_reward_func/std": 3.9044137001037598, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 364.65625, + "completions/mean_terminated_length": 287.4761962890625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.6, + "grad_norm": 1.7330660820007324, + "kl": 0.08404541015625, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 11769796.0, + "reward": 4.3505859375, + "reward_std": 4.162467002868652, + "rewards/rm_reward_func/mean": 4.3505859375, + "rewards/rm_reward_func/std": 20.066606521606445, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 342.3125, + "completions/mean_terminated_length": 303.15386962890625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.6008, + "grad_norm": 1.8137338161468506, + "kl": 0.068206787109375, + "learning_rate": 1e-06, + "loss": -0.0192, + "num_tokens": 11785966.0, + "reward": 1.564208984375, + "reward_std": 8.127765655517578, + "rewards/rm_reward_func/mean": 1.564208984375, + "rewards/rm_reward_func/std": 13.421592712402344, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 306.0625, + "completions/mean_terminated_length": 306.0625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.6016, + "grad_norm": 1.8806535005569458, + "kl": 0.06890869140625, + "learning_rate": 1e-06, + "loss": -0.0413, + "num_tokens": 11802680.0, + "reward": 12.3533935546875, + "reward_std": 5.532873153686523, + "rewards/rm_reward_func/mean": 12.3533935546875, + "rewards/rm_reward_func/std": 19.8289737701416, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 375.53125, + "completions/mean_terminated_length": 313.5, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.6024, + "grad_norm": 1.8858489990234375, + "kl": 0.053680419921875, + "learning_rate": 1e-06, + "loss": 0.027, + "num_tokens": 11821465.0, + "reward": -2.0836715698242188, + "reward_std": 3.355863571166992, + "rewards/rm_reward_func/mean": -2.0836715698242188, + "rewards/rm_reward_func/std": 18.135364532470703, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 382.71875, + "completions/mean_terminated_length": 282.1666564941406, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.6032, + "grad_norm": 1.997640609741211, + "kl": 0.04864501953125, + "learning_rate": 1e-06, + "loss": 0.1838, + "num_tokens": 11840544.0, + "reward": -13.791015625, + "reward_std": 5.417515754699707, + "rewards/rm_reward_func/mean": -13.791015625, + "rewards/rm_reward_func/std": 7.608242511749268, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 398.3125, + "completions/mean_terminated_length": 298.0, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.604, + "grad_norm": 2.1502175331115723, + "kl": 0.046142578125, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 11854802.0, + "reward": -10.880874633789062, + "reward_std": 6.548459529876709, + "rewards/rm_reward_func/mean": -10.880874633789062, + "rewards/rm_reward_func/std": 10.829429626464844, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 358.53125, + "completions/mean_terminated_length": 205.0625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.6048, + "grad_norm": 2.2914505004882812, + "kl": 0.07757568359375, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 11871507.0, + "reward": -1.798553466796875, + "reward_std": 5.568508148193359, + "rewards/rm_reward_func/mean": -1.798553466796875, + "rewards/rm_reward_func/std": 10.702640533447266, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 322.125, + "completions/mean_terminated_length": 286.96295166015625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.6056, + "grad_norm": 2.2829623222351074, + "kl": 0.08258056640625, + "learning_rate": 1e-06, + "loss": 0.1491, + "num_tokens": 11887023.0, + "reward": -0.588653564453125, + "reward_std": 6.76053524017334, + "rewards/rm_reward_func/mean": -0.588653564453125, + "rewards/rm_reward_func/std": 7.1149115562438965, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 421.8125, + "completions/mean_terminated_length": 380.8182067871094, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.6064, + "grad_norm": 1.6099178791046143, + "kl": 0.03948974609375, + "learning_rate": 1e-06, + "loss": 0.0484, + "num_tokens": 11908393.0, + "reward": 4.95294189453125, + "reward_std": 6.641022205352783, + "rewards/rm_reward_func/mean": 4.95294189453125, + "rewards/rm_reward_func/std": 14.447596549987793, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 391.21875, + "completions/mean_terminated_length": 318.75, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.6072, + "grad_norm": 1.9315482378005981, + "kl": 0.05780029296875, + "learning_rate": 1e-06, + "loss": 0.1275, + "num_tokens": 11927624.0, + "reward": -8.3724365234375, + "reward_std": 4.914219856262207, + "rewards/rm_reward_func/mean": -8.3724365234375, + "rewards/rm_reward_func/std": 8.61403751373291, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 383.625, + "completions/mean_terminated_length": 325.2727355957031, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.608, + "grad_norm": 1.6972637176513672, + "kl": 0.06524658203125, + "learning_rate": 1e-06, + "loss": 0.0518, + "num_tokens": 11947100.0, + "reward": -1.30072021484375, + "reward_std": 5.248788356781006, + "rewards/rm_reward_func/mean": -1.30072021484375, + "rewards/rm_reward_func/std": 17.929773330688477, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 341.75, + "completions/mean_terminated_length": 310.22222900390625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.6088, + "grad_norm": 1.6799328327178955, + "kl": 0.04168701171875, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 11961676.0, + "reward": -5.9814453125, + "reward_std": 8.520503044128418, + "rewards/rm_reward_func/mean": -5.9814453125, + "rewards/rm_reward_func/std": 11.795683860778809, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 308.59375, + "completions/mean_terminated_length": 229.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6096, + "grad_norm": 2.811432361602783, + "kl": 0.11236572265625, + "learning_rate": 1e-06, + "loss": 0.0997, + "num_tokens": 11977231.0, + "reward": 3.2220458984375, + "reward_std": 4.2552971839904785, + "rewards/rm_reward_func/mean": 3.2220458984375, + "rewards/rm_reward_func/std": 7.499974727630615, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 425.65625, + "completions/mean_terminated_length": 366.5789489746094, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.6104, + "grad_norm": 2.015578269958496, + "kl": 0.046539306640625, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 11992524.0, + "reward": -7.05584716796875, + "reward_std": 5.1823883056640625, + "rewards/rm_reward_func/mean": -7.05584716796875, + "rewards/rm_reward_func/std": 7.240790843963623, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 397.4375, + "completions/mean_terminated_length": 359.25, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.6112, + "grad_norm": 1.821246862411499, + "kl": 0.0413818359375, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 12006898.0, + "reward": -4.796356201171875, + "reward_std": 4.1045050621032715, + "rewards/rm_reward_func/mean": -4.796356201171875, + "rewards/rm_reward_func/std": 6.808689594268799, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 426.28125, + "completions/mean_terminated_length": 367.631591796875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.612, + "grad_norm": 1.9066646099090576, + "kl": 0.0494384765625, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 12022507.0, + "reward": -0.5022773742675781, + "reward_std": 5.278798580169678, + "rewards/rm_reward_func/mean": -0.5022773742675781, + "rewards/rm_reward_func/std": 12.592911720275879, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 274.71875, + "completions/mean_terminated_length": 267.06451416015625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.6128, + "grad_norm": 1.8977649211883545, + "kl": 0.09136962890625, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 12037978.0, + "reward": 12.126953125, + "reward_std": 3.822523593902588, + "rewards/rm_reward_func/mean": 12.126953125, + "rewards/rm_reward_func/std": 18.096817016601562, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 312.71875, + "completions/mean_terminated_length": 246.2916717529297, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.6136, + "grad_norm": 2.0742993354797363, + "kl": 0.0810546875, + "learning_rate": 1e-06, + "loss": -0.0771, + "num_tokens": 12055337.0, + "reward": -4.2760009765625, + "reward_std": 5.665755271911621, + "rewards/rm_reward_func/mean": -4.2760009765625, + "rewards/rm_reward_func/std": 12.590758323669434, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 405.90625, + "completions/mean_terminated_length": 333.3157958984375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.6144, + "grad_norm": 2.0644466876983643, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.0331, + "num_tokens": 12070838.0, + "reward": -12.09429931640625, + "reward_std": 4.735949516296387, + "rewards/rm_reward_func/mean": -12.09429931640625, + "rewards/rm_reward_func/std": 8.851462364196777, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 238.28125, + "completions/mean_terminated_length": 238.28125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.6152, + "grad_norm": 1.7941094636917114, + "kl": 0.105224609375, + "learning_rate": 1e-06, + "loss": -0.0451, + "num_tokens": 12088151.0, + "reward": 6.8297119140625, + "reward_std": 5.0082902908325195, + "rewards/rm_reward_func/mean": 6.8297119140625, + "rewards/rm_reward_func/std": 16.496559143066406, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 277.8125, + "completions/mean_terminated_length": 212.239990234375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.616, + "grad_norm": 3.0189247131347656, + "kl": 0.10015869140625, + "learning_rate": 1e-06, + "loss": 0.0815, + "num_tokens": 12103929.0, + "reward": -3.9696044921875, + "reward_std": 5.077627182006836, + "rewards/rm_reward_func/mean": -3.9696044921875, + "rewards/rm_reward_func/std": 20.89516258239746, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 287.6875, + "completions/mean_terminated_length": 264.4827575683594, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.6168, + "grad_norm": 3.370480537414551, + "kl": 0.10546875, + "learning_rate": 1e-06, + "loss": -0.0845, + "num_tokens": 12123959.0, + "reward": 5.3612060546875, + "reward_std": 4.751652717590332, + "rewards/rm_reward_func/mean": 5.3612060546875, + "rewards/rm_reward_func/std": 14.20404052734375, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 494.625, + "completions/mean_terminated_length": 442.5, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.6176, + "grad_norm": 1.646755337715149, + "kl": 0.03765869140625, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 12143299.0, + "reward": -1.1861495971679688, + "reward_std": 4.998536109924316, + "rewards/rm_reward_func/mean": -1.1861495971679688, + "rewards/rm_reward_func/std": 13.400155067443848, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 227.15625, + "completions/mean_terminated_length": 227.15625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.6184, + "grad_norm": 2.396883964538574, + "kl": 0.0836181640625, + "learning_rate": 1e-06, + "loss": -0.0373, + "num_tokens": 12158456.0, + "reward": -0.018768310546875, + "reward_std": 6.481961727142334, + "rewards/rm_reward_func/mean": -0.018768310546875, + "rewards/rm_reward_func/std": 8.474701881408691, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 361.53125, + "completions/mean_terminated_length": 211.0625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.6192, + "grad_norm": 2.1148874759674072, + "kl": 0.0638427734375, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 12172529.0, + "reward": -1.5374755859375, + "reward_std": 4.501974105834961, + "rewards/rm_reward_func/mean": -1.5374755859375, + "rewards/rm_reward_func/std": 8.106304168701172, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 466.59375, + "completions/mean_terminated_length": 439.3500061035156, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.62, + "grad_norm": 1.544356346130371, + "kl": 0.03753662109375, + "learning_rate": 1e-06, + "loss": 0.0159, + "num_tokens": 12190948.0, + "reward": -5.691650390625, + "reward_std": 5.6976447105407715, + "rewards/rm_reward_func/mean": -5.691650390625, + "rewards/rm_reward_func/std": 7.072537422180176, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 415.96875, + "completions/mean_terminated_length": 372.3182067871094, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6208, + "grad_norm": 1.6409132480621338, + "kl": 0.05694580078125, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 12208691.0, + "reward": -0.6231689453125, + "reward_std": 4.408117294311523, + "rewards/rm_reward_func/mean": -0.6231689453125, + "rewards/rm_reward_func/std": 16.75998878479004, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 391.34375, + "completions/mean_terminated_length": 357.55999755859375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.6216, + "grad_norm": 1.5849069356918335, + "kl": 0.038330078125, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 12226158.0, + "reward": 7.8115234375, + "reward_std": 6.073317527770996, + "rewards/rm_reward_func/mean": 7.8115234375, + "rewards/rm_reward_func/std": 13.412633895874023, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 358.28125, + "completions/mean_terminated_length": 336.3214416503906, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.6224, + "grad_norm": 1.9297345876693726, + "kl": 0.073486328125, + "learning_rate": 1e-06, + "loss": 0.0466, + "num_tokens": 12241999.0, + "reward": 1.650665283203125, + "reward_std": 5.406838417053223, + "rewards/rm_reward_func/mean": 1.650665283203125, + "rewards/rm_reward_func/std": 10.137090682983398, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 395.09375, + "completions/mean_terminated_length": 291.9411926269531, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.6232, + "grad_norm": 1.4971743822097778, + "kl": 0.065826416015625, + "learning_rate": 1e-06, + "loss": 0.054, + "num_tokens": 12259298.0, + "reward": -0.95458984375, + "reward_std": 5.878915786743164, + "rewards/rm_reward_func/mean": -0.95458984375, + "rewards/rm_reward_func/std": 9.596146583557129, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 348.09375, + "completions/mean_terminated_length": 324.6785888671875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.624, + "grad_norm": 1.6809792518615723, + "kl": 0.07177734375, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 12276445.0, + "reward": 14.17633056640625, + "reward_std": 4.167543411254883, + "rewards/rm_reward_func/mean": 14.17633056640625, + "rewards/rm_reward_func/std": 18.308982849121094, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 367.71875, + "completions/mean_terminated_length": 327.32000732421875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6248, + "grad_norm": 1.9638782739639282, + "kl": 0.05645751953125, + "learning_rate": 1e-06, + "loss": 0.0835, + "num_tokens": 12295484.0, + "reward": 0.5545654296875, + "reward_std": 7.807260036468506, + "rewards/rm_reward_func/mean": 0.5545654296875, + "rewards/rm_reward_func/std": 14.969010353088379, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 387.28125, + "completions/mean_terminated_length": 338.478271484375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.6256, + "grad_norm": 1.5887730121612549, + "kl": 0.039093017578125, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 12313677.0, + "reward": -4.2950439453125, + "reward_std": 8.64499282836914, + "rewards/rm_reward_func/mean": -4.2950439453125, + "rewards/rm_reward_func/std": 14.152074813842773, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 399.40625, + "completions/mean_terminated_length": 361.875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.6264, + "grad_norm": 1.7321784496307373, + "kl": 0.041015625, + "learning_rate": 1e-06, + "loss": -0.0475, + "num_tokens": 12331858.0, + "reward": -3.74432373046875, + "reward_std": 3.1944923400878906, + "rewards/rm_reward_func/mean": -3.74432373046875, + "rewards/rm_reward_func/std": 7.2627387046813965, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 240.6875, + "completions/mean_terminated_length": 240.6875, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.6272, + "grad_norm": 2.683408498764038, + "kl": 0.0968017578125, + "learning_rate": 1e-06, + "loss": 0.0857, + "num_tokens": 12345144.0, + "reward": 3.4385986328125, + "reward_std": 5.330440521240234, + "rewards/rm_reward_func/mean": 3.4385986328125, + "rewards/rm_reward_func/std": 12.799178123474121, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 320.3125, + "completions/mean_terminated_length": 300.4827575683594, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.628, + "grad_norm": 2.0897364616394043, + "kl": 0.077056884765625, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 12359538.0, + "reward": -8.53704833984375, + "reward_std": 4.623435974121094, + "rewards/rm_reward_func/mean": -8.53704833984375, + "rewards/rm_reward_func/std": 8.657999992370605, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 430.75, + "completions/mean_terminated_length": 375.15789794921875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.6288, + "grad_norm": 1.3711316585540771, + "kl": 0.034210205078125, + "learning_rate": 1e-06, + "loss": 0.085, + "num_tokens": 12381802.0, + "reward": -5.14404296875, + "reward_std": 6.813742637634277, + "rewards/rm_reward_func/mean": -5.14404296875, + "rewards/rm_reward_func/std": 8.147368431091309, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 284.1875, + "completions/mean_terminated_length": 220.39999389648438, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.6296, + "grad_norm": 2.205284833908081, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": -0.0249, + "num_tokens": 12401768.0, + "reward": 5.022705078125, + "reward_std": 5.365182399749756, + "rewards/rm_reward_func/mean": 5.022705078125, + "rewards/rm_reward_func/std": 18.661264419555664, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 322.75, + "completions/mean_terminated_length": 236.72727966308594, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.6304, + "grad_norm": 2.795766830444336, + "kl": 0.09356689453125, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 12418208.0, + "reward": -2.3320236206054688, + "reward_std": 3.7200515270233154, + "rewards/rm_reward_func/mean": -2.3320236206054688, + "rewards/rm_reward_func/std": 16.78501319885254, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 341.8125, + "completions/mean_terminated_length": 275.2174072265625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.6312, + "grad_norm": 1.7020301818847656, + "kl": 0.062255859375, + "learning_rate": 1e-06, + "loss": 0.1408, + "num_tokens": 12436378.0, + "reward": -8.028976440429688, + "reward_std": 5.496880531311035, + "rewards/rm_reward_func/mean": -8.028976440429688, + "rewards/rm_reward_func/std": 15.035484313964844, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 455.21875, + "completions/mean_terminated_length": 372.23077392578125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.632, + "grad_norm": 1.4828109741210938, + "kl": 0.033721923828125, + "learning_rate": 1e-06, + "loss": -0.0187, + "num_tokens": 12457433.0, + "reward": -7.8040771484375, + "reward_std": 3.3573057651519775, + "rewards/rm_reward_func/mean": -7.8040771484375, + "rewards/rm_reward_func/std": 15.83621597290039, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 447.71875, + "completions/mean_terminated_length": 403.7368469238281, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.6328, + "grad_norm": 1.64700448513031, + "kl": 0.040802001953125, + "learning_rate": 1e-06, + "loss": -0.0363, + "num_tokens": 12474320.0, + "reward": -8.177734375, + "reward_std": 3.934264659881592, + "rewards/rm_reward_func/mean": -8.177734375, + "rewards/rm_reward_func/std": 9.670948028564453, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.71875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 491.6875, + "completions/mean_terminated_length": 439.77777099609375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.6336, + "grad_norm": 1.4515008926391602, + "kl": 0.0294189453125, + "learning_rate": 1e-06, + "loss": 0.0439, + "num_tokens": 12494902.0, + "reward": -0.952423095703125, + "reward_std": 4.107771873474121, + "rewards/rm_reward_func/mean": -0.952423095703125, + "rewards/rm_reward_func/std": 7.56387186050415, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 273.375, + "completions/mean_terminated_length": 239.2857208251953, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.6344, + "grad_norm": 1.9918630123138428, + "kl": 0.06610107421875, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 12509402.0, + "reward": -0.4385986328125, + "reward_std": 4.984426021575928, + "rewards/rm_reward_func/mean": -0.4385986328125, + "rewards/rm_reward_func/std": 15.97726821899414, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 402.15625, + "completions/mean_terminated_length": 344.6190490722656, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.6352, + "grad_norm": 1.5443204641342163, + "kl": 0.05877685546875, + "learning_rate": 1e-06, + "loss": 0.0596, + "num_tokens": 12530607.0, + "reward": -4.820343017578125, + "reward_std": 7.136542320251465, + "rewards/rm_reward_func/mean": -4.820343017578125, + "rewards/rm_reward_func/std": 16.648509979248047, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 448.15625, + "completions/mean_terminated_length": 366.0714416503906, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.636, + "grad_norm": 1.548814296722412, + "kl": 0.0350494384765625, + "learning_rate": 1e-06, + "loss": -0.0274, + "num_tokens": 12549332.0, + "reward": -8.04266357421875, + "reward_std": 4.036449909210205, + "rewards/rm_reward_func/mean": -8.04266357421875, + "rewards/rm_reward_func/std": 11.5554780960083, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 434.03125, + "completions/mean_terminated_length": 356.0625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.6368, + "grad_norm": 1.6055235862731934, + "kl": 0.043853759765625, + "learning_rate": 1e-06, + "loss": 0.0363, + "num_tokens": 12571789.0, + "reward": -2.919189453125, + "reward_std": 5.587007522583008, + "rewards/rm_reward_func/mean": -2.919189453125, + "rewards/rm_reward_func/std": 17.120798110961914, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 313.59375, + "completions/mean_terminated_length": 276.85186767578125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.6376, + "grad_norm": 2.0004870891571045, + "kl": 0.05963134765625, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 12585328.0, + "reward": 2.49713134765625, + "reward_std": 7.5787811279296875, + "rewards/rm_reward_func/mean": 2.49713134765625, + "rewards/rm_reward_func/std": 9.367626190185547, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 369.625, + "completions/mean_terminated_length": 272.2105407714844, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.6384, + "grad_norm": 2.4848029613494873, + "kl": 0.093994140625, + "learning_rate": 1e-06, + "loss": -0.0331, + "num_tokens": 12599396.0, + "reward": -1.3898849487304688, + "reward_std": 6.054429054260254, + "rewards/rm_reward_func/mean": -1.3898849487304688, + "rewards/rm_reward_func/std": 5.980787754058838, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 358.4375, + "completions/mean_terminated_length": 323.0, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.6392, + "grad_norm": 1.7730456590652466, + "kl": 0.066162109375, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 12617842.0, + "reward": 10.45037841796875, + "reward_std": 5.435694217681885, + "rewards/rm_reward_func/mean": 10.45037841796875, + "rewards/rm_reward_func/std": 14.936053276062012, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 287.84375, + "completions/mean_terminated_length": 287.84375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.64, + "grad_norm": 1.8582780361175537, + "kl": 0.0697021484375, + "learning_rate": 1e-06, + "loss": 0.0378, + "num_tokens": 12632557.0, + "reward": 14.749267578125, + "reward_std": 6.529238224029541, + "rewards/rm_reward_func/mean": 14.749267578125, + "rewards/rm_reward_func/std": 19.619064331054688, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 436.625, + "completions/mean_terminated_length": 378.0, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.6408, + "grad_norm": 1.56951904296875, + "kl": 0.0579833984375, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 12656697.0, + "reward": 4.82232666015625, + "reward_std": 6.457304000854492, + "rewards/rm_reward_func/mean": 4.82232666015625, + "rewards/rm_reward_func/std": 20.790550231933594, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 390.375, + "completions/mean_terminated_length": 326.66668701171875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.6416, + "grad_norm": 1.9639281034469604, + "kl": 0.04437255859375, + "learning_rate": 1e-06, + "loss": -0.0373, + "num_tokens": 12673805.0, + "reward": -8.7774658203125, + "reward_std": 4.181488037109375, + "rewards/rm_reward_func/mean": -8.7774658203125, + "rewards/rm_reward_func/std": 10.957117080688477, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 349.875, + "completions/mean_terminated_length": 319.85186767578125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.6424, + "grad_norm": 1.831272006034851, + "kl": 0.06842041015625, + "learning_rate": 1e-06, + "loss": 0.0416, + "num_tokens": 12690297.0, + "reward": -0.3900146484375, + "reward_std": 4.239322662353516, + "rewards/rm_reward_func/mean": -0.3900146484375, + "rewards/rm_reward_func/std": 15.369624137878418, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 471.21875, + "completions/mean_terminated_length": 452.68182373046875, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.6432, + "grad_norm": 1.6480557918548584, + "kl": 0.0411376953125, + "learning_rate": 1e-06, + "loss": -0.0353, + "num_tokens": 12708024.0, + "reward": -4.619140625, + "reward_std": 4.437232494354248, + "rewards/rm_reward_func/mean": -4.619140625, + "rewards/rm_reward_func/std": 9.676486015319824, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 405.25, + "completions/mean_terminated_length": 341.20001220703125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.644, + "grad_norm": 1.6141996383666992, + "kl": 0.076416015625, + "learning_rate": 1e-06, + "loss": -0.0278, + "num_tokens": 12730712.0, + "reward": -12.363037109375, + "reward_std": 4.492403984069824, + "rewards/rm_reward_func/mean": -12.363037109375, + "rewards/rm_reward_func/std": 8.396431922912598, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 458.4375, + "completions/mean_terminated_length": 411.1764831542969, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.6448, + "grad_norm": 1.5772229433059692, + "kl": 0.04840087890625, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 12752470.0, + "reward": 1.596435546875, + "reward_std": 7.263904094696045, + "rewards/rm_reward_func/mean": 1.596435546875, + "rewards/rm_reward_func/std": 21.033967971801758, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 355.125, + "completions/mean_terminated_length": 293.7391357421875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.6456, + "grad_norm": 1.9470046758651733, + "kl": 0.0462646484375, + "learning_rate": 1e-06, + "loss": 0.0343, + "num_tokens": 12765826.0, + "reward": -14.298828125, + "reward_std": 4.7508625984191895, + "rewards/rm_reward_func/mean": -14.298828125, + "rewards/rm_reward_func/std": 7.956632137298584, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 394.84375, + "completions/mean_terminated_length": 324.5500183105469, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.6464, + "grad_norm": 1.719713568687439, + "kl": 0.056640625, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 12785125.0, + "reward": 4.44091796875, + "reward_std": 4.730961799621582, + "rewards/rm_reward_func/mean": 4.44091796875, + "rewards/rm_reward_func/std": 14.091428756713867, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 266.5714416503906, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.6472, + "grad_norm": 2.153022050857544, + "kl": 0.09246826171875, + "learning_rate": 1e-06, + "loss": 0.0746, + "num_tokens": 12800181.0, + "reward": 1.770751953125, + "reward_std": 7.2877373695373535, + "rewards/rm_reward_func/mean": 1.770751953125, + "rewards/rm_reward_func/std": 16.556913375854492, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 294.5625, + "completions/mean_terminated_length": 222.08334350585938, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.648, + "grad_norm": 2.431635618209839, + "kl": 0.07586669921875, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 12811711.0, + "reward": -3.408447265625, + "reward_std": 6.304819107055664, + "rewards/rm_reward_func/mean": -3.408447265625, + "rewards/rm_reward_func/std": 7.7825493812561035, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 410.5625, + "completions/mean_terminated_length": 341.15789794921875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.6488, + "grad_norm": 1.7024002075195312, + "kl": 0.06878662109375, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 12831137.0, + "reward": 1.285888671875, + "reward_std": 5.4614057540893555, + "rewards/rm_reward_func/mean": 1.285888671875, + "rewards/rm_reward_func/std": 12.91396427154541, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 401.03125, + "completions/mean_terminated_length": 334.45001220703125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.6496, + "grad_norm": 1.7982466220855713, + "kl": 0.047576904296875, + "learning_rate": 1e-06, + "loss": 0.0365, + "num_tokens": 12846410.0, + "reward": -3.569000244140625, + "reward_std": 4.652016639709473, + "rewards/rm_reward_func/mean": -3.569000244140625, + "rewards/rm_reward_func/std": 5.947784423828125, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 262.03125, + "completions/mean_terminated_length": 262.03125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.6504, + "grad_norm": 1.7034178972244263, + "kl": 0.092041015625, + "learning_rate": 1e-06, + "loss": -0.0485, + "num_tokens": 12866459.0, + "reward": 8.5050048828125, + "reward_std": 4.619973659515381, + "rewards/rm_reward_func/mean": 8.5050048828125, + "rewards/rm_reward_func/std": 14.52647876739502, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 397.0, + "completions/mean_terminated_length": 370.4615478515625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.6512, + "grad_norm": 1.6072787046432495, + "kl": 0.0355224609375, + "learning_rate": 1e-06, + "loss": 0.0446, + "num_tokens": 12881219.0, + "reward": -2.746826171875, + "reward_std": 7.377750396728516, + "rewards/rm_reward_func/mean": -2.746826171875, + "rewards/rm_reward_func/std": 8.113428115844727, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 371.8125, + "completions/mean_terminated_length": 316.9565124511719, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.652, + "grad_norm": 1.9066165685653687, + "kl": 0.0594482421875, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 12899589.0, + "reward": -9.53515625, + "reward_std": 4.386843204498291, + "rewards/rm_reward_func/mean": -9.53515625, + "rewards/rm_reward_func/std": 7.112394332885742, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 302.78125, + "completions/mean_terminated_length": 296.0322570800781, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.6528, + "grad_norm": 2.1296980381011963, + "kl": 0.0765380859375, + "learning_rate": 1e-06, + "loss": -0.0655, + "num_tokens": 12914702.0, + "reward": -6.715087890625, + "reward_std": 3.347726345062256, + "rewards/rm_reward_func/mean": -6.715087890625, + "rewards/rm_reward_func/std": 15.396578788757324, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 413.46875, + "completions/mean_terminated_length": 385.8800048828125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6536, + "grad_norm": 1.7054963111877441, + "kl": 0.03607177734375, + "learning_rate": 1e-06, + "loss": -0.0602, + "num_tokens": 12929837.0, + "reward": 5.032958984375, + "reward_std": 6.780797958374023, + "rewards/rm_reward_func/mean": 5.032958984375, + "rewards/rm_reward_func/std": 9.962952613830566, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 393.59375, + "completions/mean_terminated_length": 339.7727355957031, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.6544, + "grad_norm": 1.6988518238067627, + "kl": 0.078643798828125, + "learning_rate": 1e-06, + "loss": 0.0612, + "num_tokens": 12949736.0, + "reward": 2.55517578125, + "reward_std": 3.879378318786621, + "rewards/rm_reward_func/mean": 2.55517578125, + "rewards/rm_reward_func/std": 16.611738204956055, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 368.09375, + "completions/mean_terminated_length": 334.8846130371094, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.6552, + "grad_norm": 1.9187781810760498, + "kl": 0.0479736328125, + "learning_rate": 1e-06, + "loss": -0.011, + "num_tokens": 12965403.0, + "reward": -2.953369140625, + "reward_std": 3.9390182495117188, + "rewards/rm_reward_func/mean": -2.953369140625, + "rewards/rm_reward_func/std": 7.869694232940674, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 433.6875, + "completions/mean_terminated_length": 364.5882263183594, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.656, + "grad_norm": 1.587661623954773, + "kl": 0.03997802734375, + "learning_rate": 1e-06, + "loss": 0.1318, + "num_tokens": 12981649.0, + "reward": -13.14111328125, + "reward_std": 5.038619041442871, + "rewards/rm_reward_func/mean": -13.14111328125, + "rewards/rm_reward_func/std": 9.344220161437988, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 318.65625, + "completions/mean_terminated_length": 254.20834350585938, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.6568, + "grad_norm": 2.144728899002075, + "kl": 0.0738525390625, + "learning_rate": 1e-06, + "loss": 0.0475, + "num_tokens": 12994494.0, + "reward": -6.48553466796875, + "reward_std": 7.326076507568359, + "rewards/rm_reward_func/mean": -6.48553466796875, + "rewards/rm_reward_func/std": 11.509013175964355, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.65625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 449.0625, + "completions/mean_terminated_length": 328.9090881347656, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.6576, + "grad_norm": 1.703092098236084, + "kl": 0.041839599609375, + "learning_rate": 1e-06, + "loss": -0.0613, + "num_tokens": 13013280.0, + "reward": -11.07568359375, + "reward_std": 4.9624223709106445, + "rewards/rm_reward_func/mean": -11.07568359375, + "rewards/rm_reward_func/std": 6.441583633422852, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 297.84375, + "completions/mean_terminated_length": 283.5666809082031, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.6584, + "grad_norm": 2.1857736110687256, + "kl": 0.06390380859375, + "learning_rate": 1e-06, + "loss": 0.0965, + "num_tokens": 13026371.0, + "reward": -13.30078125, + "reward_std": 2.7921879291534424, + "rewards/rm_reward_func/mean": -13.30078125, + "rewards/rm_reward_func/std": 3.936753749847412, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 332.625, + "completions/mean_terminated_length": 272.8333435058594, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.6592, + "grad_norm": 1.9702630043029785, + "kl": 0.08807373046875, + "learning_rate": 1e-06, + "loss": 0.0405, + "num_tokens": 13045439.0, + "reward": 9.664947509765625, + "reward_std": 4.583461761474609, + "rewards/rm_reward_func/mean": 9.664947509765625, + "rewards/rm_reward_func/std": 17.729219436645508, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 397.28125, + "completions/mean_terminated_length": 328.45001220703125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.66, + "grad_norm": 1.7395524978637695, + "kl": 0.060302734375, + "learning_rate": 1e-06, + "loss": 0.1223, + "num_tokens": 13065040.0, + "reward": 0.3260498046875, + "reward_std": 6.538402557373047, + "rewards/rm_reward_func/mean": 0.3260498046875, + "rewards/rm_reward_func/std": 10.285395622253418, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 395.46875, + "completions/mean_terminated_length": 278.9375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.6608, + "grad_norm": 1.7947407960891724, + "kl": 0.04083251953125, + "learning_rate": 1e-06, + "loss": -0.0432, + "num_tokens": 13083071.0, + "reward": -3.751678466796875, + "reward_std": 5.6214399337768555, + "rewards/rm_reward_func/mean": -3.751678466796875, + "rewards/rm_reward_func/std": 8.282191276550293, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 450.5625, + "completions/mean_terminated_length": 380.933349609375, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.6616, + "grad_norm": 1.5302925109863281, + "kl": 0.052032470703125, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 13103521.0, + "reward": 0.5224609375, + "reward_std": 5.373813152313232, + "rewards/rm_reward_func/mean": 0.5224609375, + "rewards/rm_reward_func/std": 18.18515396118164, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 353.375, + "completions/mean_terminated_length": 348.258056640625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.6624, + "grad_norm": 1.8980263471603394, + "kl": 0.05084228515625, + "learning_rate": 1e-06, + "loss": -0.0529, + "num_tokens": 13121709.0, + "reward": -2.9224853515625, + "reward_std": 4.43738317489624, + "rewards/rm_reward_func/mean": -2.9224853515625, + "rewards/rm_reward_func/std": 7.120941162109375, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 307.4375, + "completions/mean_terminated_length": 293.8000183105469, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.6632, + "grad_norm": 1.7767317295074463, + "kl": 0.07769775390625, + "learning_rate": 1e-06, + "loss": -0.1117, + "num_tokens": 13135547.0, + "reward": -4.795166015625, + "reward_std": 5.211917400360107, + "rewards/rm_reward_func/mean": -4.795166015625, + "rewards/rm_reward_func/std": 11.255773544311523, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 300.9375, + "completions/mean_terminated_length": 286.8666687011719, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.664, + "grad_norm": 3.0181024074554443, + "kl": 0.12225341796875, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 13147057.0, + "reward": -8.66241455078125, + "reward_std": 5.182307243347168, + "rewards/rm_reward_func/mean": -8.66241455078125, + "rewards/rm_reward_func/std": 8.982633590698242, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 386.0625, + "completions/mean_terminated_length": 336.7826232910156, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.6648, + "grad_norm": 1.4912856817245483, + "kl": 0.045501708984375, + "learning_rate": 1e-06, + "loss": -0.0219, + "num_tokens": 13165771.0, + "reward": -6.714019775390625, + "reward_std": 7.642567157745361, + "rewards/rm_reward_func/mean": -6.714019775390625, + "rewards/rm_reward_func/std": 8.166253089904785, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 335.75, + "completions/mean_terminated_length": 303.1111145019531, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.6656, + "grad_norm": 1.8842875957489014, + "kl": 0.058746337890625, + "learning_rate": 1e-06, + "loss": -0.0357, + "num_tokens": 13181659.0, + "reward": 3.322265625, + "reward_std": 5.375673294067383, + "rewards/rm_reward_func/mean": 3.322265625, + "rewards/rm_reward_func/std": 20.12171173095703, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 338.96875, + "completions/mean_terminated_length": 281.29168701171875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.6664, + "grad_norm": 1.8074941635131836, + "kl": 0.045867919921875, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 13198930.0, + "reward": -10.295318603515625, + "reward_std": 4.299862861633301, + "rewards/rm_reward_func/mean": -10.295318603515625, + "rewards/rm_reward_func/std": 11.407203674316406, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 296.3125, + "completions/mean_terminated_length": 289.3548278808594, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.6672, + "grad_norm": 2.067885637283325, + "kl": 0.101806640625, + "learning_rate": 1e-06, + "loss": 0.0701, + "num_tokens": 13216668.0, + "reward": -1.64959716796875, + "reward_std": 5.408228874206543, + "rewards/rm_reward_func/mean": -1.64959716796875, + "rewards/rm_reward_func/std": 12.140084266662598, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 349.625, + "completions/mean_terminated_length": 187.25, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.668, + "grad_norm": 2.1750733852386475, + "kl": 0.0623779296875, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 13233128.0, + "reward": -6.5570068359375, + "reward_std": 3.990337610244751, + "rewards/rm_reward_func/mean": -6.5570068359375, + "rewards/rm_reward_func/std": 5.2844719886779785, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 452.0625, + "completions/mean_terminated_length": 375.0000305175781, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.6688, + "grad_norm": 1.7226381301879883, + "kl": 0.03826904296875, + "learning_rate": 1e-06, + "loss": 0.0365, + "num_tokens": 13253034.0, + "reward": -7.770751953125, + "reward_std": 3.2842230796813965, + "rewards/rm_reward_func/mean": -7.770751953125, + "rewards/rm_reward_func/std": 6.768344879150391, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 378.09375, + "completions/mean_terminated_length": 317.227294921875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.6696, + "grad_norm": 1.8562217950820923, + "kl": 0.04583740234375, + "learning_rate": 1e-06, + "loss": 0.038, + "num_tokens": 13271205.0, + "reward": -11.717750549316406, + "reward_std": 4.083845138549805, + "rewards/rm_reward_func/mean": -11.717750549316406, + "rewards/rm_reward_func/std": 8.03064250946045, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 320.59375, + "completions/mean_terminated_length": 307.8333435058594, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.6704, + "grad_norm": 1.8894712924957275, + "kl": 0.04095458984375, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 13284088.0, + "reward": -0.90777587890625, + "reward_std": 6.5685954093933105, + "rewards/rm_reward_func/mean": -0.90777587890625, + "rewards/rm_reward_func/std": 8.175400733947754, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 410.6875, + "completions/mean_terminated_length": 341.3684387207031, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.6712, + "grad_norm": 1.606052279472351, + "kl": 0.0537109375, + "learning_rate": 1e-06, + "loss": -0.0505, + "num_tokens": 13304406.0, + "reward": -6.742431640625, + "reward_std": 6.445566177368164, + "rewards/rm_reward_func/mean": -6.742431640625, + "rewards/rm_reward_func/std": 12.174559593200684, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 366.28125, + "completions/mean_terminated_length": 266.5789489746094, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.672, + "grad_norm": 1.667305588722229, + "kl": 0.06573486328125, + "learning_rate": 1e-06, + "loss": 0.1347, + "num_tokens": 13323863.0, + "reward": 1.331787109375, + "reward_std": 7.261053085327148, + "rewards/rm_reward_func/mean": 1.331787109375, + "rewards/rm_reward_func/std": 20.560991287231445, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 339.625, + "completions/mean_terminated_length": 328.13336181640625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.6728, + "grad_norm": 1.9302817583084106, + "kl": 0.082275390625, + "learning_rate": 1e-06, + "loss": -0.1506, + "num_tokens": 13339707.0, + "reward": -1.82708740234375, + "reward_std": 4.353664398193359, + "rewards/rm_reward_func/mean": -1.82708740234375, + "rewards/rm_reward_func/std": 10.06405258178711, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 369.375, + "completions/mean_terminated_length": 283.8000183105469, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.6736, + "grad_norm": 1.8525702953338623, + "kl": 0.06060791015625, + "learning_rate": 1e-06, + "loss": 0.0589, + "num_tokens": 13360335.0, + "reward": -6.124969482421875, + "reward_std": 4.792412281036377, + "rewards/rm_reward_func/mean": -6.124969482421875, + "rewards/rm_reward_func/std": 7.881174087524414, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 320.59375, + "completions/mean_terminated_length": 285.1481628417969, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.6744, + "grad_norm": 2.0537571907043457, + "kl": 0.05340576171875, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 13377042.0, + "reward": -5.03082275390625, + "reward_std": 5.442148208618164, + "rewards/rm_reward_func/mean": -5.03082275390625, + "rewards/rm_reward_func/std": 8.021278381347656, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 290.0, + "completions/mean_terminated_length": 282.8387145996094, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.6752, + "grad_norm": 2.19673228263855, + "kl": 0.06903076171875, + "learning_rate": 1e-06, + "loss": -0.0362, + "num_tokens": 13390610.0, + "reward": -3.630950927734375, + "reward_std": 5.155799388885498, + "rewards/rm_reward_func/mean": -3.630950927734375, + "rewards/rm_reward_func/std": 7.567800045013428, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 362.25, + "completions/mean_terminated_length": 245.7777862548828, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.676, + "grad_norm": 1.7250651121139526, + "kl": 0.034210205078125, + "learning_rate": 1e-06, + "loss": 0.0733, + "num_tokens": 13408786.0, + "reward": -7.6973876953125, + "reward_std": 6.877542495727539, + "rewards/rm_reward_func/mean": -7.6973876953125, + "rewards/rm_reward_func/std": 7.466717720031738, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 344.375, + "completions/mean_terminated_length": 305.69232177734375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.6768, + "grad_norm": 1.8355772495269775, + "kl": 0.0640869140625, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 13424590.0, + "reward": -9.62969970703125, + "reward_std": 4.646274566650391, + "rewards/rm_reward_func/mean": -9.62969970703125, + "rewards/rm_reward_func/std": 6.554321765899658, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 183.78125, + "completions/mean_terminated_length": 183.78125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.6776, + "grad_norm": 2.764571189880371, + "kl": 0.1297607421875, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 13434503.0, + "reward": 1.817169189453125, + "reward_std": 6.017488956451416, + "rewards/rm_reward_func/mean": 1.817169189453125, + "rewards/rm_reward_func/std": 11.715951919555664, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 279.375, + "completions/mean_terminated_length": 263.8666687011719, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.6784, + "grad_norm": 1.9904742240905762, + "kl": 0.09442138671875, + "learning_rate": 1e-06, + "loss": -0.0089, + "num_tokens": 13450587.0, + "reward": 5.386962890625, + "reward_std": 6.30614709854126, + "rewards/rm_reward_func/mean": 5.386962890625, + "rewards/rm_reward_func/std": 7.43483304977417, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 353.09375, + "completions/mean_terminated_length": 244.36842346191406, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6792, + "grad_norm": 1.9600452184677124, + "kl": 0.07122802734375, + "learning_rate": 1e-06, + "loss": 0.0317, + "num_tokens": 13469510.0, + "reward": -5.139892578125, + "reward_std": 5.0166778564453125, + "rewards/rm_reward_func/mean": -5.139892578125, + "rewards/rm_reward_func/std": 13.892590522766113, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 398.21875, + "completions/mean_terminated_length": 284.4375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.68, + "grad_norm": 2.0862793922424316, + "kl": 0.06207275390625, + "learning_rate": 1e-06, + "loss": 0.0415, + "num_tokens": 13487485.0, + "reward": -3.314697265625, + "reward_std": 5.655331611633301, + "rewards/rm_reward_func/mean": -3.314697265625, + "rewards/rm_reward_func/std": 12.433382987976074, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 315.15625, + "completions/mean_terminated_length": 225.68182373046875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.6808, + "grad_norm": 3.2112951278686523, + "kl": 0.1060791015625, + "learning_rate": 1e-06, + "loss": 0.0555, + "num_tokens": 13500858.0, + "reward": -3.000732421875, + "reward_std": 5.503466606140137, + "rewards/rm_reward_func/mean": -3.000732421875, + "rewards/rm_reward_func/std": 9.099902153015137, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 378.125, + "completions/mean_terminated_length": 325.7391357421875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.6816, + "grad_norm": 1.7231371402740479, + "kl": 0.046295166015625, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 13516102.0, + "reward": 6.725830078125, + "reward_std": 10.201772689819336, + "rewards/rm_reward_func/mean": 6.725830078125, + "rewards/rm_reward_func/std": 12.631306648254395, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 286.28125, + "completions/mean_terminated_length": 254.0357208251953, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.6824, + "grad_norm": 2.1838865280151367, + "kl": 0.07073974609375, + "learning_rate": 1e-06, + "loss": -0.0189, + "num_tokens": 13527543.0, + "reward": -4.8714599609375, + "reward_std": 6.023524284362793, + "rewards/rm_reward_func/mean": -4.8714599609375, + "rewards/rm_reward_func/std": 9.998571395874023, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 364.28125, + "completions/mean_terminated_length": 330.19232177734375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.6832, + "grad_norm": 1.7600077390670776, + "kl": 0.0538330078125, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 13544616.0, + "reward": 0.74462890625, + "reward_std": 5.775543689727783, + "rewards/rm_reward_func/mean": 0.74462890625, + "rewards/rm_reward_func/std": 7.537502288818359, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 353.0625, + "completions/mean_terminated_length": 280.81817626953125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.684, + "grad_norm": 2.3846139907836914, + "kl": 0.06378173828125, + "learning_rate": 1e-06, + "loss": 0.0305, + "num_tokens": 13560018.0, + "reward": -7.67559814453125, + "reward_std": 3.897425413131714, + "rewards/rm_reward_func/mean": -7.67559814453125, + "rewards/rm_reward_func/std": 6.36590576171875, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 281.46875, + "completions/mean_terminated_length": 204.625, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.6848, + "grad_norm": 2.7427308559417725, + "kl": 0.0587158203125, + "learning_rate": 1e-06, + "loss": 0.087, + "num_tokens": 13571857.0, + "reward": -9.73712158203125, + "reward_std": 5.602230072021484, + "rewards/rm_reward_func/mean": -9.73712158203125, + "rewards/rm_reward_func/std": 11.024609565734863, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 292.90625, + "completions/mean_terminated_length": 292.90625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.6856, + "grad_norm": 2.148329734802246, + "kl": 0.06610107421875, + "learning_rate": 1e-06, + "loss": 0.0325, + "num_tokens": 13583990.0, + "reward": 1.218017578125, + "reward_std": 6.0176849365234375, + "rewards/rm_reward_func/mean": 1.218017578125, + "rewards/rm_reward_func/std": 16.035024642944336, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 320.71875, + "completions/mean_terminated_length": 267.1600036621094, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.6864, + "grad_norm": 1.727174162864685, + "kl": 0.0557861328125, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 13601149.0, + "reward": -7.7918701171875, + "reward_std": 6.956125259399414, + "rewards/rm_reward_func/mean": -7.7918701171875, + "rewards/rm_reward_func/std": 10.238733291625977, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 279.4375, + "completions/mean_terminated_length": 255.37930297851562, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.6872, + "grad_norm": 2.1880764961242676, + "kl": 0.05322265625, + "learning_rate": 1e-06, + "loss": -0.0503, + "num_tokens": 13612083.0, + "reward": -3.659210205078125, + "reward_std": 6.743606090545654, + "rewards/rm_reward_func/mean": -3.659210205078125, + "rewards/rm_reward_func/std": 13.085749626159668, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.75, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 491.15625, + "completions/mean_terminated_length": 428.625, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.688, + "grad_norm": 1.510858178138733, + "kl": 0.03668212890625, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 13631216.0, + "reward": -7.741943359375, + "reward_std": 4.888319969177246, + "rewards/rm_reward_func/mean": -7.741943359375, + "rewards/rm_reward_func/std": 10.60095500946045, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 299.875, + "completions/mean_terminated_length": 285.73333740234375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.6888, + "grad_norm": 1.6817961931228638, + "kl": 0.05609130859375, + "learning_rate": 1e-06, + "loss": 0.0648, + "num_tokens": 13646884.0, + "reward": -1.1103515625, + "reward_std": 7.428096771240234, + "rewards/rm_reward_func/mean": -1.1103515625, + "rewards/rm_reward_func/std": 18.205921173095703, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 255.40625, + "completions/mean_terminated_length": 207.88888549804688, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.6896, + "grad_norm": 2.778743028640747, + "kl": 0.08013916015625, + "learning_rate": 1e-06, + "loss": 0.0801, + "num_tokens": 13659977.0, + "reward": -4.3226470947265625, + "reward_std": 5.485634803771973, + "rewards/rm_reward_func/mean": -4.3226470947265625, + "rewards/rm_reward_func/std": 8.717120170593262, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 260.4375, + "completions/mean_terminated_length": 260.4375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.6904, + "grad_norm": 2.348311185836792, + "kl": 0.04913330078125, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 13669911.0, + "reward": -3.59002685546875, + "reward_std": 7.257650375366211, + "rewards/rm_reward_func/mean": -3.59002685546875, + "rewards/rm_reward_func/std": 9.280685424804688, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 273.1875, + "completions/mean_terminated_length": 218.07693481445312, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.6912, + "grad_norm": 2.6193065643310547, + "kl": 0.07049560546875, + "learning_rate": 1e-06, + "loss": 0.0802, + "num_tokens": 13680229.0, + "reward": -6.74017333984375, + "reward_std": 7.699464797973633, + "rewards/rm_reward_func/mean": -6.74017333984375, + "rewards/rm_reward_func/std": 9.233904838562012, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 260.625, + "completions/mean_terminated_length": 234.6206817626953, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.692, + "grad_norm": 2.5557973384857178, + "kl": 0.0748291015625, + "learning_rate": 1e-06, + "loss": 0.1443, + "num_tokens": 13691185.0, + "reward": -2.5369873046875, + "reward_std": 5.114565849304199, + "rewards/rm_reward_func/mean": -2.5369873046875, + "rewards/rm_reward_func/std": 10.465826034545898, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 341.5625, + "completions/mean_terminated_length": 252.2857208251953, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.6928, + "grad_norm": 1.926889419555664, + "kl": 0.07391357421875, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 13710491.0, + "reward": -4.79638671875, + "reward_std": 8.270464897155762, + "rewards/rm_reward_func/mean": -4.79638671875, + "rewards/rm_reward_func/std": 17.685964584350586, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 255.6333465576172, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.6936, + "grad_norm": 2.030210256576538, + "kl": 0.093505859375, + "learning_rate": 1e-06, + "loss": 0.0212, + "num_tokens": 13722888.0, + "reward": 7.81378173828125, + "reward_std": 6.582851886749268, + "rewards/rm_reward_func/mean": 7.81378173828125, + "rewards/rm_reward_func/std": 8.282805442810059, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 343.1875, + "completions/mean_terminated_length": 295.91998291015625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.6944, + "grad_norm": 1.8266758918762207, + "kl": 0.099945068359375, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 13741942.0, + "reward": 4.015403747558594, + "reward_std": 4.989114761352539, + "rewards/rm_reward_func/mean": 4.015403747558594, + "rewards/rm_reward_func/std": 14.881505012512207, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 302.9375, + "completions/mean_terminated_length": 296.19354248046875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.6952, + "grad_norm": 1.7168601751327515, + "kl": 0.08782958984375, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 13757988.0, + "reward": 9.25439453125, + "reward_std": 5.8297200202941895, + "rewards/rm_reward_func/mean": 9.25439453125, + "rewards/rm_reward_func/std": 15.498250007629395, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 280.53125, + "completions/mean_terminated_length": 256.5862121582031, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.696, + "grad_norm": 2.6560745239257812, + "kl": 0.06671142578125, + "learning_rate": 1e-06, + "loss": 0.0442, + "num_tokens": 13769333.0, + "reward": -10.84674072265625, + "reward_std": 6.4003376960754395, + "rewards/rm_reward_func/mean": -10.84674072265625, + "rewards/rm_reward_func/std": 10.306774139404297, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 360.6875, + "completions/mean_terminated_length": 301.478271484375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.6968, + "grad_norm": 1.9061424732208252, + "kl": 0.077117919921875, + "learning_rate": 1e-06, + "loss": 0.0428, + "num_tokens": 13788235.0, + "reward": 6.83203125, + "reward_std": 4.622474670410156, + "rewards/rm_reward_func/mean": 6.83203125, + "rewards/rm_reward_func/std": 18.445390701293945, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 394.65625, + "completions/mean_terminated_length": 291.1176452636719, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.6976, + "grad_norm": 1.6926039457321167, + "kl": 0.0543212890625, + "learning_rate": 1e-06, + "loss": 0.0549, + "num_tokens": 13804112.0, + "reward": -11.41558837890625, + "reward_std": 6.486227512359619, + "rewards/rm_reward_func/mean": -11.41558837890625, + "rewards/rm_reward_func/std": 10.289817810058594, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 369.90625, + "completions/mean_terminated_length": 272.6842041015625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.6984, + "grad_norm": 2.294339179992676, + "kl": 0.041748046875, + "learning_rate": 1e-06, + "loss": 0.2279, + "num_tokens": 13817765.0, + "reward": -3.1925048828125, + "reward_std": 4.867129325866699, + "rewards/rm_reward_func/mean": -3.1925048828125, + "rewards/rm_reward_func/std": 7.916301727294922, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 332.125, + "completions/mean_terminated_length": 192.22222900390625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.6992, + "grad_norm": 1.796257495880127, + "kl": 0.07879638671875, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 13833577.0, + "reward": -2.02392578125, + "reward_std": 3.1411304473876953, + "rewards/rm_reward_func/mean": -2.02392578125, + "rewards/rm_reward_func/std": 19.060049057006836, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 320.78125, + "completions/mean_terminated_length": 301.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7, + "grad_norm": 1.8812639713287354, + "kl": 0.0665283203125, + "learning_rate": 1e-06, + "loss": 0.0535, + "num_tokens": 13849906.0, + "reward": -8.3310546875, + "reward_std": 6.653361797332764, + "rewards/rm_reward_func/mean": -8.3310546875, + "rewards/rm_reward_func/std": 9.87001895904541, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 320.0625, + "completions/mean_terminated_length": 320.0625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.7008, + "grad_norm": 1.808968424797058, + "kl": 0.08538818359375, + "learning_rate": 1e-06, + "loss": -0.0427, + "num_tokens": 13867244.0, + "reward": 2.5892333984375, + "reward_std": 6.00148868560791, + "rewards/rm_reward_func/mean": 2.5892333984375, + "rewards/rm_reward_func/std": 14.667585372924805, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 366.15625, + "completions/mean_terminated_length": 345.3214416503906, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.7016, + "grad_norm": 1.5374969244003296, + "kl": 0.043243408203125, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 13883937.0, + "reward": 4.79833984375, + "reward_std": 6.680488586425781, + "rewards/rm_reward_func/mean": 4.79833984375, + "rewards/rm_reward_func/std": 21.05741310119629, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 357.5625, + "completions/mean_terminated_length": 237.44444274902344, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.7024, + "grad_norm": 2.062166213989258, + "kl": 0.089202880859375, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 13904939.0, + "reward": 0.6318359375, + "reward_std": 3.2684810161590576, + "rewards/rm_reward_func/mean": 0.6318359375, + "rewards/rm_reward_func/std": 11.752583503723145, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 381.59375, + "completions/mean_terminated_length": 313.28570556640625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.7032, + "grad_norm": 1.6277662515640259, + "kl": 0.074737548828125, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 13929838.0, + "reward": 6.9467010498046875, + "reward_std": 5.498739242553711, + "rewards/rm_reward_func/mean": 6.9467010498046875, + "rewards/rm_reward_func/std": 18.866851806640625, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 318.40625, + "completions/mean_terminated_length": 273.73077392578125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.704, + "grad_norm": 1.6222554445266724, + "kl": 0.05938720703125, + "learning_rate": 1e-06, + "loss": 0.0526, + "num_tokens": 13944571.0, + "reward": 4.17254638671875, + "reward_std": 6.909102439880371, + "rewards/rm_reward_func/mean": 4.17254638671875, + "rewards/rm_reward_func/std": 12.668671607971191, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 429.0625, + "completions/mean_terminated_length": 246.60000610351562, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.7048, + "grad_norm": 1.4840784072875977, + "kl": 0.05224609375, + "learning_rate": 1e-06, + "loss": 0.1112, + "num_tokens": 13963429.0, + "reward": -3.9267578125, + "reward_std": 5.697702407836914, + "rewards/rm_reward_func/mean": -3.9267578125, + "rewards/rm_reward_func/std": 19.052875518798828, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 393.09375, + "completions/mean_terminated_length": 321.75, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.7056, + "grad_norm": 1.365999460220337, + "kl": 0.067138671875, + "learning_rate": 1e-06, + "loss": -0.03, + "num_tokens": 13982824.0, + "reward": 6.844482421875, + "reward_std": 8.401575088500977, + "rewards/rm_reward_func/mean": 6.844482421875, + "rewards/rm_reward_func/std": 18.527742385864258, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 370.53125, + "completions/mean_terminated_length": 344.3333435058594, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.7064, + "grad_norm": 2.056433916091919, + "kl": 0.046630859375, + "learning_rate": 1e-06, + "loss": -0.0606, + "num_tokens": 14000225.0, + "reward": 2.3609619140625, + "reward_std": 5.935639381408691, + "rewards/rm_reward_func/mean": 2.3609619140625, + "rewards/rm_reward_func/std": 13.418625831604004, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 362.59375, + "completions/mean_terminated_length": 284.3333435058594, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.7072, + "grad_norm": 1.7667971849441528, + "kl": 0.053955078125, + "learning_rate": 1e-06, + "loss": 0.1938, + "num_tokens": 14016892.0, + "reward": -8.98370361328125, + "reward_std": 6.923333168029785, + "rewards/rm_reward_func/mean": -8.98370361328125, + "rewards/rm_reward_func/std": 15.502429008483887, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 259.5, + "completions/mean_terminated_length": 242.6666717529297, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.708, + "grad_norm": 3.352975606918335, + "kl": 0.08428955078125, + "learning_rate": 1e-06, + "loss": 0.0446, + "num_tokens": 14030468.0, + "reward": 5.567176818847656, + "reward_std": 6.57534122467041, + "rewards/rm_reward_func/mean": 5.567176818847656, + "rewards/rm_reward_func/std": 13.971413612365723, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 400.5625, + "completions/mean_terminated_length": 333.70001220703125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.7088, + "grad_norm": 1.4829586744308472, + "kl": 0.077056884765625, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 14051990.0, + "reward": 4.32080078125, + "reward_std": 4.578785419464111, + "rewards/rm_reward_func/mean": 4.32080078125, + "rewards/rm_reward_func/std": 17.064523696899414, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 345.0, + "completions/mean_terminated_length": 269.0909118652344, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.7096, + "grad_norm": 2.163200855255127, + "kl": 0.044677734375, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 14065374.0, + "reward": -3.12738037109375, + "reward_std": 5.796885967254639, + "rewards/rm_reward_func/mean": -3.12738037109375, + "rewards/rm_reward_func/std": 6.378081321716309, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 443.625, + "completions/mean_terminated_length": 329.66668701171875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.7104, + "grad_norm": 1.9373884201049805, + "kl": 0.0499267578125, + "learning_rate": 1e-06, + "loss": 0.0664, + "num_tokens": 14085674.0, + "reward": -13.291748046875, + "reward_std": 3.145475387573242, + "rewards/rm_reward_func/mean": -13.291748046875, + "rewards/rm_reward_func/std": 4.229991436004639, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 360.375, + "completions/mean_terminated_length": 309.8333435058594, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.7112, + "grad_norm": 1.7239456176757812, + "kl": 0.0616455078125, + "learning_rate": 1e-06, + "loss": -0.1011, + "num_tokens": 14102950.0, + "reward": -4.160087585449219, + "reward_std": 6.499576568603516, + "rewards/rm_reward_func/mean": -4.160087585449219, + "rewards/rm_reward_func/std": 9.051041603088379, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 286.4375, + "completions/mean_terminated_length": 244.6666717529297, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.712, + "grad_norm": 2.292990207672119, + "kl": 0.078094482421875, + "learning_rate": 1e-06, + "loss": 0.0725, + "num_tokens": 14115404.0, + "reward": -14.580810546875, + "reward_std": 4.15199089050293, + "rewards/rm_reward_func/mean": -14.580810546875, + "rewards/rm_reward_func/std": 8.562573432922363, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 379.0, + "completions/mean_terminated_length": 318.54547119140625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.7128, + "grad_norm": 1.7398550510406494, + "kl": 0.0640869140625, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 14132492.0, + "reward": 0.929931640625, + "reward_std": 6.314770698547363, + "rewards/rm_reward_func/mean": 0.929931640625, + "rewards/rm_reward_func/std": 11.463532447814941, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 313.25, + "completions/mean_terminated_length": 257.6000061035156, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.7136, + "grad_norm": 1.9615352153778076, + "kl": 0.049591064453125, + "learning_rate": 1e-06, + "loss": 0.0811, + "num_tokens": 14147764.0, + "reward": -2.8718061447143555, + "reward_std": 6.038548469543457, + "rewards/rm_reward_func/mean": -2.8718061447143555, + "rewards/rm_reward_func/std": 9.447090148925781, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 309.53125, + "completions/mean_terminated_length": 288.5862121582031, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.7144, + "grad_norm": 1.9709076881408691, + "kl": 0.0706787109375, + "learning_rate": 1e-06, + "loss": -0.0488, + "num_tokens": 14163501.0, + "reward": 1.4691162109375, + "reward_std": 5.5178937911987305, + "rewards/rm_reward_func/mean": 1.4691162109375, + "rewards/rm_reward_func/std": 15.25466251373291, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 435.125, + "completions/mean_terminated_length": 348.0000305175781, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7152, + "grad_norm": 1.5625107288360596, + "kl": 0.044036865234375, + "learning_rate": 1e-06, + "loss": 0.107, + "num_tokens": 14180217.0, + "reward": -12.95513916015625, + "reward_std": 6.332525253295898, + "rewards/rm_reward_func/mean": -12.95513916015625, + "rewards/rm_reward_func/std": 10.219103813171387, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 440.875, + "completions/mean_terminated_length": 398.20001220703125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.716, + "grad_norm": 1.5858193635940552, + "kl": 0.0533447265625, + "learning_rate": 1e-06, + "loss": 0.029, + "num_tokens": 14203045.0, + "reward": -3.6484375, + "reward_std": 4.499842166900635, + "rewards/rm_reward_func/mean": -3.6484375, + "rewards/rm_reward_func/std": 19.35128402709961, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 347.53125, + "completions/mean_terminated_length": 272.7727355957031, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.7168, + "grad_norm": 1.8616636991500854, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.0706, + "num_tokens": 14218430.0, + "reward": 4.1662445068359375, + "reward_std": 3.9518613815307617, + "rewards/rm_reward_func/mean": 4.1662445068359375, + "rewards/rm_reward_func/std": 13.748356819152832, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 266.34375, + "completions/mean_terminated_length": 184.45834350585938, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.7176, + "grad_norm": 3.5012259483337402, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 14234241.0, + "reward": 0.06756591796875, + "reward_std": 3.6605472564697266, + "rewards/rm_reward_func/mean": 0.06756591796875, + "rewards/rm_reward_func/std": 11.851730346679688, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 357.46875, + "completions/mean_terminated_length": 237.2777862548828, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.7184, + "grad_norm": 1.895250916481018, + "kl": 0.09979248046875, + "learning_rate": 1e-06, + "loss": 0.0578, + "num_tokens": 14251616.0, + "reward": 0.0, + "reward_std": 3.7596004009246826, + "rewards/rm_reward_func/mean": 0.0, + "rewards/rm_reward_func/std": 13.46176528930664, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 342.4375, + "completions/mean_terminated_length": 240.6999969482422, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.7192, + "grad_norm": 1.8971123695373535, + "kl": 0.0438232421875, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 14265606.0, + "reward": -12.60205078125, + "reward_std": 8.237014770507812, + "rewards/rm_reward_func/mean": -12.60205078125, + "rewards/rm_reward_func/std": 11.214478492736816, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 329.5, + "completions/mean_terminated_length": 295.7037048339844, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.72, + "grad_norm": 1.7537795305252075, + "kl": 0.057373046875, + "learning_rate": 1e-06, + "loss": 0.1302, + "num_tokens": 14282118.0, + "reward": -0.439453125, + "reward_std": 6.53129768371582, + "rewards/rm_reward_func/mean": -0.439453125, + "rewards/rm_reward_func/std": 17.06511878967285, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.8125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 496.375, + "completions/mean_terminated_length": 428.66668701171875, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.7208, + "grad_norm": 1.5228943824768066, + "kl": 0.0389404296875, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 14300874.0, + "reward": -4.3704833984375, + "reward_std": 4.560824871063232, + "rewards/rm_reward_func/mean": -4.3704833984375, + "rewards/rm_reward_func/std": 11.49904727935791, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 359.71875, + "completions/mean_terminated_length": 308.9583435058594, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.7216, + "grad_norm": 1.954308032989502, + "kl": 0.04937744140625, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 14314449.0, + "reward": -7.7960205078125, + "reward_std": 5.227348327636719, + "rewards/rm_reward_func/mean": -7.7960205078125, + "rewards/rm_reward_func/std": 9.938582420349121, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 332.28125, + "completions/mean_terminated_length": 320.3000183105469, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.7224, + "grad_norm": 1.7841217517852783, + "kl": 0.10369873046875, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 14332002.0, + "reward": 9.311508178710938, + "reward_std": 4.982581615447998, + "rewards/rm_reward_func/mean": 9.311508178710938, + "rewards/rm_reward_func/std": 16.10820770263672, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 385.4375, + "completions/mean_terminated_length": 356.23077392578125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.7232, + "grad_norm": 1.7892329692840576, + "kl": 0.0501708984375, + "learning_rate": 1e-06, + "loss": 0.0664, + "num_tokens": 14347784.0, + "reward": -1.0759696960449219, + "reward_std": 7.9657816886901855, + "rewards/rm_reward_func/mean": -1.0759696960449219, + "rewards/rm_reward_func/std": 11.279064178466797, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 312.625, + "completions/mean_terminated_length": 292.0, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.724, + "grad_norm": 2.216010093688965, + "kl": 0.06109619140625, + "learning_rate": 1e-06, + "loss": 0.0316, + "num_tokens": 14360692.0, + "reward": -7.5439453125, + "reward_std": 3.7846872806549072, + "rewards/rm_reward_func/mean": -7.5439453125, + "rewards/rm_reward_func/std": 8.977649688720703, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 307.15625, + "completions/mean_terminated_length": 277.89288330078125, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.7248, + "grad_norm": 2.1664443016052246, + "kl": 0.04754638671875, + "learning_rate": 1e-06, + "loss": 0.0707, + "num_tokens": 14372401.0, + "reward": 9.5926513671875, + "reward_std": 7.038387298583984, + "rewards/rm_reward_func/mean": 9.5926513671875, + "rewards/rm_reward_func/std": 16.778491973876953, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 428.1875, + "completions/mean_terminated_length": 395.3913269042969, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.7256, + "grad_norm": 1.56720769405365, + "kl": 0.034515380859375, + "learning_rate": 1e-06, + "loss": -0.0259, + "num_tokens": 14391583.0, + "reward": -6.10302734375, + "reward_std": 6.868504047393799, + "rewards/rm_reward_func/mean": -6.10302734375, + "rewards/rm_reward_func/std": 15.576776504516602, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 379.96875, + "completions/mean_terminated_length": 289.631591796875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.7264, + "grad_norm": 1.3595832586288452, + "kl": 0.0509033203125, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 14411510.0, + "reward": 0.319061279296875, + "reward_std": 6.755457878112793, + "rewards/rm_reward_func/mean": 0.319061279296875, + "rewards/rm_reward_func/std": 20.191614151000977, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 329.40625, + "completions/mean_terminated_length": 310.5172424316406, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.7272, + "grad_norm": 1.773484230041504, + "kl": 0.09454345703125, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 14428307.0, + "reward": 4.24127197265625, + "reward_std": 6.082801818847656, + "rewards/rm_reward_func/mean": 4.24127197265625, + "rewards/rm_reward_func/std": 13.788681983947754, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 411.75, + "completions/mean_terminated_length": 351.6000061035156, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.728, + "grad_norm": 1.5417022705078125, + "kl": 0.061187744140625, + "learning_rate": 1e-06, + "loss": 0.066, + "num_tokens": 14450931.0, + "reward": -6.403564453125, + "reward_std": 7.179168701171875, + "rewards/rm_reward_func/mean": -6.403564453125, + "rewards/rm_reward_func/std": 11.500345230102539, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 293.6875, + "completions/mean_terminated_length": 286.6451416015625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.7288, + "grad_norm": 2.0456080436706543, + "kl": 0.06768798828125, + "learning_rate": 1e-06, + "loss": 0.0102, + "num_tokens": 14465385.0, + "reward": -5.870635986328125, + "reward_std": 5.733155250549316, + "rewards/rm_reward_func/mean": -5.870635986328125, + "rewards/rm_reward_func/std": 10.037235260009766, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 374.96875, + "completions/mean_terminated_length": 336.6000061035156, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.7296, + "grad_norm": 1.6597861051559448, + "kl": 0.040863037109375, + "learning_rate": 1e-06, + "loss": -0.0682, + "num_tokens": 14480712.0, + "reward": -3.853729248046875, + "reward_std": 5.717039108276367, + "rewards/rm_reward_func/mean": -3.853729248046875, + "rewards/rm_reward_func/std": 10.07486629486084, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 253.83334350585938, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.7304, + "grad_norm": 2.0848214626312256, + "kl": 0.08612060546875, + "learning_rate": 1e-06, + "loss": 0.0427, + "num_tokens": 14495495.0, + "reward": 2.125, + "reward_std": 8.297126770019531, + "rewards/rm_reward_func/mean": 2.125, + "rewards/rm_reward_func/std": 16.524341583251953, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 286.5625, + "completions/mean_terminated_length": 263.2413635253906, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.7312, + "grad_norm": 2.2552411556243896, + "kl": 0.0556640625, + "learning_rate": 1e-06, + "loss": 0.2866, + "num_tokens": 14506737.0, + "reward": 2.1148910522460938, + "reward_std": 7.923299789428711, + "rewards/rm_reward_func/mean": 2.1148910522460938, + "rewards/rm_reward_func/std": 16.44039535522461, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 277.4375, + "completions/mean_terminated_length": 277.4375, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.732, + "grad_norm": 2.173488140106201, + "kl": 0.08428955078125, + "learning_rate": 1e-06, + "loss": -0.0251, + "num_tokens": 14521375.0, + "reward": -3.1552505493164062, + "reward_std": 7.125005722045898, + "rewards/rm_reward_func/mean": -3.1552505493164062, + "rewards/rm_reward_func/std": 9.921463012695312, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 328.1875, + "completions/mean_terminated_length": 301.9285888671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.7328, + "grad_norm": 1.6935398578643799, + "kl": 0.0576171875, + "learning_rate": 1e-06, + "loss": -0.0289, + "num_tokens": 14537333.0, + "reward": 4.154022216796875, + "reward_std": 4.920960426330566, + "rewards/rm_reward_func/mean": 4.154022216796875, + "rewards/rm_reward_func/std": 19.4608211517334, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 472.34375, + "completions/mean_terminated_length": 427.4000244140625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.7336, + "grad_norm": 1.470565676689148, + "kl": 0.032470703125, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 14558120.0, + "reward": 0.945556640625, + "reward_std": 8.655411720275879, + "rewards/rm_reward_func/mean": 0.945556640625, + "rewards/rm_reward_func/std": 10.275266647338867, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 362.8125, + "completions/mean_terminated_length": 352.86669921875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.7344, + "grad_norm": 1.9920480251312256, + "kl": 0.0557861328125, + "learning_rate": 1e-06, + "loss": -0.1067, + "num_tokens": 14574226.0, + "reward": -2.730194091796875, + "reward_std": 7.887948513031006, + "rewards/rm_reward_func/mean": -2.730194091796875, + "rewards/rm_reward_func/std": 10.170256614685059, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 309.8125, + "completions/mean_terminated_length": 230.69566345214844, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.7352, + "grad_norm": 2.4043126106262207, + "kl": 0.0816650390625, + "learning_rate": 1e-06, + "loss": -0.0448, + "num_tokens": 14588996.0, + "reward": -4.7542724609375, + "reward_std": 5.3291425704956055, + "rewards/rm_reward_func/mean": -4.7542724609375, + "rewards/rm_reward_func/std": 9.259735107421875, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 419.40625, + "completions/mean_terminated_length": 314.4666748046875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.736, + "grad_norm": 1.5467361211776733, + "kl": 0.05816650390625, + "learning_rate": 1e-06, + "loss": 0.0971, + "num_tokens": 14608329.0, + "reward": 0.23238372802734375, + "reward_std": 4.091479778289795, + "rewards/rm_reward_func/mean": 0.23238372802734375, + "rewards/rm_reward_func/std": 15.872623443603516, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 340.71875, + "completions/mean_terminated_length": 169.4375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.7368, + "grad_norm": 2.0991058349609375, + "kl": 0.04388427734375, + "learning_rate": 1e-06, + "loss": 0.0811, + "num_tokens": 14622648.0, + "reward": -13.4560546875, + "reward_std": 3.0429604053497314, + "rewards/rm_reward_func/mean": -13.4560546875, + "rewards/rm_reward_func/std": 5.3804545402526855, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 213.375, + "completions/mean_terminated_length": 213.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.7376, + "grad_norm": 2.0988550186157227, + "kl": 0.0810546875, + "learning_rate": 1e-06, + "loss": -0.1148, + "num_tokens": 14634548.0, + "reward": -5.3094482421875, + "reward_std": 7.107990264892578, + "rewards/rm_reward_func/mean": -5.3094482421875, + "rewards/rm_reward_func/std": 12.734643936157227, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 346.5625, + "completions/mean_terminated_length": 329.4482727050781, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.7384, + "grad_norm": 1.7044029235839844, + "kl": 0.08477783203125, + "learning_rate": 1e-06, + "loss": 0.0363, + "num_tokens": 14653734.0, + "reward": 5.738525390625, + "reward_std": 5.907501697540283, + "rewards/rm_reward_func/mean": 5.738525390625, + "rewards/rm_reward_func/std": 14.067469596862793, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 259.90625, + "completions/mean_terminated_length": 259.90625, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.7392, + "grad_norm": 2.068110942840576, + "kl": 0.07305908203125, + "learning_rate": 1e-06, + "loss": -0.0419, + "num_tokens": 14672003.0, + "reward": -1.1322021484375, + "reward_std": 7.386229515075684, + "rewards/rm_reward_func/mean": -1.1322021484375, + "rewards/rm_reward_func/std": 11.97767162322998, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 286.21875, + "completions/mean_terminated_length": 234.11538696289062, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.74, + "grad_norm": 1.9237210750579834, + "kl": 0.0753173828125, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 14688122.0, + "reward": 12.45751953125, + "reward_std": 8.28902816772461, + "rewards/rm_reward_func/mean": 12.45751953125, + "rewards/rm_reward_func/std": 16.745771408081055, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 308.1875, + "completions/mean_terminated_length": 301.6128845214844, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.7408, + "grad_norm": 1.9341157674789429, + "kl": 0.06817626953125, + "learning_rate": 1e-06, + "loss": 0.0475, + "num_tokens": 14703688.0, + "reward": -10.9024658203125, + "reward_std": 6.0593767166137695, + "rewards/rm_reward_func/mean": -10.9024658203125, + "rewards/rm_reward_func/std": 10.119786262512207, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 363.1875, + "completions/mean_terminated_length": 321.5199890136719, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.7416, + "grad_norm": 1.7111101150512695, + "kl": 0.06585693359375, + "learning_rate": 1e-06, + "loss": -0.0544, + "num_tokens": 14719670.0, + "reward": -8.478759765625, + "reward_std": 5.609869480133057, + "rewards/rm_reward_func/mean": -8.478759765625, + "rewards/rm_reward_func/std": 8.345298767089844, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 403.28125, + "completions/mean_terminated_length": 222.08334350585938, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.7424, + "grad_norm": 1.5380433797836304, + "kl": 0.04522705078125, + "learning_rate": 1e-06, + "loss": 0.0277, + "num_tokens": 14737975.0, + "reward": -9.293212890625, + "reward_std": 4.587721824645996, + "rewards/rm_reward_func/mean": -9.293212890625, + "rewards/rm_reward_func/std": 6.724361419677734, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 328.3125, + "completions/mean_terminated_length": 276.8800048828125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.7432, + "grad_norm": 1.9521268606185913, + "kl": 0.058837890625, + "learning_rate": 1e-06, + "loss": -0.035, + "num_tokens": 14751801.0, + "reward": -7.5172119140625, + "reward_std": 5.262956619262695, + "rewards/rm_reward_func/mean": -7.5172119140625, + "rewards/rm_reward_func/std": 9.408235549926758, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 367.125, + "completions/mean_terminated_length": 301.2727355957031, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.744, + "grad_norm": 1.876100778579712, + "kl": 0.05938720703125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 14768957.0, + "reward": -0.18597412109375, + "reward_std": 5.688486099243164, + "rewards/rm_reward_func/mean": -0.18597412109375, + "rewards/rm_reward_func/std": 19.515121459960938, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 273.71875, + "completions/mean_terminated_length": 249.0689697265625, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.7448, + "grad_norm": 2.2734432220458984, + "kl": 0.0855712890625, + "learning_rate": 1e-06, + "loss": 0.0314, + "num_tokens": 14783532.0, + "reward": 2.1390380859375, + "reward_std": 6.1560564041137695, + "rewards/rm_reward_func/mean": 2.1390380859375, + "rewards/rm_reward_func/std": 19.176328659057617, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 373.75, + "completions/mean_terminated_length": 301.3333435058594, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.7456, + "grad_norm": 1.845765471458435, + "kl": 0.03680419921875, + "learning_rate": 1e-06, + "loss": 0.1431, + "num_tokens": 14798380.0, + "reward": -8.328048706054688, + "reward_std": 5.433547019958496, + "rewards/rm_reward_func/mean": -8.328048706054688, + "rewards/rm_reward_func/std": 8.384936332702637, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 372.5, + "completions/mean_terminated_length": 333.44000244140625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.7464, + "grad_norm": 1.686218023300171, + "kl": 0.08782958984375, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 14816212.0, + "reward": -0.4595947265625, + "reward_std": 6.150363922119141, + "rewards/rm_reward_func/mean": -0.4595947265625, + "rewards/rm_reward_func/std": 21.9122257232666, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 321.40625, + "completions/mean_terminated_length": 130.8125, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.7472, + "grad_norm": 2.5011839866638184, + "kl": 0.0479888916015625, + "learning_rate": 1e-06, + "loss": 0.0497, + "num_tokens": 14831449.0, + "reward": -7.47076416015625, + "reward_std": 5.498394012451172, + "rewards/rm_reward_func/mean": -7.47076416015625, + "rewards/rm_reward_func/std": 6.7130608558654785, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 327.34375, + "completions/mean_terminated_length": 300.96429443359375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.748, + "grad_norm": 2.319511651992798, + "kl": 0.059417724609375, + "learning_rate": 1e-06, + "loss": 0.1389, + "num_tokens": 14847212.0, + "reward": -10.627059936523438, + "reward_std": 3.3940374851226807, + "rewards/rm_reward_func/mean": -10.627059936523438, + "rewards/rm_reward_func/std": 7.200928688049316, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 365.46875, + "completions/mean_terminated_length": 344.5357360839844, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.7488, + "grad_norm": 1.903350591659546, + "kl": 0.0609130859375, + "learning_rate": 1e-06, + "loss": 0.0486, + "num_tokens": 14861075.0, + "reward": -5.1015625, + "reward_std": 6.82496976852417, + "rewards/rm_reward_func/mean": -5.1015625, + "rewards/rm_reward_func/std": 11.306344032287598, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 333.84375, + "completions/mean_terminated_length": 274.4583435058594, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.7496, + "grad_norm": 2.0492427349090576, + "kl": 0.07470703125, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 14876374.0, + "reward": -4.4541015625, + "reward_std": 5.210179805755615, + "rewards/rm_reward_func/mean": -4.4541015625, + "rewards/rm_reward_func/std": 6.510527610778809, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 434.53125, + "completions/mean_terminated_length": 399.3182067871094, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7504, + "grad_norm": 1.7363113164901733, + "kl": 0.04400634765625, + "learning_rate": 1e-06, + "loss": -0.0354, + "num_tokens": 14893599.0, + "reward": -3.45953369140625, + "reward_std": 5.941793441772461, + "rewards/rm_reward_func/mean": -3.45953369140625, + "rewards/rm_reward_func/std": 9.667407989501953, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 369.34375, + "completions/mean_terminated_length": 304.5, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.7512, + "grad_norm": 1.6622511148452759, + "kl": 0.07159423828125, + "learning_rate": 1e-06, + "loss": 0.0422, + "num_tokens": 14910370.0, + "reward": -4.48114013671875, + "reward_std": 5.22508430480957, + "rewards/rm_reward_func/mean": -4.48114013671875, + "rewards/rm_reward_func/std": 12.400673866271973, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 384.625, + "completions/mean_terminated_length": 342.16668701171875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.752, + "grad_norm": 1.8333054780960083, + "kl": 0.06915283203125, + "learning_rate": 1e-06, + "loss": 0.0413, + "num_tokens": 14928758.0, + "reward": 6.473388671875, + "reward_std": 5.208273410797119, + "rewards/rm_reward_func/mean": 6.473388671875, + "rewards/rm_reward_func/std": 11.401970863342285, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 286.21875, + "completions/mean_terminated_length": 278.93548583984375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.7528, + "grad_norm": 1.830357313156128, + "kl": 0.09039306640625, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 14944805.0, + "reward": -0.680908203125, + "reward_std": 7.5175981521606445, + "rewards/rm_reward_func/mean": -0.680908203125, + "rewards/rm_reward_func/std": 17.9741268157959, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 367.15625, + "completions/mean_terminated_length": 318.875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.7536, + "grad_norm": 1.861135482788086, + "kl": 0.06884765625, + "learning_rate": 1e-06, + "loss": -0.0785, + "num_tokens": 14966506.0, + "reward": 0.560791015625, + "reward_std": 5.897975921630859, + "rewards/rm_reward_func/mean": 0.560791015625, + "rewards/rm_reward_func/std": 16.340225219726562, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 428.6875, + "completions/mean_terminated_length": 363.8888854980469, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.7544, + "grad_norm": 1.6303080320358276, + "kl": 0.036468505859375, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 14984992.0, + "reward": -3.52716064453125, + "reward_std": 4.550829887390137, + "rewards/rm_reward_func/mean": -3.52716064453125, + "rewards/rm_reward_func/std": 8.39388370513916, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 436.625, + "completions/mean_terminated_length": 326.4615478515625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.7552, + "grad_norm": 1.556160807609558, + "kl": 0.03253173828125, + "learning_rate": 1e-06, + "loss": 0.1423, + "num_tokens": 15004420.0, + "reward": -9.245254516601562, + "reward_std": 6.954720973968506, + "rewards/rm_reward_func/mean": -9.245254516601562, + "rewards/rm_reward_func/std": 9.071441650390625, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 327.0625, + "completions/mean_terminated_length": 254.69566345214844, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.756, + "grad_norm": 2.0845038890838623, + "kl": 0.062744140625, + "learning_rate": 1e-06, + "loss": 0.2008, + "num_tokens": 15019294.0, + "reward": -6.424774169921875, + "reward_std": 4.513679027557373, + "rewards/rm_reward_func/mean": -6.424774169921875, + "rewards/rm_reward_func/std": 9.611113548278809, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 388.875, + "completions/mean_terminated_length": 376.137939453125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.7568, + "grad_norm": 1.613211989402771, + "kl": 0.0430908203125, + "learning_rate": 1e-06, + "loss": -0.0425, + "num_tokens": 15034194.0, + "reward": -3.32708740234375, + "reward_std": 5.613581657409668, + "rewards/rm_reward_func/mean": -3.32708740234375, + "rewards/rm_reward_func/std": 11.077795028686523, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 388.0625, + "completions/mean_terminated_length": 359.4615478515625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.7576, + "grad_norm": 1.9565799236297607, + "kl": 0.0794677734375, + "learning_rate": 1e-06, + "loss": 0.0263, + "num_tokens": 15052428.0, + "reward": 6.959815979003906, + "reward_std": 5.441317558288574, + "rewards/rm_reward_func/mean": 6.959815979003906, + "rewards/rm_reward_func/std": 10.955689430236816, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 346.875, + "completions/mean_terminated_length": 308.7692565917969, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.7584, + "grad_norm": 1.8273959159851074, + "kl": 0.063629150390625, + "learning_rate": 1e-06, + "loss": 0.0262, + "num_tokens": 15067576.0, + "reward": -3.980010986328125, + "reward_std": 6.687274932861328, + "rewards/rm_reward_func/mean": -3.980010986328125, + "rewards/rm_reward_func/std": 10.250381469726562, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 292.875, + "completions/mean_terminated_length": 193.27273559570312, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7592, + "grad_norm": 2.502899408340454, + "kl": 0.05133056640625, + "learning_rate": 1e-06, + "loss": 0.2254, + "num_tokens": 15078764.0, + "reward": -5.8040618896484375, + "reward_std": 8.234657287597656, + "rewards/rm_reward_func/mean": -5.8040618896484375, + "rewards/rm_reward_func/std": 8.954713821411133, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 327.03125, + "completions/mean_terminated_length": 242.95455932617188, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.76, + "grad_norm": 2.0451250076293945, + "kl": 0.047119140625, + "learning_rate": 1e-06, + "loss": -0.122, + "num_tokens": 15091037.0, + "reward": -10.05517578125, + "reward_std": 5.5270891189575195, + "rewards/rm_reward_func/mean": -10.05517578125, + "rewards/rm_reward_func/std": 6.178440570831299, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 390.25, + "completions/mean_terminated_length": 326.4761962890625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.7608, + "grad_norm": 1.4872499704360962, + "kl": 0.06561279296875, + "learning_rate": 1e-06, + "loss": -0.0606, + "num_tokens": 15112069.0, + "reward": -1.043212890625, + "reward_std": 8.339581489562988, + "rewards/rm_reward_func/mean": -1.043212890625, + "rewards/rm_reward_func/std": 17.639554977416992, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 286.6875, + "completions/mean_terminated_length": 254.50001525878906, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.7616, + "grad_norm": 2.3534464836120605, + "kl": 0.053436279296875, + "learning_rate": 1e-06, + "loss": 0.1685, + "num_tokens": 15124667.0, + "reward": -6.18603515625, + "reward_std": 4.946113586425781, + "rewards/rm_reward_func/mean": -6.18603515625, + "rewards/rm_reward_func/std": 8.57293701171875, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 292.03125, + "completions/mean_terminated_length": 284.93548583984375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.7624, + "grad_norm": 1.930702805519104, + "kl": 0.0777587890625, + "learning_rate": 1e-06, + "loss": 0.1417, + "num_tokens": 15140812.0, + "reward": 5.955810546875, + "reward_std": 5.625755310058594, + "rewards/rm_reward_func/mean": 5.955810546875, + "rewards/rm_reward_func/std": 17.552413940429688, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 318.03125, + "completions/mean_terminated_length": 263.7200012207031, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.7632, + "grad_norm": 1.9286456108093262, + "kl": 0.067657470703125, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 15156669.0, + "reward": -1.68414306640625, + "reward_std": 7.524256229400635, + "rewards/rm_reward_func/mean": -1.68414306640625, + "rewards/rm_reward_func/std": 12.127203941345215, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 377.09375, + "completions/mean_terminated_length": 258.058837890625, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.764, + "grad_norm": 1.8441168069839478, + "kl": 0.04248046875, + "learning_rate": 1e-06, + "loss": -0.0405, + "num_tokens": 15171240.0, + "reward": -10.736328125, + "reward_std": 3.9458930492401123, + "rewards/rm_reward_func/mean": -10.736328125, + "rewards/rm_reward_func/std": 5.1531219482421875, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 336.625, + "completions/mean_terminated_length": 296.15386962890625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.7648, + "grad_norm": 1.5560182332992554, + "kl": 0.05157470703125, + "learning_rate": 1e-06, + "loss": -0.1291, + "num_tokens": 15188100.0, + "reward": -4.72515869140625, + "reward_std": 5.798953056335449, + "rewards/rm_reward_func/mean": -4.72515869140625, + "rewards/rm_reward_func/std": 7.556000709533691, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 287.65625, + "completions/mean_terminated_length": 246.11111450195312, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.7656, + "grad_norm": 1.9677491188049316, + "kl": 0.06890869140625, + "learning_rate": 1e-06, + "loss": 0.1539, + "num_tokens": 15206361.0, + "reward": 0.15216064453125, + "reward_std": 7.636119842529297, + "rewards/rm_reward_func/mean": 0.15216064453125, + "rewards/rm_reward_func/std": 19.130725860595703, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 372.5, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.7664, + "grad_norm": 2.0687954425811768, + "kl": 0.092132568359375, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 15225705.0, + "reward": 4.88031005859375, + "reward_std": 4.407324314117432, + "rewards/rm_reward_func/mean": 4.88031005859375, + "rewards/rm_reward_func/std": 19.1469783782959, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 304.21875, + "completions/mean_terminated_length": 234.95834350585938, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.7672, + "grad_norm": 2.046639919281006, + "kl": 0.0902099609375, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 15243904.0, + "reward": 2.17626953125, + "reward_std": 5.633055686950684, + "rewards/rm_reward_func/mean": 2.17626953125, + "rewards/rm_reward_func/std": 18.605958938598633, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 349.34375, + "completions/mean_terminated_length": 332.5172424316406, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.768, + "grad_norm": 1.8774504661560059, + "kl": 0.07470703125, + "learning_rate": 1e-06, + "loss": 0.0616, + "num_tokens": 15262555.0, + "reward": 5.9736328125, + "reward_std": 4.318167686462402, + "rewards/rm_reward_func/mean": 5.9736328125, + "rewards/rm_reward_func/std": 15.18685531616211, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 333.125, + "completions/mean_terminated_length": 273.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7688, + "grad_norm": 1.8209190368652344, + "kl": 0.07891845703125, + "learning_rate": 1e-06, + "loss": 0.0345, + "num_tokens": 15277767.0, + "reward": 1.30303955078125, + "reward_std": 5.383155822753906, + "rewards/rm_reward_func/mean": 1.30303955078125, + "rewards/rm_reward_func/std": 12.852009773254395, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 332.5625, + "completions/mean_terminated_length": 326.774169921875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7696, + "grad_norm": 1.7912604808807373, + "kl": 0.07354736328125, + "learning_rate": 1e-06, + "loss": -0.0483, + "num_tokens": 15294225.0, + "reward": 5.5262451171875, + "reward_std": 7.560568332672119, + "rewards/rm_reward_func/mean": 5.5262451171875, + "rewards/rm_reward_func/std": 17.68309783935547, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 312.96875, + "completions/mean_terminated_length": 284.5357360839844, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.7704, + "grad_norm": 2.113269567489624, + "kl": 0.06231689453125, + "learning_rate": 1e-06, + "loss": -0.0643, + "num_tokens": 15306320.0, + "reward": -4.245681285858154, + "reward_std": 4.566248416900635, + "rewards/rm_reward_func/mean": -4.245681285858154, + "rewards/rm_reward_func/std": 9.306557655334473, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 338.5625, + "completions/mean_terminated_length": 320.6206970214844, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.7712, + "grad_norm": 1.846778392791748, + "kl": 0.0762939453125, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 15323730.0, + "reward": -1.96588134765625, + "reward_std": 3.975306749343872, + "rewards/rm_reward_func/mean": -1.96588134765625, + "rewards/rm_reward_func/std": 9.614950180053711, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 418.9375, + "completions/mean_terminated_length": 370.19049072265625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.772, + "grad_norm": 1.7270152568817139, + "kl": 0.04766845703125, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 15344344.0, + "reward": -1.3333740234375, + "reward_std": 5.022583961486816, + "rewards/rm_reward_func/mean": -1.3333740234375, + "rewards/rm_reward_func/std": 7.120126247406006, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 328.6875, + "completions/mean_terminated_length": 286.3846130371094, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7728, + "grad_norm": 1.9307111501693726, + "kl": 0.045867919921875, + "learning_rate": 1e-06, + "loss": -0.1604, + "num_tokens": 15357958.0, + "reward": 2.3218994140625, + "reward_std": 5.988040924072266, + "rewards/rm_reward_func/mean": 2.3218994140625, + "rewards/rm_reward_func/std": 8.682594299316406, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 277.875, + "completions/mean_terminated_length": 262.2666931152344, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.7736, + "grad_norm": 1.9342926740646362, + "kl": 0.0849609375, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 15374322.0, + "reward": 2.303466796875, + "reward_std": 7.30094051361084, + "rewards/rm_reward_func/mean": 2.303466796875, + "rewards/rm_reward_func/std": 18.978647232055664, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 387.28125, + "completions/mean_terminated_length": 301.9473571777344, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.7744, + "grad_norm": 1.743038535118103, + "kl": 0.04779052734375, + "learning_rate": 1e-06, + "loss": -0.0744, + "num_tokens": 15394443.0, + "reward": -7.593994140625, + "reward_std": 6.755252361297607, + "rewards/rm_reward_func/mean": -7.593994140625, + "rewards/rm_reward_func/std": 12.107401847839355, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 255.375, + "completions/mean_terminated_length": 228.8275909423828, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.7752, + "grad_norm": 1.9324322938919067, + "kl": 0.07879638671875, + "learning_rate": 1e-06, + "loss": 0.1045, + "num_tokens": 15409159.0, + "reward": 8.2818603515625, + "reward_std": 8.277243614196777, + "rewards/rm_reward_func/mean": 8.2818603515625, + "rewards/rm_reward_func/std": 17.525957107543945, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 382.15625, + "completions/mean_terminated_length": 293.3157958984375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.776, + "grad_norm": 1.8402212858200073, + "kl": 0.063995361328125, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 15426684.0, + "reward": -5.162109375, + "reward_std": 4.2492523193359375, + "rewards/rm_reward_func/mean": -5.162109375, + "rewards/rm_reward_func/std": 6.334498882293701, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 347.96875, + "completions/mean_terminated_length": 347.96875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.7768, + "grad_norm": 1.7348252534866333, + "kl": 0.060821533203125, + "learning_rate": 1e-06, + "loss": -0.038, + "num_tokens": 15445707.0, + "reward": 13.2669677734375, + "reward_std": 6.520524024963379, + "rewards/rm_reward_func/mean": 13.2669677734375, + "rewards/rm_reward_func/std": 13.039697647094727, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 245.5, + "completions/mean_terminated_length": 245.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.7776, + "grad_norm": 2.197350263595581, + "kl": 0.090576171875, + "learning_rate": 1e-06, + "loss": 0.1023, + "num_tokens": 15459075.0, + "reward": -0.01153564453125, + "reward_std": 6.615358829498291, + "rewards/rm_reward_func/mean": -0.01153564453125, + "rewards/rm_reward_func/std": 18.74847984313965, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 434.15625, + "completions/mean_terminated_length": 320.3846130371094, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.7784, + "grad_norm": 1.4784244298934937, + "kl": 0.049560546875, + "learning_rate": 1e-06, + "loss": 0.1962, + "num_tokens": 15476648.0, + "reward": -12.15582275390625, + "reward_std": 6.480105400085449, + "rewards/rm_reward_func/mean": -12.15582275390625, + "rewards/rm_reward_func/std": 10.369534492492676, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 365.0625, + "completions/mean_terminated_length": 349.862060546875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.7792, + "grad_norm": 1.7645145654678345, + "kl": 0.036346435546875, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 15490058.0, + "reward": -2.26495361328125, + "reward_std": 5.346769332885742, + "rewards/rm_reward_func/mean": -2.26495361328125, + "rewards/rm_reward_func/std": 5.916716575622559, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 385.53125, + "completions/mean_terminated_length": 367.46429443359375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.78, + "grad_norm": 1.7059917449951172, + "kl": 0.034698486328125, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 15506779.0, + "reward": -1.983245849609375, + "reward_std": 6.229931831359863, + "rewards/rm_reward_func/mean": -1.983245849609375, + "rewards/rm_reward_func/std": 6.868417263031006, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 290.6875, + "completions/mean_terminated_length": 275.933349609375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.7808, + "grad_norm": 1.776451587677002, + "kl": 0.0611572265625, + "learning_rate": 1e-06, + "loss": -0.0888, + "num_tokens": 15522009.0, + "reward": 0.98291015625, + "reward_std": 6.029773235321045, + "rewards/rm_reward_func/mean": 0.98291015625, + "rewards/rm_reward_func/std": 16.120994567871094, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 192.9375, + "completions/mean_terminated_length": 182.64515686035156, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.7816, + "grad_norm": 4.362729072570801, + "kl": 0.09637451171875, + "learning_rate": 1e-06, + "loss": -0.0184, + "num_tokens": 15530271.0, + "reward": -2.884033203125, + "reward_std": 3.912165880203247, + "rewards/rm_reward_func/mean": -2.884033203125, + "rewards/rm_reward_func/std": 5.748475074768066, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 175.75, + "completions/mean_terminated_length": 164.90321350097656, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.7824, + "grad_norm": 2.667686700820923, + "kl": 0.0711669921875, + "learning_rate": 1e-06, + "loss": 0.2975, + "num_tokens": 15538815.0, + "reward": -5.282958984375, + "reward_std": 7.594799041748047, + "rewards/rm_reward_func/mean": -5.282958984375, + "rewards/rm_reward_func/std": 12.636316299438477, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 312.09375, + "completions/mean_terminated_length": 256.1199951171875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.7832, + "grad_norm": 1.9955432415008545, + "kl": 0.0772705078125, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 15554114.0, + "reward": -2.55517578125, + "reward_std": 3.5416982173919678, + "rewards/rm_reward_func/mean": -2.55517578125, + "rewards/rm_reward_func/std": 13.374783515930176, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 326.6875, + "completions/mean_terminated_length": 300.21429443359375, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.784, + "grad_norm": 1.8399032354354858, + "kl": 0.05633544921875, + "learning_rate": 1e-06, + "loss": 0.0908, + "num_tokens": 15570920.0, + "reward": -2.0135498046875, + "reward_std": 3.859978437423706, + "rewards/rm_reward_func/mean": -2.0135498046875, + "rewards/rm_reward_func/std": 9.135330200195312, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 275.5625, + "completions/mean_terminated_length": 267.93548583984375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.7848, + "grad_norm": 1.8108325004577637, + "kl": 0.11358642578125, + "learning_rate": 1e-06, + "loss": 0.0645, + "num_tokens": 15590058.0, + "reward": 14.71484375, + "reward_std": 5.722306251525879, + "rewards/rm_reward_func/mean": 14.71484375, + "rewards/rm_reward_func/std": 18.05368995666504, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 353.34375, + "completions/mean_terminated_length": 323.96295166015625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.7856, + "grad_norm": 1.7612102031707764, + "kl": 0.0533447265625, + "learning_rate": 1e-06, + "loss": -0.1152, + "num_tokens": 15606029.0, + "reward": -3.18353271484375, + "reward_std": 4.7075090408325195, + "rewards/rm_reward_func/mean": -3.18353271484375, + "rewards/rm_reward_func/std": 9.6627836227417, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.7864, + "grad_norm": 2.0172884464263916, + "kl": 0.071502685546875, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 15620289.0, + "reward": -3.647735595703125, + "reward_std": 3.6949591636657715, + "rewards/rm_reward_func/mean": -3.647735595703125, + "rewards/rm_reward_func/std": 10.247601509094238, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 317.71875, + "completions/mean_terminated_length": 304.7666931152344, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 0.7872, + "grad_norm": 1.6725493669509888, + "kl": 0.07464599609375, + "learning_rate": 1e-06, + "loss": -0.0729, + "num_tokens": 15636040.0, + "reward": 3.89508056640625, + "reward_std": 6.154848098754883, + "rewards/rm_reward_func/mean": 3.89508056640625, + "rewards/rm_reward_func/std": 18.554534912109375, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 323.125, + "completions/mean_terminated_length": 270.239990234375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.788, + "grad_norm": 1.8639757633209229, + "kl": 0.082275390625, + "learning_rate": 1e-06, + "loss": -0.0238, + "num_tokens": 15651236.0, + "reward": 7.235595703125, + "reward_std": 6.050224781036377, + "rewards/rm_reward_func/mean": 7.235595703125, + "rewards/rm_reward_func/std": 20.596040725708008, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 368.9375, + "completions/mean_terminated_length": 364.32257080078125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.7888, + "grad_norm": 1.8886938095092773, + "kl": 0.044677734375, + "learning_rate": 1e-06, + "loss": -0.0594, + "num_tokens": 15666866.0, + "reward": 3.24713134765625, + "reward_std": 8.963842391967773, + "rewards/rm_reward_func/mean": 3.24713134765625, + "rewards/rm_reward_func/std": 12.393926620483398, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 223.53125, + "completions/mean_terminated_length": 193.6896514892578, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.7896, + "grad_norm": 2.689934492111206, + "kl": 0.0843505859375, + "learning_rate": 1e-06, + "loss": -0.1394, + "num_tokens": 15678075.0, + "reward": -1.504852294921875, + "reward_std": 5.043430328369141, + "rewards/rm_reward_func/mean": -1.504852294921875, + "rewards/rm_reward_func/std": 8.934005737304688, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 364.625, + "completions/mean_terminated_length": 323.3599853515625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.7904, + "grad_norm": 1.5724399089813232, + "kl": 0.049774169921875, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 15695079.0, + "reward": 8.339691162109375, + "reward_std": 5.753800392150879, + "rewards/rm_reward_func/mean": 8.339691162109375, + "rewards/rm_reward_func/std": 20.401081085205078, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 276.28125, + "completions/mean_terminated_length": 268.6773986816406, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.7912, + "grad_norm": 2.143218994140625, + "kl": 0.07366943359375, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 15706008.0, + "reward": -10.157135009765625, + "reward_std": 5.735292911529541, + "rewards/rm_reward_func/mean": -10.157135009765625, + "rewards/rm_reward_func/std": 6.09641170501709, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 293.9375, + "completions/mean_terminated_length": 286.9032287597656, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.792, + "grad_norm": 1.812641978263855, + "kl": 0.0853271484375, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 15720590.0, + "reward": 1.65966796875, + "reward_std": 4.406991481781006, + "rewards/rm_reward_func/mean": 1.65966796875, + "rewards/rm_reward_func/std": 14.930853843688965, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 320.21875, + "completions/mean_terminated_length": 314.0322570800781, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.7928, + "grad_norm": 1.9463584423065186, + "kl": 0.04864501953125, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 15734061.0, + "reward": -6.485595703125, + "reward_std": 4.2344279289245605, + "rewards/rm_reward_func/mean": -6.485595703125, + "rewards/rm_reward_func/std": 7.137295246124268, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 224.46875, + "completions/mean_terminated_length": 215.19354248046875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.7936, + "grad_norm": 2.2556302547454834, + "kl": 0.084716796875, + "learning_rate": 1e-06, + "loss": 0.0645, + "num_tokens": 15744668.0, + "reward": 0.55755615234375, + "reward_std": 4.947491645812988, + "rewards/rm_reward_func/mean": 0.55755615234375, + "rewards/rm_reward_func/std": 20.104541778564453, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 347.65625, + "completions/mean_terminated_length": 330.6551818847656, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.7944, + "grad_norm": 1.7728079557418823, + "kl": 0.04779052734375, + "learning_rate": 1e-06, + "loss": 0.1494, + "num_tokens": 15758017.0, + "reward": -13.9990234375, + "reward_std": 6.432539463043213, + "rewards/rm_reward_func/mean": -13.9990234375, + "rewards/rm_reward_func/std": 8.954249382019043, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 318.40625, + "completions/mean_terminated_length": 318.40625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.7952, + "grad_norm": 1.9117544889450073, + "kl": 0.065185546875, + "learning_rate": 1e-06, + "loss": -0.0529, + "num_tokens": 15774006.0, + "reward": -1.8724365234375, + "reward_std": 5.881443023681641, + "rewards/rm_reward_func/mean": -1.8724365234375, + "rewards/rm_reward_func/std": 7.912093639373779, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 296.875, + "completions/mean_terminated_length": 257.03704833984375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.796, + "grad_norm": 2.0297462940216064, + "kl": 0.050628662109375, + "learning_rate": 1e-06, + "loss": 0.3345, + "num_tokens": 15785930.0, + "reward": -6.709716796875, + "reward_std": 7.219078063964844, + "rewards/rm_reward_func/mean": -6.709716796875, + "rewards/rm_reward_func/std": 8.17263412475586, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 291.3125, + "completions/mean_terminated_length": 250.44444274902344, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.7968, + "grad_norm": 1.8663793802261353, + "kl": 0.0833740234375, + "learning_rate": 1e-06, + "loss": -0.0608, + "num_tokens": 15800220.0, + "reward": -9.150802612304688, + "reward_std": 5.248953819274902, + "rewards/rm_reward_func/mean": -9.150802612304688, + "rewards/rm_reward_func/std": 5.429791450500488, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 220.03125, + "completions/mean_terminated_length": 220.03125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.7976, + "grad_norm": 2.772073745727539, + "kl": 0.0828857421875, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 15810061.0, + "reward": -5.68426513671875, + "reward_std": 6.693396091461182, + "rewards/rm_reward_func/mean": -5.68426513671875, + "rewards/rm_reward_func/std": 13.478750228881836, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 381.4375, + "completions/mean_terminated_length": 190.61538696289062, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.7984, + "grad_norm": 1.5646052360534668, + "kl": 0.075347900390625, + "learning_rate": 1e-06, + "loss": 0.1149, + "num_tokens": 15829171.0, + "reward": -15.673095703125, + "reward_std": 7.047809600830078, + "rewards/rm_reward_func/mean": -15.673095703125, + "rewards/rm_reward_func/std": 7.571465492248535, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 256.5625, + "completions/mean_terminated_length": 239.53334045410156, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.7992, + "grad_norm": 2.333911180496216, + "kl": 0.097900390625, + "learning_rate": 1e-06, + "loss": 0.0421, + "num_tokens": 15841837.0, + "reward": -3.9521560668945312, + "reward_std": 7.589486122131348, + "rewards/rm_reward_func/mean": -3.9521560668945312, + "rewards/rm_reward_func/std": 10.762330055236816, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 363.40625, + "completions/mean_terminated_length": 261.7368469238281, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.8, + "grad_norm": 2.2394330501556396, + "kl": 0.067962646484375, + "learning_rate": 1e-06, + "loss": -0.0173, + "num_tokens": 15855778.0, + "reward": -9.9951171875, + "reward_std": 3.4860429763793945, + "rewards/rm_reward_func/mean": -9.9951171875, + "rewards/rm_reward_func/std": 10.343985557556152, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 254.3125, + "completions/mean_terminated_length": 168.4166717529297, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8008, + "grad_norm": 2.8163135051727295, + "kl": 0.0916748046875, + "learning_rate": 1e-06, + "loss": -0.0379, + "num_tokens": 15869124.0, + "reward": -4.91015625, + "reward_std": 3.4444026947021484, + "rewards/rm_reward_func/mean": -4.91015625, + "rewards/rm_reward_func/std": 17.854663848876953, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 382.96875, + "completions/mean_terminated_length": 359.0740661621094, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.8016, + "grad_norm": 1.9579390287399292, + "kl": 0.0465087890625, + "learning_rate": 1e-06, + "loss": -0.0854, + "num_tokens": 15886035.0, + "reward": -5.937408447265625, + "reward_std": 6.331451416015625, + "rewards/rm_reward_func/mean": -5.937408447265625, + "rewards/rm_reward_func/std": 11.247023582458496, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 327.4375, + "completions/mean_terminated_length": 321.4838562011719, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.8024, + "grad_norm": 1.864088535308838, + "kl": 0.0556640625, + "learning_rate": 1e-06, + "loss": 0.0407, + "num_tokens": 15901369.0, + "reward": 1.1678466796875, + "reward_std": 7.161996841430664, + "rewards/rm_reward_func/mean": 1.1678466796875, + "rewards/rm_reward_func/std": 14.592074394226074, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 415.875, + "completions/mean_terminated_length": 350.1052551269531, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.8032, + "grad_norm": 1.42827308177948, + "kl": 0.04962158203125, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 15918765.0, + "reward": -6.79290771484375, + "reward_std": 7.461469650268555, + "rewards/rm_reward_func/mean": -6.79290771484375, + "rewards/rm_reward_func/std": 11.295293807983398, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 331.71875, + "completions/mean_terminated_length": 281.239990234375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.804, + "grad_norm": 1.9580515623092651, + "kl": 0.060791015625, + "learning_rate": 1e-06, + "loss": 0.0661, + "num_tokens": 15932900.0, + "reward": -2.8197021484375, + "reward_std": 4.197875022888184, + "rewards/rm_reward_func/mean": -2.8197021484375, + "rewards/rm_reward_func/std": 9.510029792785645, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 388.25, + "completions/mean_terminated_length": 347.0, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8048, + "grad_norm": 1.6206207275390625, + "kl": 0.05731201171875, + "learning_rate": 1e-06, + "loss": -0.092, + "num_tokens": 15951612.0, + "reward": 1.013824462890625, + "reward_std": 6.577977180480957, + "rewards/rm_reward_func/mean": 1.013824462890625, + "rewards/rm_reward_func/std": 9.395310401916504, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 243.0, + "completions/mean_terminated_length": 243.0, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.8056, + "grad_norm": 2.5836453437805176, + "kl": 0.1219482421875, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 15966028.0, + "reward": 8.94403076171875, + "reward_std": 5.605957984924316, + "rewards/rm_reward_func/mean": 8.94403076171875, + "rewards/rm_reward_func/std": 14.561261177062988, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 182.5625, + "completions/mean_terminated_length": 182.5625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.8064, + "grad_norm": 2.645958423614502, + "kl": 0.124755859375, + "learning_rate": 1e-06, + "loss": -0.0399, + "num_tokens": 15980742.0, + "reward": 10.03466796875, + "reward_std": 7.054365158081055, + "rewards/rm_reward_func/mean": 10.03466796875, + "rewards/rm_reward_func/std": 18.324739456176758, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 334.34375, + "completions/mean_terminated_length": 308.96429443359375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.8072, + "grad_norm": 1.7227952480316162, + "kl": 0.05535888671875, + "learning_rate": 1e-06, + "loss": -0.0178, + "num_tokens": 15999465.0, + "reward": -5.70513916015625, + "reward_std": 5.174798965454102, + "rewards/rm_reward_func/mean": -5.70513916015625, + "rewards/rm_reward_func/std": 9.141780853271484, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 362.25, + "completions/mean_terminated_length": 340.8571472167969, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.808, + "grad_norm": 1.576114296913147, + "kl": 0.05389404296875, + "learning_rate": 1e-06, + "loss": -0.063, + "num_tokens": 16013689.0, + "reward": 6.36151123046875, + "reward_std": 8.44892692565918, + "rewards/rm_reward_func/mean": 6.36151123046875, + "rewards/rm_reward_func/std": 12.840198516845703, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 301.53125, + "completions/mean_terminated_length": 301.53125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.8088, + "grad_norm": 1.9873569011688232, + "kl": 0.092742919921875, + "learning_rate": 1e-06, + "loss": -0.042, + "num_tokens": 16027794.0, + "reward": -0.2614021301269531, + "reward_std": 5.382135391235352, + "rewards/rm_reward_func/mean": -0.2614021301269531, + "rewards/rm_reward_func/std": 10.239026069641113, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 386.15625, + "completions/mean_terminated_length": 350.91998291015625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.8096, + "grad_norm": 1.572719931602478, + "kl": 0.066650390625, + "learning_rate": 1e-06, + "loss": 0.0469, + "num_tokens": 16047743.0, + "reward": -4.66168212890625, + "reward_std": 5.536433219909668, + "rewards/rm_reward_func/mean": -4.66168212890625, + "rewards/rm_reward_func/std": 11.767607688903809, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 291.34375, + "completions/mean_terminated_length": 284.2257995605469, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.8104, + "grad_norm": 2.0520529747009277, + "kl": 0.077392578125, + "learning_rate": 1e-06, + "loss": 0.0789, + "num_tokens": 16062642.0, + "reward": 2.512969970703125, + "reward_std": 4.955704689025879, + "rewards/rm_reward_func/mean": 2.512969970703125, + "rewards/rm_reward_func/std": 12.086182594299316, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 362.03125, + "completions/mean_terminated_length": 340.6071472167969, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.8112, + "grad_norm": 1.7673920392990112, + "kl": 0.0745849609375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 16079963.0, + "reward": 1.136505126953125, + "reward_std": 4.737231731414795, + "rewards/rm_reward_func/mean": 1.136505126953125, + "rewards/rm_reward_func/std": 16.013450622558594, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 373.84375, + "completions/mean_terminated_length": 364.63336181640625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.812, + "grad_norm": 2.0377087593078613, + "kl": 0.075439453125, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 16097030.0, + "reward": 0.761962890625, + "reward_std": 4.88601016998291, + "rewards/rm_reward_func/mean": 0.761962890625, + "rewards/rm_reward_func/std": 7.866146564483643, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 371.375, + "completions/mean_terminated_length": 316.34783935546875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.8128, + "grad_norm": 1.8479177951812744, + "kl": 0.07464599609375, + "learning_rate": 1e-06, + "loss": -0.0267, + "num_tokens": 16113818.0, + "reward": -2.511932373046875, + "reward_std": 4.925314426422119, + "rewards/rm_reward_func/mean": -2.511932373046875, + "rewards/rm_reward_func/std": 11.58869457244873, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 319.9375, + "completions/mean_terminated_length": 313.7419128417969, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.8136, + "grad_norm": 1.5920718908309937, + "kl": 0.07037353515625, + "learning_rate": 1e-06, + "loss": -0.0146, + "num_tokens": 16132208.0, + "reward": 2.3974609375, + "reward_std": 5.030467987060547, + "rewards/rm_reward_func/mean": 2.3974609375, + "rewards/rm_reward_func/std": 18.256624221801758, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 290.5625, + "completions/mean_terminated_length": 189.9091033935547, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.8144, + "grad_norm": 5.38646936416626, + "kl": 0.11956787109375, + "learning_rate": 1e-06, + "loss": 0.069, + "num_tokens": 16147298.0, + "reward": -7.26385498046875, + "reward_std": 6.358008861541748, + "rewards/rm_reward_func/mean": -7.26385498046875, + "rewards/rm_reward_func/std": 10.757355690002441, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 385.28125, + "completions/mean_terminated_length": 335.6956481933594, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.8152, + "grad_norm": 1.428399920463562, + "kl": 0.053253173828125, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 16164739.0, + "reward": 4.42010498046875, + "reward_std": 7.538242340087891, + "rewards/rm_reward_func/mean": 4.42010498046875, + "rewards/rm_reward_func/std": 12.856751441955566, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 299.15625, + "completions/mean_terminated_length": 284.9666748046875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.816, + "grad_norm": 1.8140485286712646, + "kl": 0.08819580078125, + "learning_rate": 1e-06, + "loss": -0.0289, + "num_tokens": 16183272.0, + "reward": 5.366790771484375, + "reward_std": 5.528800964355469, + "rewards/rm_reward_func/mean": 5.366790771484375, + "rewards/rm_reward_func/std": 16.75286102294922, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 343.96875, + "completions/mean_terminated_length": 332.7666931152344, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.8168, + "grad_norm": 1.739495038986206, + "kl": 0.06005859375, + "learning_rate": 1e-06, + "loss": -0.0809, + "num_tokens": 16196063.0, + "reward": 1.968994140625, + "reward_std": 6.434878349304199, + "rewards/rm_reward_func/mean": 1.968994140625, + "rewards/rm_reward_func/std": 10.942986488342285, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 388.96875, + "completions/mean_terminated_length": 376.2413635253906, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.8176, + "grad_norm": 1.7111109495162964, + "kl": 0.09381103515625, + "learning_rate": 1e-06, + "loss": -0.0147, + "num_tokens": 16215414.0, + "reward": 7.9273681640625, + "reward_std": 5.260622501373291, + "rewards/rm_reward_func/mean": 7.9273681640625, + "rewards/rm_reward_func/std": 11.676657676696777, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 229.9375, + "completions/mean_terminated_length": 229.9375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.8184, + "grad_norm": 2.6079165935516357, + "kl": 0.1212158203125, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 16230356.0, + "reward": 1.617431640625, + "reward_std": 5.185896396636963, + "rewards/rm_reward_func/mean": 1.617431640625, + "rewards/rm_reward_func/std": 16.07063102722168, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 328.125, + "completions/mean_terminated_length": 244.5454559326172, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.8192, + "grad_norm": 2.0914688110351562, + "kl": 0.0684814453125, + "learning_rate": 1e-06, + "loss": 0.0328, + "num_tokens": 16245552.0, + "reward": -10.475830078125, + "reward_std": 4.746757507324219, + "rewards/rm_reward_func/mean": -10.475830078125, + "rewards/rm_reward_func/std": 9.18952465057373, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 397.75, + "completions/mean_terminated_length": 365.7599792480469, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.82, + "grad_norm": 1.56807279586792, + "kl": 0.10015869140625, + "learning_rate": 1e-06, + "loss": 0.063, + "num_tokens": 16266560.0, + "reward": -5.3624267578125, + "reward_std": 7.8521599769592285, + "rewards/rm_reward_func/mean": -5.3624267578125, + "rewards/rm_reward_func/std": 13.009211540222168, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 419.90625, + "completions/mean_terminated_length": 364.6499938964844, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.8208, + "grad_norm": 1.668832540512085, + "kl": 0.0499267578125, + "learning_rate": 1e-06, + "loss": 0.1235, + "num_tokens": 16283301.0, + "reward": -5.5810546875, + "reward_std": 9.804344177246094, + "rewards/rm_reward_func/mean": -5.5810546875, + "rewards/rm_reward_func/std": 14.529510498046875, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 422.5625, + "completions/mean_terminated_length": 381.9090881347656, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8216, + "grad_norm": 1.7207919359207153, + "kl": 0.05462646484375, + "learning_rate": 1e-06, + "loss": -0.0706, + "num_tokens": 16300631.0, + "reward": -3.9527587890625, + "reward_std": 3.497464418411255, + "rewards/rm_reward_func/mean": -3.9527587890625, + "rewards/rm_reward_func/std": 7.815584182739258, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 367.96875, + "completions/mean_terminated_length": 292.5238037109375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.8224, + "grad_norm": 1.6002906560897827, + "kl": 0.046630859375, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 16319934.0, + "reward": 2.4716796875, + "reward_std": 5.260158538818359, + "rewards/rm_reward_func/mean": 2.4716796875, + "rewards/rm_reward_func/std": 14.948128700256348, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 276.5625, + "completions/mean_terminated_length": 242.9285888671875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.8232, + "grad_norm": 1.9021689891815186, + "kl": 0.125244140625, + "learning_rate": 1e-06, + "loss": 0.0323, + "num_tokens": 16338488.0, + "reward": 13.14996337890625, + "reward_std": 6.562317848205566, + "rewards/rm_reward_func/mean": 13.14996337890625, + "rewards/rm_reward_func/std": 14.118902206420898, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 379.28125, + "completions/mean_terminated_length": 360.3214416503906, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.824, + "grad_norm": 1.625343918800354, + "kl": 0.05035400390625, + "learning_rate": 1e-06, + "loss": -0.0564, + "num_tokens": 16354793.0, + "reward": -0.2197723388671875, + "reward_std": 6.220179557800293, + "rewards/rm_reward_func/mean": -0.2197723388671875, + "rewards/rm_reward_func/std": 7.623610496520996, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 371.3125, + "completions/mean_terminated_length": 345.2592468261719, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8248, + "grad_norm": 1.686112880706787, + "kl": 0.06500244140625, + "learning_rate": 1e-06, + "loss": -0.0308, + "num_tokens": 16373419.0, + "reward": 2.93994140625, + "reward_std": 5.604059219360352, + "rewards/rm_reward_func/mean": 2.93994140625, + "rewards/rm_reward_func/std": 16.731985092163086, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 312.875, + "completions/mean_terminated_length": 257.1199951171875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.8256, + "grad_norm": 1.9143481254577637, + "kl": 0.0755615234375, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 16388783.0, + "reward": -12.905609130859375, + "reward_std": 4.989555835723877, + "rewards/rm_reward_func/mean": -12.905609130859375, + "rewards/rm_reward_func/std": 7.252536296844482, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 355.28125, + "completions/mean_terminated_length": 311.3999938964844, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.8264, + "grad_norm": 1.6884747743606567, + "kl": 0.0567626953125, + "learning_rate": 1e-06, + "loss": -0.031, + "num_tokens": 16405632.0, + "reward": -11.318359375, + "reward_std": 3.094526767730713, + "rewards/rm_reward_func/mean": -11.318359375, + "rewards/rm_reward_func/std": 4.449491500854492, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 343.28125, + "completions/mean_terminated_length": 337.8387145996094, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8272, + "grad_norm": 1.8399924039840698, + "kl": 0.05926513671875, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 16420025.0, + "reward": -2.153076171875, + "reward_std": 4.214644432067871, + "rewards/rm_reward_func/mean": -2.153076171875, + "rewards/rm_reward_func/std": 11.250146865844727, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 456.46875, + "completions/mean_terminated_length": 437.9583435058594, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.828, + "grad_norm": 1.5094772577285767, + "kl": 0.04022216796875, + "learning_rate": 1e-06, + "loss": -0.027, + "num_tokens": 16436976.0, + "reward": -2.4154815673828125, + "reward_std": 5.3466877937316895, + "rewards/rm_reward_func/mean": -2.4154815673828125, + "rewards/rm_reward_func/std": 12.02271842956543, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 332.09375, + "completions/mean_terminated_length": 332.09375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.8288, + "grad_norm": 1.7834326028823853, + "kl": 0.08721923828125, + "learning_rate": 1e-06, + "loss": -0.0193, + "num_tokens": 16452899.0, + "reward": -1.25048828125, + "reward_std": 3.818423271179199, + "rewards/rm_reward_func/mean": -1.25048828125, + "rewards/rm_reward_func/std": 13.085210800170898, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 278.9375, + "completions/mean_terminated_length": 254.8275909423828, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.8296, + "grad_norm": 1.9472944736480713, + "kl": 0.0792236328125, + "learning_rate": 1e-06, + "loss": 0.0912, + "num_tokens": 16467857.0, + "reward": 0.56591796875, + "reward_std": 7.540620803833008, + "rewards/rm_reward_func/mean": 0.56591796875, + "rewards/rm_reward_func/std": 18.89093589782715, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 312.90625, + "completions/mean_terminated_length": 292.3103332519531, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.8304, + "grad_norm": 2.2728517055511475, + "kl": 0.06500244140625, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 16480270.0, + "reward": -6.685546875, + "reward_std": 3.789708137512207, + "rewards/rm_reward_func/mean": -6.685546875, + "rewards/rm_reward_func/std": 6.050817012786865, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 361.71875, + "completions/mean_terminated_length": 319.6399841308594, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.8312, + "grad_norm": 1.6631767749786377, + "kl": 0.060394287109375, + "learning_rate": 1e-06, + "loss": -0.0683, + "num_tokens": 16496261.0, + "reward": -8.27288818359375, + "reward_std": 5.8831305503845215, + "rewards/rm_reward_func/mean": -8.27288818359375, + "rewards/rm_reward_func/std": 9.305617332458496, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 234.46875, + "completions/mean_terminated_length": 234.46875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.832, + "grad_norm": 2.1253116130828857, + "kl": 0.07049560546875, + "learning_rate": 1e-06, + "loss": -0.0629, + "num_tokens": 16507732.0, + "reward": -5.7918701171875, + "reward_std": 3.993044376373291, + "rewards/rm_reward_func/mean": -5.7918701171875, + "rewards/rm_reward_func/std": 6.80061149597168, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 269.15625, + "completions/mean_terminated_length": 224.1851806640625, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.8328, + "grad_norm": 2.9166905879974365, + "kl": 0.10565185546875, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 16524393.0, + "reward": 2.97930908203125, + "reward_std": 9.098430633544922, + "rewards/rm_reward_func/mean": 2.97930908203125, + "rewards/rm_reward_func/std": 11.424431800842285, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 353.9375, + "completions/mean_terminated_length": 343.4000244140625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.8336, + "grad_norm": 2.1423757076263428, + "kl": 0.07586669921875, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 16538463.0, + "reward": -0.7562255859375, + "reward_std": 8.681238174438477, + "rewards/rm_reward_func/mean": -0.7562255859375, + "rewards/rm_reward_func/std": 10.141227722167969, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 411.875, + "completions/mean_terminated_length": 393.3333435058594, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.8344, + "grad_norm": 1.6426948308944702, + "kl": 0.04595947265625, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 16553731.0, + "reward": -6.404052734375, + "reward_std": 6.105518341064453, + "rewards/rm_reward_func/mean": -6.404052734375, + "rewards/rm_reward_func/std": 10.995789527893066, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 405.46875, + "completions/mean_terminated_length": 357.04547119140625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.8352, + "grad_norm": 1.7646334171295166, + "kl": 0.06634521484375, + "learning_rate": 1e-06, + "loss": 0.0354, + "num_tokens": 16574586.0, + "reward": 0.4004058837890625, + "reward_std": 6.039009094238281, + "rewards/rm_reward_func/mean": 0.4004058837890625, + "rewards/rm_reward_func/std": 7.969948768615723, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 266.5, + "completions/mean_terminated_length": 266.5, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.836, + "grad_norm": 2.234823703765869, + "kl": 0.07666015625, + "learning_rate": 1e-06, + "loss": 0.1187, + "num_tokens": 16585362.0, + "reward": -9.371368408203125, + "reward_std": 4.3622026443481445, + "rewards/rm_reward_func/mean": -9.371368408203125, + "rewards/rm_reward_func/std": 9.075967788696289, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 272.9375, + "completions/mean_terminated_length": 238.7857208251953, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8368, + "grad_norm": 2.415853977203369, + "kl": 0.06976318359375, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 16596504.0, + "reward": -8.74237060546875, + "reward_std": 5.031957626342773, + "rewards/rm_reward_func/mean": -8.74237060546875, + "rewards/rm_reward_func/std": 8.675934791564941, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 265.0625, + "completions/mean_terminated_length": 265.0625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.8376, + "grad_norm": 2.048006534576416, + "kl": 0.1090087890625, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 16612714.0, + "reward": 13.726806640625, + "reward_std": 4.847775459289551, + "rewards/rm_reward_func/mean": 13.726806640625, + "rewards/rm_reward_func/std": 15.610601425170898, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 307.875, + "completions/mean_terminated_length": 278.71429443359375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.8384, + "grad_norm": 1.6287974119186401, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": -0.0313, + "num_tokens": 16630566.0, + "reward": 8.326171875, + "reward_std": 4.775975227355957, + "rewards/rm_reward_func/mean": 8.326171875, + "rewards/rm_reward_func/std": 21.441246032714844, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 208.6875, + "completions/mean_terminated_length": 208.6875, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8392, + "grad_norm": 3.099369764328003, + "kl": 0.13623046875, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 16644236.0, + "reward": 4.762725830078125, + "reward_std": 5.342302322387695, + "rewards/rm_reward_func/mean": 4.762725830078125, + "rewards/rm_reward_func/std": 9.763506889343262, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 283.0, + "completions/mean_terminated_length": 275.6128845214844, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.84, + "grad_norm": 2.37614107131958, + "kl": 0.0762939453125, + "learning_rate": 1e-06, + "loss": -0.0163, + "num_tokens": 16655508.0, + "reward": 3.20361328125, + "reward_std": 6.2384467124938965, + "rewards/rm_reward_func/mean": 3.20361328125, + "rewards/rm_reward_func/std": 15.467334747314453, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 393.9375, + "completions/mean_terminated_length": 347.7391357421875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.8408, + "grad_norm": 1.5033197402954102, + "kl": 0.0576171875, + "learning_rate": 1e-06, + "loss": 0.0674, + "num_tokens": 16671114.0, + "reward": 6.122802734375, + "reward_std": 5.325164318084717, + "rewards/rm_reward_func/mean": 6.122802734375, + "rewards/rm_reward_func/std": 21.951223373413086, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 361.78125, + "completions/mean_terminated_length": 351.7666931152344, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.8416, + "grad_norm": 1.6870393753051758, + "kl": 0.042724609375, + "learning_rate": 1e-06, + "loss": -0.0825, + "num_tokens": 16684499.0, + "reward": -1.7891731262207031, + "reward_std": 6.369847774505615, + "rewards/rm_reward_func/mean": -1.7891731262207031, + "rewards/rm_reward_func/std": 10.838322639465332, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 357.25, + "completions/mean_terminated_length": 305.66668701171875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.8424, + "grad_norm": 1.8656604290008545, + "kl": 0.0390625, + "learning_rate": 1e-06, + "loss": -0.018, + "num_tokens": 16697843.0, + "reward": 0.326812744140625, + "reward_std": 4.996870517730713, + "rewards/rm_reward_func/mean": 0.326812744140625, + "rewards/rm_reward_func/std": 7.947678565979004, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 243.03125, + "completions/mean_terminated_length": 243.03125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 0.8432, + "grad_norm": 2.8653292655944824, + "kl": 0.09893798828125, + "learning_rate": 1e-06, + "loss": 0.0368, + "num_tokens": 16707812.0, + "reward": -3.23150634765625, + "reward_std": 5.999335289001465, + "rewards/rm_reward_func/mean": -3.23150634765625, + "rewards/rm_reward_func/std": 11.447941780090332, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 348.71875, + "completions/mean_terminated_length": 284.8260803222656, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.844, + "grad_norm": 1.5015619993209839, + "kl": 0.0716552734375, + "learning_rate": 1e-06, + "loss": 0.26, + "num_tokens": 16728075.0, + "reward": -6.52734375, + "reward_std": 11.31596565246582, + "rewards/rm_reward_func/mean": -6.52734375, + "rewards/rm_reward_func/std": 16.228729248046875, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 433.78125, + "completions/mean_terminated_length": 364.76470947265625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.8448, + "grad_norm": 1.466704249382019, + "kl": 0.040435791015625, + "learning_rate": 1e-06, + "loss": -0.0536, + "num_tokens": 16745684.0, + "reward": -2.230712890625, + "reward_std": 5.94638729095459, + "rewards/rm_reward_func/mean": -2.230712890625, + "rewards/rm_reward_func/std": 12.65593147277832, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 400.5625, + "completions/mean_terminated_length": 369.3599853515625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.8456, + "grad_norm": 1.772912621498108, + "kl": 0.043731689453125, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 16761094.0, + "reward": -0.579833984375, + "reward_std": 5.782545566558838, + "rewards/rm_reward_func/mean": -0.579833984375, + "rewards/rm_reward_func/std": 12.686838150024414, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 276.21875, + "completions/mean_terminated_length": 276.21875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.8464, + "grad_norm": 2.001225233078003, + "kl": 0.067626953125, + "learning_rate": 1e-06, + "loss": 0.0234, + "num_tokens": 16777013.0, + "reward": 5.501708984375, + "reward_std": 4.32759428024292, + "rewards/rm_reward_func/mean": 5.501708984375, + "rewards/rm_reward_func/std": 11.539275169372559, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 342.5, + "completions/mean_terminated_length": 276.1739196777344, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 0.8472, + "grad_norm": 1.8316313028335571, + "kl": 0.05230712890625, + "learning_rate": 1e-06, + "loss": 0.0939, + "num_tokens": 16793413.0, + "reward": -6.962158203125, + "reward_std": 4.534675121307373, + "rewards/rm_reward_func/mean": -6.962158203125, + "rewards/rm_reward_func/std": 9.093425750732422, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 314.8125, + "completions/mean_terminated_length": 301.66668701171875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.848, + "grad_norm": 2.536829948425293, + "kl": 0.07025146484375, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 16808175.0, + "reward": 1.30810546875, + "reward_std": 7.037678241729736, + "rewards/rm_reward_func/mean": 1.30810546875, + "rewards/rm_reward_func/std": 7.801201820373535, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 241.21875, + "completions/mean_terminated_length": 232.48387145996094, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.8488, + "grad_norm": 2.69404673576355, + "kl": 0.0960693359375, + "learning_rate": 1e-06, + "loss": 0.147, + "num_tokens": 16818974.0, + "reward": -7.393390655517578, + "reward_std": 4.317598819732666, + "rewards/rm_reward_func/mean": -7.393390655517578, + "rewards/rm_reward_func/std": 5.862435340881348, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 370.53125, + "completions/mean_terminated_length": 330.91998291015625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.8496, + "grad_norm": 1.8768541812896729, + "kl": 0.0709228515625, + "learning_rate": 1e-06, + "loss": 0.0444, + "num_tokens": 16838063.0, + "reward": 3.635009765625, + "reward_std": 5.506108283996582, + "rewards/rm_reward_func/mean": 3.635009765625, + "rewards/rm_reward_func/std": 17.62849998474121, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 256.8125, + "completions/mean_terminated_length": 230.41378784179688, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.8504, + "grad_norm": 3.075516939163208, + "kl": 0.11358642578125, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 16852361.0, + "reward": -2.4738903045654297, + "reward_std": 2.6218044757843018, + "rewards/rm_reward_func/mean": -2.4738903045654297, + "rewards/rm_reward_func/std": 14.790396690368652, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 344.28125, + "completions/mean_terminated_length": 320.3214416503906, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.8512, + "grad_norm": 1.9647315740585327, + "kl": 0.06268310546875, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 16866818.0, + "reward": -7.043121337890625, + "reward_std": 6.3150153160095215, + "rewards/rm_reward_func/mean": -7.043121337890625, + "rewards/rm_reward_func/std": 7.116485118865967, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 297.96875, + "completions/mean_terminated_length": 258.3333435058594, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.852, + "grad_norm": 2.158475399017334, + "kl": 0.056060791015625, + "learning_rate": 1e-06, + "loss": -0.0986, + "num_tokens": 16877633.0, + "reward": -15.3370361328125, + "reward_std": 4.794503211975098, + "rewards/rm_reward_func/mean": -15.3370361328125, + "rewards/rm_reward_func/std": 9.14231014251709, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 303.84375, + "completions/mean_terminated_length": 289.9666748046875, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.8528, + "grad_norm": 3.385695219039917, + "kl": 0.05029296875, + "learning_rate": 1e-06, + "loss": 0.2779, + "num_tokens": 16891636.0, + "reward": 2.5072174072265625, + "reward_std": 7.175235748291016, + "rewards/rm_reward_func/mean": 2.5072174072265625, + "rewards/rm_reward_func/std": 8.558778762817383, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 367.0625, + "completions/mean_terminated_length": 318.75, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.8536, + "grad_norm": 1.8394227027893066, + "kl": 0.06146240234375, + "learning_rate": 1e-06, + "loss": 0.134, + "num_tokens": 16907750.0, + "reward": 2.1540603637695312, + "reward_std": 5.875840187072754, + "rewards/rm_reward_func/mean": 2.1540603637695312, + "rewards/rm_reward_func/std": 7.639623641967773, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 404.375, + "completions/mean_terminated_length": 355.4545593261719, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.8544, + "grad_norm": 1.373740553855896, + "kl": 0.05029296875, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 16927858.0, + "reward": 4.25262451171875, + "reward_std": 9.591477394104004, + "rewards/rm_reward_func/mean": 4.25262451171875, + "rewards/rm_reward_func/std": 16.014190673828125, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 376.0, + "completions/mean_terminated_length": 322.7826232910156, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.8552, + "grad_norm": 1.3425090312957764, + "kl": 0.08404541015625, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 16950826.0, + "reward": 2.2708892822265625, + "reward_std": 6.649087905883789, + "rewards/rm_reward_func/mean": 2.2708892822265625, + "rewards/rm_reward_func/std": 11.48779010772705, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 343.53125, + "completions/mean_terminated_length": 332.3000183105469, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.856, + "grad_norm": 1.7826794385910034, + "kl": 0.0701904296875, + "learning_rate": 1e-06, + "loss": -0.0124, + "num_tokens": 16965739.0, + "reward": 6.72235107421875, + "reward_std": 6.049109935760498, + "rewards/rm_reward_func/mean": 6.72235107421875, + "rewards/rm_reward_func/std": 16.85179901123047, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 269.5, + "completions/mean_terminated_length": 261.6773986816406, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8568, + "grad_norm": 1.9555034637451172, + "kl": 0.08056640625, + "learning_rate": 1e-06, + "loss": 0.1479, + "num_tokens": 16981299.0, + "reward": 4.0966796875, + "reward_std": 7.525038242340088, + "rewards/rm_reward_func/mean": 4.0966796875, + "rewards/rm_reward_func/std": 11.042428016662598, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 322.09375, + "completions/mean_terminated_length": 258.79168701171875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.8576, + "grad_norm": 2.188685178756714, + "kl": 0.067138671875, + "learning_rate": 1e-06, + "loss": 0.0682, + "num_tokens": 16999382.0, + "reward": -9.259239196777344, + "reward_std": 7.81141471862793, + "rewards/rm_reward_func/mean": -9.259239196777344, + "rewards/rm_reward_func/std": 14.85389232635498, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 269.03125, + "completions/mean_terminated_length": 234.32144165039062, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.8584, + "grad_norm": 2.190272569656372, + "kl": 0.0572509765625, + "learning_rate": 1e-06, + "loss": -0.0217, + "num_tokens": 17013239.0, + "reward": -8.312744140625, + "reward_std": 5.52623987197876, + "rewards/rm_reward_func/mean": -8.312744140625, + "rewards/rm_reward_func/std": 5.812991619110107, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 376.09375, + "completions/mean_terminated_length": 338.03997802734375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.8592, + "grad_norm": 1.5942208766937256, + "kl": 0.0531005859375, + "learning_rate": 1e-06, + "loss": -0.0177, + "num_tokens": 17028898.0, + "reward": -3.8048095703125, + "reward_std": 5.167942523956299, + "rewards/rm_reward_func/mean": -3.8048095703125, + "rewards/rm_reward_func/std": 7.143012523651123, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 439.34375, + "completions/mean_terminated_length": 382.8333435058594, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.86, + "grad_norm": 1.7428988218307495, + "kl": 0.043701171875, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 17044717.0, + "reward": -5.0797119140625, + "reward_std": 9.10765266418457, + "rewards/rm_reward_func/mean": -5.0797119140625, + "rewards/rm_reward_func/std": 18.70339584350586, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 408.875, + "completions/mean_terminated_length": 368.5217590332031, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.8608, + "grad_norm": 1.7409586906433105, + "kl": 0.055419921875, + "learning_rate": 1e-06, + "loss": -0.1125, + "num_tokens": 17061345.0, + "reward": -2.56256103515625, + "reward_std": 5.845050811767578, + "rewards/rm_reward_func/mean": -2.56256103515625, + "rewards/rm_reward_func/std": 7.789404392242432, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 322.25, + "completions/mean_terminated_length": 295.14288330078125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.8616, + "grad_norm": 1.5859686136245728, + "kl": 0.075469970703125, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 17080601.0, + "reward": 6.177734375, + "reward_std": 7.394063949584961, + "rewards/rm_reward_func/mean": 6.177734375, + "rewards/rm_reward_func/std": 20.439453125, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 324.53125, + "completions/mean_terminated_length": 251.17391967773438, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.8624, + "grad_norm": 1.9152007102966309, + "kl": 0.08935546875, + "learning_rate": 1e-06, + "loss": 0.1256, + "num_tokens": 17099962.0, + "reward": -1.997802734375, + "reward_std": 6.696859359741211, + "rewards/rm_reward_func/mean": -1.997802734375, + "rewards/rm_reward_func/std": 8.248697280883789, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 351.53125, + "completions/mean_terminated_length": 351.53125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.8632, + "grad_norm": 1.8813602924346924, + "kl": 0.07415771484375, + "learning_rate": 1e-06, + "loss": -0.0646, + "num_tokens": 17116579.0, + "reward": 7.39404296875, + "reward_std": 6.781589508056641, + "rewards/rm_reward_func/mean": 7.39404296875, + "rewards/rm_reward_func/std": 11.871100425720215, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 321.1875, + "completions/mean_terminated_length": 293.9285888671875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.864, + "grad_norm": 1.54855477809906, + "kl": 0.0518798828125, + "learning_rate": 1e-06, + "loss": -0.1098, + "num_tokens": 17133881.0, + "reward": -4.786782264709473, + "reward_std": 5.430292129516602, + "rewards/rm_reward_func/mean": -4.786782264709473, + "rewards/rm_reward_func/std": 10.012991905212402, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 387.625, + "completions/mean_terminated_length": 346.16668701171875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.8648, + "grad_norm": 1.4348578453063965, + "kl": 0.075531005859375, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 17157269.0, + "reward": 6.763519287109375, + "reward_std": 5.296229839324951, + "rewards/rm_reward_func/mean": 6.763519287109375, + "rewards/rm_reward_func/std": 17.171924591064453, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 402.46875, + "completions/mean_terminated_length": 336.75, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.8656, + "grad_norm": 1.5684152841567993, + "kl": 0.05010986328125, + "learning_rate": 1e-06, + "loss": 0.1279, + "num_tokens": 17174452.0, + "reward": -11.177057266235352, + "reward_std": 4.580845832824707, + "rewards/rm_reward_func/mean": -11.177057266235352, + "rewards/rm_reward_func/std": 6.919641494750977, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 262.09375, + "completions/mean_terminated_length": 245.433349609375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8664, + "grad_norm": 2.075847864151001, + "kl": 0.09521484375, + "learning_rate": 1e-06, + "loss": -0.0218, + "num_tokens": 17188127.0, + "reward": -2.570526123046875, + "reward_std": 8.268798828125, + "rewards/rm_reward_func/mean": -2.570526123046875, + "rewards/rm_reward_func/std": 10.402847290039062, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 271.65625, + "completions/mean_terminated_length": 216.19232177734375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.8672, + "grad_norm": 2.8243250846862793, + "kl": 0.10247802734375, + "learning_rate": 1e-06, + "loss": -0.0281, + "num_tokens": 17202708.0, + "reward": 4.91973876953125, + "reward_std": 3.5849428176879883, + "rewards/rm_reward_func/mean": 4.91973876953125, + "rewards/rm_reward_func/std": 16.08811378479004, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 358.0625, + "completions/mean_terminated_length": 342.137939453125, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.868, + "grad_norm": 1.7685139179229736, + "kl": 0.04290771484375, + "learning_rate": 1e-06, + "loss": 0.0372, + "num_tokens": 17219278.0, + "reward": 2.25048828125, + "reward_std": 6.024899482727051, + "rewards/rm_reward_func/mean": 2.25048828125, + "rewards/rm_reward_func/std": 13.552252769470215, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 381.8125, + "completions/mean_terminated_length": 280.5555725097656, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.8688, + "grad_norm": 1.8544764518737793, + "kl": 0.09918212890625, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 17239344.0, + "reward": -4.3543701171875, + "reward_std": 6.867346286773682, + "rewards/rm_reward_func/mean": -4.3543701171875, + "rewards/rm_reward_func/std": 18.318862915039062, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 343.25, + "completions/mean_terminated_length": 296.0, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.8696, + "grad_norm": 1.958099365234375, + "kl": 0.07861328125, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 17258752.0, + "reward": 4.333587646484375, + "reward_std": 7.4507646560668945, + "rewards/rm_reward_func/mean": 4.333587646484375, + "rewards/rm_reward_func/std": 16.062524795532227, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 363.875, + "completions/mean_terminated_length": 314.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.8704, + "grad_norm": 2.0110268592834473, + "kl": 0.0440673828125, + "learning_rate": 1e-06, + "loss": 0.2111, + "num_tokens": 17275332.0, + "reward": -7.1995849609375, + "reward_std": 4.988054275512695, + "rewards/rm_reward_func/mean": -7.1995849609375, + "rewards/rm_reward_func/std": 8.222111701965332, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 304.6875, + "completions/mean_terminated_length": 275.0714416503906, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.8712, + "grad_norm": 2.5539073944091797, + "kl": 0.1864013671875, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 17296858.0, + "reward": 3.484619140625, + "reward_std": 6.507828712463379, + "rewards/rm_reward_func/mean": 3.484619140625, + "rewards/rm_reward_func/std": 13.895406723022461, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 322.90625, + "completions/mean_terminated_length": 279.26922607421875, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.872, + "grad_norm": 2.0791800022125244, + "kl": 0.08416748046875, + "learning_rate": 1e-06, + "loss": 0.1362, + "num_tokens": 17312399.0, + "reward": 0.58026123046875, + "reward_std": 4.362951278686523, + "rewards/rm_reward_func/mean": 0.58026123046875, + "rewards/rm_reward_func/std": 17.155956268310547, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 350.03125, + "completions/mean_terminated_length": 312.65386962890625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.8728, + "grad_norm": 1.846316933631897, + "kl": 0.069580078125, + "learning_rate": 1e-06, + "loss": -0.0667, + "num_tokens": 17328920.0, + "reward": 3.18212890625, + "reward_std": 5.347060203552246, + "rewards/rm_reward_func/mean": 3.18212890625, + "rewards/rm_reward_func/std": 15.348078727722168, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 253.3125, + "completions/mean_terminated_length": 236.06668090820312, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.8736, + "grad_norm": 2.0793819427490234, + "kl": 0.06634521484375, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 17339330.0, + "reward": -4.01171875, + "reward_std": 9.31304931640625, + "rewards/rm_reward_func/mean": -4.01171875, + "rewards/rm_reward_func/std": 14.479862213134766, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 328.03125, + "completions/mean_terminated_length": 328.03125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.8744, + "grad_norm": 1.6161627769470215, + "kl": 0.0771484375, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 17357203.0, + "reward": 9.56640625, + "reward_std": 3.058387041091919, + "rewards/rm_reward_func/mean": 9.56640625, + "rewards/rm_reward_func/std": 22.18265151977539, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 275.28125, + "completions/mean_terminated_length": 275.28125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.8752, + "grad_norm": 1.9762084484100342, + "kl": 0.0865478515625, + "learning_rate": 1e-06, + "loss": -0.0204, + "num_tokens": 17371220.0, + "reward": 9.5074462890625, + "reward_std": 4.592803001403809, + "rewards/rm_reward_func/mean": 9.5074462890625, + "rewards/rm_reward_func/std": 8.343945503234863, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 277.4375, + "completions/mean_terminated_length": 261.8000183105469, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.876, + "grad_norm": 2.13796329498291, + "kl": 0.07037353515625, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 17382114.0, + "reward": -1.20977783203125, + "reward_std": 6.088901519775391, + "rewards/rm_reward_func/mean": -1.20977783203125, + "rewards/rm_reward_func/std": 18.771263122558594, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 307.90625, + "completions/mean_terminated_length": 260.8077087402344, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8768, + "grad_norm": 4.1955718994140625, + "kl": 0.08642578125, + "learning_rate": 1e-06, + "loss": 0.3399, + "num_tokens": 17394583.0, + "reward": -6.243194580078125, + "reward_std": 4.6278977394104, + "rewards/rm_reward_func/mean": -6.243194580078125, + "rewards/rm_reward_func/std": 7.7522664070129395, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 465.1875, + "completions/mean_terminated_length": 396.7692565917969, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.8776, + "grad_norm": 1.6034258604049683, + "kl": 0.047027587890625, + "learning_rate": 1e-06, + "loss": -0.0292, + "num_tokens": 17412229.0, + "reward": -5.2532958984375, + "reward_std": 3.8931074142456055, + "rewards/rm_reward_func/mean": -5.2532958984375, + "rewards/rm_reward_func/std": 6.627257823944092, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 271.375, + "completions/mean_terminated_length": 226.8148193359375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.8784, + "grad_norm": 2.2611372470855713, + "kl": 0.0994873046875, + "learning_rate": 1e-06, + "loss": 0.1444, + "num_tokens": 17427001.0, + "reward": -4.86248779296875, + "reward_std": 4.061513900756836, + "rewards/rm_reward_func/mean": -4.86248779296875, + "rewards/rm_reward_func/std": 7.475869178771973, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 156.09375, + "completions/mean_terminated_length": 156.09375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8792, + "grad_norm": 4.113502502441406, + "kl": 0.1251220703125, + "learning_rate": 1e-06, + "loss": -0.0181, + "num_tokens": 17435348.0, + "reward": 5.791015625, + "reward_std": 4.625732421875, + "rewards/rm_reward_func/mean": 5.791015625, + "rewards/rm_reward_func/std": 14.378396987915039, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 370.53125, + "completions/mean_terminated_length": 306.227294921875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.88, + "grad_norm": 1.8165894746780396, + "kl": 0.0643310546875, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 17449517.0, + "reward": 6.808362007141113, + "reward_std": 4.4656853675842285, + "rewards/rm_reward_func/mean": 6.808362007141113, + "rewards/rm_reward_func/std": 19.509113311767578, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 263.6875, + "completions/mean_terminated_length": 206.38462829589844, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.8808, + "grad_norm": 2.0981500148773193, + "kl": 0.10498046875, + "learning_rate": 1e-06, + "loss": 0.1016, + "num_tokens": 17465555.0, + "reward": -2.56903076171875, + "reward_std": 8.292194366455078, + "rewards/rm_reward_func/mean": -2.56903076171875, + "rewards/rm_reward_func/std": 12.846428871154785, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 294.15625, + "completions/mean_terminated_length": 243.88462829589844, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.8816, + "grad_norm": 2.171806573867798, + "kl": 0.069580078125, + "learning_rate": 1e-06, + "loss": 0.3434, + "num_tokens": 17479896.0, + "reward": -12.5537109375, + "reward_std": 11.227779388427734, + "rewards/rm_reward_func/mean": -12.5537109375, + "rewards/rm_reward_func/std": 11.74368953704834, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 208.3125, + "completions/mean_terminated_length": 198.51612854003906, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.8824, + "grad_norm": 2.4335920810699463, + "kl": 0.1173095703125, + "learning_rate": 1e-06, + "loss": 0.113, + "num_tokens": 17490290.0, + "reward": 5.0563812255859375, + "reward_std": 6.061530113220215, + "rewards/rm_reward_func/mean": 5.0563812255859375, + "rewards/rm_reward_func/std": 19.05927085876465, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 256.65625, + "completions/mean_terminated_length": 248.41934204101562, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.8832, + "grad_norm": 2.5260488986968994, + "kl": 0.06396484375, + "learning_rate": 1e-06, + "loss": -0.0632, + "num_tokens": 17500639.0, + "reward": -10.673934936523438, + "reward_std": 5.530281066894531, + "rewards/rm_reward_func/mean": -10.673934936523438, + "rewards/rm_reward_func/std": 6.325264930725098, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 343.15625, + "completions/mean_terminated_length": 304.19232177734375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.884, + "grad_norm": 1.3845207691192627, + "kl": 0.06768798828125, + "learning_rate": 1e-06, + "loss": -0.0533, + "num_tokens": 17524388.0, + "reward": 4.716976165771484, + "reward_std": 7.782622337341309, + "rewards/rm_reward_func/mean": 4.716976165771484, + "rewards/rm_reward_func/std": 18.12108612060547, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 293.90625, + "completions/mean_terminated_length": 279.3666687011719, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.8848, + "grad_norm": 3.2154455184936523, + "kl": 0.09893798828125, + "learning_rate": 1e-06, + "loss": 0.2118, + "num_tokens": 17539825.0, + "reward": 4.46380615234375, + "reward_std": 7.844593524932861, + "rewards/rm_reward_func/mean": 4.46380615234375, + "rewards/rm_reward_func/std": 16.142616271972656, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 396.0625, + "completions/mean_terminated_length": 369.3077087402344, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.8856, + "grad_norm": 1.842594861984253, + "kl": 0.05828857421875, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 17556211.0, + "reward": -7.06494140625, + "reward_std": 5.258692741394043, + "rewards/rm_reward_func/mean": -7.06494140625, + "rewards/rm_reward_func/std": 12.971417427062988, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 310.0625, + "completions/mean_terminated_length": 272.6666564941406, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.8864, + "grad_norm": 2.240687847137451, + "kl": 0.08087158203125, + "learning_rate": 1e-06, + "loss": 0.2258, + "num_tokens": 17572173.0, + "reward": -3.089935302734375, + "reward_std": 5.76326847076416, + "rewards/rm_reward_func/mean": -3.089935302734375, + "rewards/rm_reward_func/std": 8.40343189239502, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 291.6875, + "completions/mean_terminated_length": 250.88888549804688, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.8872, + "grad_norm": 2.050447702407837, + "kl": 0.07476806640625, + "learning_rate": 1e-06, + "loss": 0.1391, + "num_tokens": 17587475.0, + "reward": -0.385009765625, + "reward_std": 4.082517623901367, + "rewards/rm_reward_func/mean": -0.385009765625, + "rewards/rm_reward_func/std": 15.49717903137207, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 339.25, + "completions/mean_terminated_length": 290.8800048828125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.888, + "grad_norm": 2.0064468383789062, + "kl": 0.0640869140625, + "learning_rate": 1e-06, + "loss": 0.0641, + "num_tokens": 17601683.0, + "reward": -1.9089641571044922, + "reward_std": 9.604089736938477, + "rewards/rm_reward_func/mean": -1.9089641571044922, + "rewards/rm_reward_func/std": 11.033252716064453, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 376.0625, + "completions/mean_terminated_length": 338.0, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.8888, + "grad_norm": 1.8694170713424683, + "kl": 0.0479736328125, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 17619109.0, + "reward": -4.768796920776367, + "reward_std": 5.166200637817383, + "rewards/rm_reward_func/mean": -4.768796920776367, + "rewards/rm_reward_func/std": 9.178638458251953, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 449.78125, + "completions/mean_terminated_length": 417.19049072265625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.8896, + "grad_norm": 1.5631797313690186, + "kl": 0.04412841796875, + "learning_rate": 1e-06, + "loss": -0.0791, + "num_tokens": 17640326.0, + "reward": -7.66705322265625, + "reward_std": 4.925333023071289, + "rewards/rm_reward_func/mean": -7.66705322265625, + "rewards/rm_reward_func/std": 10.646611213684082, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 241.125, + "completions/mean_terminated_length": 223.06668090820312, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.8904, + "grad_norm": 3.044036865234375, + "kl": 0.07659912109375, + "learning_rate": 1e-06, + "loss": 0.1482, + "num_tokens": 17649866.0, + "reward": -4.82183837890625, + "reward_std": 4.639820098876953, + "rewards/rm_reward_func/mean": -4.82183837890625, + "rewards/rm_reward_func/std": 7.2421464920043945, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 374.3125, + "completions/mean_terminated_length": 360.0689697265625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.8912, + "grad_norm": 1.898924469947815, + "kl": 0.059814453125, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 17667556.0, + "reward": -5.2734375, + "reward_std": 3.365342140197754, + "rewards/rm_reward_func/mean": -5.2734375, + "rewards/rm_reward_func/std": 4.226077556610107, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 295.65625, + "completions/mean_terminated_length": 273.2758483886719, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.892, + "grad_norm": 2.6310179233551025, + "kl": 0.08929443359375, + "learning_rate": 1e-06, + "loss": -0.0364, + "num_tokens": 17681449.0, + "reward": 4.1010284423828125, + "reward_std": 5.437978744506836, + "rewards/rm_reward_func/mean": 4.1010284423828125, + "rewards/rm_reward_func/std": 18.356325149536133, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 348.875, + "completions/mean_terminated_length": 318.6666564941406, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.8928, + "grad_norm": 1.5374693870544434, + "kl": 0.10076904296875, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 17704997.0, + "reward": 6.69610595703125, + "reward_std": 6.900143623352051, + "rewards/rm_reward_func/mean": 6.69610595703125, + "rewards/rm_reward_func/std": 19.937532424926758, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 340.53125, + "completions/mean_terminated_length": 316.0357360839844, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.8936, + "grad_norm": 1.6386735439300537, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": -0.0716, + "num_tokens": 17723486.0, + "reward": 6.125732421875, + "reward_std": 7.932873249053955, + "rewards/rm_reward_func/mean": 6.125732421875, + "rewards/rm_reward_func/std": 15.267772674560547, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 397.15625, + "completions/mean_terminated_length": 352.2174072265625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.8944, + "grad_norm": 1.7222517728805542, + "kl": 0.071044921875, + "learning_rate": 1e-06, + "loss": 0.048, + "num_tokens": 17740963.0, + "reward": 6.173095703125, + "reward_std": 7.970644950866699, + "rewards/rm_reward_func/mean": 6.173095703125, + "rewards/rm_reward_func/std": 13.987817764282227, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 318.0, + "completions/mean_terminated_length": 282.0740661621094, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.8952, + "grad_norm": 1.919953465461731, + "kl": 0.06915283203125, + "learning_rate": 1e-06, + "loss": -0.0355, + "num_tokens": 17756299.0, + "reward": -0.48681640625, + "reward_std": 5.74087381362915, + "rewards/rm_reward_func/mean": -0.48681640625, + "rewards/rm_reward_func/std": 14.732064247131348, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 222.78125, + "completions/mean_terminated_length": 222.78125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.896, + "grad_norm": 2.096815586090088, + "kl": 0.08251953125, + "learning_rate": 1e-06, + "loss": -0.0809, + "num_tokens": 17767804.0, + "reward": 9.5291748046875, + "reward_std": 4.593903541564941, + "rewards/rm_reward_func/mean": 9.5291748046875, + "rewards/rm_reward_func/std": 18.932607650756836, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 412.75, + "completions/mean_terminated_length": 373.9130554199219, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.8968, + "grad_norm": 1.6090526580810547, + "kl": 0.038787841796875, + "learning_rate": 1e-06, + "loss": -0.0929, + "num_tokens": 17785828.0, + "reward": -2.448760986328125, + "reward_std": 6.6045637130737305, + "rewards/rm_reward_func/mean": -2.448760986328125, + "rewards/rm_reward_func/std": 8.59542465209961, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 377.78125, + "completions/mean_terminated_length": 243.5625, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.8976, + "grad_norm": 2.167140483856201, + "kl": 0.04473876953125, + "learning_rate": 1e-06, + "loss": 0.2417, + "num_tokens": 17804173.0, + "reward": -9.55291748046875, + "reward_std": 5.804081916809082, + "rewards/rm_reward_func/mean": -9.55291748046875, + "rewards/rm_reward_func/std": 9.330775260925293, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 359.625, + "completions/mean_terminated_length": 308.8333435058594, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.8984, + "grad_norm": 1.7016032934188843, + "kl": 0.0684814453125, + "learning_rate": 1e-06, + "loss": -0.0813, + "num_tokens": 17820913.0, + "reward": 8.431106567382812, + "reward_std": 8.720551490783691, + "rewards/rm_reward_func/mean": 8.431106567382812, + "rewards/rm_reward_func/std": 19.23511505126953, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 265.84375, + "completions/mean_terminated_length": 240.37930297851562, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.8992, + "grad_norm": 2.0276615619659424, + "kl": 0.10687255859375, + "learning_rate": 1e-06, + "loss": -0.0958, + "num_tokens": 17838580.0, + "reward": -2.802215576171875, + "reward_std": 7.847016334533691, + "rewards/rm_reward_func/mean": -2.802215576171875, + "rewards/rm_reward_func/std": 11.67371654510498, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 266.875, + "completions/mean_terminated_length": 231.85714721679688, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 0.9, + "grad_norm": 2.3680171966552734, + "kl": 0.093017578125, + "learning_rate": 1e-06, + "loss": 0.1025, + "num_tokens": 17849416.0, + "reward": -10.891845703125, + "reward_std": 7.265751838684082, + "rewards/rm_reward_func/mean": -10.891845703125, + "rewards/rm_reward_func/std": 8.970508575439453, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 245.0, + "completions/mean_terminated_length": 245.0, + "completions/min_length": 30.0, + "completions/min_terminated_length": 30.0, + "epoch": 0.9008, + "grad_norm": 3.174072504043579, + "kl": 0.13861083984375, + "learning_rate": 1e-06, + "loss": -0.0141, + "num_tokens": 17863936.0, + "reward": 3.42138671875, + "reward_std": 3.8966920375823975, + "rewards/rm_reward_func/mean": 3.42138671875, + "rewards/rm_reward_func/std": 7.419936656951904, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 368.84375, + "completions/mean_terminated_length": 312.8260803222656, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.9016, + "grad_norm": 1.8501002788543701, + "kl": 0.04547119140625, + "learning_rate": 1e-06, + "loss": 0.097, + "num_tokens": 17878387.0, + "reward": 1.4113006591796875, + "reward_std": 7.580566883087158, + "rewards/rm_reward_func/mean": 1.4113006591796875, + "rewards/rm_reward_func/std": 11.237761497497559, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 340.90625, + "completions/mean_terminated_length": 251.2857208251953, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9024, + "grad_norm": 2.0635948181152344, + "kl": 0.055084228515625, + "learning_rate": 1e-06, + "loss": 0.1201, + "num_tokens": 17891544.0, + "reward": -10.33984375, + "reward_std": 8.512529373168945, + "rewards/rm_reward_func/mean": -10.33984375, + "rewards/rm_reward_func/std": 12.727262496948242, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 245.4375, + "completions/mean_terminated_length": 245.4375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.9032, + "grad_norm": 3.380964994430542, + "kl": 0.11627197265625, + "learning_rate": 1e-06, + "loss": 0.0535, + "num_tokens": 17903126.0, + "reward": 3.5533447265625, + "reward_std": 3.8524515628814697, + "rewards/rm_reward_func/mean": 3.5533447265625, + "rewards/rm_reward_func/std": 12.030610084533691, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 323.78125, + "completions/mean_terminated_length": 304.3103332519531, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.904, + "grad_norm": 1.8443655967712402, + "kl": 0.07342529296875, + "learning_rate": 1e-06, + "loss": -0.0716, + "num_tokens": 17920311.0, + "reward": 5.91522216796875, + "reward_std": 5.141403675079346, + "rewards/rm_reward_func/mean": 5.91522216796875, + "rewards/rm_reward_func/std": 17.084081649780273, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 237.9354705810547, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.9048, + "grad_norm": 2.729099750518799, + "kl": 0.0924072265625, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 17933087.0, + "reward": -5.018798828125, + "reward_std": 4.45077657699585, + "rewards/rm_reward_func/mean": -5.018798828125, + "rewards/rm_reward_func/std": 7.457932472229004, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 313.6875, + "completions/mean_terminated_length": 285.3571472167969, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "epoch": 0.9056, + "grad_norm": 1.6784160137176514, + "kl": 0.0491943359375, + "learning_rate": 1e-06, + "loss": 0.0874, + "num_tokens": 17944861.0, + "reward": -3.11517333984375, + "reward_std": 8.746601104736328, + "rewards/rm_reward_func/mean": -3.11517333984375, + "rewards/rm_reward_func/std": 14.569607734680176, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 314.75, + "completions/mean_terminated_length": 286.5714416503906, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.9064, + "grad_norm": 1.9443410634994507, + "kl": 0.0731201171875, + "learning_rate": 1e-06, + "loss": -0.0975, + "num_tokens": 17957541.0, + "reward": 4.2359619140625, + "reward_std": 5.690091133117676, + "rewards/rm_reward_func/mean": 4.2359619140625, + "rewards/rm_reward_func/std": 19.615888595581055, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 258.21875, + "completions/mean_terminated_length": 258.21875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.9072, + "grad_norm": 2.292943239212036, + "kl": 0.05712890625, + "learning_rate": 1e-06, + "loss": 0.0613, + "num_tokens": 17970228.0, + "reward": -0.2470703125, + "reward_std": 3.4264345169067383, + "rewards/rm_reward_func/mean": -0.2470703125, + "rewards/rm_reward_func/std": 14.349126815795898, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 335.625, + "completions/mean_terminated_length": 294.923095703125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.908, + "grad_norm": 1.8512630462646484, + "kl": 0.05194091796875, + "learning_rate": 1e-06, + "loss": -0.0652, + "num_tokens": 17984792.0, + "reward": 4.363029479980469, + "reward_std": 10.35328483581543, + "rewards/rm_reward_func/mean": 4.363029479980469, + "rewards/rm_reward_func/std": 10.560738563537598, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 342.3125, + "completions/mean_terminated_length": 331.0000305175781, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.9088, + "grad_norm": 1.826686143875122, + "kl": 0.05206298828125, + "learning_rate": 1e-06, + "loss": -0.083, + "num_tokens": 17998082.0, + "reward": -0.5994186401367188, + "reward_std": 5.863535404205322, + "rewards/rm_reward_func/mean": -0.5994186401367188, + "rewards/rm_reward_func/std": 8.127443313598633, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 402.375, + "completions/mean_terminated_length": 327.3684387207031, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.9096, + "grad_norm": 2.1141932010650635, + "kl": 0.0447998046875, + "learning_rate": 1e-06, + "loss": 0.2183, + "num_tokens": 18016974.0, + "reward": 0.2315673828125, + "reward_std": 7.783949851989746, + "rewards/rm_reward_func/mean": 0.2315673828125, + "rewards/rm_reward_func/std": 17.63456153869629, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 307.1875, + "completions/mean_terminated_length": 126.47058868408203, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.9104, + "grad_norm": 2.9265241622924805, + "kl": 0.107330322265625, + "learning_rate": 1e-06, + "loss": -0.0658, + "num_tokens": 18032940.0, + "reward": -4.1884765625, + "reward_std": 4.123441219329834, + "rewards/rm_reward_func/mean": -4.1884765625, + "rewards/rm_reward_func/std": 13.327445983886719, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 335.9375, + "completions/mean_terminated_length": 303.3333435058594, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.9112, + "grad_norm": 2.735564947128296, + "kl": 0.06890869140625, + "learning_rate": 1e-06, + "loss": 0.189, + "num_tokens": 18046058.0, + "reward": -1.1719970703125, + "reward_std": 5.4717512130737305, + "rewards/rm_reward_func/mean": -1.1719970703125, + "rewards/rm_reward_func/std": 7.992931842803955, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 358.0625, + "completions/mean_terminated_length": 306.75, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.912, + "grad_norm": 1.7399682998657227, + "kl": 0.064453125, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 18063004.0, + "reward": 3.93359375, + "reward_std": 4.157954216003418, + "rewards/rm_reward_func/mean": 3.93359375, + "rewards/rm_reward_func/std": 20.801462173461914, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 371.8125, + "completions/mean_terminated_length": 316.9565124511719, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.9128, + "grad_norm": 1.757383108139038, + "kl": 0.06927490234375, + "learning_rate": 1e-06, + "loss": 0.0321, + "num_tokens": 18083646.0, + "reward": 0.1849365234375, + "reward_std": 4.613014221191406, + "rewards/rm_reward_func/mean": 0.1849365234375, + "rewards/rm_reward_func/std": 21.08102035522461, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 269.96875, + "completions/mean_terminated_length": 202.1999969482422, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.9136, + "grad_norm": 2.4788358211517334, + "kl": 0.0633544921875, + "learning_rate": 1e-06, + "loss": 0.3112, + "num_tokens": 18098213.0, + "reward": -4.2085723876953125, + "reward_std": 6.648910999298096, + "rewards/rm_reward_func/mean": -4.2085723876953125, + "rewards/rm_reward_func/std": 6.716778755187988, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 356.40625, + "completions/mean_terminated_length": 320.5, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.9144, + "grad_norm": 1.7559200525283813, + "kl": 0.0590972900390625, + "learning_rate": 1e-06, + "loss": -0.037, + "num_tokens": 18116490.0, + "reward": -1.7566566467285156, + "reward_std": 6.10938835144043, + "rewards/rm_reward_func/mean": -1.7566566467285156, + "rewards/rm_reward_func/std": 14.643930435180664, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 409.21875, + "completions/mean_terminated_length": 380.44000244140625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.9152, + "grad_norm": 1.6537233591079712, + "kl": 0.044769287109375, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 18131561.0, + "reward": -2.463897705078125, + "reward_std": 5.601576805114746, + "rewards/rm_reward_func/mean": -2.463897705078125, + "rewards/rm_reward_func/std": 11.792763710021973, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 274.59375, + "completions/mean_terminated_length": 266.93548583984375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.916, + "grad_norm": 2.151003360748291, + "kl": 0.07232666015625, + "learning_rate": 1e-06, + "loss": 0.1026, + "num_tokens": 18142596.0, + "reward": -9.701416015625, + "reward_std": 5.322725296020508, + "rewards/rm_reward_func/mean": -9.701416015625, + "rewards/rm_reward_func/std": 7.718242168426514, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 201.8125, + "completions/mean_terminated_length": 191.8064422607422, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.9168, + "grad_norm": 2.635965347290039, + "kl": 0.08038330078125, + "learning_rate": 1e-06, + "loss": 0.0618, + "num_tokens": 18151854.0, + "reward": -13.19586181640625, + "reward_std": 6.336431980133057, + "rewards/rm_reward_func/mean": -13.19586181640625, + "rewards/rm_reward_func/std": 10.333242416381836, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 346.875, + "completions/mean_terminated_length": 291.8333435058594, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.9176, + "grad_norm": 1.5437933206558228, + "kl": 0.070709228515625, + "learning_rate": 1e-06, + "loss": -0.0772, + "num_tokens": 18171306.0, + "reward": -2.70196533203125, + "reward_std": 5.082521915435791, + "rewards/rm_reward_func/mean": -2.70196533203125, + "rewards/rm_reward_func/std": 17.651018142700195, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 292.625, + "completions/mean_terminated_length": 269.9310302734375, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.9184, + "grad_norm": 2.198352813720703, + "kl": 0.07086181640625, + "learning_rate": 1e-06, + "loss": 0.1321, + "num_tokens": 18183982.0, + "reward": -6.55157470703125, + "reward_std": 5.601506233215332, + "rewards/rm_reward_func/mean": -6.55157470703125, + "rewards/rm_reward_func/std": 9.368389129638672, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 352.75, + "completions/mean_terminated_length": 330.0, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9192, + "grad_norm": 2.109632730484009, + "kl": 0.0538330078125, + "learning_rate": 1e-06, + "loss": -0.0339, + "num_tokens": 18197318.0, + "reward": -5.082672119140625, + "reward_std": 4.930788040161133, + "rewards/rm_reward_func/mean": -5.082672119140625, + "rewards/rm_reward_func/std": 7.403159141540527, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 303.0, + "completions/mean_terminated_length": 296.258056640625, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.92, + "grad_norm": 2.0294196605682373, + "kl": 0.058349609375, + "learning_rate": 1e-06, + "loss": -0.1144, + "num_tokens": 18212542.0, + "reward": 0.09304046630859375, + "reward_std": 4.330693244934082, + "rewards/rm_reward_func/mean": 0.09304046630859375, + "rewards/rm_reward_func/std": 5.565187454223633, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 347.34375, + "completions/mean_terminated_length": 261.0952453613281, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.9208, + "grad_norm": 1.8967647552490234, + "kl": 0.050628662109375, + "learning_rate": 1e-06, + "loss": 0.1564, + "num_tokens": 18227633.0, + "reward": 5.91015625, + "reward_std": 4.137157440185547, + "rewards/rm_reward_func/mean": 5.91015625, + "rewards/rm_reward_func/std": 23.394020080566406, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 420.53125, + "completions/mean_terminated_length": 378.9545593261719, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.9216, + "grad_norm": 1.577277660369873, + "kl": 0.05584716796875, + "learning_rate": 1e-06, + "loss": -0.0157, + "num_tokens": 18244930.0, + "reward": 4.427337646484375, + "reward_std": 6.276885509490967, + "rewards/rm_reward_func/mean": 4.427337646484375, + "rewards/rm_reward_func/std": 18.504451751708984, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 306.90625, + "completions/mean_terminated_length": 277.6071472167969, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.9224, + "grad_norm": 1.6562687158584595, + "kl": 0.0576171875, + "learning_rate": 1e-06, + "loss": -0.1202, + "num_tokens": 18257991.0, + "reward": -7.249755859375, + "reward_std": 5.383556842803955, + "rewards/rm_reward_func/mean": -7.249755859375, + "rewards/rm_reward_func/std": 11.949959754943848, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 374.6875, + "completions/mean_terminated_length": 343.0, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.9232, + "grad_norm": 1.6180505752563477, + "kl": 0.0694580078125, + "learning_rate": 1e-06, + "loss": -0.0384, + "num_tokens": 18275725.0, + "reward": 12.142578125, + "reward_std": 5.616630554199219, + "rewards/rm_reward_func/mean": 12.142578125, + "rewards/rm_reward_func/std": 13.852538108825684, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 390.90625, + "completions/mean_terminated_length": 343.5217590332031, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.924, + "grad_norm": 1.6419225931167603, + "kl": 0.0716552734375, + "learning_rate": 1e-06, + "loss": -0.0551, + "num_tokens": 18296826.0, + "reward": 6.17230224609375, + "reward_std": 5.930800437927246, + "rewards/rm_reward_func/mean": 6.17230224609375, + "rewards/rm_reward_func/std": 15.92397403717041, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 276.53125, + "completions/mean_terminated_length": 268.93548583984375, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.9248, + "grad_norm": 2.10224986076355, + "kl": 0.05126953125, + "learning_rate": 1e-06, + "loss": -0.0948, + "num_tokens": 18307803.0, + "reward": -2.283203125, + "reward_std": 4.421841621398926, + "rewards/rm_reward_func/mean": -2.283203125, + "rewards/rm_reward_func/std": 10.629165649414062, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 329.625, + "completions/mean_terminated_length": 287.5384826660156, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.9256, + "grad_norm": 1.9656977653503418, + "kl": 0.0986328125, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 18324479.0, + "reward": 0.03173828125, + "reward_std": 8.026904106140137, + "rewards/rm_reward_func/mean": 0.03173828125, + "rewards/rm_reward_func/std": 18.84831428527832, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 290.5, + "completions/mean_terminated_length": 258.8571472167969, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9264, + "grad_norm": 2.420762062072754, + "kl": 0.06121826171875, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 18338671.0, + "reward": -6.857666015625, + "reward_std": 6.175392150878906, + "rewards/rm_reward_func/mean": -6.857666015625, + "rewards/rm_reward_func/std": 9.297062873840332, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 350.03125, + "completions/mean_terminated_length": 339.23333740234375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 0.9272, + "grad_norm": 1.8742328882217407, + "kl": 0.07061767578125, + "learning_rate": 1e-06, + "loss": -0.0898, + "num_tokens": 18356832.0, + "reward": -8.0841064453125, + "reward_std": 3.9184622764587402, + "rewards/rm_reward_func/mean": -8.0841064453125, + "rewards/rm_reward_func/std": 8.11465835571289, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 246.125, + "completions/mean_terminated_length": 228.40000915527344, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.928, + "grad_norm": 4.225946426391602, + "kl": 0.1259765625, + "learning_rate": 1e-06, + "loss": 0.0967, + "num_tokens": 18367692.0, + "reward": 4.774200439453125, + "reward_std": 6.818203449249268, + "rewards/rm_reward_func/mean": 4.774200439453125, + "rewards/rm_reward_func/std": 9.039929389953613, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 366.1875, + "completions/mean_terminated_length": 237.5294189453125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.9288, + "grad_norm": 2.6777892112731934, + "kl": 0.06103515625, + "learning_rate": 1e-06, + "loss": 0.1202, + "num_tokens": 18386482.0, + "reward": -4.39886474609375, + "reward_std": 8.557821273803711, + "rewards/rm_reward_func/mean": -4.39886474609375, + "rewards/rm_reward_func/std": 9.900589942932129, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 358.625, + "completions/mean_terminated_length": 336.71429443359375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.9296, + "grad_norm": 1.5284615755081177, + "kl": 0.07965087890625, + "learning_rate": 1e-06, + "loss": -0.0863, + "num_tokens": 18404910.0, + "reward": 7.204736709594727, + "reward_std": 5.036811828613281, + "rewards/rm_reward_func/mean": 7.204736709594727, + "rewards/rm_reward_func/std": 17.289981842041016, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 378.5, + "completions/mean_terminated_length": 326.2608642578125, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.9304, + "grad_norm": 1.7886688709259033, + "kl": 0.05419921875, + "learning_rate": 1e-06, + "loss": -0.1201, + "num_tokens": 18422086.0, + "reward": -2.7242431640625, + "reward_std": 7.236316680908203, + "rewards/rm_reward_func/mean": -2.7242431640625, + "rewards/rm_reward_func/std": 9.736739158630371, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 354.0, + "completions/mean_terminated_length": 348.9032287597656, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.9312, + "grad_norm": 1.6585993766784668, + "kl": 0.051513671875, + "learning_rate": 1e-06, + "loss": -0.112, + "num_tokens": 18436078.0, + "reward": -3.51953125, + "reward_std": 7.458183765411377, + "rewards/rm_reward_func/mean": -3.51953125, + "rewards/rm_reward_func/std": 11.516336441040039, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 303.59375, + "completions/mean_terminated_length": 296.8709716796875, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.932, + "grad_norm": 2.0608420372009277, + "kl": 0.0582275390625, + "learning_rate": 1e-06, + "loss": -0.3036, + "num_tokens": 18447177.0, + "reward": 0.93359375, + "reward_std": 9.4815034866333, + "rewards/rm_reward_func/mean": 0.93359375, + "rewards/rm_reward_func/std": 15.404294967651367, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 311.625, + "completions/mean_terminated_length": 298.2666931152344, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.9328, + "grad_norm": 1.7879366874694824, + "kl": 0.08062744140625, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 18462421.0, + "reward": 9.181396484375, + "reward_std": 7.995716571807861, + "rewards/rm_reward_func/mean": 9.181396484375, + "rewards/rm_reward_func/std": 15.19915771484375, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 430.3125, + "completions/mean_terminated_length": 403.0833435058594, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.9336, + "grad_norm": 1.740823745727539, + "kl": 0.050048828125, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 18478383.0, + "reward": 4.3897705078125, + "reward_std": 7.601470947265625, + "rewards/rm_reward_func/mean": 4.3897705078125, + "rewards/rm_reward_func/std": 19.65622901916504, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 309.5, + "completions/mean_terminated_length": 262.76922607421875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.9344, + "grad_norm": 1.9558498859405518, + "kl": 0.09619140625, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 18496031.0, + "reward": 2.38873291015625, + "reward_std": 5.518542289733887, + "rewards/rm_reward_func/mean": 2.38873291015625, + "rewards/rm_reward_func/std": 10.202322959899902, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 302.46875, + "completions/mean_terminated_length": 288.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.9352, + "grad_norm": 2.0900814533233643, + "kl": 0.05816650390625, + "learning_rate": 1e-06, + "loss": -0.1068, + "num_tokens": 18511174.0, + "reward": 0.43853759765625, + "reward_std": 5.7756123542785645, + "rewards/rm_reward_func/mean": 0.43853759765625, + "rewards/rm_reward_func/std": 8.118162155151367, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 314.03125, + "completions/mean_terminated_length": 178.57894897460938, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.936, + "grad_norm": 2.420034408569336, + "kl": 0.067169189453125, + "learning_rate": 1e-06, + "loss": 0.3415, + "num_tokens": 18526775.0, + "reward": -7.227855682373047, + "reward_std": 8.721613883972168, + "rewards/rm_reward_func/mean": -7.227855682373047, + "rewards/rm_reward_func/std": 10.889971733093262, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 323.96875, + "completions/mean_terminated_length": 271.32000732421875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.9368, + "grad_norm": 1.5937886238098145, + "kl": 0.0791015625, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 18546782.0, + "reward": -1.1865234375, + "reward_std": 6.299413681030273, + "rewards/rm_reward_func/mean": -1.1865234375, + "rewards/rm_reward_func/std": 19.411155700683594, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 309.15625, + "completions/mean_terminated_length": 262.3461608886719, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.9376, + "grad_norm": 2.2506115436553955, + "kl": 0.058349609375, + "learning_rate": 1e-06, + "loss": 0.2414, + "num_tokens": 18559315.0, + "reward": -7.3369140625, + "reward_std": 7.841141223907471, + "rewards/rm_reward_func/mean": -7.3369140625, + "rewards/rm_reward_func/std": 10.676852226257324, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 310.5, + "completions/mean_terminated_length": 304.0, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.9384, + "grad_norm": 1.8848408460617065, + "kl": 0.09490966796875, + "learning_rate": 1e-06, + "loss": 0.0207, + "num_tokens": 18577851.0, + "reward": 4.5037841796875, + "reward_std": 5.092231750488281, + "rewards/rm_reward_func/mean": 4.5037841796875, + "rewards/rm_reward_func/std": 8.580667495727539, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 333.46875, + "completions/mean_terminated_length": 315.0, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 0.9392, + "grad_norm": 2.0534439086914062, + "kl": 0.06707763671875, + "learning_rate": 1e-06, + "loss": -0.0326, + "num_tokens": 18591610.0, + "reward": -7.8873291015625, + "reward_std": 7.426888942718506, + "rewards/rm_reward_func/mean": -7.8873291015625, + "rewards/rm_reward_func/std": 8.953723907470703, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 349.03125, + "completions/mean_terminated_length": 318.85186767578125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.94, + "grad_norm": 1.6998295783996582, + "kl": 0.09869384765625, + "learning_rate": 1e-06, + "loss": -0.0332, + "num_tokens": 18609395.0, + "reward": 1.0355224609375, + "reward_std": 6.597189903259277, + "rewards/rm_reward_func/mean": 1.0355224609375, + "rewards/rm_reward_func/std": 12.671693801879883, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 353.03125, + "completions/mean_terminated_length": 353.03125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.9408, + "grad_norm": 1.7443124055862427, + "kl": 0.07403564453125, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 18627348.0, + "reward": 10.0181884765625, + "reward_std": 6.816492080688477, + "rewards/rm_reward_func/mean": 10.0181884765625, + "rewards/rm_reward_func/std": 16.376365661621094, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 405.0, + "completions/mean_terminated_length": 380.3077087402344, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.9416, + "grad_norm": 1.5109375715255737, + "kl": 0.042236328125, + "learning_rate": 1e-06, + "loss": -0.0507, + "num_tokens": 18644372.0, + "reward": 1.02813720703125, + "reward_std": 7.758831024169922, + "rewards/rm_reward_func/mean": 1.02813720703125, + "rewards/rm_reward_func/std": 9.708630561828613, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 269.875, + "completions/mean_terminated_length": 225.0370330810547, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.9424, + "grad_norm": 1.8930772542953491, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 18660832.0, + "reward": 7.6513671875, + "reward_std": 4.5707316398620605, + "rewards/rm_reward_func/mean": 7.6513671875, + "rewards/rm_reward_func/std": 14.335498809814453, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 303.3125, + "completions/mean_terminated_length": 281.7241516113281, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 0.9432, + "grad_norm": 2.3529882431030273, + "kl": 0.10150146484375, + "learning_rate": 1e-06, + "loss": -0.0214, + "num_tokens": 18676162.0, + "reward": 16.01739501953125, + "reward_std": 7.0298309326171875, + "rewards/rm_reward_func/mean": 16.01739501953125, + "rewards/rm_reward_func/std": 20.992267608642578, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 348.1875, + "completions/mean_terminated_length": 310.3846130371094, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.944, + "grad_norm": 1.760179042816162, + "kl": 0.077178955078125, + "learning_rate": 1e-06, + "loss": -0.0465, + "num_tokens": 18695952.0, + "reward": 6.663818359375, + "reward_std": 4.0742902755737305, + "rewards/rm_reward_func/mean": 6.663818359375, + "rewards/rm_reward_func/std": 16.944875717163086, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 349.25, + "completions/mean_terminated_length": 311.69232177734375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.9448, + "grad_norm": 1.5521093606948853, + "kl": 0.08465576171875, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 18718808.0, + "reward": 16.41357421875, + "reward_std": 3.149042844772339, + "rewards/rm_reward_func/mean": 16.41357421875, + "rewards/rm_reward_func/std": 20.468868255615234, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 406.875, + "completions/mean_terminated_length": 334.9473571777344, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.9456, + "grad_norm": 1.6353975534439087, + "kl": 0.05712890625, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 18740644.0, + "reward": -1.34014892578125, + "reward_std": 5.3794169425964355, + "rewards/rm_reward_func/mean": -1.34014892578125, + "rewards/rm_reward_func/std": 15.041204452514648, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 305.15625, + "completions/mean_terminated_length": 298.4838562011719, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.9464, + "grad_norm": 1.911405086517334, + "kl": 0.09625244140625, + "learning_rate": 1e-06, + "loss": 0.1793, + "num_tokens": 18759769.0, + "reward": 21.146484375, + "reward_std": 7.484389305114746, + "rewards/rm_reward_func/mean": 21.146484375, + "rewards/rm_reward_func/std": 20.252504348754883, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 385.25, + "completions/mean_terminated_length": 349.7599792480469, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.9472, + "grad_norm": 1.7041083574295044, + "kl": 0.05462646484375, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 18775873.0, + "reward": -2.3277587890625, + "reward_std": 3.502652645111084, + "rewards/rm_reward_func/mean": -2.3277587890625, + "rewards/rm_reward_func/std": 10.969642639160156, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 321.875, + "completions/mean_terminated_length": 302.2069091796875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.948, + "grad_norm": 2.095851182937622, + "kl": 0.0941162109375, + "learning_rate": 1e-06, + "loss": 0.0375, + "num_tokens": 18792733.0, + "reward": 7.5231170654296875, + "reward_std": 5.675870418548584, + "rewards/rm_reward_func/mean": 7.5231170654296875, + "rewards/rm_reward_func/std": 16.059511184692383, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 338.3125, + "completions/mean_terminated_length": 234.10000610351562, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9488, + "grad_norm": 2.7033138275146484, + "kl": 0.14019775390625, + "learning_rate": 1e-06, + "loss": -0.0395, + "num_tokens": 18805519.0, + "reward": -4.1978759765625, + "reward_std": 5.5976457595825195, + "rewards/rm_reward_func/mean": -4.1978759765625, + "rewards/rm_reward_func/std": 7.334824562072754, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 442.0, + "completions/mean_terminated_length": 380.23529052734375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.9496, + "grad_norm": 1.607633352279663, + "kl": 0.067138671875, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 18825623.0, + "reward": 8.71148681640625, + "reward_std": 4.78580904006958, + "rewards/rm_reward_func/mean": 8.71148681640625, + "rewards/rm_reward_func/std": 18.962846755981445, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 334.46875, + "completions/mean_terminated_length": 322.63336181640625, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.9504, + "grad_norm": 1.7158557176589966, + "kl": 0.06060791015625, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 18839262.0, + "reward": 1.75518798828125, + "reward_std": 8.543966293334961, + "rewards/rm_reward_func/mean": 1.75518798828125, + "rewards/rm_reward_func/std": 12.190470695495605, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 195.84375, + "completions/mean_terminated_length": 174.7666778564453, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.9512, + "grad_norm": 3.507983446121216, + "kl": 0.0986328125, + "learning_rate": 1e-06, + "loss": 0.3423, + "num_tokens": 18848121.0, + "reward": -1.8478012084960938, + "reward_std": 5.146864891052246, + "rewards/rm_reward_func/mean": -1.8478012084960938, + "rewards/rm_reward_func/std": 7.283282279968262, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 363.78125, + "completions/mean_terminated_length": 322.2799987792969, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.952, + "grad_norm": 1.8857688903808594, + "kl": 0.1007080078125, + "learning_rate": 1e-06, + "loss": -0.0322, + "num_tokens": 18866762.0, + "reward": 8.295989990234375, + "reward_std": 6.348849773406982, + "rewards/rm_reward_func/mean": 8.295989990234375, + "rewards/rm_reward_func/std": 8.897079467773438, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 292.875, + "completions/mean_terminated_length": 252.29629516601562, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.9528, + "grad_norm": 2.0393242835998535, + "kl": 0.07666015625, + "learning_rate": 1e-06, + "loss": 0.0878, + "num_tokens": 18881118.0, + "reward": -3.680419921875, + "reward_std": 4.832398414611816, + "rewards/rm_reward_func/mean": -3.680419921875, + "rewards/rm_reward_func/std": 11.717550277709961, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 312.3125, + "completions/mean_terminated_length": 305.8709716796875, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.9536, + "grad_norm": 2.8157835006713867, + "kl": 0.13165283203125, + "learning_rate": 1e-06, + "loss": 0.1924, + "num_tokens": 18896512.0, + "reward": 6.648193359375, + "reward_std": 4.956387042999268, + "rewards/rm_reward_func/mean": 6.648193359375, + "rewards/rm_reward_func/std": 16.177244186401367, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 305.0625, + "completions/mean_terminated_length": 305.0625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.9544, + "grad_norm": 2.303070306777954, + "kl": 0.18023681640625, + "learning_rate": 1e-06, + "loss": 0.0332, + "num_tokens": 18909770.0, + "reward": 11.090087890625, + "reward_std": 7.672306060791016, + "rewards/rm_reward_func/mean": 11.090087890625, + "rewards/rm_reward_func/std": 21.919132232666016, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 353.28125, + "completions/mean_terminated_length": 342.70001220703125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.9552, + "grad_norm": 1.8168104887008667, + "kl": 0.04620361328125, + "learning_rate": 1e-06, + "loss": -0.0074, + "num_tokens": 18923547.0, + "reward": 1.03057861328125, + "reward_std": 4.729050636291504, + "rewards/rm_reward_func/mean": 1.03057861328125, + "rewards/rm_reward_func/std": 9.131479263305664, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 300.84375, + "completions/mean_terminated_length": 252.11538696289062, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.956, + "grad_norm": 1.891855239868164, + "kl": 0.07427978515625, + "learning_rate": 1e-06, + "loss": 0.0315, + "num_tokens": 18942894.0, + "reward": -7.471923828125, + "reward_std": 4.9498291015625, + "rewards/rm_reward_func/mean": -7.471923828125, + "rewards/rm_reward_func/std": 7.764822006225586, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 356.375, + "completions/mean_terminated_length": 346.0000305175781, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.9568, + "grad_norm": 1.879837989807129, + "kl": 0.06280517578125, + "learning_rate": 1e-06, + "loss": -0.115, + "num_tokens": 18956690.0, + "reward": -6.384971618652344, + "reward_std": 4.789652347564697, + "rewards/rm_reward_func/mean": -6.384971618652344, + "rewards/rm_reward_func/std": 8.219325065612793, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 278.9375, + "completions/mean_terminated_length": 213.67999267578125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.9576, + "grad_norm": 2.1505980491638184, + "kl": 0.09893798828125, + "learning_rate": 1e-06, + "loss": -0.1447, + "num_tokens": 18968920.0, + "reward": -2.244720458984375, + "reward_std": 6.6176557540893555, + "rewards/rm_reward_func/mean": -2.244720458984375, + "rewards/rm_reward_func/std": 15.78062915802002, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 406.3125, + "completions/mean_terminated_length": 371.0833435058594, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.9584, + "grad_norm": 1.579254150390625, + "kl": 0.0592041015625, + "learning_rate": 1e-06, + "loss": -0.1184, + "num_tokens": 18984634.0, + "reward": -5.4693603515625, + "reward_std": 5.85736083984375, + "rewards/rm_reward_func/mean": -5.4693603515625, + "rewards/rm_reward_func/std": 8.396554946899414, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 301.25, + "completions/mean_terminated_length": 301.25, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.9592, + "grad_norm": 1.7899622917175293, + "kl": 0.088623046875, + "learning_rate": 1e-06, + "loss": -0.0921, + "num_tokens": 19001634.0, + "reward": 9.3291015625, + "reward_std": 4.022342681884766, + "rewards/rm_reward_func/mean": 9.3291015625, + "rewards/rm_reward_func/std": 16.73755645751953, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 256.5, + "completions/mean_terminated_length": 220.00001525878906, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 0.96, + "grad_norm": 3.106379270553589, + "kl": 0.09051513671875, + "learning_rate": 1e-06, + "loss": -0.0431, + "num_tokens": 19013514.0, + "reward": -11.283447265625, + "reward_std": 6.853769302368164, + "rewards/rm_reward_func/mean": -11.283447265625, + "rewards/rm_reward_func/std": 7.490703582763672, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 390.46875, + "completions/mean_terminated_length": 349.9583435058594, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.9608, + "grad_norm": 2.380739450454712, + "kl": 0.07672119140625, + "learning_rate": 1e-06, + "loss": -0.0477, + "num_tokens": 19032505.0, + "reward": 1.5191650390625, + "reward_std": 6.789242744445801, + "rewards/rm_reward_func/mean": 1.5191650390625, + "rewards/rm_reward_func/std": 11.326786041259766, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 411.15625, + "completions/mean_terminated_length": 377.54168701171875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.9616, + "grad_norm": 1.7256031036376953, + "kl": 0.0650634765625, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 19050670.0, + "reward": -1.34991455078125, + "reward_std": 5.369917869567871, + "rewards/rm_reward_func/mean": -1.34991455078125, + "rewards/rm_reward_func/std": 13.881171226501465, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 310.34375, + "completions/mean_terminated_length": 303.8387145996094, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.9624, + "grad_norm": 1.9273755550384521, + "kl": 0.09442138671875, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 19067977.0, + "reward": 11.826690673828125, + "reward_std": 4.967939376831055, + "rewards/rm_reward_func/mean": 11.826690673828125, + "rewards/rm_reward_func/std": 18.93402099609375, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 391.125, + "completions/mean_terminated_length": 378.6206970214844, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.9632, + "grad_norm": 1.5855298042297363, + "kl": 0.060546875, + "learning_rate": 1e-06, + "loss": 0.0713, + "num_tokens": 19084405.0, + "reward": 16.45343017578125, + "reward_std": 8.746343612670898, + "rewards/rm_reward_func/mean": 16.45343017578125, + "rewards/rm_reward_func/std": 13.88955020904541, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 278.9375, + "completions/mean_terminated_length": 271.4193420410156, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.964, + "grad_norm": 2.1639761924743652, + "kl": 0.07666015625, + "learning_rate": 1e-06, + "loss": -0.0561, + "num_tokens": 19098019.0, + "reward": 0.934478759765625, + "reward_std": 4.84557580947876, + "rewards/rm_reward_func/mean": 0.934478759765625, + "rewards/rm_reward_func/std": 20.446882247924805, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 309.9375, + "completions/mean_terminated_length": 281.0714416503906, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.9648, + "grad_norm": 2.0690553188323975, + "kl": 0.06085205078125, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 19109673.0, + "reward": -6.0211181640625, + "reward_std": 5.820977210998535, + "rewards/rm_reward_func/mean": -6.0211181640625, + "rewards/rm_reward_func/std": 8.592059135437012, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 358.40625, + "completions/mean_terminated_length": 329.96295166015625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.9656, + "grad_norm": 1.8341344594955444, + "kl": 0.08172607421875, + "learning_rate": 1e-06, + "loss": 0.063, + "num_tokens": 19129750.0, + "reward": 5.96826171875, + "reward_std": 7.739785194396973, + "rewards/rm_reward_func/mean": 5.96826171875, + "rewards/rm_reward_func/std": 16.700008392333984, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 373.3125, + "completions/mean_terminated_length": 353.5000305175781, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.9664, + "grad_norm": 1.935084342956543, + "kl": 0.075927734375, + "learning_rate": 1e-06, + "loss": 0.1739, + "num_tokens": 19147856.0, + "reward": 3.4145050048828125, + "reward_std": 8.682884216308594, + "rewards/rm_reward_func/mean": 3.4145050048828125, + "rewards/rm_reward_func/std": 20.180097579956055, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 331.21875, + "completions/mean_terminated_length": 260.478271484375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.9672, + "grad_norm": 3.0217480659484863, + "kl": 0.15484619140625, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 19164711.0, + "reward": 5.384857177734375, + "reward_std": 5.285286903381348, + "rewards/rm_reward_func/mean": 5.384857177734375, + "rewards/rm_reward_func/std": 22.06003189086914, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 297.25, + "completions/mean_terminated_length": 257.4814758300781, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.968, + "grad_norm": 2.2715647220611572, + "kl": 0.0556640625, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 19178287.0, + "reward": -4.639434814453125, + "reward_std": 3.8123955726623535, + "rewards/rm_reward_func/mean": -4.639434814453125, + "rewards/rm_reward_func/std": 9.903362274169922, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 276.15625, + "completions/mean_terminated_length": 260.433349609375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9688, + "grad_norm": 2.704739809036255, + "kl": 0.08502197265625, + "learning_rate": 1e-06, + "loss": -0.0316, + "num_tokens": 19191332.0, + "reward": 3.612548828125, + "reward_std": 6.149729251861572, + "rewards/rm_reward_func/mean": 3.612548828125, + "rewards/rm_reward_func/std": 7.4725422859191895, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 340.5625, + "completions/mean_terminated_length": 322.82757568359375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.9696, + "grad_norm": 1.970215916633606, + "kl": 0.05609130859375, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 19207382.0, + "reward": 3.12396240234375, + "reward_std": 4.889390468597412, + "rewards/rm_reward_func/mean": 3.12396240234375, + "rewards/rm_reward_func/std": 18.048656463623047, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 312.40625, + "completions/mean_terminated_length": 299.1000061035156, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.9704, + "grad_norm": 2.0979464054107666, + "kl": 0.0750732421875, + "learning_rate": 1e-06, + "loss": -0.0383, + "num_tokens": 19222275.0, + "reward": 0.49658203125, + "reward_std": 5.79196310043335, + "rewards/rm_reward_func/mean": 0.49658203125, + "rewards/rm_reward_func/std": 17.66546630859375, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 359.71875, + "completions/mean_terminated_length": 337.96429443359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.9712, + "grad_norm": 1.8306912183761597, + "kl": 0.0751953125, + "learning_rate": 1e-06, + "loss": -0.0426, + "num_tokens": 19237218.0, + "reward": 2.3309097290039062, + "reward_std": 8.304255485534668, + "rewards/rm_reward_func/mean": 2.3309097290039062, + "rewards/rm_reward_func/std": 10.246231079101562, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 333.21875, + "completions/mean_terminated_length": 314.7241516113281, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.972, + "grad_norm": 1.9634864330291748, + "kl": 0.0802001953125, + "learning_rate": 1e-06, + "loss": 0.0942, + "num_tokens": 19255129.0, + "reward": 3.79296875, + "reward_std": 6.4938249588012695, + "rewards/rm_reward_func/mean": 3.79296875, + "rewards/rm_reward_func/std": 19.279415130615234, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 397.78125, + "completions/mean_terminated_length": 359.7083435058594, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.9728, + "grad_norm": 1.7288548946380615, + "kl": 0.05010986328125, + "learning_rate": 1e-06, + "loss": -0.0326, + "num_tokens": 19270162.0, + "reward": -1.320068359375, + "reward_std": 5.934271812438965, + "rewards/rm_reward_func/mean": -1.320068359375, + "rewards/rm_reward_func/std": 14.231712341308594, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 323.5625, + "completions/mean_terminated_length": 304.0689697265625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.9736, + "grad_norm": 2.204549789428711, + "kl": 0.120849609375, + "learning_rate": 1e-06, + "loss": 0.031, + "num_tokens": 19288908.0, + "reward": 3.6815185546875, + "reward_std": 4.899533748626709, + "rewards/rm_reward_func/mean": 3.6815185546875, + "rewards/rm_reward_func/std": 19.92808723449707, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 356.6875, + "completions/mean_terminated_length": 295.9130554199219, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.9744, + "grad_norm": 1.9091168642044067, + "kl": 0.06219482421875, + "learning_rate": 1e-06, + "loss": -0.1469, + "num_tokens": 19302410.0, + "reward": -8.40838623046875, + "reward_std": 5.639212608337402, + "rewards/rm_reward_func/mean": -8.40838623046875, + "rewards/rm_reward_func/std": 7.34835147857666, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 242.125, + "completions/mean_terminated_length": 152.1666717529297, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.9752, + "grad_norm": 3.1749019622802734, + "kl": 0.094970703125, + "learning_rate": 1e-06, + "loss": 0.0563, + "num_tokens": 19312750.0, + "reward": -5.06494140625, + "reward_std": 7.494110107421875, + "rewards/rm_reward_func/mean": -5.06494140625, + "rewards/rm_reward_func/std": 10.416797637939453, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 339.0, + "completions/mean_terminated_length": 306.96295166015625, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.976, + "grad_norm": 1.9346483945846558, + "kl": 0.06170654296875, + "learning_rate": 1e-06, + "loss": 0.1632, + "num_tokens": 19328014.0, + "reward": -8.430877685546875, + "reward_std": 4.441213607788086, + "rewards/rm_reward_func/mean": -8.430877685546875, + "rewards/rm_reward_func/std": 12.21574878692627, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 328.75, + "completions/mean_terminated_length": 294.8148193359375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.9768, + "grad_norm": 1.9413658380508423, + "kl": 0.05517578125, + "learning_rate": 1e-06, + "loss": -0.1096, + "num_tokens": 19341102.0, + "reward": -4.01416015625, + "reward_std": 4.763402938842773, + "rewards/rm_reward_func/mean": -4.01416015625, + "rewards/rm_reward_func/std": 6.332244396209717, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 342.46875, + "completions/mean_terminated_length": 324.9310302734375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.9776, + "grad_norm": 1.7298831939697266, + "kl": 0.08660888671875, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 19357717.0, + "reward": 2.748779296875, + "reward_std": 5.7391815185546875, + "rewards/rm_reward_func/mean": 2.748779296875, + "rewards/rm_reward_func/std": 9.909750938415527, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 364.6875, + "completions/mean_terminated_length": 337.40740966796875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.9784, + "grad_norm": 1.821076512336731, + "kl": 0.0728759765625, + "learning_rate": 1e-06, + "loss": 0.0258, + "num_tokens": 19374923.0, + "reward": 3.959228515625, + "reward_std": 5.252490520477295, + "rewards/rm_reward_func/mean": 3.959228515625, + "rewards/rm_reward_func/std": 11.81523323059082, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 356.125, + "completions/mean_terminated_length": 340.0, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.9792, + "grad_norm": 2.123723268508911, + "kl": 0.06549072265625, + "learning_rate": 1e-06, + "loss": -0.0229, + "num_tokens": 19388607.0, + "reward": -3.793304443359375, + "reward_std": 4.087723731994629, + "rewards/rm_reward_func/mean": -3.793304443359375, + "rewards/rm_reward_func/std": 5.921860218048096, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 319.84375, + "completions/mean_terminated_length": 313.6451416015625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.98, + "grad_norm": 1.8228784799575806, + "kl": 0.09368896484375, + "learning_rate": 1e-06, + "loss": 0.0605, + "num_tokens": 19403930.0, + "reward": 1.459716796875, + "reward_std": 7.972273826599121, + "rewards/rm_reward_func/mean": 1.459716796875, + "rewards/rm_reward_func/std": 11.324004173278809, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 154.09375, + "completions/mean_terminated_length": 154.09375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.9808, + "grad_norm": 2.8573548793792725, + "kl": 0.1258544921875, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 19413717.0, + "reward": 0.7480850219726562, + "reward_std": 6.262543201446533, + "rewards/rm_reward_func/mean": 0.7480850219726562, + "rewards/rm_reward_func/std": 11.525054931640625, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 290.8125, + "completions/mean_terminated_length": 204.26087951660156, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.9816, + "grad_norm": 2.101404905319214, + "kl": 0.0794677734375, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 19430103.0, + "reward": 5.057209014892578, + "reward_std": 4.712871551513672, + "rewards/rm_reward_func/mean": 5.057209014892578, + "rewards/rm_reward_func/std": 15.899744033813477, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 335.90625, + "completions/mean_terminated_length": 230.25, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.9824, + "grad_norm": 1.9529588222503662, + "kl": 0.0904541015625, + "learning_rate": 1e-06, + "loss": 0.3705, + "num_tokens": 19445900.0, + "reward": -7.445159912109375, + "reward_std": 9.688376426696777, + "rewards/rm_reward_func/mean": -7.445159912109375, + "rewards/rm_reward_func/std": 13.721606254577637, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 306.28125, + "completions/mean_terminated_length": 276.89288330078125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.9832, + "grad_norm": 2.182965040206909, + "kl": 0.1063232421875, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 19460477.0, + "reward": 6.24365234375, + "reward_std": 6.503387928009033, + "rewards/rm_reward_func/mean": 6.24365234375, + "rewards/rm_reward_func/std": 12.083962440490723, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 346.0625, + "completions/mean_terminated_length": 246.5, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.984, + "grad_norm": 1.965198278427124, + "kl": 0.0882568359375, + "learning_rate": 1e-06, + "loss": 0.1093, + "num_tokens": 19480815.0, + "reward": -2.0164384841918945, + "reward_std": 4.365109443664551, + "rewards/rm_reward_func/mean": -2.0164384841918945, + "rewards/rm_reward_func/std": 14.292363166809082, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 243.15625, + "completions/mean_terminated_length": 204.75001525878906, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.9848, + "grad_norm": 2.141848087310791, + "kl": 0.156494140625, + "learning_rate": 1e-06, + "loss": 0.0588, + "num_tokens": 19496604.0, + "reward": 11.575439453125, + "reward_std": 5.662555694580078, + "rewards/rm_reward_func/mean": 11.575439453125, + "rewards/rm_reward_func/std": 19.600753784179688, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 379.53125, + "completions/mean_terminated_length": 360.6071472167969, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.9856, + "grad_norm": 1.8068856000900269, + "kl": 0.0782470703125, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 19512309.0, + "reward": 5.921875, + "reward_std": 6.898159503936768, + "rewards/rm_reward_func/mean": 5.921875, + "rewards/rm_reward_func/std": 11.215498924255371, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 255.375, + "completions/mean_terminated_length": 238.2666778564453, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.9864, + "grad_norm": 2.4451401233673096, + "kl": 0.06591796875, + "learning_rate": 1e-06, + "loss": 0.0356, + "num_tokens": 19524545.0, + "reward": 0.807464599609375, + "reward_std": 5.453124046325684, + "rewards/rm_reward_func/mean": 0.807464599609375, + "rewards/rm_reward_func/std": 11.848858833312988, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 343.1875, + "completions/mean_terminated_length": 286.91668701171875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.9872, + "grad_norm": 2.2289021015167236, + "kl": 0.1170654296875, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 19541583.0, + "reward": 7.099365234375, + "reward_std": 5.563742637634277, + "rewards/rm_reward_func/mean": 7.099365234375, + "rewards/rm_reward_func/std": 17.532470703125, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 351.875, + "completions/mean_terminated_length": 279.0909118652344, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.988, + "grad_norm": 1.769615650177002, + "kl": 0.06658935546875, + "learning_rate": 1e-06, + "loss": 0.0612, + "num_tokens": 19555419.0, + "reward": -10.983932495117188, + "reward_std": 4.360165596008301, + "rewards/rm_reward_func/mean": -10.983932495117188, + "rewards/rm_reward_func/std": 7.810812473297119, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 419.9375, + "completions/mean_terminated_length": 356.9473571777344, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.9888, + "grad_norm": 1.5924738645553589, + "kl": 0.08319091796875, + "learning_rate": 1e-06, + "loss": 0.0729, + "num_tokens": 19579433.0, + "reward": 5.734619140625, + "reward_std": 6.61159610748291, + "rewards/rm_reward_func/mean": 5.734619140625, + "rewards/rm_reward_func/std": 18.786161422729492, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 355.59375, + "completions/mean_terminated_length": 319.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.9896, + "grad_norm": 1.730557918548584, + "kl": 0.078125, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 19597860.0, + "reward": -1.3018798828125, + "reward_std": 5.418453693389893, + "rewards/rm_reward_func/mean": -1.3018798828125, + "rewards/rm_reward_func/std": 15.416099548339844, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 340.28125, + "completions/mean_terminated_length": 292.1999816894531, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.9904, + "grad_norm": 1.7849236726760864, + "kl": 0.08306884765625, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 19614853.0, + "reward": 11.4884033203125, + "reward_std": 4.998743057250977, + "rewards/rm_reward_func/mean": 11.4884033203125, + "rewards/rm_reward_func/std": 20.11761474609375, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 209.8125, + "completions/mean_terminated_length": 209.8125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.9912, + "grad_norm": 3.2263269424438477, + "kl": 0.0992431640625, + "learning_rate": 1e-06, + "loss": 0.0639, + "num_tokens": 19626927.0, + "reward": -4.279754638671875, + "reward_std": 4.137574195861816, + "rewards/rm_reward_func/mean": -4.279754638671875, + "rewards/rm_reward_func/std": 7.89611291885376, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 308.25, + "completions/mean_terminated_length": 228.52174377441406, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.992, + "grad_norm": 2.3739120960235596, + "kl": 0.078765869140625, + "learning_rate": 1e-06, + "loss": 0.2238, + "num_tokens": 19643207.0, + "reward": 4.8181915283203125, + "reward_std": 7.092428207397461, + "rewards/rm_reward_func/mean": 4.8181915283203125, + "rewards/rm_reward_func/std": 10.239140510559082, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 253.53125, + "completions/mean_terminated_length": 245.19354248046875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.9928, + "grad_norm": 2.0511248111724854, + "kl": 0.091796875, + "learning_rate": 1e-06, + "loss": 0.0428, + "num_tokens": 19658872.0, + "reward": -6.848480224609375, + "reward_std": 6.572463035583496, + "rewards/rm_reward_func/mean": -6.848480224609375, + "rewards/rm_reward_func/std": 10.909504890441895, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 285.625, + "completions/mean_terminated_length": 243.70370483398438, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.9936, + "grad_norm": 2.3514668941497803, + "kl": 0.0936279296875, + "learning_rate": 1e-06, + "loss": 0.2276, + "num_tokens": 19675060.0, + "reward": 1.14483642578125, + "reward_std": 8.80129623413086, + "rewards/rm_reward_func/mean": 1.14483642578125, + "rewards/rm_reward_func/std": 16.491636276245117, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 310.25, + "completions/mean_terminated_length": 289.3793029785156, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.9944, + "grad_norm": 2.116722822189331, + "kl": 0.06549072265625, + "learning_rate": 1e-06, + "loss": -0.0507, + "num_tokens": 19689252.0, + "reward": -8.44384765625, + "reward_std": 4.841726303100586, + "rewards/rm_reward_func/mean": -8.44384765625, + "rewards/rm_reward_func/std": 9.359322547912598, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 293.5, + "completions/mean_terminated_length": 208.0, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.9952, + "grad_norm": 3.5723612308502197, + "kl": 0.0733642578125, + "learning_rate": 1e-06, + "loss": 0.1694, + "num_tokens": 19702340.0, + "reward": -6.228546142578125, + "reward_std": 6.26521635055542, + "rewards/rm_reward_func/mean": -6.228546142578125, + "rewards/rm_reward_func/std": 6.528346538543701, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 304.5, + "completions/mean_terminated_length": 274.8571472167969, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 0.996, + "grad_norm": 2.168534278869629, + "kl": 0.06280517578125, + "learning_rate": 1e-06, + "loss": 0.0342, + "num_tokens": 19714060.0, + "reward": -4.342529296875, + "reward_std": 5.371504783630371, + "rewards/rm_reward_func/mean": -4.342529296875, + "rewards/rm_reward_func/std": 8.78249454498291, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 512.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 371.46875, + "completions/mean_terminated_length": 324.625, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 0.9968, + "grad_norm": 2.114946126937866, + "kl": 0.06060791015625, + "learning_rate": 1e-06, + "loss": -0.122, + "num_tokens": 19731051.0, + "reward": -3.2021484375, + "reward_std": 4.737578392028809, + "rewards/rm_reward_func/mean": -3.2021484375, + "rewards/rm_reward_func/std": 8.52869987487793, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 298.15625, + "completions/mean_terminated_length": 283.9000244140625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9976, + "grad_norm": 1.8747590780258179, + "kl": 0.0693359375, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 19745792.0, + "reward": -3.0748291015625, + "reward_std": 8.589252471923828, + "rewards/rm_reward_func/mean": -3.0748291015625, + "rewards/rm_reward_func/std": 11.960886001586914, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 226.25, + "completions/mean_terminated_length": 217.03225708007812, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.9984, + "grad_norm": 3.755967617034912, + "kl": 0.080322265625, + "learning_rate": 1e-06, + "loss": 0.2093, + "num_tokens": 19758512.0, + "reward": 7.357421875, + "reward_std": 6.109155654907227, + "rewards/rm_reward_func/mean": 7.357421875, + "rewards/rm_reward_func/std": 21.06812858581543, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 344.625, + "completions/mean_terminated_length": 313.629638671875, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 0.9992, + "grad_norm": 2.5709409713745117, + "kl": 0.07611083984375, + "learning_rate": 1e-06, + "loss": 0.1939, + "num_tokens": 19776212.0, + "reward": 3.890625, + "reward_std": 6.159649848937988, + "rewards/rm_reward_func/mean": 3.890625, + "rewards/rm_reward_func/std": 19.292577743530273, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 265.0, + "completions/mean_terminated_length": 265.0, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 1.0, + "grad_norm": 2.417809247970581, + "kl": 0.139892578125, + "learning_rate": 1e-06, + "loss": -0.1476, + "num_tokens": 19793485.0, + "reward": -9.4732666015625, + "reward_std": 5.3670735359191895, + "rewards/rm_reward_func/mean": -9.4732666015625, + "rewards/rm_reward_func/std": 6.196745872497559, + "step": 1250 + } + ], + "logging_steps": 1, + "max_steps": 1250, + "num_input_tokens_seen": 19793485, + "num_train_epochs": 1, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}