{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00031221510371785745, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.623632907867432, "epoch": 3.1221510371785745e-06, "frac_reward_zero_std": 0.0, "grad_norm": 1.2739975452423096, "kl": 0.0001386080402880907, "learning_rate": 1e-05, "loss": 0.0, "num_tokens": 18464.0, "reward": 6.609435081481934, "reward_std": 0.4221917688846588, "rewards/CLIPDistanceReward/mean": 1.2607897520065308, "rewards/CLIPDistanceReward/std": 0.15182529017329216, "rewards/HPSV21DistanceReward/mean": 0.8540859222412109, "rewards/HPSV21DistanceReward/std": 0.056746602058410645, "rewards/MANIQAReward/mean": 0.7976526021957397, "rewards/MANIQAReward/std": 0.3044530153274536, "rewards/QwenFakeDiscrimReward/mean": 0.21875, "rewards/QwenFakeDiscrimReward/std": 0.08400268852710724, "rewards/QwenLabelReward/mean": 1.11328125, "rewards/QwenLabelReward/std": 0.06629125773906708, "rewards/QwenWeirdDiscrimReward/mean": 0.25, "rewards/QwenWeirdDiscrimReward/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.730321884155273, "epoch": 6.244302074357149e-06, "frac_reward_zero_std": 0.0, "grad_norm": 1.2292801141738892, "kl": 0.0003049901861231774, "learning_rate": 9.999998438924483e-06, "loss": 0.0, "num_tokens": 36928.0, "reward": 6.131712913513184, "reward_std": 0.452839732170105, "rewards/CLIPDistanceReward/mean": 1.0307670831680298, "rewards/CLIPDistanceReward/std": 0.1981462985277176, "rewards/HPSV21DistanceReward/mean": 0.8319225311279297, "rewards/HPSV21DistanceReward/std": 0.07379035651683807, "rewards/MANIQAReward/mean": 0.889146089553833, "rewards/MANIQAReward/std": 0.12645867466926575, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.103124976158142, "rewards/QwenLabelReward/std": 0.10734140872955322, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.528157711029053, "epoch": 9.366453111535724e-06, "frac_reward_zero_std": 0.0, "grad_norm": 1.257138729095459, "kl": 0.00040717620868235826, "learning_rate": 9.999996877848963e-06, "loss": 0.0, "num_tokens": 55392.0, "reward": 6.619058132171631, "reward_std": 0.5448145866394043, "rewards/CLIPDistanceReward/mean": 1.1895149946212769, "rewards/CLIPDistanceReward/std": 0.16646936908364296, "rewards/HPSV21DistanceReward/mean": 0.8643283843994141, "rewards/HPSV21DistanceReward/std": 0.06711792945861816, "rewards/MANIQAReward/mean": 0.9121524691581726, "rewards/MANIQAReward/std": 0.23076210916042328, "rewards/QwenFakeDiscrimReward/mean": 0.234375, "rewards/QwenFakeDiscrimReward/std": 0.06148367002606392, "rewards/QwenLabelReward/mean": 1.122656226158142, "rewards/QwenLabelReward/std": 0.07250122725963593, "rewards/QwenWeirdDiscrimReward/mean": 0.2421875, "rewards/QwenWeirdDiscrimReward/std": 0.04419417306780815, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.05067777633667, "epoch": 1.2488604148714298e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.334944486618042, "kl": 0.0006911944365128875, "learning_rate": 9.999995316773445e-06, "loss": 0.0001, "num_tokens": 73856.0, "reward": 5.796908378601074, "reward_std": 0.39618438482284546, "rewards/CLIPDistanceReward/mean": 0.9435072541236877, "rewards/CLIPDistanceReward/std": 0.11599139124155045, "rewards/HPSV21DistanceReward/mean": 0.8088340759277344, "rewards/HPSV21DistanceReward/std": 0.05785701796412468, "rewards/MANIQAReward/mean": 0.7984758615493774, "rewards/MANIQAReward/std": 0.2420261949300766, "rewards/QwenFakeDiscrimReward/mean": 0.140625, "rewards/QwenFakeDiscrimReward/std": 0.12600402534008026, "rewards/QwenLabelReward/mean": 1.134374976158142, "rewards/QwenLabelReward/std": 0.025200821459293365, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.253571510314941, "epoch": 1.5610755185892873e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.28339421749115, "kl": 0.000788698613177985, "learning_rate": 9.999993755697927e-06, "loss": 0.0001, "num_tokens": 92320.0, "reward": 6.281489372253418, "reward_std": 0.6554425954818726, "rewards/CLIPDistanceReward/mean": 1.1557438373565674, "rewards/CLIPDistanceReward/std": 0.18544841185212135, "rewards/HPSV21DistanceReward/mean": 0.81292724609375, "rewards/HPSV21DistanceReward/std": 0.12179260328412056, "rewards/MANIQAReward/mean": 0.846490740776062, "rewards/MANIQAReward/std": 0.27076393365859985, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.122656226158142, "rewards/QwenLabelReward/std": 0.07250122725963593, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.551281452178955, "epoch": 1.8732906223071447e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.1884926557540894, "kl": 0.0009239333448931575, "learning_rate": 9.999992194622407e-06, "loss": 0.0001, "num_tokens": 110784.0, "reward": 6.167599678039551, "reward_std": 0.5350075364112854, "rewards/CLIPDistanceReward/mean": 1.0848320722579956, "rewards/CLIPDistanceReward/std": 0.17391569539904594, "rewards/HPSV21DistanceReward/mean": 0.8280715942382812, "rewards/HPSV21DistanceReward/std": 0.11653662100434303, "rewards/MANIQAReward/mean": 0.791792631149292, "rewards/MANIQAReward/std": 0.22277672588825226, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1437499523162842, "rewards/QwenLabelReward/std": 0.032995618879795074, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.8872480392456055, "epoch": 2.1855057260250022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2976913452148438, "kl": 0.001203209743835032, "learning_rate": 9.999990633546889e-06, "loss": 0.0001, "num_tokens": 129248.0, "reward": 6.023123741149902, "reward_std": 0.47152942419052124, "rewards/CLIPDistanceReward/mean": 1.0311306715011597, "rewards/CLIPDistanceReward/std": 0.18736129999160767, "rewards/HPSV21DistanceReward/mean": 0.8109321594238281, "rewards/HPSV21DistanceReward/std": 0.06704938411712646, "rewards/MANIQAReward/mean": 0.8616540431976318, "rewards/MANIQAReward/std": 0.22205907106399536, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.0945311784744263, "rewards/QwenLabelReward/std": 0.13132917881011963, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.8296003341674805, "epoch": 2.4977208297428596e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2519214153289795, "kl": 0.001564494799822569, "learning_rate": 9.99998907247137e-06, "loss": 0.0002, "num_tokens": 147712.0, "reward": 6.610672473907471, "reward_std": 0.4562682807445526, "rewards/CLIPDistanceReward/mean": 1.2311782240867615, "rewards/CLIPDistanceReward/std": 0.16479255631566048, "rewards/HPSV21DistanceReward/mean": 0.8795032501220703, "rewards/HPSV21DistanceReward/std": 0.10189330950379372, "rewards/MANIQAReward/mean": 0.8588408827781677, "rewards/MANIQAReward/std": 0.223855122923851, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.1320312023162842, "rewards/QwenLabelReward/std": 0.07705109566450119, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.978052616119385, "epoch": 2.809935933460717e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2254565954208374, "kl": 0.001480231760069728, "learning_rate": 9.999987511395852e-06, "loss": 0.0001, "num_tokens": 166176.0, "reward": 6.16562032699585, "reward_std": 0.40449047088623047, "rewards/CLIPDistanceReward/mean": 1.1519981622695923, "rewards/CLIPDistanceReward/std": 0.19231434166431427, "rewards/HPSV21DistanceReward/mean": 0.7836761474609375, "rewards/HPSV21DistanceReward/std": 0.058440033346414566, "rewards/MANIQAReward/mean": 0.7489591240882874, "rewards/MANIQAReward/std": 0.15209169685840607, "rewards/QwenFakeDiscrimReward/mean": 0.21875, "rewards/QwenFakeDiscrimReward/std": 0.08400268852710724, "rewards/QwenLabelReward/mean": 1.107812523841858, "rewards/QwenLabelReward/std": 0.1098821833729744, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.917590618133545, "epoch": 3.1221510371785745e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2080357074737549, "kl": 0.0019695935770869255, "learning_rate": 9.999985950320333e-06, "loss": 0.0002, "num_tokens": 184640.0, "reward": 6.550270080566406, "reward_std": 0.3679863512516022, "rewards/CLIPDistanceReward/mean": 1.173481822013855, "rewards/CLIPDistanceReward/std": 0.17859025672078133, "rewards/HPSV21DistanceReward/mean": 0.84783935546875, "rewards/HPSV21DistanceReward/std": 0.058296844363212585, "rewards/MANIQAReward/mean": 0.9193464517593384, "rewards/MANIQAReward/std": 0.2125077098608017, "rewards/QwenFakeDiscrimReward/mean": 0.234375, "rewards/QwenFakeDiscrimReward/std": 0.06148367002606392, "rewards/QwenLabelReward/mean": 1.127343773841858, "rewards/QwenLabelReward/std": 0.013258261606097221, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.084595680236816, "epoch": 3.434366140896432e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3724128007888794, "kl": 0.002312946366146207, "learning_rate": 9.999984389244815e-06, "loss": 0.0002, "num_tokens": 203104.0, "reward": 6.125527858734131, "reward_std": 0.5582720041275024, "rewards/CLIPDistanceReward/mean": 1.1064560413360596, "rewards/CLIPDistanceReward/std": 0.198241725564003, "rewards/HPSV21DistanceReward/mean": 0.7864799499511719, "rewards/HPSV21DistanceReward/std": 0.08960738778114319, "rewards/MANIQAReward/mean": 0.871687114238739, "rewards/MANIQAReward/std": 0.2775617837905884, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.0851562023162842, "rewards/QwenLabelReward/std": 0.12729495763778687, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.3304443359375, "epoch": 3.7465812446142895e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3275723457336426, "kl": 0.0022135809995234013, "learning_rate": 9.999982828169296e-06, "loss": 0.0002, "num_tokens": 221568.0, "reward": 6.677940368652344, "reward_std": 0.3971633315086365, "rewards/CLIPDistanceReward/mean": 1.198138952255249, "rewards/CLIPDistanceReward/std": 0.17020755633711815, "rewards/HPSV21DistanceReward/mean": 0.8744964599609375, "rewards/HPSV21DistanceReward/std": 0.10942155495285988, "rewards/MANIQAReward/mean": 0.9647008180618286, "rewards/MANIQAReward/std": 0.2401675134897232, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.1304688453674316, "rewards/QwenLabelReward/std": 0.08513139933347702, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.26898193359375, "epoch": 4.058796348332147e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2554527521133423, "kl": 0.0023454572074115276, "learning_rate": 9.999981267093778e-06, "loss": 0.0002, "num_tokens": 240032.0, "reward": 6.430377006530762, "reward_std": 0.5081740617752075, "rewards/CLIPDistanceReward/mean": 1.176119089126587, "rewards/CLIPDistanceReward/std": 0.17678173631429672, "rewards/HPSV21DistanceReward/mean": 0.8330764770507812, "rewards/HPSV21DistanceReward/std": 0.06638708710670471, "rewards/MANIQAReward/mean": 0.8658918142318726, "rewards/MANIQAReward/std": 0.2314479947090149, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.1320312023162842, "rewards/QwenLabelReward/std": 0.02221085876226425, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.671435832977295, "epoch": 4.3710114520500044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.275119423866272, "kl": 0.003373210784047842, "learning_rate": 9.999979706018258e-06, "loss": 0.0003, "num_tokens": 258496.0, "reward": 6.654054641723633, "reward_std": 0.38790571689605713, "rewards/CLIPDistanceReward/mean": 1.176817536354065, "rewards/CLIPDistanceReward/std": 0.18450217321515083, "rewards/HPSV21DistanceReward/mean": 0.8623161315917969, "rewards/HPSV21DistanceReward/std": 0.04225846007466316, "rewards/MANIQAReward/mean": 0.9632872343063354, "rewards/MANIQAReward/std": 0.2009655386209488, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.1437500715255737, "rewards/QwenLabelReward/std": 0.08082239329814911, "rewards/QwenWeirdDiscrimReward/mean": 0.2421875, "rewards/QwenWeirdDiscrimReward/std": 0.04419417306780815, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.287827491760254, "epoch": 4.683226555767862e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2638883590698242, "kl": 0.002877640537917614, "learning_rate": 9.99997814494274e-06, "loss": 0.0003, "num_tokens": 276960.0, "reward": 6.649702548980713, "reward_std": 0.44256481528282166, "rewards/CLIPDistanceReward/mean": 1.2167330980300903, "rewards/CLIPDistanceReward/std": 0.15435198694467545, "rewards/HPSV21DistanceReward/mean": 0.8567485809326172, "rewards/HPSV21DistanceReward/std": 0.13607237115502357, "rewards/MANIQAReward/mean": 0.9152392745018005, "rewards/MANIQAReward/std": 0.2961329519748688, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1578125953674316, "rewards/QwenLabelReward/std": 0.03780123218894005, "rewards/QwenWeirdDiscrimReward/mean": 0.2421875, "rewards/QwenWeirdDiscrimReward/std": 0.04419417306780815, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.031698703765869, "epoch": 4.995441659485719e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2580305337905884, "kl": 0.005749293137341738, "learning_rate": 9.999976583867222e-06, "loss": 0.0006, "num_tokens": 295424.0, "reward": 6.24772310256958, "reward_std": 0.42724496126174927, "rewards/CLIPDistanceReward/mean": 1.1427398324012756, "rewards/CLIPDistanceReward/std": 0.20864415913820267, "rewards/HPSV21DistanceReward/mean": 0.8126010894775391, "rewards/HPSV21DistanceReward/std": 0.07252603769302368, "rewards/MANIQAReward/mean": 0.8050100803375244, "rewards/MANIQAReward/std": 0.29134368896484375, "rewards/QwenFakeDiscrimReward/mean": 0.171875, "rewards/QwenFakeDiscrimReward/std": 0.11773227155208588, "rewards/QwenLabelReward/mean": 1.1414062976837158, "rewards/QwenLabelReward/std": 0.031501028686761856, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.156553268432617, "epoch": 5.307656763203577e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2772891521453857, "kl": 0.003552363719791174, "learning_rate": 9.999975022791704e-06, "loss": 0.0004, "num_tokens": 313888.0, "reward": 6.44113826751709, "reward_std": 0.41334664821624756, "rewards/CLIPDistanceReward/mean": 1.1253780722618103, "rewards/CLIPDistanceReward/std": 0.21243099868297577, "rewards/HPSV21DistanceReward/mean": 0.8584766387939453, "rewards/HPSV21DistanceReward/std": 0.1353207342326641, "rewards/MANIQAReward/mean": 0.8742103576660156, "rewards/MANIQAReward/std": 0.2524873614311218, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.130468726158142, "rewards/QwenLabelReward/std": 0.08513139933347702, "rewards/QwenWeirdDiscrimReward/mean": 0.2421875, "rewards/QwenWeirdDiscrimReward/std": 0.04419417306780815, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.499603271484375, "epoch": 5.619871866921434e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2289921045303345, "kl": 0.002652363618835807, "learning_rate": 9.999973461716184e-06, "loss": 0.0003, "num_tokens": 332352.0, "reward": 6.489038467407227, "reward_std": 0.46353602409362793, "rewards/CLIPDistanceReward/mean": 1.223012924194336, "rewards/CLIPDistanceReward/std": 0.20438914746046066, "rewards/HPSV21DistanceReward/mean": 0.8093585968017578, "rewards/HPSV21DistanceReward/std": 0.08119793981313705, "rewards/MANIQAReward/mean": 0.9110140800476074, "rewards/MANIQAReward/std": 0.14940990507602692, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.0601563453674316, "rewards/QwenLabelReward/std": 0.17710252106189728, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.222662925720215, "epoch": 5.9320869706392916e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3303914070129395, "kl": 0.003658069996163249, "learning_rate": 9.999971900640666e-06, "loss": 0.0004, "num_tokens": 350816.0, "reward": 6.452338218688965, "reward_std": 0.5061109662055969, "rewards/CLIPDistanceReward/mean": 1.1446911692619324, "rewards/CLIPDistanceReward/std": 0.22170008718967438, "rewards/HPSV21DistanceReward/mean": 0.8552494049072266, "rewards/HPSV21DistanceReward/std": 0.0648173913359642, "rewards/MANIQAReward/mean": 0.9446442127227783, "rewards/MANIQAReward/std": 0.28712502121925354, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.0859375, "rewards/QwenLabelReward/std": 0.13633118569850922, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.361435890197754, "epoch": 6.244302074357149e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2669092416763306, "kl": 0.0034602172672748566, "learning_rate": 9.999970339565146e-06, "loss": 0.0003, "num_tokens": 369280.0, "reward": 6.44541072845459, "reward_std": 0.4544782042503357, "rewards/CLIPDistanceReward/mean": 1.1906455755233765, "rewards/CLIPDistanceReward/std": 0.17723742872476578, "rewards/HPSV21DistanceReward/mean": 0.8204116821289062, "rewards/HPSV21DistanceReward/std": 0.06402766704559326, "rewards/MANIQAReward/mean": 0.9107962846755981, "rewards/MANIQAReward/std": 0.21920821070671082, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.0984375476837158, "rewards/QwenLabelReward/std": 0.11551733314990997, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.265169143676758, "epoch": 6.556517178075007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2718417644500732, "kl": 0.003340024035423994, "learning_rate": 9.99996877848963e-06, "loss": 0.0003, "num_tokens": 387744.0, "reward": 6.8722968101501465, "reward_std": 0.36055171489715576, "rewards/CLIPDistanceReward/mean": 1.3202731609344482, "rewards/CLIPDistanceReward/std": 0.11399306729435921, "rewards/HPSV21DistanceReward/mean": 0.8651580810546875, "rewards/HPSV21DistanceReward/std": 0.0535629540681839, "rewards/MANIQAReward/mean": 0.904559314250946, "rewards/MANIQAReward/std": 0.230123370885849, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.1437499523162842, "rewards/QwenLabelReward/std": 0.032995618879795074, "rewards/QwenWeirdDiscrimReward/mean": 0.25, "rewards/QwenWeirdDiscrimReward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.374960899353027, "epoch": 6.868732281792864e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3140023946762085, "kl": 0.004379130434244871, "learning_rate": 9.99996721741411e-06, "loss": 0.0004, "num_tokens": 406208.0, "reward": 6.434380531311035, "reward_std": 0.4323171377182007, "rewards/CLIPDistanceReward/mean": 1.1590567827224731, "rewards/CLIPDistanceReward/std": 0.2023616060614586, "rewards/HPSV21DistanceReward/mean": 0.7939376831054688, "rewards/HPSV21DistanceReward/std": 0.05536552518606186, "rewards/MANIQAReward/mean": 0.9299541115760803, "rewards/MANIQAReward/std": 0.20483846962451935, "rewards/QwenFakeDiscrimReward/mean": 0.234375, "rewards/QwenFakeDiscrimReward/std": 0.06148367002606392, "rewards/QwenLabelReward/mean": 1.1296875476837158, "rewards/QwenLabelReward/std": 0.07605110853910446, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.199890613555908, "epoch": 7.180947385510721e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.371036410331726, "kl": 0.004055140074342489, "learning_rate": 9.999965656338592e-06, "loss": 0.0004, "num_tokens": 424672.0, "reward": 6.093087196350098, "reward_std": 0.49832916259765625, "rewards/CLIPDistanceReward/mean": 1.0871888399124146, "rewards/CLIPDistanceReward/std": 0.20538772642612457, "rewards/HPSV21DistanceReward/mean": 0.8001174926757812, "rewards/HPSV21DistanceReward/std": 0.06698217242956161, "rewards/MANIQAReward/mean": 0.8239438533782959, "rewards/MANIQAReward/std": 0.23175010085105896, "rewards/QwenFakeDiscrimReward/mean": 0.1640625, "rewards/QwenFakeDiscrimReward/std": 0.12063967436552048, "rewards/QwenLabelReward/mean": 1.111718773841858, "rewards/QwenLabelReward/std": 0.07513009756803513, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.6632513999938965, "epoch": 7.493162489228579e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2595688104629517, "kl": 0.0038368860259652138, "learning_rate": 9.999964095263072e-06, "loss": 0.0004, "num_tokens": 443136.0, "reward": 5.975345611572266, "reward_std": 0.5153025984764099, "rewards/CLIPDistanceReward/mean": 1.0566595196723938, "rewards/CLIPDistanceReward/std": 0.2073858380317688, "rewards/HPSV21DistanceReward/mean": 0.7921257019042969, "rewards/HPSV21DistanceReward/std": 0.06261793524026871, "rewards/MANIQAReward/mean": 0.838712751865387, "rewards/MANIQAReward/std": 0.20742380619049072, "rewards/QwenFakeDiscrimReward/mean": 0.1640625, "rewards/QwenFakeDiscrimReward/std": 0.12063967436552048, "rewards/QwenLabelReward/mean": 1.110937476158142, "rewards/QwenLabelReward/std": 0.09795025736093521, "rewards/QwenWeirdDiscrimReward/mean": 0.1640625, "rewards/QwenWeirdDiscrimReward/std": 0.12063967436552048, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.280344486236572, "epoch": 7.805377592946436e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2874106168746948, "kl": 0.0040969643741846085, "learning_rate": 9.999962534187556e-06, "loss": 0.0004, "num_tokens": 461600.0, "reward": 6.090956687927246, "reward_std": 0.3492622375488281, "rewards/CLIPDistanceReward/mean": 1.0820870399475098, "rewards/CLIPDistanceReward/std": 0.18884264677762985, "rewards/HPSV21DistanceReward/mean": 0.7807197570800781, "rewards/HPSV21DistanceReward/std": 0.0538395419716835, "rewards/MANIQAReward/mean": 0.8989369869232178, "rewards/MANIQAReward/std": 0.17104113101959229, "rewards/QwenFakeDiscrimReward/mean": 0.1484375, "rewards/QwenFakeDiscrimReward/std": 0.12474772334098816, "rewards/QwenLabelReward/mean": 1.1460938453674316, "rewards/QwenLabelReward/std": 0.08134688436985016, "rewards/QwenWeirdDiscrimReward/mean": 0.171875, "rewards/QwenWeirdDiscrimReward/std": 0.11773227155208588, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.07451057434082, "epoch": 8.117592696664294e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2815903425216675, "kl": 0.004331190604716539, "learning_rate": 9.999960973112036e-06, "loss": 0.0004, "num_tokens": 480064.0, "reward": 5.910678863525391, "reward_std": 0.3325164020061493, "rewards/CLIPDistanceReward/mean": 1.06599360704422, "rewards/CLIPDistanceReward/std": 0.18919724225997925, "rewards/HPSV21DistanceReward/mean": 0.7868137359619141, "rewards/HPSV21DistanceReward/std": 0.04832137003540993, "rewards/MANIQAReward/mean": 0.69178307056427, "rewards/MANIQAReward/std": 0.2281167209148407, "rewards/QwenFakeDiscrimReward/mean": 0.15625, "rewards/QwenFakeDiscrimReward/std": 0.12296734005212784, "rewards/QwenLabelReward/mean": 1.146093726158142, "rewards/QwenLabelReward/std": 0.03426027670502663, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.292863845825195, "epoch": 8.429807800382151e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.2945539951324463, "kl": 0.005133111961185932, "learning_rate": 9.999959412036518e-06, "loss": 0.0005, "num_tokens": 498528.0, "reward": 5.929190635681152, "reward_std": 0.3859512209892273, "rewards/CLIPDistanceReward/mean": 1.0497870445251465, "rewards/CLIPDistanceReward/std": 0.20292063802480698, "rewards/HPSV21DistanceReward/mean": 0.7739105224609375, "rewards/HPSV21DistanceReward/std": 0.06499895453453064, "rewards/MANIQAReward/mean": 0.814608097076416, "rewards/MANIQAReward/std": 0.2812035381793976, "rewards/QwenFakeDiscrimReward/mean": 0.171875, "rewards/QwenFakeDiscrimReward/std": 0.11773227155208588, "rewards/QwenLabelReward/mean": 1.1078124046325684, "rewards/QwenLabelReward/std": 0.1098821833729744, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.734858989715576, "epoch": 8.742022904100009e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.272476315498352, "kl": 0.0058884648606181145, "learning_rate": 9.999957850960998e-06, "loss": 0.0006, "num_tokens": 516992.0, "reward": 6.270168304443359, "reward_std": 0.39280158281326294, "rewards/CLIPDistanceReward/mean": 1.095599114894867, "rewards/CLIPDistanceReward/std": 0.2135273814201355, "rewards/HPSV21DistanceReward/mean": 0.8424606323242188, "rewards/HPSV21DistanceReward/std": 0.055253688246011734, "rewards/MANIQAReward/mean": 0.8729552030563354, "rewards/MANIQAReward/std": 0.269828200340271, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.1460938453674316, "rewards/QwenLabelReward/std": 0.03426027670502663, "rewards/QwenWeirdDiscrimReward/mean": 0.1953125, "rewards/QwenWeirdDiscrimReward/std": 0.10500335693359375, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.224096298217773, "epoch": 9.054238007817866e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.333516001701355, "kl": 0.00541363749653101, "learning_rate": 9.99995628988548e-06, "loss": 0.0005, "num_tokens": 535456.0, "reward": 6.388664245605469, "reward_std": 0.4473634958267212, "rewards/CLIPDistanceReward/mean": 1.1900262832641602, "rewards/CLIPDistanceReward/std": 0.19321010261774063, "rewards/HPSV21DistanceReward/mean": 0.8024158477783203, "rewards/HPSV21DistanceReward/std": 0.061401356011629105, "rewards/MANIQAReward/mean": 0.900654673576355, "rewards/MANIQAReward/std": 0.2593793272972107, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.1124999523162842, "rewards/QwenLabelReward/std": 0.08082239329814911, "rewards/QwenWeirdDiscrimReward/mean": 0.1953125, "rewards/QwenWeirdDiscrimReward/std": 0.10500335693359375, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.436909198760986, "epoch": 9.366453111535724e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.32342529296875, "kl": 0.0052945357747375965, "learning_rate": 9.999954728809962e-06, "loss": 0.0005, "num_tokens": 553920.0, "reward": 6.060810089111328, "reward_std": 0.5033186674118042, "rewards/CLIPDistanceReward/mean": 1.046980857849121, "rewards/CLIPDistanceReward/std": 0.18705914169549942, "rewards/HPSV21DistanceReward/mean": 0.8070125579833984, "rewards/HPSV21DistanceReward/std": 0.06009075045585632, "rewards/MANIQAReward/mean": 0.8176668882369995, "rewards/MANIQAReward/std": 0.20907077193260193, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.12109375, "rewards/QwenLabelReward/std": 0.08084966987371445, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.125551223754883, "epoch": 9.678668215253581e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3219630718231201, "kl": 0.005405973643064499, "learning_rate": 9.999953167734444e-06, "loss": 0.0005, "num_tokens": 572384.0, "reward": 6.284246921539307, "reward_std": 0.442164808511734, "rewards/CLIPDistanceReward/mean": 1.1579678654670715, "rewards/CLIPDistanceReward/std": 0.19579196721315384, "rewards/HPSV21DistanceReward/mean": 0.8039512634277344, "rewards/HPSV21DistanceReward/std": 0.056665197014808655, "rewards/MANIQAReward/mean": 0.8385337591171265, "rewards/MANIQAReward/std": 0.2044580578804016, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.115625023841858, "rewards/QwenLabelReward/std": 0.08631105720996857, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.476280212402344, "epoch": 9.990883318971439e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.296881079673767, "kl": 0.005742260254919529, "learning_rate": 9.999951606658924e-06, "loss": 0.0006, "num_tokens": 590848.0, "reward": 6.385183334350586, "reward_std": 0.47731465101242065, "rewards/CLIPDistanceReward/mean": 1.1471171379089355, "rewards/CLIPDistanceReward/std": 0.20982357114553452, "rewards/HPSV21DistanceReward/mean": 0.8438053131103516, "rewards/HPSV21DistanceReward/std": 0.06823650747537613, "rewards/MANIQAReward/mean": 0.9033385515213013, "rewards/MANIQAReward/std": 0.2538517415523529, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.09375, "rewards/QwenLabelReward/std": 0.125402569770813, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.819608688354492, "epoch": 0.00010303098422689296, "frac_reward_zero_std": 0.0, "grad_norm": 1.2446321249008179, "kl": 0.005064681172370911, "learning_rate": 9.999950045583406e-06, "loss": 0.0005, "num_tokens": 609312.0, "reward": 6.259083271026611, "reward_std": 0.489948034286499, "rewards/CLIPDistanceReward/mean": 1.1182302236557007, "rewards/CLIPDistanceReward/std": 0.18100308999419212, "rewards/HPSV21DistanceReward/mean": 0.8076610565185547, "rewards/HPSV21DistanceReward/std": 0.05156708508729935, "rewards/MANIQAReward/mean": 0.8744879961013794, "rewards/MANIQAReward/std": 0.23010288178920746, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.1343750953674316, "rewards/QwenLabelReward/std": 0.10734141618013382, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.902883529663086, "epoch": 0.00010615313526407153, "frac_reward_zero_std": 0.0, "grad_norm": 1.2884489297866821, "kl": 0.006726870313286781, "learning_rate": 9.999948484507888e-06, "loss": 0.0007, "num_tokens": 627776.0, "reward": 6.717419624328613, "reward_std": 0.43727123737335205, "rewards/CLIPDistanceReward/mean": 1.234553337097168, "rewards/CLIPDistanceReward/std": 0.15614907443523407, "rewards/HPSV21DistanceReward/mean": 0.8613338470458984, "rewards/HPSV21DistanceReward/std": 0.07176142185926437, "rewards/MANIQAReward/mean": 0.951426088809967, "rewards/MANIQAReward/std": 0.2554662227630615, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.13671875, "rewards/QwenLabelReward/std": 0.02766766957938671, "rewards/QwenWeirdDiscrimReward/mean": 0.2421875, "rewards/QwenWeirdDiscrimReward/std": 0.04419417306780815, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.184929847717285, "epoch": 0.00010927528630125011, "frac_reward_zero_std": 0.0, "grad_norm": 1.3049592971801758, "kl": 0.005772479344159365, "learning_rate": 9.99994692343237e-06, "loss": 0.0006, "num_tokens": 646240.0, "reward": 6.12513542175293, "reward_std": 0.6124197840690613, "rewards/CLIPDistanceReward/mean": 1.022597074508667, "rewards/CLIPDistanceReward/std": 0.17780658602714539, "rewards/HPSV21DistanceReward/mean": 0.8306064605712891, "rewards/HPSV21DistanceReward/std": 0.08065979927778244, "rewards/MANIQAReward/mean": 0.9046660661697388, "rewards/MANIQAReward/std": 0.3007054924964905, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.123437523841858, "rewards/QwenLabelReward/std": 0.09565932303667068, "rewards/QwenWeirdDiscrimReward/mean": 0.1796875, "rewards/QwenWeirdDiscrimReward/std": 0.11420085281133652, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.53181791305542, "epoch": 0.00011239743733842868, "frac_reward_zero_std": 0.0, "grad_norm": 1.2878429889678955, "kl": 0.006266451440751553, "learning_rate": 9.99994536235685e-06, "loss": 0.0006, "num_tokens": 664704.0, "reward": 6.719705104827881, "reward_std": 0.6210151314735413, "rewards/CLIPDistanceReward/mean": 1.2626533508300781, "rewards/CLIPDistanceReward/std": 0.18076249212026596, "rewards/HPSV21DistanceReward/mean": 0.8399429321289062, "rewards/HPSV21DistanceReward/std": 0.0697900652885437, "rewards/MANIQAReward/mean": 0.9879498481750488, "rewards/MANIQAReward/std": 0.2860192358493805, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.1124999523162842, "rewards/QwenLabelReward/std": 0.1121634915471077, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.821704387664795, "epoch": 0.00011551958837560726, "frac_reward_zero_std": 0.0, "grad_norm": 1.3397001028060913, "kl": 0.006520923227071762, "learning_rate": 9.999943801281332e-06, "loss": 0.0007, "num_tokens": 683168.0, "reward": 6.541408538818359, "reward_std": 0.44073063135147095, "rewards/CLIPDistanceReward/mean": 1.1602474451065063, "rewards/CLIPDistanceReward/std": 0.19990672171115875, "rewards/HPSV21DistanceReward/mean": 0.8196296691894531, "rewards/HPSV21DistanceReward/std": 0.08148961514234543, "rewards/MANIQAReward/mean": 0.9918105602264404, "rewards/MANIQAReward/std": 0.31259843707084656, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.13671875, "rewards/QwenLabelReward/std": 0.02766766957938671, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.488513469696045, "epoch": 0.00011864173941278583, "frac_reward_zero_std": 0.0, "grad_norm": 1.268316626548767, "kl": 0.006187104620039463, "learning_rate": 9.999942240205813e-06, "loss": 0.0006, "num_tokens": 701632.0, "reward": 6.33780574798584, "reward_std": 0.48742905259132385, "rewards/CLIPDistanceReward/mean": 1.1050123572349548, "rewards/CLIPDistanceReward/std": 0.2011624351143837, "rewards/HPSV21DistanceReward/mean": 0.8315505981445312, "rewards/HPSV21DistanceReward/std": 0.07715779542922974, "rewards/MANIQAReward/mean": 0.9404610395431519, "rewards/MANIQAReward/std": 0.2275473028421402, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.1257811784744263, "rewards/QwenLabelReward/std": 0.08315464109182358, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.923390865325928, "epoch": 0.00012176389044996441, "frac_reward_zero_std": 0.0, "grad_norm": 1.3168087005615234, "kl": 0.005816879216581583, "learning_rate": 9.999940679130294e-06, "loss": 0.0006, "num_tokens": 720096.0, "reward": 6.029330253601074, "reward_std": 0.5353747606277466, "rewards/CLIPDistanceReward/mean": 1.0316098928451538, "rewards/CLIPDistanceReward/std": 0.20081055164337158, "rewards/HPSV21DistanceReward/mean": 0.8357276916503906, "rewards/HPSV21DistanceReward/std": 0.056367889046669006, "rewards/MANIQAReward/mean": 0.8790298104286194, "rewards/MANIQAReward/std": 0.24055209755897522, "rewards/QwenFakeDiscrimReward/mean": 0.1484375, "rewards/QwenFakeDiscrimReward/std": 0.12474772334098816, "rewards/QwenLabelReward/mean": 1.0953125953674316, "rewards/QwenLabelReward/std": 0.12055609375238419, "rewards/QwenWeirdDiscrimReward/mean": 0.171875, "rewards/QwenWeirdDiscrimReward/std": 0.11773227155208588, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.03944206237793, "epoch": 0.00012488604148714298, "frac_reward_zero_std": 0.0, "grad_norm": 1.2778514623641968, "kl": 0.006581760477274656, "learning_rate": 9.999939118054775e-06, "loss": 0.0007, "num_tokens": 738560.0, "reward": 6.181083679199219, "reward_std": 0.43840038776397705, "rewards/CLIPDistanceReward/mean": 1.0732033252716064, "rewards/CLIPDistanceReward/std": 0.20688392221927643, "rewards/HPSV21DistanceReward/mean": 0.8260383605957031, "rewards/HPSV21DistanceReward/std": 0.048155613243579865, "rewards/MANIQAReward/mean": 0.872443675994873, "rewards/MANIQAReward/std": 0.24510648846626282, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1351561546325684, "rewards/QwenLabelReward/std": 0.03855212405323982, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.2535552978515625, "epoch": 0.00012800819252432156, "frac_reward_zero_std": 0.0, "grad_norm": 1.2671496868133545, "kl": 0.006979086436331272, "learning_rate": 9.999937556979257e-06, "loss": 0.0007, "num_tokens": 757024.0, "reward": 6.169990539550781, "reward_std": 0.3875640630722046, "rewards/CLIPDistanceReward/mean": 1.0197408199310303, "rewards/CLIPDistanceReward/std": 0.2183612361550331, "rewards/HPSV21DistanceReward/mean": 0.8206520080566406, "rewards/HPSV21DistanceReward/std": 0.1473320610821247, "rewards/MANIQAReward/mean": 0.937641978263855, "rewards/MANIQAReward/std": 0.23602475225925446, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1296875476837158, "rewards/QwenLabelReward/std": 0.01844511367380619, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.47361421585083, "epoch": 0.00013113034356150013, "frac_reward_zero_std": 0.0, "grad_norm": 1.382256269454956, "kl": 0.007010441739112139, "learning_rate": 9.99993599590374e-06, "loss": 0.0007, "num_tokens": 775488.0, "reward": 6.300050735473633, "reward_std": 0.5908696055412292, "rewards/CLIPDistanceReward/mean": 1.134007751941681, "rewards/CLIPDistanceReward/std": 0.22769278287887573, "rewards/HPSV21DistanceReward/mean": 0.8180370330810547, "rewards/HPSV21DistanceReward/std": 0.09549170732498169, "rewards/MANIQAReward/mean": 0.9225234389305115, "rewards/MANIQAReward/std": 0.27633288502693176, "rewards/QwenFakeDiscrimReward/mean": 0.1484375, "rewards/QwenFakeDiscrimReward/std": 0.12474772334098816, "rewards/QwenLabelReward/mean": 1.1218749284744263, "rewards/QwenLabelReward/std": 0.11600717157125473, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.654510498046875, "epoch": 0.0001342524945986787, "frac_reward_zero_std": 0.0, "grad_norm": 1.3311586380004883, "kl": 0.006488606333732605, "learning_rate": 9.99993443482822e-06, "loss": 0.0006, "num_tokens": 793952.0, "reward": 6.430598258972168, "reward_std": 0.4188922047615051, "rewards/CLIPDistanceReward/mean": 1.0939875841140747, "rewards/CLIPDistanceReward/std": 0.17691722512245178, "rewards/HPSV21DistanceReward/mean": 0.8600559234619141, "rewards/HPSV21DistanceReward/std": 0.049359384924173355, "rewards/MANIQAReward/mean": 0.9279803037643433, "rewards/MANIQAReward/std": 0.19311390817165375, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.1648437976837158, "rewards/QwenLabelReward/std": 0.03802557289600372, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.630250930786133, "epoch": 0.00013737464563585728, "frac_reward_zero_std": 0.0, "grad_norm": 1.3824607133865356, "kl": 0.007012604735791683, "learning_rate": 9.999932873752701e-06, "loss": 0.0007, "num_tokens": 812416.0, "reward": 6.438296318054199, "reward_std": 0.41041895747184753, "rewards/CLIPDistanceReward/mean": 1.1141502261161804, "rewards/CLIPDistanceReward/std": 0.19920891523361206, "rewards/HPSV21DistanceReward/mean": 0.8597984313964844, "rewards/HPSV21DistanceReward/std": 0.07660837471485138, "rewards/MANIQAReward/mean": 0.8989928960800171, "rewards/MANIQAReward/std": 0.23932239413261414, "rewards/QwenFakeDiscrimReward/mean": 0.21875, "rewards/QwenFakeDiscrimReward/std": 0.08400268852710724, "rewards/QwenLabelReward/mean": 1.146093726158142, "rewards/QwenLabelReward/std": 0.03426027670502663, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.449301719665527, "epoch": 0.00014049679667303585, "frac_reward_zero_std": 0.0, "grad_norm": 1.343591570854187, "kl": 0.008751961402595043, "learning_rate": 9.999931312677183e-06, "loss": 0.0009, "num_tokens": 830880.0, "reward": 6.531269550323486, "reward_std": 0.5313133001327515, "rewards/CLIPDistanceReward/mean": 1.2232906222343445, "rewards/CLIPDistanceReward/std": 0.19396261125802994, "rewards/HPSV21DistanceReward/mean": 0.8219089508056641, "rewards/HPSV21DistanceReward/std": 0.07268453389406204, "rewards/MANIQAReward/mean": 0.9268078804016113, "rewards/MANIQAReward/std": 0.17758311331272125, "rewards/QwenFakeDiscrimReward/mean": 0.15625, "rewards/QwenFakeDiscrimReward/std": 0.12296734005212784, "rewards/QwenLabelReward/mean": 1.139062523841858, "rewards/QwenLabelReward/std": 0.02974185161292553, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.066934585571289, "epoch": 0.00014361894771021443, "frac_reward_zero_std": 0.0, "grad_norm": 1.2583849430084229, "kl": 0.006894892081618309, "learning_rate": 9.999929751601665e-06, "loss": 0.0007, "num_tokens": 849344.0, "reward": 6.360971450805664, "reward_std": 0.4299190938472748, "rewards/CLIPDistanceReward/mean": 1.131060779094696, "rewards/CLIPDistanceReward/std": 0.2191702499985695, "rewards/HPSV21DistanceReward/mean": 0.8336868286132812, "rewards/HPSV21DistanceReward/std": 0.08889924734830856, "rewards/MANIQAReward/mean": 0.9033513069152832, "rewards/MANIQAReward/std": 0.22541655600070953, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1296875476837158, "rewards/QwenLabelReward/std": 0.07605110853910446, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.661937236785889, "epoch": 0.000146741098747393, "frac_reward_zero_std": 0.0, "grad_norm": 1.3645052909851074, "kl": 0.007640751078724861, "learning_rate": 9.999928190526145e-06, "loss": 0.0008, "num_tokens": 867808.0, "reward": 6.432875156402588, "reward_std": 0.4851674437522888, "rewards/CLIPDistanceReward/mean": 1.1447111368179321, "rewards/CLIPDistanceReward/std": 0.19094277173280716, "rewards/HPSV21DistanceReward/mean": 0.8620872497558594, "rewards/HPSV21DistanceReward/std": 0.05914563685655594, "rewards/MANIQAReward/mean": 0.8505282402038574, "rewards/MANIQAReward/std": 0.22087351977825165, "rewards/QwenFakeDiscrimReward/mean": 0.2421875, "rewards/QwenFakeDiscrimReward/std": 0.04419417306780815, "rewards/QwenLabelReward/mean": 1.0999999046325684, "rewards/QwenLabelReward/std": 0.12313120067119598, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.049304485321045, "epoch": 0.00014986324978457158, "frac_reward_zero_std": 0.0, "grad_norm": 1.269399642944336, "kl": 0.006897418759763241, "learning_rate": 9.999926629450627e-06, "loss": 0.0007, "num_tokens": 886272.0, "reward": 6.110084533691406, "reward_std": 0.4549964368343353, "rewards/CLIPDistanceReward/mean": 1.1138911247253418, "rewards/CLIPDistanceReward/std": 0.20834001898765564, "rewards/HPSV21DistanceReward/mean": 0.785736083984375, "rewards/HPSV21DistanceReward/std": 0.06646380573511124, "rewards/MANIQAReward/mean": 0.8584867715835571, "rewards/MANIQAReward/std": 0.20678770542144775, "rewards/QwenFakeDiscrimReward/mean": 0.15625, "rewards/QwenFakeDiscrimReward/std": 0.12296734005212784, "rewards/QwenLabelReward/mean": 1.1164062023162842, "rewards/QwenLabelReward/std": 0.09237227588891983, "rewards/QwenWeirdDiscrimReward/mean": 0.1796875, "rewards/QwenWeirdDiscrimReward/std": 0.11420085281133652, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.024564743041992, "epoch": 0.00015298540082175015, "frac_reward_zero_std": 0.0, "grad_norm": 1.3421450853347778, "kl": 0.007771933451294899, "learning_rate": 9.999925068375107e-06, "loss": 0.0008, "num_tokens": 904736.0, "reward": 6.071702003479004, "reward_std": 0.6235730648040771, "rewards/CLIPDistanceReward/mean": 1.0847819447517395, "rewards/CLIPDistanceReward/std": 0.18296416476368904, "rewards/HPSV21DistanceReward/mean": 0.8124008178710938, "rewards/HPSV21DistanceReward/std": 0.07124163210391998, "rewards/MANIQAReward/mean": 0.8101495504379272, "rewards/MANIQAReward/std": 0.22735163569450378, "rewards/QwenFakeDiscrimReward/mean": 0.1484375, "rewards/QwenFakeDiscrimReward/std": 0.12474772334098816, "rewards/QwenLabelReward/mean": 1.131250023841858, "rewards/QwenLabelReward/std": 0.04534813016653061, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.453320026397705, "epoch": 0.00015610755185892873, "frac_reward_zero_std": 0.0, "grad_norm": 1.2978057861328125, "kl": 0.007353670429438353, "learning_rate": 9.999923507299591e-06, "loss": 0.0007, "num_tokens": 923200.0, "reward": 6.144754409790039, "reward_std": 0.5444225072860718, "rewards/CLIPDistanceReward/mean": 1.080443024635315, "rewards/CLIPDistanceReward/std": 0.22036810219287872, "rewards/HPSV21DistanceReward/mean": 0.8323993682861328, "rewards/HPSV21DistanceReward/std": 0.06223133206367493, "rewards/MANIQAReward/mean": 0.8675070405006409, "rewards/MANIQAReward/std": 0.23265883326530457, "rewards/QwenFakeDiscrimReward/mean": 0.1640625, "rewards/QwenFakeDiscrimReward/std": 0.12063967436552048, "rewards/QwenLabelReward/mean": 1.1156249046325684, "rewards/QwenLabelReward/std": 0.1003522127866745, "rewards/QwenWeirdDiscrimReward/mean": 0.171875, "rewards/QwenWeirdDiscrimReward/std": 0.11773227155208588, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.570916175842285, "epoch": 0.0001592297028961073, "frac_reward_zero_std": 0.0, "grad_norm": 1.2729332447052002, "kl": 0.007927855476737022, "learning_rate": 9.999921946224071e-06, "loss": 0.0008, "num_tokens": 941664.0, "reward": 6.134909629821777, "reward_std": 0.4191247522830963, "rewards/CLIPDistanceReward/mean": 1.0831640362739563, "rewards/CLIPDistanceReward/std": 0.24547777324914932, "rewards/HPSV21DistanceReward/mean": 0.8462371826171875, "rewards/HPSV21DistanceReward/std": 0.05734020099043846, "rewards/MANIQAReward/mean": 0.79095059633255, "rewards/MANIQAReward/std": 0.21968814730644226, "rewards/QwenFakeDiscrimReward/mean": 0.171875, "rewards/QwenFakeDiscrimReward/std": 0.11773227155208588, "rewards/QwenLabelReward/mean": 1.1257812976837158, "rewards/QwenLabelReward/std": 0.09661287069320679, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.128353595733643, "epoch": 0.00016235185393328588, "frac_reward_zero_std": 0.0, "grad_norm": 1.2531239986419678, "kl": 0.008027514442801476, "learning_rate": 9.999920385148553e-06, "loss": 0.0008, "num_tokens": 960128.0, "reward": 6.782259941101074, "reward_std": 0.4210473299026489, "rewards/CLIPDistanceReward/mean": 1.2353112697601318, "rewards/CLIPDistanceReward/std": 0.18280790001153946, "rewards/HPSV21DistanceReward/mean": 0.8704986572265625, "rewards/HPSV21DistanceReward/std": 0.06445670127868652, "rewards/MANIQAReward/mean": 1.0300147533416748, "rewards/MANIQAReward/std": 0.22412016987800598, "rewards/QwenFakeDiscrimReward/mean": 0.21875, "rewards/QwenFakeDiscrimReward/std": 0.08400268852710724, "rewards/QwenLabelReward/mean": 1.0953125953674316, "rewards/QwenLabelReward/std": 0.12055609375238419, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.131199359893799, "epoch": 0.00016547400497046445, "frac_reward_zero_std": 0.0, "grad_norm": 1.3369675874710083, "kl": 0.008288213983178139, "learning_rate": 9.999918824073033e-06, "loss": 0.0008, "num_tokens": 978592.0, "reward": 6.71404504776001, "reward_std": 0.5276718139648438, "rewards/CLIPDistanceReward/mean": 1.2221407890319824, "rewards/CLIPDistanceReward/std": 0.1706755869090557, "rewards/HPSV21DistanceReward/mean": 0.9309158325195312, "rewards/HPSV21DistanceReward/std": 0.20235875993967056, "rewards/MANIQAReward/mean": 0.9587129354476929, "rewards/MANIQAReward/std": 0.2810458242893219, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.08984375, "rewards/QwenLabelReward/std": 0.1263410896062851, "rewards/QwenWeirdDiscrimReward/mean": 0.1640625, "rewards/QwenWeirdDiscrimReward/std": 0.12063967436552048, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.113888263702393, "epoch": 0.00016859615600764303, "frac_reward_zero_std": 0.0, "grad_norm": 1.3338310718536377, "kl": 0.009376339614391327, "learning_rate": 9.999917262997517e-06, "loss": 0.0009, "num_tokens": 997056.0, "reward": 6.735107421875, "reward_std": 0.30960577726364136, "rewards/CLIPDistanceReward/mean": 1.2029386162757874, "rewards/CLIPDistanceReward/std": 0.1617688462138176, "rewards/HPSV21DistanceReward/mean": 0.83746337890625, "rewards/HPSV21DistanceReward/std": 0.06649389863014221, "rewards/MANIQAReward/mean": 1.035553216934204, "rewards/MANIQAReward/std": 0.19934388995170593, "rewards/QwenFakeDiscrimReward/mean": 0.2421875, "rewards/QwenFakeDiscrimReward/std": 0.04419417306780815, "rewards/QwenLabelReward/mean": 1.1343750953674316, "rewards/QwenLabelReward/std": 0.025200821459293365, "rewards/QwenWeirdDiscrimReward/mean": 0.2421875, "rewards/QwenWeirdDiscrimReward/std": 0.04419417306780815, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.485555648803711, "epoch": 0.0001717183070448216, "frac_reward_zero_std": 0.0, "grad_norm": 1.2641077041625977, "kl": 0.0062630027532577515, "learning_rate": 9.999915701921997e-06, "loss": 0.0006, "num_tokens": 1015520.0, "reward": 6.312243461608887, "reward_std": 0.3064427971839905, "rewards/CLIPDistanceReward/mean": 1.0948007106781006, "rewards/CLIPDistanceReward/std": 0.21074523776769638, "rewards/HPSV21DistanceReward/mean": 0.82403564453125, "rewards/HPSV21DistanceReward/std": 0.05114356800913811, "rewards/MANIQAReward/mean": 0.8519141674041748, "rewards/MANIQAReward/std": 0.21662679314613342, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.1460938453674316, "rewards/QwenLabelReward/std": 0.03426027670502663, "rewards/QwenWeirdDiscrimReward/mean": 0.25, "rewards/QwenWeirdDiscrimReward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.135220050811768, "epoch": 0.00017484045808200017, "frac_reward_zero_std": 0.0, "grad_norm": 1.2229082584381104, "kl": 0.007777086459100246, "learning_rate": 9.999914140846479e-06, "loss": 0.0008, "num_tokens": 1033984.0, "reward": 6.5819172859191895, "reward_std": 0.46489545702934265, "rewards/CLIPDistanceReward/mean": 1.2867448329925537, "rewards/CLIPDistanceReward/std": 0.1442060023546219, "rewards/HPSV21DistanceReward/mean": 0.8334388732910156, "rewards/HPSV21DistanceReward/std": 0.06802699714899063, "rewards/MANIQAReward/mean": 0.860299825668335, "rewards/MANIQAReward/std": 0.21941989660263062, "rewards/QwenFakeDiscrimReward/mean": 0.15625, "rewards/QwenFakeDiscrimReward/std": 0.12296734005212784, "rewards/QwenLabelReward/mean": 1.1140625476837158, "rewards/QwenLabelReward/std": 0.10643366724252701, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.967752933502197, "epoch": 0.00017796260911917875, "frac_reward_zero_std": 0.0, "grad_norm": 1.340774416923523, "kl": 0.009379569441080093, "learning_rate": 9.999912579770959e-06, "loss": 0.0009, "num_tokens": 1052448.0, "reward": 6.288841247558594, "reward_std": 0.5845574736595154, "rewards/CLIPDistanceReward/mean": 1.035302460193634, "rewards/CLIPDistanceReward/std": 0.2021016925573349, "rewards/HPSV21DistanceReward/mean": 0.8155097961425781, "rewards/HPSV21DistanceReward/std": 0.08353127539157867, "rewards/MANIQAReward/mean": 1.052060604095459, "rewards/MANIQAReward/std": 0.24042990803718567, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.12109375, "rewards/QwenLabelReward/std": 0.08084966987371445, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.549872398376465, "epoch": 0.00018108476015635732, "frac_reward_zero_std": 0.0, "grad_norm": 1.239559292793274, "kl": 0.007882797159254551, "learning_rate": 9.999911018695441e-06, "loss": 0.0008, "num_tokens": 1070912.0, "reward": 6.43791389465332, "reward_std": 0.40904492139816284, "rewards/CLIPDistanceReward/mean": 1.160652995109558, "rewards/CLIPDistanceReward/std": 0.20131604373455048, "rewards/HPSV21DistanceReward/mean": 0.8377208709716797, "rewards/HPSV21DistanceReward/std": 0.06891020387411118, "rewards/MANIQAReward/mean": 0.9396036267280579, "rewards/MANIQAReward/std": 0.16728326678276062, "rewards/QwenFakeDiscrimReward/mean": 0.171875, "rewards/QwenFakeDiscrimReward/std": 0.11773227155208588, "rewards/QwenLabelReward/mean": 1.1187500953674316, "rewards/QwenLabelReward/std": 0.10850895941257477, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.5321784019470215, "epoch": 0.0001842069111935359, "frac_reward_zero_std": 0.0, "grad_norm": 1.3053251504898071, "kl": 0.008387461304664612, "learning_rate": 9.999909457619923e-06, "loss": 0.0008, "num_tokens": 1089376.0, "reward": 6.372633934020996, "reward_std": 0.5711358785629272, "rewards/CLIPDistanceReward/mean": 1.1317217350006104, "rewards/CLIPDistanceReward/std": 0.22566315531730652, "rewards/HPSV21DistanceReward/mean": 0.8434238433837891, "rewards/HPSV21DistanceReward/std": 0.06000735983252525, "rewards/MANIQAReward/mean": 0.9059365391731262, "rewards/MANIQAReward/std": 0.151746928691864, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1257812976837158, "rewards/QwenLabelReward/std": 0.08315464109182358, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.227677345275879, "epoch": 0.00018732906223071447, "frac_reward_zero_std": 0.0, "grad_norm": 1.3655048608779907, "kl": 0.009916513226926327, "learning_rate": 9.999907896544405e-06, "loss": 0.001, "num_tokens": 1107840.0, "reward": 5.903385639190674, "reward_std": 0.4724699556827545, "rewards/CLIPDistanceReward/mean": 1.0491719841957092, "rewards/CLIPDistanceReward/std": 0.22907741367816925, "rewards/HPSV21DistanceReward/mean": 0.7722034454345703, "rewards/HPSV21DistanceReward/std": 0.08675484359264374, "rewards/MANIQAReward/mean": 0.8348537683486938, "rewards/MANIQAReward/std": 0.19158567488193512, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.01953125, "rewards/QwenLabelReward/std": 0.18035967648029327, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.129537582397461, "epoch": 0.00019045121326789305, "frac_reward_zero_std": 0.0, "grad_norm": 1.3097251653671265, "kl": 0.008311115205287933, "learning_rate": 9.999906335468885e-06, "loss": 0.0008, "num_tokens": 1126304.0, "reward": 6.495568752288818, "reward_std": 0.5924535989761353, "rewards/CLIPDistanceReward/mean": 1.2184027433395386, "rewards/CLIPDistanceReward/std": 0.18084348738193512, "rewards/HPSV21DistanceReward/mean": 0.8467731475830078, "rewards/HPSV21DistanceReward/std": 0.12655069306492805, "rewards/MANIQAReward/mean": 0.8472481966018677, "rewards/MANIQAReward/std": 0.24999307096004486, "rewards/QwenFakeDiscrimReward/mean": 0.171875, "rewards/QwenFakeDiscrimReward/std": 0.11773227155208588, "rewards/QwenLabelReward/mean": 1.1273436546325684, "rewards/QwenLabelReward/std": 0.07496219873428345, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.071813106536865, "epoch": 0.00019357336430507162, "frac_reward_zero_std": 0.0, "grad_norm": 1.2414672374725342, "kl": 0.008822638541460037, "learning_rate": 9.999904774393367e-06, "loss": 0.0009, "num_tokens": 1144768.0, "reward": 6.851861000061035, "reward_std": 0.510129451751709, "rewards/CLIPDistanceReward/mean": 1.25276780128479, "rewards/CLIPDistanceReward/std": 0.1776387244462967, "rewards/HPSV21DistanceReward/mean": 0.9159793853759766, "rewards/HPSV21DistanceReward/std": 0.20137912780046463, "rewards/MANIQAReward/mean": 0.9878044128417969, "rewards/MANIQAReward/std": 0.2545080780982971, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.1281249523162842, "rewards/QwenLabelReward/std": 0.0883883610367775, "rewards/QwenWeirdDiscrimReward/mean": 0.171875, "rewards/QwenWeirdDiscrimReward/std": 0.11773227155208588, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.517354488372803, "epoch": 0.0001966955153422502, "frac_reward_zero_std": 0.0, "grad_norm": 1.194737434387207, "kl": 0.00856002140790224, "learning_rate": 9.999903213317849e-06, "loss": 0.0009, "num_tokens": 1163232.0, "reward": 6.517750263214111, "reward_std": 0.4532257616519928, "rewards/CLIPDistanceReward/mean": 1.158919095993042, "rewards/CLIPDistanceReward/std": 0.20201244205236435, "rewards/HPSV21DistanceReward/mean": 0.8294239044189453, "rewards/HPSV21DistanceReward/std": 0.05985083431005478, "rewards/MANIQAReward/mean": 0.963720440864563, "rewards/MANIQAReward/std": 0.18319322168827057, "rewards/QwenFakeDiscrimReward/mean": 0.234375, "rewards/QwenFakeDiscrimReward/std": 0.06148367002606392, "rewards/QwenLabelReward/mean": 1.1320312023162842, "rewards/QwenLabelReward/std": 0.07705109566450119, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.92746639251709, "epoch": 0.00019981766637942877, "frac_reward_zero_std": 0.0, "grad_norm": 1.3169008493423462, "kl": 0.01007457822561264, "learning_rate": 9.99990165224233e-06, "loss": 0.001, "num_tokens": 1181696.0, "reward": 6.447408676147461, "reward_std": 0.5936325192451477, "rewards/CLIPDistanceReward/mean": 1.1829187870025635, "rewards/CLIPDistanceReward/std": 0.18921823799610138, "rewards/HPSV21DistanceReward/mean": 0.8354701995849609, "rewards/HPSV21DistanceReward/std": 0.06127145513892174, "rewards/MANIQAReward/mean": 0.8832865953445435, "rewards/MANIQAReward/std": 0.27868810296058655, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.11328125, "rewards/QwenLabelReward/std": 0.09918709844350815, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.154086589813232, "epoch": 0.00020293981741660735, "frac_reward_zero_std": 0.0, "grad_norm": 1.2773691415786743, "kl": 0.010575435124337673, "learning_rate": 9.99990009116681e-06, "loss": 0.0011, "num_tokens": 1200160.0, "reward": 6.635031223297119, "reward_std": 0.4442386031150818, "rewards/CLIPDistanceReward/mean": 1.1448700428009033, "rewards/CLIPDistanceReward/std": 0.20687588304281235, "rewards/HPSV21DistanceReward/mean": 0.8703575134277344, "rewards/HPSV21DistanceReward/std": 0.10141471400856972, "rewards/MANIQAReward/mean": 1.060044527053833, "rewards/MANIQAReward/std": 0.16661526262760162, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.114843726158142, "rewards/QwenLabelReward/std": 0.11321180313825607, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.11346960067749, "epoch": 0.00020606196845378592, "frac_reward_zero_std": 0.0, "grad_norm": 1.364324688911438, "kl": 0.00960706640034914, "learning_rate": 9.999898530091293e-06, "loss": 0.001, "num_tokens": 1218624.0, "reward": 5.873997211456299, "reward_std": 0.5387024879455566, "rewards/CLIPDistanceReward/mean": 1.014047086238861, "rewards/CLIPDistanceReward/std": 0.1868160292506218, "rewards/HPSV21DistanceReward/mean": 0.7860507965087891, "rewards/HPSV21DistanceReward/std": 0.07148782163858414, "rewards/MANIQAReward/mean": 0.8370827436447144, "rewards/MANIQAReward/std": 0.28617867827415466, "rewards/QwenFakeDiscrimReward/mean": 0.15625, "rewards/QwenFakeDiscrimReward/std": 0.12296734005212784, "rewards/QwenLabelReward/mean": 1.1007813215255737, "rewards/QwenLabelReward/std": 0.10766667127609253, "rewards/QwenWeirdDiscrimReward/mean": 0.1796875, "rewards/QwenWeirdDiscrimReward/std": 0.11420085281133652, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.486327171325684, "epoch": 0.0002091841194909645, "frac_reward_zero_std": 0.0, "grad_norm": 1.2683284282684326, "kl": 0.009103009477257729, "learning_rate": 9.999896969015774e-06, "loss": 0.0009, "num_tokens": 1237088.0, "reward": 6.342294216156006, "reward_std": 0.48605507612228394, "rewards/CLIPDistanceReward/mean": 1.1768858432769775, "rewards/CLIPDistanceReward/std": 0.18414612114429474, "rewards/HPSV21DistanceReward/mean": 0.7981529235839844, "rewards/HPSV21DistanceReward/std": 0.06564762443304062, "rewards/MANIQAReward/mean": 0.8906541466712952, "rewards/MANIQAReward/std": 0.1791858673095703, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.134374976158142, "rewards/QwenLabelReward/std": 0.025200821459293365, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.3935394287109375, "epoch": 0.00021230627052814307, "frac_reward_zero_std": 0.0, "grad_norm": 1.2829253673553467, "kl": 0.008876740001142025, "learning_rate": 9.999895407940256e-06, "loss": 0.0009, "num_tokens": 1255552.0, "reward": 6.153975009918213, "reward_std": 0.41977977752685547, "rewards/CLIPDistanceReward/mean": 1.0520233511924744, "rewards/CLIPDistanceReward/std": 0.20747043192386627, "rewards/HPSV21DistanceReward/mean": 0.8071556091308594, "rewards/HPSV21DistanceReward/std": 0.07044436037540436, "rewards/MANIQAReward/mean": 0.8293673396110535, "rewards/MANIQAReward/std": 0.27647969126701355, "rewards/QwenFakeDiscrimReward/mean": 0.234375, "rewards/QwenFakeDiscrimReward/std": 0.06148367002606392, "rewards/QwenLabelReward/mean": 1.1375000476837158, "rewards/QwenLabelReward/std": 0.10040242969989777, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.836034297943115, "epoch": 0.00021542842156532164, "frac_reward_zero_std": 0.0, "grad_norm": 1.2717801332473755, "kl": 0.008838916197419167, "learning_rate": 9.999893846864736e-06, "loss": 0.0009, "num_tokens": 1274016.0, "reward": 6.727662086486816, "reward_std": 0.4599715769290924, "rewards/CLIPDistanceReward/mean": 1.2527709603309631, "rewards/CLIPDistanceReward/std": 0.14290814474225044, "rewards/HPSV21DistanceReward/mean": 0.8549156188964844, "rewards/HPSV21DistanceReward/std": 0.06471451371908188, "rewards/MANIQAReward/mean": 0.9646328091621399, "rewards/MANIQAReward/std": 0.24125872552394867, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.1414061784744263, "rewards/QwenLabelReward/std": 0.036813292652368546, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.968777179718018, "epoch": 0.00021855057260250022, "frac_reward_zero_std": 0.0, "grad_norm": 1.3309261798858643, "kl": 0.0098097063601017, "learning_rate": 9.999892285789218e-06, "loss": 0.001, "num_tokens": 1292480.0, "reward": 6.591516494750977, "reward_std": 0.38068944215774536, "rewards/CLIPDistanceReward/mean": 1.2276822328567505, "rewards/CLIPDistanceReward/std": 0.19322476536035538, "rewards/HPSV21DistanceReward/mean": 0.8124675750732422, "rewards/HPSV21DistanceReward/std": 0.057890597730875015, "rewards/MANIQAReward/mean": 1.031529426574707, "rewards/MANIQAReward/std": 0.18140874803066254, "rewards/QwenFakeDiscrimReward/mean": 0.171875, "rewards/QwenFakeDiscrimReward/std": 0.11773227155208588, "rewards/QwenLabelReward/mean": 1.1281250715255737, "rewards/QwenLabelReward/std": 0.09749897569417953, "rewards/QwenWeirdDiscrimReward/mean": 0.1796875, "rewards/QwenWeirdDiscrimReward/std": 0.11420085281133652, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.201743125915527, "epoch": 0.0002216727236396788, "frac_reward_zero_std": 0.0, "grad_norm": 1.3278558254241943, "kl": 0.010665543377399445, "learning_rate": 9.9998907247137e-06, "loss": 0.0011, "num_tokens": 1310944.0, "reward": 6.5537004470825195, "reward_std": 0.48342764377593994, "rewards/CLIPDistanceReward/mean": 1.221526026725769, "rewards/CLIPDistanceReward/std": 0.16903544962406158, "rewards/HPSV21DistanceReward/mean": 0.840362548828125, "rewards/HPSV21DistanceReward/std": 0.05994560196995735, "rewards/MANIQAReward/mean": 0.9916420578956604, "rewards/MANIQAReward/std": 0.20467722415924072, "rewards/QwenFakeDiscrimReward/mean": 0.1328125, "rewards/QwenFakeDiscrimReward/std": 0.1267518401145935, "rewards/QwenLabelReward/mean": 1.1023437976837158, "rewards/QwenLabelReward/std": 0.10007242858409882, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.9509782791137695, "epoch": 0.00022479487467685737, "frac_reward_zero_std": 0.0, "grad_norm": 1.2989915609359741, "kl": 0.009828956797719002, "learning_rate": 9.99988916363818e-06, "loss": 0.001, "num_tokens": 1329408.0, "reward": 6.3959856033325195, "reward_std": 0.4432118535041809, "rewards/CLIPDistanceReward/mean": 1.1576658487319946, "rewards/CLIPDistanceReward/std": 0.21701092272996902, "rewards/HPSV21DistanceReward/mean": 0.8202304840087891, "rewards/HPSV21DistanceReward/std": 0.09092430770397186, "rewards/MANIQAReward/mean": 0.8472239971160889, "rewards/MANIQAReward/std": 0.23385627567768097, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.1320312023162842, "rewards/QwenLabelReward/std": 0.07705109566450119, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.402599811553955, "epoch": 0.00022791702571403594, "frac_reward_zero_std": 0.0, "grad_norm": 1.3026690483093262, "kl": 0.010762540623545647, "learning_rate": 9.999887602562662e-06, "loss": 0.0011, "num_tokens": 1347872.0, "reward": 6.628430366516113, "reward_std": 0.4794447422027588, "rewards/CLIPDistanceReward/mean": 1.2512588500976562, "rewards/CLIPDistanceReward/std": 0.13688861206173897, "rewards/HPSV21DistanceReward/mean": 0.8415737152099609, "rewards/HPSV21DistanceReward/std": 0.049749236553907394, "rewards/MANIQAReward/mean": 0.9146403074264526, "rewards/MANIQAReward/std": 0.21187174320220947, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.0984375476837158, "rewards/QwenLabelReward/std": 0.14562678337097168, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.224852561950684, "epoch": 0.00023103917675121452, "frac_reward_zero_std": 0.0, "grad_norm": 1.2114664316177368, "kl": 0.009706275537610054, "learning_rate": 9.999886041487144e-06, "loss": 0.001, "num_tokens": 1366336.0, "reward": 6.176843643188477, "reward_std": 0.6170514225959778, "rewards/CLIPDistanceReward/mean": 1.1193881034851074, "rewards/CLIPDistanceReward/std": 0.21057775616645813, "rewards/HPSV21DistanceReward/mean": 0.8008518218994141, "rewards/HPSV21DistanceReward/std": 0.081735759973526, "rewards/MANIQAReward/mean": 0.878551185131073, "rewards/MANIQAReward/std": 0.23770035803318024, "rewards/QwenFakeDiscrimReward/mean": 0.1640625, "rewards/QwenFakeDiscrimReward/std": 0.12063967436552048, "rewards/QwenLabelReward/mean": 1.0906250476837158, "rewards/QwenLabelReward/std": 0.11773227155208588, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.927337646484375, "epoch": 0.0002341613277883931, "frac_reward_zero_std": 0.0, "grad_norm": 1.2508243322372437, "kl": 0.010052671656012535, "learning_rate": 9.999884480411626e-06, "loss": 0.001, "num_tokens": 1384800.0, "reward": 6.518453598022461, "reward_std": 0.5139465928077698, "rewards/CLIPDistanceReward/mean": 1.1168642044067383, "rewards/CLIPDistanceReward/std": 0.2267470583319664, "rewards/HPSV21DistanceReward/mean": 0.8774166107177734, "rewards/HPSV21DistanceReward/std": 0.13572324439883232, "rewards/MANIQAReward/mean": 0.9884858131408691, "rewards/MANIQAReward/std": 0.23626494407653809, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.111718773841858, "rewards/QwenLabelReward/std": 0.07513009756803513, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.469064712524414, "epoch": 0.00023728347882557167, "frac_reward_zero_std": 0.0, "grad_norm": 1.3478745222091675, "kl": 0.01073535531759262, "learning_rate": 9.999882919336106e-06, "loss": 0.0011, "num_tokens": 1403264.0, "reward": 6.656659126281738, "reward_std": 0.4163103699684143, "rewards/CLIPDistanceReward/mean": 1.1729547381401062, "rewards/CLIPDistanceReward/std": 0.18532928824424744, "rewards/HPSV21DistanceReward/mean": 0.8903770446777344, "rewards/HPSV21DistanceReward/std": 0.1494666524231434, "rewards/MANIQAReward/mean": 0.9987451434135437, "rewards/MANIQAReward/std": 0.22216308116912842, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.1484375, "rewards/QwenLabelReward/std": 0.03531970456242561, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.01206636428833, "epoch": 0.00024040562986275024, "frac_reward_zero_std": 0.0, "grad_norm": 1.321268081665039, "kl": 0.014898188412189484, "learning_rate": 9.999881358260588e-06, "loss": 0.0015, "num_tokens": 1421728.0, "reward": 6.650838851928711, "reward_std": 0.31938880681991577, "rewards/CLIPDistanceReward/mean": 1.1991780400276184, "rewards/CLIPDistanceReward/std": 0.19914858043193817, "rewards/HPSV21DistanceReward/mean": 0.8484115600585938, "rewards/HPSV21DistanceReward/std": 0.08030588179826736, "rewards/MANIQAReward/mean": 1.01581609249115, "rewards/MANIQAReward/std": 0.27025946974754333, "rewards/QwenFakeDiscrimReward/mean": 0.234375, "rewards/QwenFakeDiscrimReward/std": 0.06148367002606392, "rewards/QwenLabelReward/mean": 1.0789062976837158, "rewards/QwenLabelReward/std": 0.13199910521507263, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.428371906280518, "epoch": 0.00024352778089992881, "frac_reward_zero_std": 0.0, "grad_norm": 1.3018513917922974, "kl": 0.010107096284627914, "learning_rate": 9.99987979718507e-06, "loss": 0.001, "num_tokens": 1440192.0, "reward": 6.405355930328369, "reward_std": 0.3369462192058563, "rewards/CLIPDistanceReward/mean": 1.128469467163086, "rewards/CLIPDistanceReward/std": 0.2270970121026039, "rewards/HPSV21DistanceReward/mean": 0.8276214599609375, "rewards/HPSV21DistanceReward/std": 0.0577189102768898, "rewards/MANIQAReward/mean": 0.9634867310523987, "rewards/MANIQAReward/std": 0.1852370649576187, "rewards/QwenFakeDiscrimReward/mean": 0.1953125, "rewards/QwenFakeDiscrimReward/std": 0.10500335693359375, "rewards/QwenLabelReward/mean": 1.139062523841858, "rewards/QwenLabelReward/std": 0.02974185161292553, "rewards/QwenWeirdDiscrimReward/mean": 0.1953125, "rewards/QwenWeirdDiscrimReward/std": 0.10500335693359375, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.856346607208252, "epoch": 0.0002466499319371074, "frac_reward_zero_std": 0.0, "grad_norm": 1.2462036609649658, "kl": 0.011311967857182026, "learning_rate": 9.99987823610955e-06, "loss": 0.0011, "num_tokens": 1458656.0, "reward": 6.27631950378418, "reward_std": 0.5904483199119568, "rewards/CLIPDistanceReward/mean": 1.1376744508743286, "rewards/CLIPDistanceReward/std": 0.19612333923578262, "rewards/HPSV21DistanceReward/mean": 0.7914104461669922, "rewards/HPSV21DistanceReward/std": 0.06848488003015518, "rewards/MANIQAReward/mean": 0.8767432570457458, "rewards/MANIQAReward/std": 0.20081265270709991, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1507811546325684, "rewards/QwenLabelReward/std": 0.036191925406455994, "rewards/QwenWeirdDiscrimReward/mean": 0.203125, "rewards/QwenWeirdDiscrimReward/std": 0.09913944453001022, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.71397590637207, "epoch": 0.00024977208297428596, "frac_reward_zero_std": 0.0, "grad_norm": 1.2694483995437622, "kl": 0.010853873565793037, "learning_rate": 9.999876675034032e-06, "loss": 0.0011, "num_tokens": 1477120.0, "reward": 6.61531925201416, "reward_std": 0.5339745283126831, "rewards/CLIPDistanceReward/mean": 1.1640762686729431, "rewards/CLIPDistanceReward/std": 0.1985262930393219, "rewards/HPSV21DistanceReward/mean": 0.8789138793945312, "rewards/HPSV21DistanceReward/std": 0.15052929148077965, "rewards/MANIQAReward/mean": 0.9816823601722717, "rewards/MANIQAReward/std": 0.20988689363002777, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1414062976837158, "rewards/QwenLabelReward/std": 0.031501028686761856, "rewards/QwenWeirdDiscrimReward/mean": 0.21875, "rewards/QwenWeirdDiscrimReward/std": 0.08400268852710724, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.880930423736572, "epoch": 0.00025289423401146454, "frac_reward_zero_std": 0.0, "grad_norm": 1.2771624326705933, "kl": 0.011193495243787766, "learning_rate": 9.999875113958514e-06, "loss": 0.0011, "num_tokens": 1495584.0, "reward": 6.787773132324219, "reward_std": 0.46319013833999634, "rewards/CLIPDistanceReward/mean": 1.2512636184692383, "rewards/CLIPDistanceReward/std": 0.15889307484030724, "rewards/HPSV21DistanceReward/mean": 0.8873405456542969, "rewards/HPSV21DistanceReward/std": 0.05663186311721802, "rewards/MANIQAReward/mean": 0.966814398765564, "rewards/MANIQAReward/std": 0.16403299570083618, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.1531250476837158, "rewards/QwenLabelReward/std": 0.03689022734761238, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.359314918518066, "epoch": 0.0002560163850486431, "frac_reward_zero_std": 0.0, "grad_norm": 1.3135985136032104, "kl": 0.011431424878537655, "learning_rate": 9.999873552882994e-06, "loss": 0.0011, "num_tokens": 1514048.0, "reward": 6.644465923309326, "reward_std": 0.5115830302238464, "rewards/CLIPDistanceReward/mean": 1.1546519994735718, "rewards/CLIPDistanceReward/std": 0.21079139411449432, "rewards/HPSV21DistanceReward/mean": 0.8863191604614258, "rewards/HPSV21DistanceReward/std": 0.1566813476383686, "rewards/MANIQAReward/mean": 1.0093984603881836, "rewards/MANIQAReward/std": 0.17993612587451935, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.1390624046325684, "rewards/QwenLabelReward/std": 0.02974185161292553, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.115431785583496, "epoch": 0.0002591385360858217, "frac_reward_zero_std": 0.0, "grad_norm": 1.2314904928207397, "kl": 0.009851144626736641, "learning_rate": 9.999871991807476e-06, "loss": 0.001, "num_tokens": 1532512.0, "reward": 6.480103492736816, "reward_std": 0.38582515716552734, "rewards/CLIPDistanceReward/mean": 1.1731469631195068, "rewards/CLIPDistanceReward/std": 0.1877496838569641, "rewards/HPSV21DistanceReward/mean": 0.7845344543457031, "rewards/HPSV21DistanceReward/std": 0.06728916615247726, "rewards/MANIQAReward/mean": 0.9788031578063965, "rewards/MANIQAReward/std": 0.2358698695898056, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.1484375, "rewards/QwenLabelReward/std": 0.03531970456242561, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.649871826171875, "epoch": 0.00026226068712300026, "frac_reward_zero_std": 0.0, "grad_norm": 1.2837311029434204, "kl": 0.011531388387084007, "learning_rate": 9.999870430731958e-06, "loss": 0.0012, "num_tokens": 1550976.0, "reward": 6.467432022094727, "reward_std": 0.5702420473098755, "rewards/CLIPDistanceReward/mean": 1.2111746072769165, "rewards/CLIPDistanceReward/std": 0.2069578543305397, "rewards/HPSV21DistanceReward/mean": 0.8359756469726562, "rewards/HPSV21DistanceReward/std": 0.06299323588609695, "rewards/MANIQAReward/mean": 0.8598500490188599, "rewards/MANIQAReward/std": 0.23286640644073486, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.130468726158142, "rewards/QwenLabelReward/std": 0.09831944108009338, "rewards/QwenWeirdDiscrimReward/mean": 0.1796875, "rewards/QwenWeirdDiscrimReward/std": 0.11420085281133652, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.369100570678711, "epoch": 0.00026538283816017884, "frac_reward_zero_std": 0.0, "grad_norm": 1.2944066524505615, "kl": 0.011802257038652897, "learning_rate": 9.99986886965644e-06, "loss": 0.0012, "num_tokens": 1569440.0, "reward": 6.551990509033203, "reward_std": 0.48613283038139343, "rewards/CLIPDistanceReward/mean": 1.2148078083992004, "rewards/CLIPDistanceReward/std": 0.17303722351789474, "rewards/HPSV21DistanceReward/mean": 0.8557090759277344, "rewards/HPSV21DistanceReward/std": 0.12299598380923271, "rewards/MANIQAReward/mean": 0.9398633241653442, "rewards/MANIQAReward/std": 0.2125772088766098, "rewards/QwenFakeDiscrimReward/mean": 0.1640625, "rewards/QwenFakeDiscrimReward/std": 0.12063967436552048, "rewards/QwenLabelReward/mean": 1.150781273841858, "rewards/QwenLabelReward/std": 0.036191921681165695, "rewards/QwenWeirdDiscrimReward/mean": 0.15625, "rewards/QwenWeirdDiscrimReward/std": 0.12296734005212784, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.018946170806885, "epoch": 0.0002685049891973574, "frac_reward_zero_std": 0.0, "grad_norm": 1.4271643161773682, "kl": 0.014272022992372513, "learning_rate": 9.99986730858092e-06, "loss": 0.0014, "num_tokens": 1587904.0, "reward": 6.618888854980469, "reward_std": 0.4122573435306549, "rewards/CLIPDistanceReward/mean": 1.2389565706253052, "rewards/CLIPDistanceReward/std": 0.1402580738067627, "rewards/HPSV21DistanceReward/mean": 0.8069934844970703, "rewards/HPSV21DistanceReward/std": 0.05623254179954529, "rewards/MANIQAReward/mean": 0.9519885182380676, "rewards/MANIQAReward/std": 0.1682148426771164, "rewards/QwenFakeDiscrimReward/mean": 0.21875, "rewards/QwenFakeDiscrimReward/std": 0.08400268852710724, "rewards/QwenLabelReward/mean": 1.1296875476837158, "rewards/QwenLabelReward/std": 0.01844511367380619, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.20851993560791, "epoch": 0.000271627140234536, "frac_reward_zero_std": 0.0, "grad_norm": 1.3302770853042603, "kl": 0.013172083534300327, "learning_rate": 9.999865747505402e-06, "loss": 0.0013, "num_tokens": 1606368.0, "reward": 6.524214744567871, "reward_std": 0.48863306641578674, "rewards/CLIPDistanceReward/mean": 1.1486289501190186, "rewards/CLIPDistanceReward/std": 0.2026970535516739, "rewards/HPSV21DistanceReward/mean": 0.8196773529052734, "rewards/HPSV21DistanceReward/std": 0.073027603328228, "rewards/MANIQAReward/mean": 1.0368207693099976, "rewards/MANIQAReward/std": 0.1703212708234787, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.13671875, "rewards/QwenLabelReward/std": 0.02766766957938671, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.015572547912598, "epoch": 0.00027474929127171456, "frac_reward_zero_std": 0.0, "grad_norm": 1.374786138534546, "kl": 0.013838522136211395, "learning_rate": 9.999864186429884e-06, "loss": 0.0014, "num_tokens": 1624832.0, "reward": 5.835655212402344, "reward_std": 0.5297427773475647, "rewards/CLIPDistanceReward/mean": 1.0395894050598145, "rewards/CLIPDistanceReward/std": 0.19665346294641495, "rewards/HPSV21DistanceReward/mean": 0.8077373504638672, "rewards/HPSV21DistanceReward/std": 0.06120998039841652, "rewards/MANIQAReward/mean": 0.7285017371177673, "rewards/MANIQAReward/std": 0.28362324833869934, "rewards/QwenFakeDiscrimReward/mean": 0.15625, "rewards/QwenFakeDiscrimReward/std": 0.12296734005212784, "rewards/QwenLabelReward/mean": 1.092187523841858, "rewards/QwenLabelReward/std": 0.11260075867176056, "rewards/QwenWeirdDiscrimReward/mean": 0.1640625, "rewards/QwenWeirdDiscrimReward/std": 0.12063967436552048, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.082034111022949, "epoch": 0.00027787144230889313, "frac_reward_zero_std": 0.0, "grad_norm": 1.2705234289169312, "kl": 0.011447586119174957, "learning_rate": 9.999862625354366e-06, "loss": 0.0011, "num_tokens": 1643296.0, "reward": 6.532041549682617, "reward_std": 0.5247108936309814, "rewards/CLIPDistanceReward/mean": 1.223293423652649, "rewards/CLIPDistanceReward/std": 0.16816182434558868, "rewards/HPSV21DistanceReward/mean": 0.8615341186523438, "rewards/HPSV21DistanceReward/std": 0.06826657056808472, "rewards/MANIQAReward/mean": 0.8858242034912109, "rewards/MANIQAReward/std": 0.2799542546272278, "rewards/QwenFakeDiscrimReward/mean": 0.2109375, "rewards/QwenFakeDiscrimReward/std": 0.09222550690174103, "rewards/QwenLabelReward/mean": 1.0546875, "rewards/QwenLabelReward/std": 0.1623215526342392, "rewards/QwenWeirdDiscrimReward/mean": 0.2109375, "rewards/QwenWeirdDiscrimReward/std": 0.09222550690174103, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.037008285522461, "epoch": 0.0002809935933460717, "frac_reward_zero_std": 0.0, "grad_norm": 1.194696068763733, "kl": 0.009676177985966206, "learning_rate": 9.999861064278846e-06, "loss": 0.001, "num_tokens": 1661760.0, "reward": 6.645256996154785, "reward_std": 0.43395093083381653, "rewards/CLIPDistanceReward/mean": 1.157776951789856, "rewards/CLIPDistanceReward/std": 0.17336687073111534, "rewards/HPSV21DistanceReward/mean": 0.8540287017822266, "rewards/HPSV21DistanceReward/std": 0.05959376320242882, "rewards/MANIQAReward/mean": 1.0700827836990356, "rewards/MANIQAReward/std": 0.19643321633338928, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.1296875476837158, "rewards/QwenLabelReward/std": 0.01844511367380619, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.698102951049805, "epoch": 0.0002841157443832503, "frac_reward_zero_std": 0.0, "grad_norm": 1.3488980531692505, "kl": 0.014921500347554684, "learning_rate": 9.999859503203328e-06, "loss": 0.0015, "num_tokens": 1680224.0, "reward": 6.316977500915527, "reward_std": 0.4695993661880493, "rewards/CLIPDistanceReward/mean": 1.0925068855285645, "rewards/CLIPDistanceReward/std": 0.1930340826511383, "rewards/HPSV21DistanceReward/mean": 0.8638286590576172, "rewards/HPSV21DistanceReward/std": 0.16718481481075287, "rewards/MANIQAReward/mean": 0.9574311971664429, "rewards/MANIQAReward/std": 0.2232540100812912, "rewards/QwenFakeDiscrimReward/mean": 0.1875, "rewards/QwenFakeDiscrimReward/std": 0.1099853366613388, "rewards/QwenLabelReward/mean": 1.095312476158142, "rewards/QwenLabelReward/std": 0.12055609375238419, "rewards/QwenWeirdDiscrimReward/mean": 0.1640625, "rewards/QwenWeirdDiscrimReward/std": 0.12063967436552048, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.058731555938721, "epoch": 0.00028723789542042886, "frac_reward_zero_std": 0.0, "grad_norm": 1.322605013847351, "kl": 0.013861821964383125, "learning_rate": 9.99985794212781e-06, "loss": 0.0014, "num_tokens": 1698688.0, "reward": 6.300305366516113, "reward_std": 0.4486587941646576, "rewards/CLIPDistanceReward/mean": 1.047316551208496, "rewards/CLIPDistanceReward/std": 0.17072461172938347, "rewards/HPSV21DistanceReward/mean": 0.8187065124511719, "rewards/HPSV21DistanceReward/std": 0.11990371719002724, "rewards/MANIQAReward/mean": 0.9877907037734985, "rewards/MANIQAReward/std": 0.23089075088500977, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.111718773841858, "rewards/QwenLabelReward/std": 0.07513009756803513, "rewards/QwenWeirdDiscrimReward/mean": 0.2421875, "rewards/QwenWeirdDiscrimReward/std": 0.04419417306780815, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.909784317016602, "epoch": 0.00029036004645760743, "frac_reward_zero_std": 0.0, "grad_norm": 1.3663002252578735, "kl": 0.014044124633073807, "learning_rate": 9.999856381052291e-06, "loss": 0.0014, "num_tokens": 1717152.0, "reward": 6.286442279815674, "reward_std": 0.45500510931015015, "rewards/CLIPDistanceReward/mean": 1.1485949158668518, "rewards/CLIPDistanceReward/std": 0.1992233842611313, "rewards/HPSV21DistanceReward/mean": 0.8088817596435547, "rewards/HPSV21DistanceReward/std": 0.05629899352788925, "rewards/MANIQAReward/mean": 0.9410200119018555, "rewards/MANIQAReward/std": 0.18042035400867462, "rewards/QwenFakeDiscrimReward/mean": 0.1328125, "rewards/QwenFakeDiscrimReward/std": 0.1267518401145935, "rewards/QwenLabelReward/mean": 1.1023437976837158, "rewards/QwenLabelReward/std": 0.10007242858409882, "rewards/QwenWeirdDiscrimReward/mean": 0.1953125, "rewards/QwenWeirdDiscrimReward/std": 0.10500335693359375, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.382993698120117, "epoch": 0.000293482197494786, "frac_reward_zero_std": 0.0, "grad_norm": 1.34286367893219, "kl": 0.014433640986680984, "learning_rate": 9.999854819976772e-06, "loss": 0.0014, "num_tokens": 1735616.0, "reward": 6.061652183532715, "reward_std": 0.5163753628730774, "rewards/CLIPDistanceReward/mean": 1.0559407472610474, "rewards/CLIPDistanceReward/std": 0.21535518020391464, "rewards/HPSV21DistanceReward/mean": 0.8206977844238281, "rewards/HPSV21DistanceReward/std": 0.05320136621594429, "rewards/MANIQAReward/mean": 0.8513439893722534, "rewards/MANIQAReward/std": 0.19229555130004883, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.10546875, "rewards/QwenLabelReward/std": 0.10864532738924026, "rewards/QwenWeirdDiscrimReward/mean": 0.171875, "rewards/QwenWeirdDiscrimReward/std": 0.11773227155208588, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.465245723724365, "epoch": 0.0002966043485319646, "frac_reward_zero_std": 0.0, "grad_norm": 1.3539674282073975, "kl": 0.011551715433597565, "learning_rate": 9.999853258901253e-06, "loss": 0.0012, "num_tokens": 1754080.0, "reward": 6.37191104888916, "reward_std": 0.3475463092327118, "rewards/CLIPDistanceReward/mean": 1.0429537892341614, "rewards/CLIPDistanceReward/std": 0.1726062037050724, "rewards/HPSV21DistanceReward/mean": 0.8243122100830078, "rewards/HPSV21DistanceReward/std": 0.08137667179107666, "rewards/MANIQAReward/mean": 1.0084729194641113, "rewards/MANIQAReward/std": 0.1495041847229004, "rewards/QwenFakeDiscrimReward/mean": 0.2421875, "rewards/QwenFakeDiscrimReward/std": 0.04419417306780815, "rewards/QwenLabelReward/mean": 1.13671875, "rewards/QwenLabelReward/std": 0.02766766957938671, "rewards/QwenWeirdDiscrimReward/mean": 0.25, "rewards/QwenWeirdDiscrimReward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.38861083984375, "epoch": 0.00029972649956914316, "frac_reward_zero_std": 0.0, "grad_norm": 1.321020483970642, "kl": 0.014348190277814865, "learning_rate": 9.999851697825734e-06, "loss": 0.0014, "num_tokens": 1772544.0, "reward": 6.457367420196533, "reward_std": 0.5884271264076233, "rewards/CLIPDistanceReward/mean": 1.1617244482040405, "rewards/CLIPDistanceReward/std": 0.2425595223903656, "rewards/HPSV21DistanceReward/mean": 0.8005180358886719, "rewards/HPSV21DistanceReward/std": 0.07174292206764221, "rewards/MANIQAReward/mean": 0.9836637377738953, "rewards/MANIQAReward/std": 0.20696383714675903, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.111718773841858, "rewards/QwenLabelReward/std": 0.08979914337396622, "rewards/QwenWeirdDiscrimReward/mean": 0.234375, "rewards/QwenWeirdDiscrimReward/std": 0.06148367002606392, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.632518768310547, "epoch": 0.00030284865060632173, "frac_reward_zero_std": 0.0, "grad_norm": 1.3199386596679688, "kl": 0.013946463353931904, "learning_rate": 9.999850136750217e-06, "loss": 0.0014, "num_tokens": 1791008.0, "reward": 6.359943389892578, "reward_std": 0.46949610114097595, "rewards/CLIPDistanceReward/mean": 1.0889617204666138, "rewards/CLIPDistanceReward/std": 0.21345198154449463, "rewards/HPSV21DistanceReward/mean": 0.8147754669189453, "rewards/HPSV21DistanceReward/std": 0.06555086374282837, "rewards/MANIQAReward/mean": 0.9821569919586182, "rewards/MANIQAReward/std": 0.2799859642982483, "rewards/QwenFakeDiscrimReward/mean": 0.2265625, "rewards/QwenFakeDiscrimReward/std": 0.07403614372015, "rewards/QwenLabelReward/mean": 1.1171875, "rewards/QwenLabelReward/std": 0.11420086026191711, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 5.37549352645874, "epoch": 0.0003059708016435003, "frac_reward_zero_std": 0.0, "grad_norm": 1.3158499002456665, "kl": 0.012216787785291672, "learning_rate": 9.999848575674697e-06, "loss": 0.0012, "num_tokens": 1809472.0, "reward": 6.323463439941406, "reward_std": 0.37022027373313904, "rewards/CLIPDistanceReward/mean": 1.1429919004440308, "rewards/CLIPDistanceReward/std": 0.20780770480632782, "rewards/HPSV21DistanceReward/mean": 0.7836952209472656, "rewards/HPSV21DistanceReward/std": 0.05120699480175972, "rewards/MANIQAReward/mean": 0.9224329590797424, "rewards/MANIQAReward/std": 0.2144441455602646, "rewards/QwenFakeDiscrimReward/mean": 0.203125, "rewards/QwenFakeDiscrimReward/std": 0.09913944453001022, "rewards/QwenLabelReward/mean": 1.1179687976837158, "rewards/QwenLabelReward/std": 0.06962873786687851, "rewards/QwenWeirdDiscrimReward/mean": 0.2265625, "rewards/QwenWeirdDiscrimReward/std": 0.07403614372015, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 7.528017520904541, "epoch": 0.0003090929526806789, "frac_reward_zero_std": 0.0, "grad_norm": 1.342447280883789, "kl": 0.013031678274273872, "learning_rate": 9.99984701459918e-06, "loss": 0.0013, "num_tokens": 1827936.0, "reward": 6.676502704620361, "reward_std": 0.515540599822998, "rewards/CLIPDistanceReward/mean": 1.18186616897583, "rewards/CLIPDistanceReward/std": 0.1764284484088421, "rewards/HPSV21DistanceReward/mean": 0.8738784790039062, "rewards/HPSV21DistanceReward/std": 0.14323310926556587, "rewards/MANIQAReward/mean": 1.0509512424468994, "rewards/MANIQAReward/std": 0.21744301915168762, "rewards/QwenFakeDiscrimReward/mean": 0.1796875, "rewards/QwenFakeDiscrimReward/std": 0.11420085281133652, "rewards/QwenLabelReward/mean": 1.139062523841858, "rewards/QwenLabelReward/std": 0.02974185161292553, "rewards/QwenWeirdDiscrimReward/mean": 0.1953125, "rewards/QwenWeirdDiscrimReward/std": 0.10500335693359375, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "coef_1": 1.0, "completions/clipped_ratio": 1.0, "completions/max_length": 576.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 576.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 576.0, "completions/min_terminated_length": 0.0, "entropy": 6.722692012786865, "epoch": 0.00031221510371785745, "frac_reward_zero_std": 0.0, "grad_norm": 1.2652713060379028, "kl": 0.013153746724128723, "learning_rate": 9.99984545352366e-06, "loss": 0.0013, "num_tokens": 1846400.0, "reward": 6.429366111755371, "reward_std": 0.7387886643409729, "rewards/CLIPDistanceReward/mean": 1.2333825826644897, "rewards/CLIPDistanceReward/std": 0.19391363114118576, "rewards/HPSV21DistanceReward/mean": 0.8224048614501953, "rewards/HPSV21DistanceReward/std": 0.06454271078109741, "rewards/MANIQAReward/mean": 0.8670098781585693, "rewards/MANIQAReward/std": 0.2712596654891968, "rewards/QwenFakeDiscrimReward/mean": 0.1484375, "rewards/QwenFakeDiscrimReward/std": 0.12474772334098816, "rewards/QwenLabelReward/mean": 1.114843726158142, "rewards/QwenLabelReward/std": 0.11321180313825607, "rewards/QwenWeirdDiscrimReward/mean": 0.1875, "rewards/QwenWeirdDiscrimReward/std": 0.1099853366613388, "step": 100 } ], "logging_steps": 1.0, "max_steps": 6405840, "num_input_tokens_seen": 1846400, "num_train_epochs": 20, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }