diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,59433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.14839797639123, + "eval_steps": 100, + "global_step": 3300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 262.6458435058594, + "epoch": 0.003372681281618887, + "grad_norm": 18.2549130889672, + "kl": 0.0, + "learning_rate": 9.997184684684683e-07, + "loss": 0.0, + "reward": 2.236002564430237, + "reward_std": 0.798353910446167, + "rewards/final_reward": 0.1961517347854389, + "rewards/mask_iou_reward": 0.09807586739271945, + "rewards/sam_format_reward": 0.78125, + "rewards/sam_reward_func_ultra": 0.4755857586860657, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1, + "think_completion_length": 227.66666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.46876525878906, + "epoch": 0.006745362563237774, + "grad_norm": 3.0259863371376707, + "kl": 0.000598907470703125, + "learning_rate": 9.994369369369369e-07, + "loss": 0.0, + "reward": 2.0085843801498413, + "reward_std": 0.7900800108909607, + "rewards/final_reward": 0.17079922505335737, + "rewards/mask_iou_reward": 0.08539961252667869, + "rewards/sam_format_reward": 0.7604166865348816, + "rewards/sam_reward_func_ultra": 0.2898343503475189, + "rewards/thk_ans_format_reward": 0.9583333730697632, + "step": 2, + "think_completion_length": 164.70833333333331 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.7916717529297, + "epoch": 0.01011804384485666, + "grad_norm": 3.5258361720821183, + "kl": 0.0006771087646484375, + "learning_rate": 9.991554054054052e-07, + "loss": 0.0, + "reward": 2.462438941001892, + "reward_std": 0.8611267507076263, + "rewards/final_reward": 0.5922547578171589, + "rewards/mask_iou_reward": 0.29612737890857943, + "rewards/sam_format_reward": 0.8750000298023224, + "rewards/sam_reward_func_ultra": 0.6395223438739777, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 3, + "think_completion_length": 175.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.59375, + "epoch": 0.013490725126475547, + "grad_norm": 4.726047943419514, + "kl": 0.0013427734375, + "learning_rate": 9.988738738738738e-07, + "loss": 0.0, + "reward": 2.2907389402389526, + "reward_std": 0.7321855425834656, + "rewards/final_reward": 0.4825787532407586, + "rewards/mask_iou_reward": 0.2412893766203793, + "rewards/sam_format_reward": 0.8854166865348816, + "rewards/sam_reward_func_ultra": 0.4574054926633835, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 4, + "think_completion_length": 136.33333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.28125762939453, + "epoch": 0.016863406408094434, + "grad_norm": 5.4744398823185385, + "kl": 0.001377105712890625, + "learning_rate": 9.985923423423422e-07, + "loss": 0.0, + "reward": 2.405760407447815, + "reward_std": 0.7585574686527252, + "rewards/final_reward": 0.7464167821487235, + "rewards/mask_iou_reward": 0.37320839107436177, + "rewards/sam_format_reward": 0.8750000298023224, + "rewards/sam_reward_func_ultra": 0.5724269300699234, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 5, + "think_completion_length": 153.33333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.53125762939453, + "epoch": 0.02023608768971332, + "grad_norm": 2.6261435014718977, + "kl": 0.00251007080078125, + "learning_rate": 9.983108108108107e-07, + "loss": 0.0, + "reward": 2.4854605197906494, + "reward_std": 0.6170332133769989, + "rewards/final_reward": 0.5536762328101035, + "rewards/mask_iou_reward": 0.27683811640505174, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 0.5479605048894882, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 6, + "think_completion_length": 144.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.90625, + "epoch": 0.023608768971332208, + "grad_norm": 3.411913189332803, + "kl": 0.00347900390625, + "learning_rate": 9.980292792792793e-07, + "loss": 0.0, + "reward": 2.411745548248291, + "reward_std": 0.578468382358551, + "rewards/final_reward": 0.24950741038787477, + "rewards/mask_iou_reward": 0.12475370519393739, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 0.46382857859134674, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 7, + "think_completion_length": 147.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.30209350585938, + "epoch": 0.026981450252951095, + "grad_norm": 4.706402296307697, + "kl": 0.0042724609375, + "learning_rate": 9.977477477477476e-07, + "loss": 0.0, + "reward": 2.6951953172683716, + "reward_std": 0.6168608367443085, + "rewards/final_reward": 0.7593355902470126, + "rewards/mask_iou_reward": 0.3796677951235063, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 0.7264453172683716, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 8, + "think_completion_length": 108.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.61458587646484, + "epoch": 0.03035413153456998, + "grad_norm": 5.850108394416396, + "kl": 0.00506591796875, + "learning_rate": 9.974662162162162e-07, + "loss": 0.0, + "reward": 2.4541696310043335, + "reward_std": 0.5115222632884979, + "rewards/final_reward": 0.6870269603439862, + "rewards/mask_iou_reward": 0.3435134801719931, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.48541969060897827, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 9, + "think_completion_length": 179.58333333333331 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.65625762939453, + "epoch": 0.03372681281618887, + "grad_norm": 3.8688201576142967, + "kl": 0.0067596435546875, + "learning_rate": 9.971846846846846e-07, + "loss": 0.0, + "reward": 2.526506543159485, + "reward_std": 0.5167315006256104, + "rewards/final_reward": 1.0214127943524371, + "rewards/mask_iou_reward": 0.5107063971762186, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.6619232296943665, + "rewards/thk_ans_format_reward": 0.8750000298023224, + "step": 10, + "think_completion_length": 109.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0, + "epoch": 0.03709949409780776, + "grad_norm": 4.7134408075823435, + "kl": 0.010284423828125, + "learning_rate": 9.969031531531531e-07, + "loss": 0.0, + "reward": 2.434975743293762, + "reward_std": 0.40706782042980194, + "rewards/final_reward": 0.9454941172749969, + "rewards/mask_iou_reward": 0.47274705863749844, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4558090567588806, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 11, + "think_completion_length": 133.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.86458587646484, + "epoch": 0.04047217537942664, + "grad_norm": 9.134381347922247, + "kl": 0.010589599609375, + "learning_rate": 9.966216216216215e-07, + "loss": 0.0, + "reward": 2.6082929372787476, + "reward_std": 0.4820929616689682, + "rewards/final_reward": 0.5997067438178366, + "rewards/mask_iou_reward": 0.2998533719089183, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6187096536159515, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 12, + "think_completion_length": 107.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.5729217529297, + "epoch": 0.04384485666104553, + "grad_norm": 5.958574182814823, + "kl": 0.014892578125, + "learning_rate": 9.9634009009009e-07, + "loss": 0.0, + "reward": 2.874386191368103, + "reward_std": 0.6012694835662842, + "rewards/final_reward": 1.1462164005316828, + "rewards/mask_iou_reward": 0.5731082002658414, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.8952195346355438, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 13, + "think_completion_length": 95.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.67709350585938, + "epoch": 0.047217537942664416, + "grad_norm": 9.395049743033171, + "kl": 0.015472412109375, + "learning_rate": 9.960585585585584e-07, + "loss": 0.0, + "reward": 2.7581610679626465, + "reward_std": 0.39922401309013367, + "rewards/final_reward": 0.322904850008464, + "rewards/mask_iou_reward": 0.161452425004232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7685778141021729, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 14, + "think_completion_length": 91.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.375, + "epoch": 0.050590219224283306, + "grad_norm": 11.83054025776063, + "kl": 0.0218505859375, + "learning_rate": 9.95777027027027e-07, + "loss": 0.0, + "reward": 2.6608554124832153, + "reward_std": 0.4703047573566437, + "rewards/final_reward": 0.664067629310831, + "rewards/mask_iou_reward": 0.3320338146554155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6712720990180969, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 15, + "think_completion_length": 95.33333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.77084350585938, + "epoch": 0.05396290050590219, + "grad_norm": 4.207564129436361, + "kl": 0.023193359375, + "learning_rate": 9.954954954954955e-07, + "loss": 0.0, + "reward": 2.6589574813842773, + "reward_std": 0.2804350033402443, + "rewards/final_reward": 0.302006787477356, + "rewards/mask_iou_reward": 0.151003393738678, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6589572131633759, + "rewards/thk_ans_format_reward": 1.0, + "step": 16, + "think_completion_length": 122.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.70834350585938, + "epoch": 0.05733558178752108, + "grad_norm": 10.925973964489978, + "kl": 0.02789306640625, + "learning_rate": 9.952139639639639e-07, + "loss": 0.0, + "reward": 2.7561737298965454, + "reward_std": 0.5501474440097809, + "rewards/final_reward": 0.37828136243168875, + "rewards/mask_iou_reward": 0.18914068121584438, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7561735510826111, + "rewards/thk_ans_format_reward": 1.0, + "step": 17, + "think_completion_length": 101.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.92708587646484, + "epoch": 0.06070826306913996, + "grad_norm": 4.10772567894951, + "kl": 0.02850341796875, + "learning_rate": 9.949324324324325e-07, + "loss": 0.0, + "reward": 2.593306541442871, + "reward_std": 0.36468201875686646, + "rewards/final_reward": 0.4049101159612564, + "rewards/mask_iou_reward": 0.25851768383432, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5933065414428711, + "rewards/thk_ans_format_reward": 1.0, + "step": 18, + "think_completion_length": 110.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.48958587646484, + "epoch": 0.06408094435075885, + "grad_norm": 5.711915284565923, + "kl": 0.03118896484375, + "learning_rate": 9.946509009009008e-07, + "loss": 0.0, + "reward": 2.6937522888183594, + "reward_std": 0.5062113702297211, + "rewards/final_reward": 1.0718381430291133, + "rewards/mask_iou_reward": 0.5359190715145566, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.7145856469869614, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 19, + "think_completion_length": 129.66666666666669 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.34375762939453, + "epoch": 0.06745362563237774, + "grad_norm": 5.726509164029333, + "kl": 0.039794921875, + "learning_rate": 9.943693693693694e-07, + "loss": 0.0, + "reward": 2.4703195095062256, + "reward_std": 0.3861998915672302, + "rewards/final_reward": 0.34144271480993443, + "rewards/mask_iou_reward": 0.17072135740496722, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.4807361662387848, + "rewards/thk_ans_format_reward": 1.0, + "step": 20, + "think_completion_length": 109.95833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.11459350585938, + "epoch": 0.07082630691399663, + "grad_norm": 10.317711135994427, + "kl": 0.036376953125, + "learning_rate": 9.940878378378377e-07, + "loss": 0.0, + "reward": 2.5708796977996826, + "reward_std": 0.36896252632141113, + "rewards/final_reward": 0.44824195500453407, + "rewards/mask_iou_reward": 0.22412097750226703, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5708796679973602, + "rewards/thk_ans_format_reward": 1.0, + "step": 21, + "think_completion_length": 123.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.96875762939453, + "epoch": 0.07419898819561552, + "grad_norm": 3.1813431001378536, + "kl": 0.0360107421875, + "learning_rate": 9.938063063063063e-07, + "loss": 0.0001, + "reward": 2.4606316089630127, + "reward_std": 0.35946163535118103, + "rewards/final_reward": 0.37041026102779656, + "rewards/mask_iou_reward": 0.18520513051389828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.46063150465488434, + "rewards/thk_ans_format_reward": 1.0, + "step": 22, + "think_completion_length": 100.45833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.6979217529297, + "epoch": 0.0775716694772344, + "grad_norm": 4.09602863721668, + "kl": 0.047607421875, + "learning_rate": 9.935247747747747e-07, + "loss": 0.0, + "reward": 2.44514799118042, + "reward_std": 0.40501701831817627, + "rewards/final_reward": 0.08334334422518366, + "rewards/mask_iou_reward": 0.04167167211259183, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.4555644392967224, + "rewards/thk_ans_format_reward": 1.0, + "step": 23, + "think_completion_length": 99.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.5104217529297, + "epoch": 0.08094435075885328, + "grad_norm": 6.604801123537757, + "kl": 0.0496826171875, + "learning_rate": 9.932432432432432e-07, + "loss": 0.0, + "reward": 2.8794195652008057, + "reward_std": 0.4920074939727783, + "rewards/final_reward": 0.1977291756476331, + "rewards/mask_iou_reward": 0.09886458782381655, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.9106696248054504, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 24, + "think_completion_length": 97.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.6041717529297, + "epoch": 0.08431703204047218, + "grad_norm": 6.320587869504459, + "kl": 0.0426025390625, + "learning_rate": 9.929617117117116e-07, + "loss": 0.0, + "reward": 2.7572332620620728, + "reward_std": 0.41600461304187775, + "rewards/final_reward": 0.6702880951115837, + "rewards/mask_iou_reward": 0.33514404755579186, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.7676499336957932, + "rewards/thk_ans_format_reward": 1.0, + "step": 25, + "think_completion_length": 101.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.30209350585938, + "epoch": 0.08768971332209106, + "grad_norm": 7.749212860781184, + "kl": 0.046630859375, + "learning_rate": 9.926801801801801e-07, + "loss": 0.0, + "reward": 2.748945713043213, + "reward_std": 0.4598637521266937, + "rewards/final_reward": 0.6600739575919204, + "rewards/mask_iou_reward": 0.3300369787959602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7489454746246338, + "rewards/thk_ans_format_reward": 1.0, + "step": 26, + "think_completion_length": 107.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.75000762939453, + "epoch": 0.09106239460370995, + "grad_norm": 4.032996224301655, + "kl": 0.052001953125, + "learning_rate": 9.923986486486487e-07, + "loss": 0.0001, + "reward": 2.9561160802841187, + "reward_std": 0.4863891154527664, + "rewards/final_reward": 0.6586801252841948, + "rewards/mask_iou_reward": 0.3293400626420974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9561160504817963, + "rewards/thk_ans_format_reward": 1.0, + "step": 27, + "think_completion_length": 84.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.68750762939453, + "epoch": 0.09443507588532883, + "grad_norm": 22.503580944554333, + "kl": 0.05712890625, + "learning_rate": 9.92117117117117e-07, + "loss": 0.0001, + "reward": 2.79542875289917, + "reward_std": 0.3854813724756241, + "rewards/final_reward": 0.9766034139788797, + "rewards/mask_iou_reward": 0.48830170698943987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7954287976026535, + "rewards/thk_ans_format_reward": 1.0, + "step": 28, + "think_completion_length": 107.29166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.90625, + "epoch": 0.09780775716694773, + "grad_norm": 5.444098681896409, + "kl": 0.053955078125, + "learning_rate": 9.918355855855856e-07, + "loss": 0.0001, + "reward": 3.074863076210022, + "reward_std": 0.3014441579580307, + "rewards/final_reward": 1.681790123556664, + "rewards/mask_iou_reward": 0.840895061778332, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0748630166053772, + "rewards/thk_ans_format_reward": 1.0, + "step": 29, + "think_completion_length": 108.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.90625762939453, + "epoch": 0.10118043844856661, + "grad_norm": 3.941049607151489, + "kl": 0.0626220703125, + "learning_rate": 9.91554054054054e-07, + "loss": 0.0001, + "reward": 2.320728898048401, + "reward_std": 0.24627278745174408, + "rewards/final_reward": 0.30145627819432563, + "rewards/mask_iou_reward": 0.15072813909716282, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.3207288384437561, + "rewards/thk_ans_format_reward": 1.0, + "step": 30, + "think_completion_length": 75.54166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.37500762939453, + "epoch": 0.1045531197301855, + "grad_norm": 5.256367822273493, + "kl": 0.072021484375, + "learning_rate": 9.912725225225226e-07, + "loss": 0.0001, + "reward": 2.81955087184906, + "reward_std": 0.4395739734172821, + "rewards/final_reward": 0.6934297826888204, + "rewards/mask_iou_reward": 0.3467148913444102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8195509016513824, + "rewards/thk_ans_format_reward": 1.0, + "step": 31, + "think_completion_length": 75.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8541717529297, + "epoch": 0.10792580101180438, + "grad_norm": 4.941175549973696, + "kl": 0.0615234375, + "learning_rate": 9.90990990990991e-07, + "loss": 0.0001, + "reward": 3.0443195104599, + "reward_std": 0.36992163956165314, + "rewards/final_reward": 1.0586814928047903, + "rewards/mask_iou_reward": 0.5293407464023951, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0443194210529327, + "rewards/thk_ans_format_reward": 1.0, + "step": 32, + "think_completion_length": 106.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.14584350585938, + "epoch": 0.11129848229342328, + "grad_norm": 6.373319719553773, + "kl": 0.06689453125, + "learning_rate": 9.907094594594595e-07, + "loss": 0.0001, + "reward": 2.7814066410064697, + "reward_std": 0.34776973724365234, + "rewards/final_reward": 0.39388861840120193, + "rewards/mask_iou_reward": 0.19694430920060096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7814066708087921, + "rewards/thk_ans_format_reward": 1.0, + "step": 33, + "think_completion_length": 93.83333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.0, + "epoch": 0.11467116357504216, + "grad_norm": 6.760195356920477, + "kl": 0.063232421875, + "learning_rate": 9.904279279279278e-07, + "loss": 0.0001, + "reward": 3.0221351385116577, + "reward_std": 0.3316876143217087, + "rewards/final_reward": 1.0392349291181158, + "rewards/mask_iou_reward": 0.5196174645590579, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0221351981163025, + "rewards/thk_ans_format_reward": 1.0, + "step": 34, + "think_completion_length": 72.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.12500762939453, + "epoch": 0.11804384485666104, + "grad_norm": 11.70900945875506, + "kl": 0.071044921875, + "learning_rate": 9.901463963963964e-07, + "loss": 0.0001, + "reward": 2.652697205543518, + "reward_std": 0.44748905301094055, + "rewards/final_reward": 1.167913835000586, + "rewards/mask_iou_reward": 0.583956917500293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6526971757411957, + "rewards/thk_ans_format_reward": 1.0, + "step": 35, + "think_completion_length": 93.29166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.48959350585938, + "epoch": 0.12141652613827993, + "grad_norm": 10.978024050704882, + "kl": 0.0614013671875, + "learning_rate": 9.89864864864865e-07, + "loss": 0.0001, + "reward": 3.1266239881515503, + "reward_std": 0.3739871680736542, + "rewards/final_reward": 1.6664728440852943, + "rewards/mask_iou_reward": 0.8332364220426471, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1370405852794647, + "rewards/thk_ans_format_reward": 1.0, + "step": 36, + "think_completion_length": 75.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1666717529297, + "epoch": 0.12478920741989882, + "grad_norm": 4.687339166938046, + "kl": 0.07763671875, + "learning_rate": 9.895833333333333e-07, + "loss": 0.0001, + "reward": 2.7773534059524536, + "reward_std": 0.4481920897960663, + "rewards/final_reward": 0.7356097783982714, + "rewards/mask_iou_reward": 0.3678048891991357, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.7981867790222168, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 37, + "think_completion_length": 78.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.65625, + "epoch": 0.1281618887015177, + "grad_norm": 16.500247450151882, + "kl": 0.07470703125, + "learning_rate": 9.893018018018019e-07, + "loss": 0.0001, + "reward": 2.6673413515090942, + "reward_std": 0.41427473723888397, + "rewards/final_reward": 0.9702149166908527, + "rewards/mask_iou_reward": 0.48510745834542635, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.6777579486370087, + "rewards/thk_ans_format_reward": 1.0, + "step": 38, + "think_completion_length": 86.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.98959350585938, + "epoch": 0.1315345699831366, + "grad_norm": 58.074807859002405, + "kl": 0.08544921875, + "learning_rate": 9.890202702702702e-07, + "loss": 0.0001, + "reward": 2.5168732404708862, + "reward_std": 0.6337110698223114, + "rewards/final_reward": 0.47920808376200985, + "rewards/mask_iou_reward": 0.23960404188100493, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 0.5585398375988007, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 39, + "think_completion_length": 95.45833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.45833587646484, + "epoch": 0.13490725126475547, + "grad_norm": 4.72506162337957, + "kl": 0.073486328125, + "learning_rate": 9.887387387387386e-07, + "loss": 0.0001, + "reward": 3.031570553779602, + "reward_std": 0.46547742187976837, + "rewards/final_reward": 0.9411258904506269, + "rewards/mask_iou_reward": 0.47056294522531344, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.031570553779602, + "rewards/thk_ans_format_reward": 1.0, + "step": 40, + "think_completion_length": 87.66666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1354217529297, + "epoch": 0.13827993254637436, + "grad_norm": 6.6206234978194125, + "kl": 0.078369140625, + "learning_rate": 9.884572072072072e-07, + "loss": 0.0001, + "reward": 3.18060564994812, + "reward_std": 0.49932096898555756, + "rewards/final_reward": 1.1321352132892444, + "rewards/mask_iou_reward": 0.5660676066446222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1806055903434753, + "rewards/thk_ans_format_reward": 1.0, + "step": 41, + "think_completion_length": 82.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.0416717529297, + "epoch": 0.14165261382799327, + "grad_norm": 3.8491770091768016, + "kl": 0.08154296875, + "learning_rate": 9.881756756756755e-07, + "loss": 0.0001, + "reward": 2.55053448677063, + "reward_std": 0.31911052763462067, + "rewards/final_reward": 0.5311662418642663, + "rewards/mask_iou_reward": 0.26558312093213315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.5505344271659851, + "rewards/thk_ans_format_reward": 1.0, + "step": 42, + "think_completion_length": 90.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0416717529297, + "epoch": 0.14502529510961215, + "grad_norm": 6.789756683137194, + "kl": 0.078369140625, + "learning_rate": 9.87894144144144e-07, + "loss": 0.0001, + "reward": 2.9492753744125366, + "reward_std": 0.2880386933684349, + "rewards/final_reward": 0.5763597238360774, + "rewards/mask_iou_reward": 0.2881798619180387, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9492754340171814, + "rewards/thk_ans_format_reward": 1.0, + "step": 43, + "think_completion_length": 70.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.71875762939453, + "epoch": 0.14839797639123103, + "grad_norm": 13.041295860696609, + "kl": 0.078369140625, + "learning_rate": 9.876126126126124e-07, + "loss": 0.0001, + "reward": 3.086544990539551, + "reward_std": 0.3482399433851242, + "rewards/final_reward": 1.6059155017247742, + "rewards/mask_iou_reward": 0.8029577508623871, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0865449905395508, + "rewards/thk_ans_format_reward": 1.0, + "step": 44, + "think_completion_length": 86.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.78125762939453, + "epoch": 0.15177065767284992, + "grad_norm": 7.34001355188649, + "kl": 0.07568359375, + "learning_rate": 9.87331081081081e-07, + "loss": 0.0001, + "reward": 2.7933939695358276, + "reward_std": 0.455650195479393, + "rewards/final_reward": 1.0957763493410244, + "rewards/mask_iou_reward": 0.5478881746705122, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7933937758207321, + "rewards/thk_ans_format_reward": 1.0, + "step": 45, + "think_completion_length": 72.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1354217529297, + "epoch": 0.1551433389544688, + "grad_norm": 6.723874556684061, + "kl": 0.08349609375, + "learning_rate": 9.870495495495496e-07, + "loss": 0.0001, + "reward": 3.034854769706726, + "reward_std": 0.3657161295413971, + "rewards/final_reward": 1.2670155002357293, + "rewards/mask_iou_reward": 0.6335077501178646, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0348548293113708, + "rewards/thk_ans_format_reward": 1.0, + "step": 46, + "think_completion_length": 85.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.5729217529297, + "epoch": 0.15851602023608768, + "grad_norm": 5.1064273040513815, + "kl": 0.094970703125, + "learning_rate": 9.86768018018018e-07, + "loss": 0.0001, + "reward": 2.641897439956665, + "reward_std": 0.4104642868041992, + "rewards/final_reward": 0.6724832396800458, + "rewards/mask_iou_reward": 0.3362416198400229, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.6523140668869019, + "rewards/thk_ans_format_reward": 1.0, + "step": 47, + "think_completion_length": 83.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.14584350585938, + "epoch": 0.16188870151770657, + "grad_norm": 15.24708912565845, + "kl": 0.08447265625, + "learning_rate": 9.864864864864865e-07, + "loss": 0.0001, + "reward": 2.9382331371307373, + "reward_std": 0.3587312549352646, + "rewards/final_reward": 1.0089048256920612, + "rewards/mask_iou_reward": 0.5044524128460306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9382330775260925, + "rewards/thk_ans_format_reward": 1.0, + "step": 48, + "think_completion_length": 83.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.59375, + "epoch": 0.16526138279932545, + "grad_norm": 7.158849673206477, + "kl": 0.09033203125, + "learning_rate": 9.862049549549548e-07, + "loss": 0.0001, + "reward": 2.7654584646224976, + "reward_std": 0.3078659772872925, + "rewards/final_reward": 0.931761202507062, + "rewards/mask_iou_reward": 0.465880601253531, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7654582858085632, + "rewards/thk_ans_format_reward": 1.0, + "step": 49, + "think_completion_length": 77.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.30208587646484, + "epoch": 0.16863406408094436, + "grad_norm": 8.151717013930563, + "kl": 0.090087890625, + "learning_rate": 9.859234234234234e-07, + "loss": 0.0001, + "reward": 2.9706579446792603, + "reward_std": 0.39569392800331116, + "rewards/final_reward": 0.44502848416067103, + "rewards/mask_iou_reward": 0.22251424208033552, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9706579446792603, + "rewards/thk_ans_format_reward": 1.0, + "step": 50, + "think_completion_length": 102.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.7916717529297, + "epoch": 0.17200674536256325, + "grad_norm": 6.831911618016697, + "kl": 0.085205078125, + "learning_rate": 9.856418918918918e-07, + "loss": 0.0001, + "reward": 3.266450881958008, + "reward_std": 0.2550960034132004, + "rewards/final_reward": 1.463418817207728, + "rewards/mask_iou_reward": 0.731709408603864, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2664507329463959, + "rewards/thk_ans_format_reward": 1.0, + "step": 51, + "think_completion_length": 74.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.15625, + "epoch": 0.17537942664418213, + "grad_norm": 84.87485355159552, + "kl": 0.09228515625, + "learning_rate": 9.853603603603603e-07, + "loss": 0.0001, + "reward": 2.6442378759384155, + "reward_std": 0.28126492351293564, + "rewards/final_reward": 1.2378643356828511, + "rewards/mask_iou_reward": 0.6189321678414256, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6442377269268036, + "rewards/thk_ans_format_reward": 1.0, + "step": 52, + "think_completion_length": 65.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.27083587646484, + "epoch": 0.178752107925801, + "grad_norm": 11.894833801964715, + "kl": 0.089111328125, + "learning_rate": 9.850788288288287e-07, + "loss": 0.0001, + "reward": 2.7687995433807373, + "reward_std": 0.26568184792995453, + "rewards/final_reward": 0.879780526858291, + "rewards/mask_iou_reward": 0.4398902634291455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7687995284795761, + "rewards/thk_ans_format_reward": 1.0, + "step": 53, + "think_completion_length": 72.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.0416717529297, + "epoch": 0.1821247892074199, + "grad_norm": 5.807968972350355, + "kl": 0.089111328125, + "learning_rate": 9.847972972972973e-07, + "loss": 0.0001, + "reward": 2.8916733264923096, + "reward_std": 0.20665724575519562, + "rewards/final_reward": 0.7938957174186951, + "rewards/mask_iou_reward": 0.39694785870934757, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8916733860969543, + "rewards/thk_ans_format_reward": 1.0, + "step": 54, + "think_completion_length": 82.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.87500762939453, + "epoch": 0.18549747048903878, + "grad_norm": 7.6025544739888975, + "kl": 0.100341796875, + "learning_rate": 9.845157657657656e-07, + "loss": 0.0001, + "reward": 2.9938149452209473, + "reward_std": 0.2959776520729065, + "rewards/final_reward": 0.8594974404944786, + "rewards/mask_iou_reward": 0.4297487202472393, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.0146484375, + "rewards/thk_ans_format_reward": 1.0, + "step": 55, + "think_completion_length": 76.20833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.6666717529297, + "epoch": 0.18887015177065766, + "grad_norm": 4.240339295381267, + "kl": 0.09375, + "learning_rate": 9.842342342342342e-07, + "loss": 0.0001, + "reward": 3.2571107149124146, + "reward_std": 0.3112206757068634, + "rewards/final_reward": 1.0376420191306543, + "rewards/mask_iou_reward": 0.5188210095653272, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2571107149124146, + "rewards/thk_ans_format_reward": 1.0, + "step": 56, + "think_completion_length": 69.45833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9375, + "epoch": 0.19224283305227655, + "grad_norm": 9.498023007923031, + "kl": 0.0947265625, + "learning_rate": 9.839527027027027e-07, + "loss": 0.0001, + "reward": 2.7155935764312744, + "reward_std": 0.33761440217494965, + "rewards/final_reward": 1.145701922917429, + "rewards/mask_iou_reward": 0.5728509614587145, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.7260101139545441, + "rewards/thk_ans_format_reward": 1.0, + "step": 57, + "think_completion_length": 79.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.95833587646484, + "epoch": 0.19561551433389546, + "grad_norm": 12.122477919232969, + "kl": 0.1005859375, + "learning_rate": 9.83671171171171e-07, + "loss": 0.0001, + "reward": 2.712533116340637, + "reward_std": 0.20152553170919418, + "rewards/final_reward": 1.0981164497885971, + "rewards/mask_iou_reward": 0.5490582248942986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7125331163406372, + "rewards/thk_ans_format_reward": 1.0, + "step": 58, + "think_completion_length": 64.45833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.39583587646484, + "epoch": 0.19898819561551434, + "grad_norm": 4.960251178888369, + "kl": 0.09619140625, + "learning_rate": 9.833896396396397e-07, + "loss": 0.0001, + "reward": 2.893130302429199, + "reward_std": 0.26960865780711174, + "rewards/final_reward": 0.44181123180177767, + "rewards/mask_iou_reward": 0.22090561590088884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8931301832199097, + "rewards/thk_ans_format_reward": 1.0, + "step": 59, + "think_completion_length": 83.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.34375762939453, + "epoch": 0.20236087689713322, + "grad_norm": 14.477576256950904, + "kl": 0.112060546875, + "learning_rate": 9.83108108108108e-07, + "loss": 0.0001, + "reward": 3.004229187965393, + "reward_std": 0.26009829342365265, + "rewards/final_reward": 1.154425956510769, + "rewards/mask_iou_reward": 0.5772129782553845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0042291581630707, + "rewards/thk_ans_format_reward": 1.0, + "step": 60, + "think_completion_length": 65.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.48958587646484, + "epoch": 0.2057335581787521, + "grad_norm": 5.4023722595664365, + "kl": 0.10302734375, + "learning_rate": 9.828265765765766e-07, + "loss": 0.0001, + "reward": 2.922378182411194, + "reward_std": 0.2460155412554741, + "rewards/final_reward": 1.1422041590321816, + "rewards/mask_iou_reward": 0.5711020795160908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9223781824111938, + "rewards/thk_ans_format_reward": 1.0, + "step": 61, + "think_completion_length": 74.54166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.84375762939453, + "epoch": 0.209106239460371, + "grad_norm": 17.679265660046486, + "kl": 0.159912109375, + "learning_rate": 9.82545045045045e-07, + "loss": 0.0002, + "reward": 2.8391278982162476, + "reward_std": 0.38828714191913605, + "rewards/final_reward": 1.057218125291731, + "rewards/mask_iou_reward": 0.5286090626458655, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8391278684139252, + "rewards/thk_ans_format_reward": 1.0, + "step": 62, + "think_completion_length": 71.04166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.30208587646484, + "epoch": 0.21247892074198987, + "grad_norm": 13.719301439552309, + "kl": 0.11181640625, + "learning_rate": 9.822635135135135e-07, + "loss": 0.0001, + "reward": 3.010664939880371, + "reward_std": 0.240879625082016, + "rewards/final_reward": 1.6053594054522557, + "rewards/mask_iou_reward": 0.8026797027261279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0106649100780487, + "rewards/thk_ans_format_reward": 1.0, + "step": 63, + "think_completion_length": 69.83333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8854217529297, + "epoch": 0.21585160202360876, + "grad_norm": 40.58424129421211, + "kl": 0.12646484375, + "learning_rate": 9.819819819819819e-07, + "loss": 0.0001, + "reward": 2.95255708694458, + "reward_std": 0.3045773357152939, + "rewards/final_reward": 0.5511350359294102, + "rewards/mask_iou_reward": 0.2755675179647051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9525572657585144, + "rewards/thk_ans_format_reward": 1.0, + "step": 64, + "think_completion_length": 76.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.6666717529297, + "epoch": 0.21922428330522767, + "grad_norm": 6.376365790032157, + "kl": 0.111328125, + "learning_rate": 9.817004504504504e-07, + "loss": 0.0001, + "reward": 3.245211124420166, + "reward_std": 0.3611130267381668, + "rewards/final_reward": 1.1441606900744024, + "rewards/mask_iou_reward": 0.5720803450372012, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.255627989768982, + "rewards/thk_ans_format_reward": 1.0, + "step": 65, + "think_completion_length": 73.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.8854217529297, + "epoch": 0.22259696458684655, + "grad_norm": 4.339143871006515, + "kl": 0.11181640625, + "learning_rate": 9.81418918918919e-07, + "loss": 0.0001, + "reward": 3.0247561931610107, + "reward_std": 0.34554168581962585, + "rewards/final_reward": 1.0067040342282128, + "rewards/mask_iou_reward": 0.5033520171141064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.024756371974945, + "rewards/thk_ans_format_reward": 1.0, + "step": 66, + "think_completion_length": 76.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.21875762939453, + "epoch": 0.22596964586846544, + "grad_norm": 5.8507038942252905, + "kl": 0.112548828125, + "learning_rate": 9.811373873873873e-07, + "loss": 0.0001, + "reward": 2.7070631980895996, + "reward_std": 0.27164027094841003, + "rewards/final_reward": 0.6720597954271437, + "rewards/mask_iou_reward": 0.33602989771357183, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7070631682872772, + "rewards/thk_ans_format_reward": 1.0, + "step": 67, + "think_completion_length": 60.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.55209350585938, + "epoch": 0.22934232715008432, + "grad_norm": 10.822994146609568, + "kl": 0.12646484375, + "learning_rate": 9.80855855855856e-07, + "loss": 0.0001, + "reward": 2.8537681102752686, + "reward_std": 0.34967314451932907, + "rewards/final_reward": 0.8741368306469235, + "rewards/mask_iou_reward": 0.43706841532346175, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.874601423740387, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 68, + "think_completion_length": 66.41666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.86459350585938, + "epoch": 0.2327150084317032, + "grad_norm": 5.3148935449566235, + "kl": 0.142578125, + "learning_rate": 9.805743243243243e-07, + "loss": 0.0001, + "reward": 2.923313856124878, + "reward_std": 0.38698340952396393, + "rewards/final_reward": 0.8756449063816937, + "rewards/mask_iou_reward": 0.43782245319084684, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9233138561248779, + "rewards/thk_ans_format_reward": 1.0, + "step": 69, + "think_completion_length": 79.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.6979217529297, + "epoch": 0.23608768971332209, + "grad_norm": 5.415557085648394, + "kl": 0.107177734375, + "learning_rate": 9.802927927927928e-07, + "loss": 0.0001, + "reward": 2.9035059213638306, + "reward_std": 0.19867272675037384, + "rewards/final_reward": 1.3126234085256436, + "rewards/mask_iou_reward": 0.6563117042628218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9035059213638306, + "rewards/thk_ans_format_reward": 1.0, + "step": 70, + "think_completion_length": 77.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.64584350585938, + "epoch": 0.23946037099494097, + "grad_norm": 10.86839202590463, + "kl": 0.114990234375, + "learning_rate": 9.800112612612612e-07, + "loss": 0.0001, + "reward": 2.8913815021514893, + "reward_std": 0.5183416604995728, + "rewards/final_reward": 0.48871107211835074, + "rewards/mask_iou_reward": 0.24435553605917537, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.9017982482910156, + "rewards/thk_ans_format_reward": 1.0, + "step": 71, + "think_completion_length": 84.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1041717529297, + "epoch": 0.24283305227655985, + "grad_norm": 4.026949460423362, + "kl": 0.116943359375, + "learning_rate": 9.797297297297298e-07, + "loss": 0.0001, + "reward": 3.240618944168091, + "reward_std": 0.22773104906082153, + "rewards/final_reward": 0.7806309576412722, + "rewards/mask_iou_reward": 0.3903154788206361, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2406185865402222, + "rewards/thk_ans_format_reward": 1.0, + "step": 72, + "think_completion_length": 79.41666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.45833587646484, + "epoch": 0.24620573355817876, + "grad_norm": 4.335677102355079, + "kl": 0.107177734375, + "learning_rate": 9.794481981981981e-07, + "loss": 0.0001, + "reward": 2.805757761001587, + "reward_std": 0.3943777531385422, + "rewards/final_reward": 0.8992796951354338, + "rewards/mask_iou_reward": 0.4496398475677169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8057577311992645, + "rewards/thk_ans_format_reward": 1.0, + "step": 73, + "think_completion_length": 70.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.36458587646484, + "epoch": 0.24957841483979765, + "grad_norm": 11.769442693892403, + "kl": 0.12890625, + "learning_rate": 9.791666666666667e-07, + "loss": 0.0001, + "reward": 3.5046173334121704, + "reward_std": 0.3032621145248413, + "rewards/final_reward": 1.4271000649967103, + "rewards/mask_iou_reward": 0.7135500324983551, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.504617154598236, + "rewards/thk_ans_format_reward": 1.0, + "step": 74, + "think_completion_length": 71.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.17708587646484, + "epoch": 0.25295109612141653, + "grad_norm": 8.11512275742484, + "kl": 0.1064453125, + "learning_rate": 9.78885135135135e-07, + "loss": 0.0001, + "reward": 2.857938289642334, + "reward_std": 0.2891754060983658, + "rewards/final_reward": 1.2226563332272598, + "rewards/mask_iou_reward": 0.6113281666136299, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.857938289642334, + "rewards/thk_ans_format_reward": 1.0, + "step": 75, + "think_completion_length": 80.33333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.23958587646484, + "epoch": 0.2563237774030354, + "grad_norm": 7.686578576868314, + "kl": 0.127685546875, + "learning_rate": 9.786036036036036e-07, + "loss": 0.0001, + "reward": 3.027070999145508, + "reward_std": 0.3380560874938965, + "rewards/final_reward": 0.1450894432775949, + "rewards/mask_iou_reward": 0.07254472163879745, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0270708799362183, + "rewards/thk_ans_format_reward": 1.0, + "step": 76, + "think_completion_length": 72.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.7291717529297, + "epoch": 0.2596964586846543, + "grad_norm": 4.448677151834081, + "kl": 0.12939453125, + "learning_rate": 9.783220720720722e-07, + "loss": 0.0001, + "reward": 2.7701700925827026, + "reward_std": 0.20241041854023933, + "rewards/final_reward": 0.9208412423412211, + "rewards/mask_iou_reward": 0.46042062117061056, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7701701521873474, + "rewards/thk_ans_format_reward": 1.0, + "step": 77, + "think_completion_length": 83.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.1979217529297, + "epoch": 0.2630691399662732, + "grad_norm": 8.075882706845949, + "kl": 0.123291015625, + "learning_rate": 9.780405405405405e-07, + "loss": 0.0001, + "reward": 3.1208958625793457, + "reward_std": 0.27007415145635605, + "rewards/final_reward": 1.3364821226854584, + "rewards/mask_iou_reward": 0.6682410613427292, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1208957433700562, + "rewards/thk_ans_format_reward": 1.0, + "step": 78, + "think_completion_length": 69.41666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9791717529297, + "epoch": 0.26644182124789206, + "grad_norm": 11.33968365370947, + "kl": 0.131103515625, + "learning_rate": 9.77759009009009e-07, + "loss": 0.0001, + "reward": 3.168465733528137, + "reward_std": 0.20537365972995758, + "rewards/final_reward": 0.6006425484332252, + "rewards/mask_iou_reward": 0.3003212742166126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1684656739234924, + "rewards/thk_ans_format_reward": 1.0, + "step": 79, + "think_completion_length": 74.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.125, + "epoch": 0.26981450252951095, + "grad_norm": 29.683199255535733, + "kl": 0.13330078125, + "learning_rate": 9.774774774774774e-07, + "loss": 0.0001, + "reward": 2.8754775524139404, + "reward_std": 0.3308670222759247, + "rewards/final_reward": 1.0366864076232571, + "rewards/mask_iou_reward": 0.5183432038116286, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8754774928092957, + "rewards/thk_ans_format_reward": 1.0, + "step": 80, + "think_completion_length": 66.70833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.21875762939453, + "epoch": 0.27318718381112983, + "grad_norm": 12.216127743389162, + "kl": 0.1181640625, + "learning_rate": 9.771959459459458e-07, + "loss": 0.0001, + "reward": 2.8978902101516724, + "reward_std": 0.40983031690120697, + "rewards/final_reward": 0.950881268056939, + "rewards/mask_iou_reward": 0.4754406340284695, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.9083065688610077, + "rewards/thk_ans_format_reward": 1.0, + "step": 81, + "think_completion_length": 71.58333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.36458587646484, + "epoch": 0.2765598650927487, + "grad_norm": 7.045076203455561, + "kl": 0.3701171875, + "learning_rate": 9.769144144144144e-07, + "loss": 0.0004, + "reward": 2.778691291809082, + "reward_std": 0.3490441143512726, + "rewards/final_reward": 0.33543247619297384, + "rewards/mask_iou_reward": 0.16771623809648692, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7786912024021149, + "rewards/thk_ans_format_reward": 1.0, + "step": 82, + "think_completion_length": 76.91666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.46875762939453, + "epoch": 0.2799325463743676, + "grad_norm": 5.069750542905911, + "kl": 0.12451171875, + "learning_rate": 9.766328828828827e-07, + "loss": 0.0001, + "reward": 2.6801772117614746, + "reward_std": 0.4206371158361435, + "rewards/final_reward": 1.0430866882172039, + "rewards/mask_iou_reward": 0.5215433441086019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6801770925521851, + "rewards/thk_ans_format_reward": 1.0, + "step": 83, + "think_completion_length": 79.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.77084350585938, + "epoch": 0.28330522765598654, + "grad_norm": 12.00513443485755, + "kl": 0.12255859375, + "learning_rate": 9.763513513513513e-07, + "loss": 0.0001, + "reward": 2.851745367050171, + "reward_std": 0.4787246733903885, + "rewards/final_reward": 1.1709810151121771, + "rewards/mask_iou_reward": 0.5854905075560886, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.8621618747711182, + "rewards/thk_ans_format_reward": 1.0, + "step": 84, + "think_completion_length": 74.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.40625762939453, + "epoch": 0.2866779089376054, + "grad_norm": 22.291846229168698, + "kl": 0.133544921875, + "learning_rate": 9.760698198198196e-07, + "loss": 0.0001, + "reward": 2.7811670303344727, + "reward_std": 0.2809675335884094, + "rewards/final_reward": 0.6003249698041416, + "rewards/mask_iou_reward": 0.3001624849020708, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7811669409275055, + "rewards/thk_ans_format_reward": 1.0, + "step": 85, + "think_completion_length": 73.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.46875762939453, + "epoch": 0.2900505902192243, + "grad_norm": 9.086774756116599, + "kl": 0.1328125, + "learning_rate": 9.757882882882882e-07, + "loss": 0.0001, + "reward": 3.1159489154815674, + "reward_std": 0.1485934928059578, + "rewards/final_reward": 1.1099682739039065, + "rewards/mask_iou_reward": 0.5549841369519533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.115948736667633, + "rewards/thk_ans_format_reward": 1.0, + "step": 86, + "think_completion_length": 83.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.4791717529297, + "epoch": 0.2934232715008432, + "grad_norm": 10.646510048049379, + "kl": 0.112060546875, + "learning_rate": 9.755067567567568e-07, + "loss": 0.0001, + "reward": 3.175132393836975, + "reward_std": 0.4541157931089401, + "rewards/final_reward": 0.9455331042822714, + "rewards/mask_iou_reward": 0.4727665521411357, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1855489611625671, + "rewards/thk_ans_format_reward": 1.0, + "step": 87, + "think_completion_length": 77.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.1979217529297, + "epoch": 0.29679595278246207, + "grad_norm": 19.66868318193964, + "kl": 0.103271484375, + "learning_rate": 9.752252252252251e-07, + "loss": 0.0001, + "reward": 3.08430278301239, + "reward_std": 0.24113387614488602, + "rewards/final_reward": 1.204691090668408, + "rewards/mask_iou_reward": 0.602345545334204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.084302693605423, + "rewards/thk_ans_format_reward": 1.0, + "step": 88, + "think_completion_length": 87.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.14584350585938, + "epoch": 0.30016863406408095, + "grad_norm": 4.329310940421637, + "kl": 0.13623046875, + "learning_rate": 9.749436936936937e-07, + "loss": 0.0001, + "reward": 3.087108612060547, + "reward_std": 0.3445526212453842, + "rewards/final_reward": 1.7332479771038667, + "rewards/mask_iou_reward": 0.8666239885519333, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0871086716651917, + "rewards/thk_ans_format_reward": 1.0, + "step": 89, + "think_completion_length": 70.54166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.58333587646484, + "epoch": 0.30354131534569984, + "grad_norm": 5.52593254215471, + "kl": 0.15185546875, + "learning_rate": 9.74662162162162e-07, + "loss": 0.0002, + "reward": 3.0328943729400635, + "reward_std": 0.3087661564350128, + "rewards/final_reward": 1.1219688993011085, + "rewards/mask_iou_reward": 0.5609844496505543, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0328941941261292, + "rewards/thk_ans_format_reward": 1.0, + "step": 90, + "think_completion_length": 76.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.09375762939453, + "epoch": 0.3069139966273187, + "grad_norm": 4.83035426553915, + "kl": 0.10595703125, + "learning_rate": 9.743806306306306e-07, + "loss": 0.0001, + "reward": 3.2756351232528687, + "reward_std": 0.21860820055007935, + "rewards/final_reward": 1.5841647447377873, + "rewards/mask_iou_reward": 0.7920823723688937, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2756351232528687, + "rewards/thk_ans_format_reward": 1.0, + "step": 91, + "think_completion_length": 73.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.4166717529297, + "epoch": 0.3102866779089376, + "grad_norm": 5.485844590793826, + "kl": 0.130126953125, + "learning_rate": 9.74099099099099e-07, + "loss": 0.0001, + "reward": 3.332701802253723, + "reward_std": 0.25605323910713196, + "rewards/final_reward": 1.7413345387431938, + "rewards/mask_iou_reward": 0.8706672693715969, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3327018022537231, + "rewards/thk_ans_format_reward": 1.0, + "step": 92, + "think_completion_length": 82.70833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6979217529297, + "epoch": 0.3136593591905565, + "grad_norm": 12.53059709455058, + "kl": 0.108642578125, + "learning_rate": 9.738175675675675e-07, + "loss": 0.0001, + "reward": 2.661715030670166, + "reward_std": 0.3630661815404892, + "rewards/final_reward": 0.7241040143891004, + "rewards/mask_iou_reward": 0.3620520071945502, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6617150008678436, + "rewards/thk_ans_format_reward": 1.0, + "step": 93, + "think_completion_length": 84.70833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.59375762939453, + "epoch": 0.31703204047217537, + "grad_norm": 9.425838303959774, + "kl": 0.13330078125, + "learning_rate": 9.735360360360359e-07, + "loss": 0.0001, + "reward": 3.524588942527771, + "reward_std": 0.25210119783878326, + "rewards/final_reward": 1.5078914610928074, + "rewards/mask_iou_reward": 0.7539457305464037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.524588942527771, + "rewards/thk_ans_format_reward": 1.0, + "step": 94, + "think_completion_length": 86.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.78125762939453, + "epoch": 0.32040472175379425, + "grad_norm": 6.176918426451065, + "kl": 0.1142578125, + "learning_rate": 9.732545045045045e-07, + "loss": 0.0001, + "reward": 2.603161573410034, + "reward_std": 0.23625994473695755, + "rewards/final_reward": 0.7563615294481078, + "rewards/mask_iou_reward": 0.3781807647240539, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6031614542007446, + "rewards/thk_ans_format_reward": 1.0, + "step": 95, + "think_completion_length": 89.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.1666717529297, + "epoch": 0.32377740303541314, + "grad_norm": 4.239631710734004, + "kl": 0.14306640625, + "learning_rate": 9.72972972972973e-07, + "loss": 0.0001, + "reward": 3.153728723526001, + "reward_std": 0.2302696853876114, + "rewards/final_reward": 1.4158820962324687, + "rewards/mask_iou_reward": 0.7079410481162344, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1537286639213562, + "rewards/thk_ans_format_reward": 1.0, + "step": 96, + "think_completion_length": 83.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.77083587646484, + "epoch": 0.327150084317032, + "grad_norm": 4.11745240192496, + "kl": 0.140625, + "learning_rate": 9.726914414414414e-07, + "loss": 0.0001, + "reward": 2.7436962127685547, + "reward_std": 0.2693813741207123, + "rewards/final_reward": 0.41176920242236215, + "rewards/mask_iou_reward": 0.20588460121118107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7436961233615875, + "rewards/thk_ans_format_reward": 1.0, + "step": 97, + "think_completion_length": 81.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.75, + "epoch": 0.3305227655986509, + "grad_norm": 4.489432400487865, + "kl": 0.11962890625, + "learning_rate": 9.7240990990991e-07, + "loss": 0.0001, + "reward": 2.873469591140747, + "reward_std": 0.35211898386478424, + "rewards/final_reward": 0.2694470590593998, + "rewards/mask_iou_reward": 0.1347235295296999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8734694123268127, + "rewards/thk_ans_format_reward": 1.0, + "step": 98, + "think_completion_length": 96.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.8854217529297, + "epoch": 0.33389544688026984, + "grad_norm": 5.779299275244791, + "kl": 0.12890625, + "learning_rate": 9.721283783783783e-07, + "loss": 0.0001, + "reward": 2.620466947555542, + "reward_std": 0.3322184383869171, + "rewards/final_reward": 0.32427589794335787, + "rewards/mask_iou_reward": 0.16213794897167894, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 0.6413003206253052, + "rewards/thk_ans_format_reward": 1.0, + "step": 99, + "think_completion_length": 94.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.0625, + "epoch": 0.3372681281618887, + "grad_norm": 7.847700810555537, + "kl": 0.12109375, + "learning_rate": 9.718468468468469e-07, + "loss": 0.0001, + "reward": 2.8851125240325928, + "reward_std": 0.20413611084222794, + "rewards/final_reward": 0.9221734908674944, + "rewards/mask_iou_reward": 0.4610867454337472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8851124942302704, + "rewards/thk_ans_format_reward": 1.0, + "step": 100, + "think_completion_length": 92.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.61459350585938, + "epoch": 0.3406408094435076, + "grad_norm": 6.340272960727325, + "kl": 0.127197265625, + "learning_rate": 9.715653153153152e-07, + "loss": 0.0001, + "reward": 3.1949912309646606, + "reward_std": 0.3106095865368843, + "rewards/final_reward": 0.9199014567652839, + "rewards/mask_iou_reward": 0.45995072838264195, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1949909627437592, + "rewards/thk_ans_format_reward": 1.0, + "step": 101, + "think_completion_length": 76.20833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8854217529297, + "epoch": 0.3440134907251265, + "grad_norm": 10.886937887424214, + "kl": 0.12109375, + "learning_rate": 9.712837837837838e-07, + "loss": 0.0001, + "reward": 2.9201362133026123, + "reward_std": 0.2227378636598587, + "rewards/final_reward": 0.9971067769618158, + "rewards/mask_iou_reward": 0.4985533884809079, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9201361835002899, + "rewards/thk_ans_format_reward": 1.0, + "step": 102, + "think_completion_length": 89.08333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.0729217529297, + "epoch": 0.3473861720067454, + "grad_norm": 10.041510866770627, + "kl": 0.125, + "learning_rate": 9.710022522522521e-07, + "loss": 0.0001, + "reward": 2.7521921396255493, + "reward_std": 0.3331163227558136, + "rewards/final_reward": 0.9283792153480164, + "rewards/mask_iou_reward": 0.4641896076740082, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7521921098232269, + "rewards/thk_ans_format_reward": 1.0, + "step": 103, + "think_completion_length": 101.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.39584350585938, + "epoch": 0.35075885328836426, + "grad_norm": 10.712565225905651, + "kl": 0.109619140625, + "learning_rate": 9.707207207207207e-07, + "loss": 0.0001, + "reward": 3.1159496307373047, + "reward_std": 0.14314200729131699, + "rewards/final_reward": 1.6823086959495313, + "rewards/mask_iou_reward": 0.8411543479747656, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1159498393535614, + "rewards/thk_ans_format_reward": 1.0, + "step": 104, + "think_completion_length": 85.95833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.1041717529297, + "epoch": 0.35413153456998314, + "grad_norm": 9.741009916482888, + "kl": 0.1142578125, + "learning_rate": 9.70439189189189e-07, + "loss": 0.0001, + "reward": 2.7842196226119995, + "reward_std": 0.3237437531352043, + "rewards/final_reward": 0.5121928576839352, + "rewards/mask_iou_reward": 0.2560964288419676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.78421950340271, + "rewards/thk_ans_format_reward": 1.0, + "step": 105, + "think_completion_length": 87.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.4479217529297, + "epoch": 0.357504215851602, + "grad_norm": 50.68857598303174, + "kl": 0.123779296875, + "learning_rate": 9.701576576576576e-07, + "loss": 0.0001, + "reward": 3.562849760055542, + "reward_std": 0.17162877321243286, + "rewards/final_reward": 1.7938839579394017, + "rewards/mask_iou_reward": 0.8969419789697008, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5628495812416077, + "rewards/thk_ans_format_reward": 1.0, + "step": 106, + "think_completion_length": 92.95833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.87500762939453, + "epoch": 0.3608768971332209, + "grad_norm": 7.883374108444154, + "kl": 0.12109375, + "learning_rate": 9.698761261261262e-07, + "loss": 0.0001, + "reward": 3.2476431131362915, + "reward_std": 0.341851145029068, + "rewards/final_reward": 1.393488279415816, + "rewards/mask_iou_reward": 0.696744139707908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2476428151130676, + "rewards/thk_ans_format_reward": 1.0, + "step": 107, + "think_completion_length": 106.41666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.45834350585938, + "epoch": 0.3642495784148398, + "grad_norm": 7.594040012287189, + "kl": 0.1796875, + "learning_rate": 9.695945945945946e-07, + "loss": 0.0002, + "reward": 3.0297967195510864, + "reward_std": 0.20907431468367577, + "rewards/final_reward": 0.9516663204931226, + "rewards/mask_iou_reward": 0.4758331602465613, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0297967791557312, + "rewards/thk_ans_format_reward": 1.0, + "step": 108, + "think_completion_length": 100.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.98958587646484, + "epoch": 0.3676222596964587, + "grad_norm": 8.488356286505727, + "kl": 0.12451171875, + "learning_rate": 9.693130630630631e-07, + "loss": 0.0001, + "reward": 3.048767328262329, + "reward_std": 0.383039191365242, + "rewards/final_reward": 0.9600915533933114, + "rewards/mask_iou_reward": 0.4800457766966557, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0487673580646515, + "rewards/thk_ans_format_reward": 1.0, + "step": 109, + "think_completion_length": 103.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.02084350585938, + "epoch": 0.37099494097807756, + "grad_norm": 6.210723775719561, + "kl": 0.125732421875, + "learning_rate": 9.690315315315315e-07, + "loss": 0.0001, + "reward": 2.9819425344467163, + "reward_std": 0.27216267585754395, + "rewards/final_reward": 0.46485509202170155, + "rewards/mask_iou_reward": 0.23242754601085078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9819426238536835, + "rewards/thk_ans_format_reward": 1.0, + "step": 110, + "think_completion_length": 93.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.15625, + "epoch": 0.37436762225969644, + "grad_norm": 8.678738528400313, + "kl": 0.15576171875, + "learning_rate": 9.6875e-07, + "loss": 0.0002, + "reward": 2.936001181602478, + "reward_std": 0.22103118896484375, + "rewards/final_reward": 0.6134164982851417, + "rewards/mask_iou_reward": 0.30670824914257083, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9360010623931885, + "rewards/thk_ans_format_reward": 1.0, + "step": 111, + "think_completion_length": 92.33333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.8229217529297, + "epoch": 0.3777403035413153, + "grad_norm": 3.946076786450147, + "kl": 0.15966796875, + "learning_rate": 9.684684684684684e-07, + "loss": 0.0002, + "reward": 2.8658047914505005, + "reward_std": 0.267331525683403, + "rewards/final_reward": 0.8884399087779431, + "rewards/mask_iou_reward": 0.44421995438897155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8658046722412109, + "rewards/thk_ans_format_reward": 1.0, + "step": 112, + "think_completion_length": 121.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.9479217529297, + "epoch": 0.3811129848229342, + "grad_norm": 13.643204279995123, + "kl": 0.1142578125, + "learning_rate": 9.68186936936937e-07, + "loss": 0.0001, + "reward": 3.2716516256332397, + "reward_std": 0.26144395768642426, + "rewards/final_reward": 0.5963319934097903, + "rewards/mask_iou_reward": 0.29816599670489513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2716516256332397, + "rewards/thk_ans_format_reward": 1.0, + "step": 113, + "think_completion_length": 90.54166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.89584350585938, + "epoch": 0.3844856661045531, + "grad_norm": 4.560320424584854, + "kl": 0.121826171875, + "learning_rate": 9.679054054054053e-07, + "loss": 0.0001, + "reward": 2.9422881603240967, + "reward_std": 0.30686257779598236, + "rewards/final_reward": 1.3590917123836523, + "rewards/mask_iou_reward": 0.6795458561918262, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9422881901264191, + "rewards/thk_ans_format_reward": 1.0, + "step": 114, + "think_completion_length": 101.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0416717529297, + "epoch": 0.38785834738617203, + "grad_norm": 9.483350438450554, + "kl": 0.130615234375, + "learning_rate": 9.676238738738739e-07, + "loss": 0.0001, + "reward": 3.0643558502197266, + "reward_std": 0.35449835658073425, + "rewards/final_reward": 0.6536492522459832, + "rewards/mask_iou_reward": 0.3268246261229916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0747724771499634, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 115, + "think_completion_length": 96.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.27084350585938, + "epoch": 0.3912310286677909, + "grad_norm": 8.231822162450193, + "kl": 0.12744140625, + "learning_rate": 9.673423423423422e-07, + "loss": 0.0001, + "reward": 3.1779122352600098, + "reward_std": 0.24591050297021866, + "rewards/final_reward": 1.5311987731543408, + "rewards/mask_iou_reward": 0.7655993865771704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1779123544692993, + "rewards/thk_ans_format_reward": 1.0, + "step": 116, + "think_completion_length": 94.83333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.83333587646484, + "epoch": 0.3946037099494098, + "grad_norm": 4.853211323414288, + "kl": 0.133544921875, + "learning_rate": 9.670608108108108e-07, + "loss": 0.0001, + "reward": 2.904745578765869, + "reward_std": 0.34407839179039, + "rewards/final_reward": 1.1153784326788996, + "rewards/mask_iou_reward": 0.5576892163394498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9047453701496124, + "rewards/thk_ans_format_reward": 1.0, + "step": 117, + "think_completion_length": 95.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.64583587646484, + "epoch": 0.3979763912310287, + "grad_norm": 5.020686586708645, + "kl": 0.121337890625, + "learning_rate": 9.667792792792794e-07, + "loss": 0.0001, + "reward": 3.063020706176758, + "reward_std": 0.2569497376680374, + "rewards/final_reward": 0.6591898883028006, + "rewards/mask_iou_reward": 0.3295949441514003, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0630205869674683, + "rewards/thk_ans_format_reward": 1.0, + "step": 118, + "think_completion_length": 95.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.4791717529297, + "epoch": 0.40134907251264756, + "grad_norm": 12.412696310883033, + "kl": 0.1435546875, + "learning_rate": 9.664977477477477e-07, + "loss": 0.0001, + "reward": 2.9063340425491333, + "reward_std": 0.26251453161239624, + "rewards/final_reward": 0.7353001495272187, + "rewards/mask_iou_reward": 0.36765007476360934, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9063339531421661, + "rewards/thk_ans_format_reward": 1.0, + "step": 119, + "think_completion_length": 102.66666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.1979217529297, + "epoch": 0.40472175379426645, + "grad_norm": 3.992774052478872, + "kl": 0.145751953125, + "learning_rate": 9.66216216216216e-07, + "loss": 0.0003, + "reward": 2.862497925758362, + "reward_std": 0.31197597831487656, + "rewards/final_reward": 1.068799297125957, + "rewards/mask_iou_reward": 0.5343996485629785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8624976277351379, + "rewards/thk_ans_format_reward": 1.0, + "step": 120, + "think_completion_length": 119.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.14584350585938, + "epoch": 0.40809443507588533, + "grad_norm": 10.473006443962179, + "kl": 0.15771484375, + "learning_rate": 9.659346846846846e-07, + "loss": 0.0002, + "reward": 2.971013307571411, + "reward_std": 0.33689363300800323, + "rewards/final_reward": 1.1416579108015639, + "rewards/mask_iou_reward": 0.5708289554007819, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9710133671760559, + "rewards/thk_ans_format_reward": 1.0, + "step": 121, + "think_completion_length": 97.91666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.5416717529297, + "epoch": 0.4114671163575042, + "grad_norm": 9.534381874086584, + "kl": 0.1396484375, + "learning_rate": 9.65653153153153e-07, + "loss": 0.0002, + "reward": 2.8356984853744507, + "reward_std": 0.15941885858774185, + "rewards/final_reward": 0.4459266845397626, + "rewards/mask_iou_reward": 0.2229633422698813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8356985449790955, + "rewards/thk_ans_format_reward": 1.0, + "step": 122, + "think_completion_length": 118.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.70833587646484, + "epoch": 0.4148397976391231, + "grad_norm": 6.86959731192038, + "kl": 0.136474609375, + "learning_rate": 9.653716216216216e-07, + "loss": 0.0001, + "reward": 3.089380621910095, + "reward_std": 0.19493500515818596, + "rewards/final_reward": 0.9564584633661732, + "rewards/mask_iou_reward": 0.4782292316830866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0893806219100952, + "rewards/thk_ans_format_reward": 1.0, + "step": 123, + "think_completion_length": 104.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.21875762939453, + "epoch": 0.418212478920742, + "grad_norm": 6.799688461335475, + "kl": 0.1376953125, + "learning_rate": 9.6509009009009e-07, + "loss": 0.0001, + "reward": 3.0790122747421265, + "reward_std": 0.2588284760713577, + "rewards/final_reward": 0.9644275758627705, + "rewards/mask_iou_reward": 0.48221378793138525, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0790123343467712, + "rewards/thk_ans_format_reward": 1.0, + "step": 124, + "think_completion_length": 124.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4166717529297, + "epoch": 0.42158516020236086, + "grad_norm": 13.750045710115277, + "kl": 0.16064453125, + "learning_rate": 9.648085585585585e-07, + "loss": 0.0002, + "reward": 3.409953236579895, + "reward_std": 0.3135468512773514, + "rewards/final_reward": 1.509239971271122, + "rewards/mask_iou_reward": 0.754619985635561, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4099529385566711, + "rewards/thk_ans_format_reward": 1.0, + "step": 125, + "think_completion_length": 92.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.81250762939453, + "epoch": 0.42495784148397975, + "grad_norm": 12.535850216866876, + "kl": 0.1357421875, + "learning_rate": 9.645270270270268e-07, + "loss": 0.0001, + "reward": 3.326646089553833, + "reward_std": 0.23538677394390106, + "rewards/final_reward": 0.8067363062978283, + "rewards/mask_iou_reward": 0.40336815314891417, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3266459703445435, + "rewards/thk_ans_format_reward": 1.0, + "step": 126, + "think_completion_length": 110.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.4479217529297, + "epoch": 0.42833052276559863, + "grad_norm": 4.964117400514983, + "kl": 0.12890625, + "learning_rate": 9.642454954954954e-07, + "loss": 0.0001, + "reward": 3.025187849998474, + "reward_std": 0.3882623016834259, + "rewards/final_reward": 1.5354492991876239, + "rewards/mask_iou_reward": 0.7677246495938119, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0251877903938293, + "rewards/thk_ans_format_reward": 1.0, + "step": 127, + "think_completion_length": 97.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.1979217529297, + "epoch": 0.4317032040472175, + "grad_norm": 8.253511110956323, + "kl": 0.13916015625, + "learning_rate": 9.63963963963964e-07, + "loss": 0.0001, + "reward": 3.17389976978302, + "reward_std": 0.34934788942337036, + "rewards/final_reward": 0.6571360829891589, + "rewards/mask_iou_reward": 0.32856804149457947, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1738998293876648, + "rewards/thk_ans_format_reward": 1.0, + "step": 128, + "think_completion_length": 114.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.6979217529297, + "epoch": 0.4350758853288364, + "grad_norm": 9.764055720521588, + "kl": 0.1328125, + "learning_rate": 9.636824324324323e-07, + "loss": 0.0001, + "reward": 2.9551135301589966, + "reward_std": 0.23873476684093475, + "rewards/final_reward": 0.29115001782333383, + "rewards/mask_iou_reward": 0.14557500891166691, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.9655300378799438, + "rewards/thk_ans_format_reward": 1.0, + "step": 129, + "think_completion_length": 126.45833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0104217529297, + "epoch": 0.43844856661045534, + "grad_norm": 5.43923330871092, + "kl": 0.1748046875, + "learning_rate": 9.63400900900901e-07, + "loss": 0.0002, + "reward": 3.2071645259857178, + "reward_std": 0.24996963143348694, + "rewards/final_reward": 1.1977146186007315, + "rewards/mask_iou_reward": 0.5988573093003657, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2071644067764282, + "rewards/thk_ans_format_reward": 1.0, + "step": 130, + "think_completion_length": 102.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.625, + "epoch": 0.4418212478920742, + "grad_norm": 6.752256975163003, + "kl": 0.171875, + "learning_rate": 9.631193693693693e-07, + "loss": 0.0002, + "reward": 3.077457904815674, + "reward_std": 0.16670826077461243, + "rewards/final_reward": 1.0109958851085978, + "rewards/mask_iou_reward": 0.5054979425542989, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0774577260017395, + "rewards/thk_ans_format_reward": 1.0, + "step": 131, + "think_completion_length": 109.41666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.15625, + "epoch": 0.4451939291736931, + "grad_norm": 7.106410397724427, + "kl": 0.1474609375, + "learning_rate": 9.628378378378378e-07, + "loss": 0.0001, + "reward": 2.8066269159317017, + "reward_std": 0.38548873364925385, + "rewards/final_reward": 0.707636198443585, + "rewards/mask_iou_reward": 0.3538180992217925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8066268563270569, + "rewards/thk_ans_format_reward": 1.0, + "step": 132, + "think_completion_length": 93.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.17709350585938, + "epoch": 0.448566610455312, + "grad_norm": 7.057980509975816, + "kl": 0.1796875, + "learning_rate": 9.625563063063062e-07, + "loss": 0.0002, + "reward": 3.3535468578338623, + "reward_std": 0.21944965422153473, + "rewards/final_reward": 1.243831253053847, + "rewards/mask_iou_reward": 0.6219156265269234, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.353546917438507, + "rewards/thk_ans_format_reward": 1.0, + "step": 133, + "think_completion_length": 103.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.2291717529297, + "epoch": 0.45193929173693087, + "grad_norm": 4.563927394653558, + "kl": 0.15625, + "learning_rate": 9.622747747747747e-07, + "loss": 0.0002, + "reward": 2.870627284049988, + "reward_std": 0.2948570251464844, + "rewards/final_reward": 0.6376485645134273, + "rewards/mask_iou_reward": 0.31882428225671366, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.870627224445343, + "rewards/thk_ans_format_reward": 1.0, + "step": 134, + "think_completion_length": 99.95833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.9479217529297, + "epoch": 0.45531197301854975, + "grad_norm": 7.050674237211012, + "kl": 0.15869140625, + "learning_rate": 9.61993243243243e-07, + "loss": 0.0002, + "reward": 2.6339285373687744, + "reward_std": 0.42745040357112885, + "rewards/final_reward": 0.12814824118852572, + "rewards/mask_iou_reward": 0.06407412059426286, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.6547618210315704, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 135, + "think_completion_length": 105.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.2916717529297, + "epoch": 0.45868465430016864, + "grad_norm": 11.431729430530904, + "kl": 0.16259765625, + "learning_rate": 9.617117117117117e-07, + "loss": 0.0002, + "reward": 2.931082606315613, + "reward_std": 0.40491442382335663, + "rewards/final_reward": 1.4102242110294274, + "rewards/mask_iou_reward": 0.7051121055147137, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9310824871063232, + "rewards/thk_ans_format_reward": 1.0, + "step": 136, + "think_completion_length": 110.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.73958587646484, + "epoch": 0.4620573355817875, + "grad_norm": 21.583035164198247, + "kl": 0.17724609375, + "learning_rate": 9.614301801801802e-07, + "loss": 0.0002, + "reward": 2.7264972925186157, + "reward_std": 0.36133062839508057, + "rewards/final_reward": 0.7153198900624561, + "rewards/mask_iou_reward": 0.35765994503122805, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7264970242977142, + "rewards/thk_ans_format_reward": 1.0, + "step": 137, + "think_completion_length": 112.29166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.28125762939453, + "epoch": 0.4654300168634064, + "grad_norm": 9.40238456718145, + "kl": 0.17822265625, + "learning_rate": 9.611486486486486e-07, + "loss": 0.0002, + "reward": 3.1096519231796265, + "reward_std": 0.30247916281223297, + "rewards/final_reward": 1.4404627860085197, + "rewards/mask_iou_reward": 0.7202313930042599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1096520125865936, + "rewards/thk_ans_format_reward": 1.0, + "step": 138, + "think_completion_length": 106.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.52084350585938, + "epoch": 0.4688026981450253, + "grad_norm": 4.9043637265708515, + "kl": 0.16943359375, + "learning_rate": 9.608671171171172e-07, + "loss": 0.0002, + "reward": 2.866019368171692, + "reward_std": 0.21420340985059738, + "rewards/final_reward": 0.525169727990159, + "rewards/mask_iou_reward": 0.2625848639950795, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8660192787647247, + "rewards/thk_ans_format_reward": 1.0, + "step": 139, + "think_completion_length": 94.70833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.96875762939453, + "epoch": 0.47217537942664417, + "grad_norm": 5.837762710429281, + "kl": 0.1826171875, + "learning_rate": 9.605855855855855e-07, + "loss": 0.0002, + "reward": 3.2641146183013916, + "reward_std": 0.2204541265964508, + "rewards/final_reward": 1.4925167842831235, + "rewards/mask_iou_reward": 0.7462583921415618, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2641146779060364, + "rewards/thk_ans_format_reward": 1.0, + "step": 140, + "think_completion_length": 82.58333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8125, + "epoch": 0.47554806070826305, + "grad_norm": 9.65070929894506, + "kl": 0.20068359375, + "learning_rate": 9.60304054054054e-07, + "loss": 0.0002, + "reward": 2.757500410079956, + "reward_std": 0.26694121956825256, + "rewards/final_reward": 0.7410724561390301, + "rewards/mask_iou_reward": 0.37053622806951503, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7575002908706665, + "rewards/thk_ans_format_reward": 1.0, + "step": 141, + "think_completion_length": 84.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.90625762939453, + "epoch": 0.47892074198988194, + "grad_norm": 7.747616062513729, + "kl": 0.1953125, + "learning_rate": 9.600225225225224e-07, + "loss": 0.0002, + "reward": 3.1408188343048096, + "reward_std": 0.25297851860523224, + "rewards/final_reward": 0.9605678349099196, + "rewards/mask_iou_reward": 0.4802839174549598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1408189535140991, + "rewards/thk_ans_format_reward": 1.0, + "step": 142, + "think_completion_length": 92.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.4166717529297, + "epoch": 0.4822934232715008, + "grad_norm": 6.639684074671962, + "kl": 0.18212890625, + "learning_rate": 9.59740990990991e-07, + "loss": 0.0002, + "reward": 3.2298460006713867, + "reward_std": 0.24782373011112213, + "rewards/final_reward": 1.4688922774294029, + "rewards/mask_iou_reward": 0.7344461387147014, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.229845941066742, + "rewards/thk_ans_format_reward": 1.0, + "step": 143, + "think_completion_length": 82.08333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.52083587646484, + "epoch": 0.4856661045531197, + "grad_norm": 11.690583175650898, + "kl": 0.19287109375, + "learning_rate": 9.594594594594594e-07, + "loss": 0.0002, + "reward": 2.953932523727417, + "reward_std": 0.24815939366817474, + "rewards/final_reward": 0.25470155704168357, + "rewards/mask_iou_reward": 0.12735077852084178, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9539322257041931, + "rewards/thk_ans_format_reward": 1.0, + "step": 144, + "think_completion_length": 88.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.0625, + "epoch": 0.48903878583473864, + "grad_norm": 8.171245738483481, + "kl": 0.20947265625, + "learning_rate": 9.59177927927928e-07, + "loss": 0.0002, + "reward": 3.105030059814453, + "reward_std": 0.41488519310951233, + "rewards/final_reward": 1.0450923837882327, + "rewards/mask_iou_reward": 0.5225461918941163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1050302386283875, + "rewards/thk_ans_format_reward": 1.0, + "step": 145, + "think_completion_length": 84.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9791717529297, + "epoch": 0.4924114671163575, + "grad_norm": 6.8042002771504695, + "kl": 0.1826171875, + "learning_rate": 9.588963963963963e-07, + "loss": 0.0002, + "reward": 2.978797674179077, + "reward_std": 0.23305433988571167, + "rewards/final_reward": 0.5362301160155533, + "rewards/mask_iou_reward": 0.26811505800777663, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.97879758477211, + "rewards/thk_ans_format_reward": 1.0, + "step": 146, + "think_completion_length": 79.33333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.7604217529297, + "epoch": 0.4957841483979764, + "grad_norm": 5.8864092171349585, + "kl": 0.21484375, + "learning_rate": 9.586148648648648e-07, + "loss": 0.0002, + "reward": 3.4847919940948486, + "reward_std": 0.13796599209308624, + "rewards/final_reward": 1.3652275476646998, + "rewards/mask_iou_reward": 0.6826137738323499, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4847919344902039, + "rewards/thk_ans_format_reward": 1.0, + "step": 147, + "think_completion_length": 82.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.30209350585938, + "epoch": 0.4991568296795953, + "grad_norm": 5.0106064638814205, + "kl": 0.2138671875, + "learning_rate": 9.583333333333334e-07, + "loss": 0.0002, + "reward": 3.076038360595703, + "reward_std": 0.27134670317173004, + "rewards/final_reward": 0.9536914128816507, + "rewards/mask_iou_reward": 0.47684570644082536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0760382413864136, + "rewards/thk_ans_format_reward": 1.0, + "step": 148, + "think_completion_length": 83.33333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.6666717529297, + "epoch": 0.5025295109612141, + "grad_norm": 8.274985300107584, + "kl": 0.2060546875, + "learning_rate": 9.580518018018018e-07, + "loss": 0.0002, + "reward": 3.1267894506454468, + "reward_std": 0.18382571265101433, + "rewards/final_reward": 0.8946684420728204, + "rewards/mask_iou_reward": 0.4473342210364102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1267894208431244, + "rewards/thk_ans_format_reward": 1.0, + "step": 149, + "think_completion_length": 84.58333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.6979217529297, + "epoch": 0.5059021922428331, + "grad_norm": 16.142757363004044, + "kl": 0.23291015625, + "learning_rate": 9.577702702702703e-07, + "loss": 0.0002, + "reward": 3.0016664266586304, + "reward_std": 0.2265520542860031, + "rewards/final_reward": 1.3027442461369658, + "rewards/mask_iou_reward": 0.6513721230684829, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0016663670539856, + "rewards/thk_ans_format_reward": 1.0, + "step": 150, + "think_completion_length": 81.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.0104217529297, + "epoch": 0.5092748735244519, + "grad_norm": 20.331315869387037, + "kl": 0.2412109375, + "learning_rate": 9.574887387387387e-07, + "loss": 0.0002, + "reward": 3.1735047101974487, + "reward_std": 0.22183486074209213, + "rewards/final_reward": 1.0375799778137802, + "rewards/mask_iou_reward": 0.5187899889068901, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.173504650592804, + "rewards/thk_ans_format_reward": 1.0, + "step": 151, + "think_completion_length": 84.83333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.8541717529297, + "epoch": 0.5126475548060708, + "grad_norm": 7.285780487915525, + "kl": 0.19775390625, + "learning_rate": 9.572072072072072e-07, + "loss": 0.0002, + "reward": 2.487415313720703, + "reward_std": 0.2901010140776634, + "rewards/final_reward": 0.6410345660592923, + "rewards/mask_iou_reward": 0.32051728302964616, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.4874153882265091, + "rewards/thk_ans_format_reward": 1.0, + "step": 152, + "think_completion_length": 70.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.58333587646484, + "epoch": 0.5160202360876898, + "grad_norm": 5.037142388718095, + "kl": 0.37255859375, + "learning_rate": 9.569256756756756e-07, + "loss": 0.0004, + "reward": 3.0975812673568726, + "reward_std": 0.41272978484630585, + "rewards/final_reward": 1.2630302577736632, + "rewards/mask_iou_reward": 0.6315151288868316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0975810885429382, + "rewards/thk_ans_format_reward": 1.0, + "step": 153, + "think_completion_length": 87.66666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.92709350585938, + "epoch": 0.5193929173693086, + "grad_norm": 9.476578185984213, + "kl": 0.27099609375, + "learning_rate": 9.566441441441442e-07, + "loss": 0.0003, + "reward": 2.7476073503494263, + "reward_std": 0.12777689844369888, + "rewards/final_reward": 1.0467066867601786, + "rewards/mask_iou_reward": 0.5233533433800893, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7476073503494263, + "rewards/thk_ans_format_reward": 1.0, + "step": 154, + "think_completion_length": 81.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.37500762939453, + "epoch": 0.5227655986509275, + "grad_norm": 10.872523571455144, + "kl": 0.263671875, + "learning_rate": 9.563626126126125e-07, + "loss": 0.0003, + "reward": 3.087652087211609, + "reward_std": 0.2608217652887106, + "rewards/final_reward": 1.1027023211599696, + "rewards/mask_iou_reward": 0.5513511605799848, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0876522362232208, + "rewards/thk_ans_format_reward": 1.0, + "step": 155, + "think_completion_length": 84.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.80208587646484, + "epoch": 0.5261382799325464, + "grad_norm": 20.114727083957355, + "kl": 0.23681640625, + "learning_rate": 9.56081081081081e-07, + "loss": 0.0002, + "reward": 3.0146535634994507, + "reward_std": 0.25460537523031235, + "rewards/final_reward": 0.5053419912394292, + "rewards/mask_iou_reward": 0.2526709956197146, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0146536529064178, + "rewards/thk_ans_format_reward": 1.0, + "step": 156, + "think_completion_length": 92.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.48958587646484, + "epoch": 0.5295109612141653, + "grad_norm": 7.9617120534329535, + "kl": 0.228515625, + "learning_rate": 9.557995495495497e-07, + "loss": 0.0002, + "reward": 3.141195058822632, + "reward_std": 0.10193538293242455, + "rewards/final_reward": 0.5441117242993203, + "rewards/mask_iou_reward": 0.27205586214966015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.14119490981102, + "rewards/thk_ans_format_reward": 1.0, + "step": 157, + "think_completion_length": 89.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.61459350585938, + "epoch": 0.5328836424957841, + "grad_norm": 7.479607652753245, + "kl": 0.2685546875, + "learning_rate": 9.55518018018018e-07, + "loss": 0.0003, + "reward": 3.1644891500473022, + "reward_std": 0.2984514832496643, + "rewards/final_reward": 0.957407182539989, + "rewards/mask_iou_reward": 0.4787035912699945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.164489209651947, + "rewards/thk_ans_format_reward": 1.0, + "step": 158, + "think_completion_length": 81.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.23958587646484, + "epoch": 0.5362563237774031, + "grad_norm": 5.402652813833984, + "kl": 0.2373046875, + "learning_rate": 9.552364864864864e-07, + "loss": 0.0002, + "reward": 2.986154317855835, + "reward_std": 0.3532260060310364, + "rewards/final_reward": 1.7312428830340285, + "rewards/mask_iou_reward": 0.8656214415170143, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9861544370651245, + "rewards/thk_ans_format_reward": 1.0, + "step": 159, + "think_completion_length": 77.04166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.56250762939453, + "epoch": 0.5396290050590219, + "grad_norm": 31.67704383926532, + "kl": 0.35546875, + "learning_rate": 9.54954954954955e-07, + "loss": 0.0004, + "reward": 3.3903119564056396, + "reward_std": 0.2488839253783226, + "rewards/final_reward": 1.530599074157094, + "rewards/mask_iou_reward": 0.765299537078547, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3903120160102844, + "rewards/thk_ans_format_reward": 1.0, + "step": 160, + "think_completion_length": 74.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.86458587646484, + "epoch": 0.5430016863406408, + "grad_norm": 10.93334970931945, + "kl": 0.2666015625, + "learning_rate": 9.546734234234233e-07, + "loss": 0.0003, + "reward": 3.126182436943054, + "reward_std": 0.23756013810634613, + "rewards/final_reward": 1.3464974720541376, + "rewards/mask_iou_reward": 0.6732487360270688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.126182347536087, + "rewards/thk_ans_format_reward": 1.0, + "step": 161, + "think_completion_length": 68.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9166717529297, + "epoch": 0.5463743676222597, + "grad_norm": 7.656379946063235, + "kl": 0.24755859375, + "learning_rate": 9.543918918918919e-07, + "loss": 0.0003, + "reward": 2.90953528881073, + "reward_std": 0.115766741335392, + "rewards/final_reward": 0.8745932935241574, + "rewards/mask_iou_reward": 0.4372966467620787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9095353484153748, + "rewards/thk_ans_format_reward": 1.0, + "step": 162, + "think_completion_length": 74.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.73958587646484, + "epoch": 0.5497470489038786, + "grad_norm": 6.866761916729452, + "kl": 0.2314453125, + "learning_rate": 9.541103603603602e-07, + "loss": 0.0002, + "reward": 3.1094547510147095, + "reward_std": 0.22638342529535294, + "rewards/final_reward": 1.349065662874475, + "rewards/mask_iou_reward": 0.6745328314372375, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.119871363043785, + "rewards/thk_ans_format_reward": 1.0, + "step": 163, + "think_completion_length": 65.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.83333587646484, + "epoch": 0.5531197301854974, + "grad_norm": 5.657018351301425, + "kl": 0.24658203125, + "learning_rate": 9.538288288288288e-07, + "loss": 0.0002, + "reward": 2.8507970571517944, + "reward_std": 0.1843552067875862, + "rewards/final_reward": 0.3972928312518996, + "rewards/mask_iou_reward": 0.1986464156259498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8507969975471497, + "rewards/thk_ans_format_reward": 1.0, + "step": 164, + "think_completion_length": 86.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.78125762939453, + "epoch": 0.5564924114671164, + "grad_norm": 13.056201326210605, + "kl": 0.28515625, + "learning_rate": 9.535472972972972e-07, + "loss": 0.0003, + "reward": 3.584625482559204, + "reward_std": 0.14753572642803192, + "rewards/final_reward": 1.270440407309784, + "rewards/mask_iou_reward": 0.635220203654892, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5846253633499146, + "rewards/thk_ans_format_reward": 1.0, + "step": 165, + "think_completion_length": 72.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9166717529297, + "epoch": 0.5598650927487352, + "grad_norm": 5.643407452949351, + "kl": 0.26171875, + "learning_rate": 9.532657657657657e-07, + "loss": 0.0003, + "reward": 3.2158550024032593, + "reward_std": 0.11347110942006111, + "rewards/final_reward": 1.7543514291511069, + "rewards/mask_iou_reward": 0.8771757145755534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2158551216125488, + "rewards/thk_ans_format_reward": 1.0, + "step": 166, + "think_completion_length": 75.66666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.7916717529297, + "epoch": 0.5632377740303541, + "grad_norm": 12.62639672873956, + "kl": 0.26220703125, + "learning_rate": 9.529842342342343e-07, + "loss": 0.0003, + "reward": 2.8758022785186768, + "reward_std": 0.2610451355576515, + "rewards/final_reward": 0.9628604077281282, + "rewards/mask_iou_reward": 0.4814302038640641, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8758021593093872, + "rewards/thk_ans_format_reward": 1.0, + "step": 167, + "think_completion_length": 84.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1041717529297, + "epoch": 0.5666104553119731, + "grad_norm": 15.581925152352548, + "kl": 0.248046875, + "learning_rate": 9.527027027027027e-07, + "loss": 0.0003, + "reward": 3.1900848150253296, + "reward_std": 0.30189305543899536, + "rewards/final_reward": 1.1237223638565845, + "rewards/mask_iou_reward": 0.5618611819282923, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1900847554206848, + "rewards/thk_ans_format_reward": 1.0, + "step": 168, + "think_completion_length": 78.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.6041717529297, + "epoch": 0.5699831365935919, + "grad_norm": 9.718093636345376, + "kl": 0.2734375, + "learning_rate": 9.524211711711712e-07, + "loss": 0.0003, + "reward": 3.1855918169021606, + "reward_std": 0.26739974319934845, + "rewards/final_reward": 1.2005245534438336, + "rewards/mask_iou_reward": 0.6002622767219168, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.196008563041687, + "rewards/thk_ans_format_reward": 1.0, + "step": 169, + "think_completion_length": 73.45833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.93750762939453, + "epoch": 0.5733558178752108, + "grad_norm": 9.760113218323678, + "kl": 0.27294921875, + "learning_rate": 9.521396396396396e-07, + "loss": 0.0003, + "reward": 3.3592395782470703, + "reward_std": 0.20917140692472458, + "rewards/final_reward": 1.4165027563247574, + "rewards/mask_iou_reward": 0.7082513781623787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3592395782470703, + "rewards/thk_ans_format_reward": 1.0, + "step": 170, + "think_completion_length": 84.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.90625762939453, + "epoch": 0.5767284991568297, + "grad_norm": 31.25506389342562, + "kl": 0.29296875, + "learning_rate": 9.518581081081081e-07, + "loss": 0.0003, + "reward": 3.5906260013580322, + "reward_std": 0.2482130452990532, + "rewards/final_reward": 1.7329653551792745, + "rewards/mask_iou_reward": 0.8664826775896373, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6010427474975586, + "rewards/thk_ans_format_reward": 1.0, + "step": 171, + "think_completion_length": 83.66666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.8229217529297, + "epoch": 0.5801011804384486, + "grad_norm": 5.221187581915739, + "kl": 0.3369140625, + "learning_rate": 9.515765765765766e-07, + "loss": 0.0003, + "reward": 3.1545242071151733, + "reward_std": 0.23806846141815186, + "rewards/final_reward": 1.0177511067470535, + "rewards/mask_iou_reward": 0.5088755533735267, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1545242071151733, + "rewards/thk_ans_format_reward": 1.0, + "step": 172, + "think_completion_length": 77.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.21875, + "epoch": 0.5834738617200674, + "grad_norm": 6.89011581902954, + "kl": 0.259765625, + "learning_rate": 9.51295045045045e-07, + "loss": 0.0003, + "reward": 3.1506524085998535, + "reward_std": 0.20986726135015488, + "rewards/final_reward": 0.9594745083224383, + "rewards/mask_iou_reward": 0.47973725416121915, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1506522297859192, + "rewards/thk_ans_format_reward": 1.0, + "step": 173, + "think_completion_length": 137.54166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.08333587646484, + "epoch": 0.5868465430016864, + "grad_norm": 13.689620989891141, + "kl": 0.25439453125, + "learning_rate": 9.510135135135135e-07, + "loss": 0.0001, + "reward": 2.8189436197280884, + "reward_std": 0.12411446496844292, + "rewards/final_reward": 0.6297055339218588, + "rewards/mask_iou_reward": 0.3148527669609294, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.818943589925766, + "rewards/thk_ans_format_reward": 1.0, + "step": 174, + "think_completion_length": 102.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.42709350585938, + "epoch": 0.5902192242833052, + "grad_norm": 10.636713075668137, + "kl": 0.240234375, + "learning_rate": 9.50731981981982e-07, + "loss": 0.0002, + "reward": 3.1594501733779907, + "reward_std": 0.33075501024723053, + "rewards/final_reward": 0.22629558092618446, + "rewards/mask_iou_reward": 0.11314779046309223, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1594501733779907, + "rewards/thk_ans_format_reward": 1.0, + "step": 175, + "think_completion_length": 108.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.1666717529297, + "epoch": 0.5935919055649241, + "grad_norm": 4.761664368364506, + "kl": 0.2587890625, + "learning_rate": 9.504504504504504e-07, + "loss": 0.0003, + "reward": 3.0785621404647827, + "reward_std": 0.24554403126239777, + "rewards/final_reward": 1.2377652068387972, + "rewards/mask_iou_reward": 0.6188826034193986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0785619616508484, + "rewards/thk_ans_format_reward": 1.0, + "step": 176, + "think_completion_length": 105.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.11459350585938, + "epoch": 0.596964586846543, + "grad_norm": 30.958999247456223, + "kl": 0.2939453125, + "learning_rate": 9.50168918918919e-07, + "loss": 0.0003, + "reward": 3.3248791694641113, + "reward_std": 0.32074373215436935, + "rewards/final_reward": 1.5646520924025296, + "rewards/mask_iou_reward": 0.7823260462012648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3248790502548218, + "rewards/thk_ans_format_reward": 1.0, + "step": 177, + "think_completion_length": 86.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.42709350585938, + "epoch": 0.6003372681281619, + "grad_norm": 8.363721329646177, + "kl": 0.26708984375, + "learning_rate": 9.498873873873874e-07, + "loss": 0.0003, + "reward": 2.8474905490875244, + "reward_std": 0.17037975788116455, + "rewards/final_reward": 1.4689836335171615, + "rewards/mask_iou_reward": 0.7344918167585808, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.847490519285202, + "rewards/thk_ans_format_reward": 1.0, + "step": 178, + "think_completion_length": 95.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.23959350585938, + "epoch": 0.6037099494097807, + "grad_norm": 16.177949288946124, + "kl": 0.25244140625, + "learning_rate": 9.496058558558558e-07, + "loss": 0.0003, + "reward": 2.9460397958755493, + "reward_std": 0.40348224341869354, + "rewards/final_reward": 0.7546152974114886, + "rewards/mask_iou_reward": 0.3773076487057443, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9460396766662598, + "rewards/thk_ans_format_reward": 1.0, + "step": 179, + "think_completion_length": 127.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.65625762939453, + "epoch": 0.6070826306913997, + "grad_norm": 5.815776866446163, + "kl": 0.2744140625, + "learning_rate": 9.493243243243243e-07, + "loss": 0.0003, + "reward": 2.947666049003601, + "reward_std": 0.2804013565182686, + "rewards/final_reward": 1.3667480820389621, + "rewards/mask_iou_reward": 0.6833740410194811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9476659297943115, + "rewards/thk_ans_format_reward": 1.0, + "step": 180, + "think_completion_length": 91.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0104217529297, + "epoch": 0.6104553119730185, + "grad_norm": 6.897634968859645, + "kl": 0.2666015625, + "learning_rate": 9.490427927927927e-07, + "loss": 0.0003, + "reward": 2.8933818340301514, + "reward_std": 0.3454447239637375, + "rewards/final_reward": 1.2089149397368106, + "rewards/mask_iou_reward": 0.6044574698684053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8933817148208618, + "rewards/thk_ans_format_reward": 1.0, + "step": 181, + "think_completion_length": 101.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.9479217529297, + "epoch": 0.6138279932546374, + "grad_norm": 23.56354755740188, + "kl": 0.2900390625, + "learning_rate": 9.487612612612612e-07, + "loss": 0.0003, + "reward": 3.2707679271698, + "reward_std": 0.3094882294535637, + "rewards/final_reward": 1.6510349024018907, + "rewards/mask_iou_reward": 0.8255174512009453, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.270767867565155, + "rewards/thk_ans_format_reward": 1.0, + "step": 182, + "think_completion_length": 91.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.5416717529297, + "epoch": 0.6172006745362564, + "grad_norm": 7.566129474584754, + "kl": 0.23828125, + "learning_rate": 9.484797297297296e-07, + "loss": 0.0003, + "reward": 3.528642773628235, + "reward_std": 0.19238104671239853, + "rewards/final_reward": 1.3287466025722514, + "rewards/mask_iou_reward": 0.6643733012861257, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5286428332328796, + "rewards/thk_ans_format_reward": 1.0, + "step": 183, + "think_completion_length": 104.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.14583587646484, + "epoch": 0.6205733558178752, + "grad_norm": 19.462449457855936, + "kl": 0.314453125, + "learning_rate": 9.481981981981981e-07, + "loss": 0.0003, + "reward": 3.2879135608673096, + "reward_std": 0.1729014366865158, + "rewards/final_reward": 0.6132975123786321, + "rewards/mask_iou_reward": 0.30664875618931603, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2879136204719543, + "rewards/thk_ans_format_reward": 1.0, + "step": 184, + "think_completion_length": 105.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.71875762939453, + "epoch": 0.6239460370994941, + "grad_norm": 10.238505822769678, + "kl": 0.291015625, + "learning_rate": 9.479166666666666e-07, + "loss": 0.0003, + "reward": 3.4363847970962524, + "reward_std": 0.08068331144750118, + "rewards/final_reward": 1.9090332802960994, + "rewards/mask_iou_reward": 0.9545166401480497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.436384677886963, + "rewards/thk_ans_format_reward": 1.0, + "step": 185, + "think_completion_length": 95.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.87500762939453, + "epoch": 0.627318718381113, + "grad_norm": 55.15622668515774, + "kl": 0.25390625, + "learning_rate": 9.47635135135135e-07, + "loss": 0.0003, + "reward": 3.5349490642547607, + "reward_std": 0.11874636262655258, + "rewards/final_reward": 1.7553482866542853, + "rewards/mask_iou_reward": 0.8776741433271427, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.534949004650116, + "rewards/thk_ans_format_reward": 1.0, + "step": 186, + "think_completion_length": 94.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.3541717529297, + "epoch": 0.6306913996627319, + "grad_norm": 18.497533459465107, + "kl": 0.275390625, + "learning_rate": 9.473536036036036e-07, + "loss": 0.0003, + "reward": 3.135318160057068, + "reward_std": 0.2178964763879776, + "rewards/final_reward": 1.3029347302478484, + "rewards/mask_iou_reward": 0.6514673651239242, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.135318249464035, + "rewards/thk_ans_format_reward": 1.0, + "step": 187, + "think_completion_length": 97.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.17708587646484, + "epoch": 0.6340640809443507, + "grad_norm": 9.223637235595785, + "kl": 0.2685546875, + "learning_rate": 9.47072072072072e-07, + "loss": 0.0003, + "reward": 2.919049620628357, + "reward_std": 0.11340761929750443, + "rewards/final_reward": 0.1910890640154114, + "rewards/mask_iou_reward": 0.0955445320077057, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9190497100353241, + "rewards/thk_ans_format_reward": 1.0, + "step": 188, + "think_completion_length": 92.20833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.80208587646484, + "epoch": 0.6374367622259697, + "grad_norm": 7.568967069603324, + "kl": 0.2451171875, + "learning_rate": 9.467905405405405e-07, + "loss": 0.0002, + "reward": 3.226958751678467, + "reward_std": 0.22894418239593506, + "rewards/final_reward": 1.2491899400329127, + "rewards/mask_iou_reward": 0.6245949700164564, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2373753786087036, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 189, + "think_completion_length": 104.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.73958587646484, + "epoch": 0.6408094435075885, + "grad_norm": 5.306307755700064, + "kl": 0.2431640625, + "learning_rate": 9.46509009009009e-07, + "loss": 0.0002, + "reward": 3.098444104194641, + "reward_std": 0.3319072127342224, + "rewards/final_reward": 1.2748850668597314, + "rewards/mask_iou_reward": 0.6374425334298657, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.108860731124878, + "rewards/thk_ans_format_reward": 1.0, + "step": 190, + "think_completion_length": 75.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.34375, + "epoch": 0.6441821247892074, + "grad_norm": 6.471694725642505, + "kl": 0.25390625, + "learning_rate": 9.462274774774774e-07, + "loss": 0.0003, + "reward": 2.9371249675750732, + "reward_std": 0.2752944231033325, + "rewards/final_reward": 1.2037132542187892, + "rewards/mask_iou_reward": 0.6018566271093946, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9371249675750732, + "rewards/thk_ans_format_reward": 1.0, + "step": 191, + "think_completion_length": 101.66666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.39583587646484, + "epoch": 0.6475548060708263, + "grad_norm": 4.684049245891642, + "kl": 0.2587890625, + "learning_rate": 9.459459459459459e-07, + "loss": 0.0003, + "reward": 3.32126247882843, + "reward_std": 0.2728252410888672, + "rewards/final_reward": 1.6915106845598409, + "rewards/mask_iou_reward": 0.8457553422799204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3212623596191406, + "rewards/thk_ans_format_reward": 1.0, + "step": 192, + "think_completion_length": 94.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.84375762939453, + "epoch": 0.6509274873524452, + "grad_norm": 5.739390664097678, + "kl": 0.24267578125, + "learning_rate": 9.456644144144143e-07, + "loss": 0.0002, + "reward": 3.100097417831421, + "reward_std": 0.3362935483455658, + "rewards/final_reward": 1.1146668513107205, + "rewards/mask_iou_reward": 0.5573334256553603, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1000972986221313, + "rewards/thk_ans_format_reward": 1.0, + "step": 193, + "think_completion_length": 114.41666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.58333587646484, + "epoch": 0.654300168634064, + "grad_norm": 8.735426480074405, + "kl": 0.26953125, + "learning_rate": 9.453828828828828e-07, + "loss": 0.0003, + "reward": 3.3539435863494873, + "reward_std": 0.3737208843231201, + "rewards/final_reward": 1.2457365957684952, + "rewards/mask_iou_reward": 0.6228682978842476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3539432883262634, + "rewards/thk_ans_format_reward": 1.0, + "step": 194, + "think_completion_length": 96.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.65625762939453, + "epoch": 0.657672849915683, + "grad_norm": 5.834979241570896, + "kl": 0.2578125, + "learning_rate": 9.451013513513513e-07, + "loss": 0.0003, + "reward": 3.2236061096191406, + "reward_std": 0.2855689972639084, + "rewards/final_reward": 1.3954733399497419, + "rewards/mask_iou_reward": 0.6977366699748709, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2236062288284302, + "rewards/thk_ans_format_reward": 1.0, + "step": 195, + "think_completion_length": 114.41666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.12500762939453, + "epoch": 0.6610455311973018, + "grad_norm": 62.393868086469084, + "kl": 0.26171875, + "learning_rate": 9.448198198198197e-07, + "loss": 0.0003, + "reward": 3.0305802822113037, + "reward_std": 0.2396130934357643, + "rewards/final_reward": 0.3726910393779626, + "rewards/mask_iou_reward": 0.1863455196889813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0305803418159485, + "rewards/thk_ans_format_reward": 1.0, + "step": 196, + "think_completion_length": 104.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.65625762939453, + "epoch": 0.6644182124789207, + "grad_norm": 9.09552951283109, + "kl": 0.2548828125, + "learning_rate": 9.445382882882883e-07, + "loss": 0.0003, + "reward": 3.033574938774109, + "reward_std": 0.30480627715587616, + "rewards/final_reward": 0.6127038885463165, + "rewards/mask_iou_reward": 0.30635194427315826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0335749089717865, + "rewards/thk_ans_format_reward": 1.0, + "step": 197, + "think_completion_length": 109.29166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6041717529297, + "epoch": 0.6677908937605397, + "grad_norm": 4.541735314198181, + "kl": 0.2734375, + "learning_rate": 9.442567567567568e-07, + "loss": 0.0003, + "reward": 3.2191314697265625, + "reward_std": 0.12774834409356117, + "rewards/final_reward": 1.363589056546357, + "rewards/mask_iou_reward": 0.6817945282731785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2191317081451416, + "rewards/thk_ans_format_reward": 1.0, + "step": 198, + "think_completion_length": 109.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.11459350585938, + "epoch": 0.6711635750421585, + "grad_norm": 7.252136816346154, + "kl": 0.23779296875, + "learning_rate": 9.439752252252252e-07, + "loss": 0.0002, + "reward": 3.0531833171844482, + "reward_std": 0.40384694933891296, + "rewards/final_reward": 1.0973872267338165, + "rewards/mask_iou_reward": 0.5486936133669083, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0531832575798035, + "rewards/thk_ans_format_reward": 1.0, + "step": 199, + "think_completion_length": 125.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.6041717529297, + "epoch": 0.6745362563237775, + "grad_norm": 11.134304628990678, + "kl": 0.2470703125, + "learning_rate": 9.436936936936937e-07, + "loss": 0.0002, + "reward": 3.1323522329330444, + "reward_std": 0.34756386280059814, + "rewards/final_reward": 0.4455057568614778, + "rewards/mask_iou_reward": 0.2227528784307389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1323521137237549, + "rewards/thk_ans_format_reward": 1.0, + "step": 200, + "think_completion_length": 121.54166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.18750762939453, + "epoch": 0.6779089376053963, + "grad_norm": 12.334649936602695, + "kl": 0.26123046875, + "learning_rate": 9.434121621621621e-07, + "loss": 0.0003, + "reward": 3.055158853530884, + "reward_std": 0.1682959347963333, + "rewards/final_reward": 0.5981111851549227, + "rewards/mask_iou_reward": 0.29905559257746134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0551589131355286, + "rewards/thk_ans_format_reward": 1.0, + "step": 201, + "think_completion_length": 151.45833333333331 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.67709350585938, + "epoch": 0.6812816188870152, + "grad_norm": 7.556066147867235, + "kl": 0.24072265625, + "learning_rate": 9.431306306306306e-07, + "loss": 0.0002, + "reward": 3.263322353363037, + "reward_std": 0.2816054970026016, + "rewards/final_reward": 1.0462942667515105, + "rewards/mask_iou_reward": 0.5231471333757552, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2633222341537476, + "rewards/thk_ans_format_reward": 1.0, + "step": 202, + "think_completion_length": 115.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.12500762939453, + "epoch": 0.684654300168634, + "grad_norm": 6.2700840389340575, + "kl": 0.2587890625, + "learning_rate": 9.428490990990991e-07, + "loss": 0.0003, + "reward": 2.998106360435486, + "reward_std": 0.26637783646583557, + "rewards/final_reward": 1.4625346666906975, + "rewards/mask_iou_reward": 0.7312673333453488, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.998106449842453, + "rewards/thk_ans_format_reward": 1.0, + "step": 203, + "think_completion_length": 117.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.23958587646484, + "epoch": 0.688026981450253, + "grad_norm": 3.79495885528206, + "kl": 0.240234375, + "learning_rate": 9.425675675675675e-07, + "loss": 0.0002, + "reward": 3.110295057296753, + "reward_std": 0.24425452947616577, + "rewards/final_reward": 1.3623789795938142, + "rewards/mask_iou_reward": 0.6811894897969071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.110295057296753, + "rewards/thk_ans_format_reward": 1.0, + "step": 204, + "think_completion_length": 127.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.9791717529297, + "epoch": 0.6913996627318718, + "grad_norm": 37.346696638085206, + "kl": 0.2841796875, + "learning_rate": 9.42286036036036e-07, + "loss": 0.0003, + "reward": 3.355635404586792, + "reward_std": 0.22298195213079453, + "rewards/final_reward": 1.2873584034100065, + "rewards/mask_iou_reward": 0.6436792017050033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3556353449821472, + "rewards/thk_ans_format_reward": 1.0, + "step": 205, + "think_completion_length": 143.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.96875762939453, + "epoch": 0.6947723440134908, + "grad_norm": 5.433853004920652, + "kl": 0.2333984375, + "learning_rate": 9.420045045045044e-07, + "loss": 0.0002, + "reward": 3.380358934402466, + "reward_std": 0.16319020092487335, + "rewards/final_reward": 1.0986353687626877, + "rewards/mask_iou_reward": 0.5493176843813439, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3803590536117554, + "rewards/thk_ans_format_reward": 1.0, + "step": 206, + "think_completion_length": 143.83333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.90625, + "epoch": 0.6981450252951096, + "grad_norm": 4.334262383011323, + "kl": 0.228515625, + "learning_rate": 9.41722972972973e-07, + "loss": 0.0002, + "reward": 3.2010854482650757, + "reward_std": 0.19329330325126648, + "rewards/final_reward": 1.3971072227643941, + "rewards/mask_iou_reward": 0.6985536113821971, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2010854482650757, + "rewards/thk_ans_format_reward": 1.0, + "step": 207, + "think_completion_length": 159.54166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.6666717529297, + "epoch": 0.7015177065767285, + "grad_norm": 4.896160260038562, + "kl": 0.251953125, + "learning_rate": 9.414414414414415e-07, + "loss": 0.0003, + "reward": 3.0796992778778076, + "reward_std": 0.13358672708272934, + "rewards/final_reward": 1.442706489408488, + "rewards/mask_iou_reward": 0.721353244704244, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.079699158668518, + "rewards/thk_ans_format_reward": 1.0, + "step": 208, + "think_completion_length": 143.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.14583587646484, + "epoch": 0.7048903878583473, + "grad_norm": 19.68180647849107, + "kl": 0.25, + "learning_rate": 9.411599099099099e-07, + "loss": 0.0003, + "reward": 2.9135031700134277, + "reward_std": 0.18659613281488419, + "rewards/final_reward": 0.8169442864924998, + "rewards/mask_iou_reward": 0.4084721432462499, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.913503110408783, + "rewards/thk_ans_format_reward": 1.0, + "step": 209, + "think_completion_length": 152.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.89583587646484, + "epoch": 0.7082630691399663, + "grad_norm": 15.962050613392199, + "kl": 0.2509765625, + "learning_rate": 9.408783783783784e-07, + "loss": 0.0002, + "reward": 2.6730724573135376, + "reward_std": 0.1774405688047409, + "rewards/final_reward": 0.3143013680390508, + "rewards/mask_iou_reward": 0.1571506840195254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6730725169181824, + "rewards/thk_ans_format_reward": 1.0, + "step": 210, + "think_completion_length": 146.91666666666669 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.1666717529297, + "epoch": 0.7116357504215851, + "grad_norm": 10.032991417039138, + "kl": 0.2451171875, + "learning_rate": 9.405968468468469e-07, + "loss": 0.0002, + "reward": 3.3573029041290283, + "reward_std": 0.12738436460494995, + "rewards/final_reward": 1.4354043883805865, + "rewards/mask_iou_reward": 0.7177021941902932, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.357302963733673, + "rewards/thk_ans_format_reward": 1.0, + "step": 211, + "think_completion_length": 139.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.1875, + "epoch": 0.715008431703204, + "grad_norm": 8.828001892207169, + "kl": 0.24609375, + "learning_rate": 9.403153153153153e-07, + "loss": 0.0002, + "reward": 3.031591057777405, + "reward_std": 0.1295524761080742, + "rewards/final_reward": 0.9175850267396465, + "rewards/mask_iou_reward": 0.45879251336982324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0315908789634705, + "rewards/thk_ans_format_reward": 1.0, + "step": 212, + "think_completion_length": 135.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.9791717529297, + "epoch": 0.718381112984823, + "grad_norm": 3.6035040176651636, + "kl": 0.2802734375, + "learning_rate": 9.400337837837838e-07, + "loss": 0.0003, + "reward": 2.9412542581558228, + "reward_std": 0.20104002207517624, + "rewards/final_reward": 0.9532803738516593, + "rewards/mask_iou_reward": 0.47664018692582966, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9412541389465332, + "rewards/thk_ans_format_reward": 1.0, + "step": 213, + "think_completion_length": 136.58333333333331 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.6979217529297, + "epoch": 0.7217537942664418, + "grad_norm": 6.910035871057598, + "kl": 0.22509765625, + "learning_rate": 9.397522522522522e-07, + "loss": 0.0002, + "reward": 2.9119738340377808, + "reward_std": 0.2971492111682892, + "rewards/final_reward": 1.0333231695646627, + "rewards/mask_iou_reward": 0.5166615847823314, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9119738936424255, + "rewards/thk_ans_format_reward": 1.0, + "step": 214, + "think_completion_length": 158.16666666666669 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.80208587646484, + "epoch": 0.7251264755480608, + "grad_norm": 6.761279164799118, + "kl": 0.2802734375, + "learning_rate": 9.394707207207207e-07, + "loss": 0.0003, + "reward": 3.0778356790542603, + "reward_std": 0.18553235195577145, + "rewards/final_reward": 1.3225514499288327, + "rewards/mask_iou_reward": 0.6612757249644163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0778355598449707, + "rewards/thk_ans_format_reward": 1.0, + "step": 215, + "think_completion_length": 124.91666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.5416717529297, + "epoch": 0.7284991568296796, + "grad_norm": 7.233850506151749, + "kl": 0.3037109375, + "learning_rate": 9.391891891891892e-07, + "loss": 0.0003, + "reward": 2.8451234102249146, + "reward_std": 0.23334325850009918, + "rewards/final_reward": 1.244009831333866, + "rewards/mask_iou_reward": 0.622004915666933, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8451233506202698, + "rewards/thk_ans_format_reward": 1.0, + "step": 216, + "think_completion_length": 111.45833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.27083587646484, + "epoch": 0.7318718381112985, + "grad_norm": 5.531946000322627, + "kl": 0.259765625, + "learning_rate": 9.389076576576577e-07, + "loss": 0.0003, + "reward": 3.2371309995651245, + "reward_std": 0.22913970798254013, + "rewards/final_reward": 1.3814128170381639, + "rewards/mask_iou_reward": 0.6907064085190819, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2371309995651245, + "rewards/thk_ans_format_reward": 1.0, + "step": 217, + "think_completion_length": 144.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.27083587646484, + "epoch": 0.7352445193929174, + "grad_norm": 4.716600374535074, + "kl": 0.275390625, + "learning_rate": 9.386261261261261e-07, + "loss": 0.0003, + "reward": 3.3373496532440186, + "reward_std": 0.10465261340141296, + "rewards/final_reward": 1.1443409000330345, + "rewards/mask_iou_reward": 0.5721704500165172, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.337349534034729, + "rewards/thk_ans_format_reward": 1.0, + "step": 218, + "think_completion_length": 151.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.6354217529297, + "epoch": 0.7386172006745363, + "grad_norm": 4.826058419182305, + "kl": 0.271484375, + "learning_rate": 9.383445945945945e-07, + "loss": 0.0003, + "reward": 3.228532910346985, + "reward_std": 0.2345435619354248, + "rewards/final_reward": 0.9689652674189777, + "rewards/mask_iou_reward": 0.48448263370948885, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2285327315330505, + "rewards/thk_ans_format_reward": 1.0, + "step": 219, + "think_completion_length": 120.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.18750762939453, + "epoch": 0.7419898819561551, + "grad_norm": 13.942520207451084, + "kl": 0.294921875, + "learning_rate": 9.38063063063063e-07, + "loss": 0.0003, + "reward": 3.0537610054016113, + "reward_std": 0.22725334763526917, + "rewards/final_reward": 1.3937016158440043, + "rewards/mask_iou_reward": 0.6968508079220022, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.053760975599289, + "rewards/thk_ans_format_reward": 1.0, + "step": 220, + "think_completion_length": 135.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.3229217529297, + "epoch": 0.7453625632377741, + "grad_norm": 12.578546976305708, + "kl": 0.265625, + "learning_rate": 9.377815315315315e-07, + "loss": 0.0003, + "reward": 3.0679415464401245, + "reward_std": 0.24034039676189423, + "rewards/final_reward": 1.0855294095190113, + "rewards/mask_iou_reward": 0.5427647047595057, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.067941665649414, + "rewards/thk_ans_format_reward": 1.0, + "step": 221, + "think_completion_length": 120.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.00000762939453, + "epoch": 0.7487352445193929, + "grad_norm": 11.785419292047843, + "kl": 0.310546875, + "learning_rate": 9.374999999999999e-07, + "loss": 0.0003, + "reward": 3.1781201362609863, + "reward_std": 0.19415687024593353, + "rewards/final_reward": 1.2690602405632174, + "rewards/mask_iou_reward": 0.6345301202816087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1781199276447296, + "rewards/thk_ans_format_reward": 1.0, + "step": 222, + "think_completion_length": 114.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.45834350585938, + "epoch": 0.7521079258010118, + "grad_norm": 10.962821825044882, + "kl": 0.3408203125, + "learning_rate": 9.372184684684684e-07, + "loss": 0.0003, + "reward": 3.1292738914489746, + "reward_std": 0.28961120545864105, + "rewards/final_reward": 1.1409641431470974, + "rewards/mask_iou_reward": 0.5704820715735487, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1292736232280731, + "rewards/thk_ans_format_reward": 1.0, + "step": 223, + "think_completion_length": 115.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.14584350585938, + "epoch": 0.7554806070826307, + "grad_norm": 8.060905494849647, + "kl": 0.2578125, + "learning_rate": 9.369369369369368e-07, + "loss": 0.0003, + "reward": 3.0584421157836914, + "reward_std": 0.278117410838604, + "rewards/final_reward": 1.3437103210085293, + "rewards/mask_iou_reward": 0.6718551605042646, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0584419965744019, + "rewards/thk_ans_format_reward": 1.0, + "step": 224, + "think_completion_length": 138.20833333333331 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.6354217529297, + "epoch": 0.7588532883642496, + "grad_norm": 5.240615156237188, + "kl": 0.29296875, + "learning_rate": 9.366554054054053e-07, + "loss": 0.0003, + "reward": 2.867990255355835, + "reward_std": 0.21263901889324188, + "rewards/final_reward": 0.36395343568342153, + "rewards/mask_iou_reward": 0.18197671784171077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8679901361465454, + "rewards/thk_ans_format_reward": 1.0, + "step": 225, + "think_completion_length": 108.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.86458587646484, + "epoch": 0.7622259696458684, + "grad_norm": 6.065632045897228, + "kl": 0.32421875, + "learning_rate": 9.363738738738738e-07, + "loss": 0.0003, + "reward": 3.0205975770950317, + "reward_std": 0.10638023167848587, + "rewards/final_reward": 1.1502363860044684, + "rewards/mask_iou_reward": 0.5751181930022342, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0205976217985153, + "rewards/thk_ans_format_reward": 1.0, + "step": 226, + "think_completion_length": 92.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.625, + "epoch": 0.7655986509274874, + "grad_norm": 4.96500687828192, + "kl": 0.3212890625, + "learning_rate": 9.360923423423422e-07, + "loss": 0.0003, + "reward": 3.1703583002090454, + "reward_std": 0.14759309589862823, + "rewards/final_reward": 1.3432951532984052, + "rewards/mask_iou_reward": 0.6716475766492026, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1703582406044006, + "rewards/thk_ans_format_reward": 1.0, + "step": 227, + "think_completion_length": 104.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.7291717529297, + "epoch": 0.7689713322091062, + "grad_norm": 7.745198108166682, + "kl": 0.462890625, + "learning_rate": 9.358108108108108e-07, + "loss": 0.0006, + "reward": 2.9740800857543945, + "reward_std": 0.3129550665616989, + "rewards/final_reward": 1.496134752534723, + "rewards/mask_iou_reward": 0.7480673762673615, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9740800261497498, + "rewards/thk_ans_format_reward": 1.0, + "step": 228, + "think_completion_length": 113.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.34375762939453, + "epoch": 0.7723440134907251, + "grad_norm": 12.566104219984988, + "kl": 0.3935546875, + "learning_rate": 9.355292792792792e-07, + "loss": 0.0004, + "reward": 3.2419862747192383, + "reward_std": 0.22945839166641235, + "rewards/final_reward": 1.9098584814414274, + "rewards/mask_iou_reward": 0.9549292407207137, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2419861555099487, + "rewards/thk_ans_format_reward": 1.0, + "step": 229, + "think_completion_length": 92.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.95833587646484, + "epoch": 0.7757166947723441, + "grad_norm": 6.96585549840121, + "kl": 0.318359375, + "learning_rate": 9.352477477477477e-07, + "loss": 0.0003, + "reward": 3.1130073070526123, + "reward_std": 0.3432839810848236, + "rewards/final_reward": 1.2066239717794454, + "rewards/mask_iou_reward": 0.6033119858897227, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1130073070526123, + "rewards/thk_ans_format_reward": 1.0, + "step": 230, + "think_completion_length": 103.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.53125762939453, + "epoch": 0.7790893760539629, + "grad_norm": 13.129698712685878, + "kl": 0.283203125, + "learning_rate": 9.349662162162162e-07, + "loss": 0.0003, + "reward": 3.0300129652023315, + "reward_std": 0.17744334042072296, + "rewards/final_reward": 1.4037192994587762, + "rewards/mask_iou_reward": 0.7018596497293881, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0300128757953644, + "rewards/thk_ans_format_reward": 1.0, + "step": 231, + "think_completion_length": 89.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.55208587646484, + "epoch": 0.7824620573355818, + "grad_norm": 4.300902307031056, + "kl": 0.30859375, + "learning_rate": 9.346846846846846e-07, + "loss": 0.0003, + "reward": 3.1586432456970215, + "reward_std": 0.2268233448266983, + "rewards/final_reward": 0.7366462498745183, + "rewards/mask_iou_reward": 0.36832312493725916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1586434841156006, + "rewards/thk_ans_format_reward": 1.0, + "step": 232, + "think_completion_length": 81.29166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.64583587646484, + "epoch": 0.7858347386172007, + "grad_norm": 6.856175117227461, + "kl": 0.33203125, + "learning_rate": 9.344031531531531e-07, + "loss": 0.0003, + "reward": 3.409724473953247, + "reward_std": 0.14944615215063095, + "rewards/final_reward": 0.99799662277511, + "rewards/mask_iou_reward": 0.498998311387555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4097245931625366, + "rewards/thk_ans_format_reward": 1.0, + "step": 233, + "think_completion_length": 86.08333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.55208587646484, + "epoch": 0.7892074198988196, + "grad_norm": 7.912545060528145, + "kl": 0.3115234375, + "learning_rate": 9.341216216216216e-07, + "loss": 0.0003, + "reward": 2.913196563720703, + "reward_std": 0.23317305743694305, + "rewards/final_reward": 0.8338462476799143, + "rewards/mask_iou_reward": 0.41692312383995717, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9131967127323151, + "rewards/thk_ans_format_reward": 1.0, + "step": 234, + "think_completion_length": 77.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.7604217529297, + "epoch": 0.7925801011804384, + "grad_norm": 8.777457829113022, + "kl": 0.326171875, + "learning_rate": 9.3384009009009e-07, + "loss": 0.0003, + "reward": 3.0413867235183716, + "reward_std": 0.2278767228126526, + "rewards/final_reward": 0.5746736596316071, + "rewards/mask_iou_reward": 0.28733682981580355, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0413867831230164, + "rewards/thk_ans_format_reward": 1.0, + "step": 235, + "think_completion_length": 81.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.61459350585938, + "epoch": 0.7959527824620574, + "grad_norm": 7.805444170540823, + "kl": 0.4345703125, + "learning_rate": 9.335585585585585e-07, + "loss": 0.0004, + "reward": 3.0897037982940674, + "reward_std": 0.3523574024438858, + "rewards/final_reward": 1.2939491615198757, + "rewards/mask_iou_reward": 0.6469745807599379, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0897037386894226, + "rewards/thk_ans_format_reward": 1.0, + "step": 236, + "think_completion_length": 67.70833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.67709350585938, + "epoch": 0.7993254637436762, + "grad_norm": 13.994501698735474, + "kl": 0.509765625, + "learning_rate": 9.332770270270269e-07, + "loss": 0.0005, + "reward": 2.8868261575698853, + "reward_std": 0.31315620988607407, + "rewards/final_reward": 1.5869208062798532, + "rewards/mask_iou_reward": 0.7934604031399266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8868262767791748, + "rewards/thk_ans_format_reward": 1.0, + "step": 237, + "think_completion_length": 71.45833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.30209350585938, + "epoch": 0.8026981450252951, + "grad_norm": 16.495913905202936, + "kl": 0.341796875, + "learning_rate": 9.329954954954955e-07, + "loss": 0.0003, + "reward": 3.322413921356201, + "reward_std": 0.13931374996900558, + "rewards/final_reward": 0.9929226750951938, + "rewards/mask_iou_reward": 0.4964613375475969, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3224138617515564, + "rewards/thk_ans_format_reward": 1.0, + "step": 238, + "think_completion_length": 69.41666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.52083587646484, + "epoch": 0.806070826306914, + "grad_norm": 5.448551014834402, + "kl": 0.3017578125, + "learning_rate": 9.32713963963964e-07, + "loss": 0.0003, + "reward": 3.1825672388076782, + "reward_std": 0.15929418802261353, + "rewards/final_reward": 0.7330971489037537, + "rewards/mask_iou_reward": 0.36654857445187683, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1825674176216125, + "rewards/thk_ans_format_reward": 1.0, + "step": 239, + "think_completion_length": 60.541666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.48959350585938, + "epoch": 0.8094435075885329, + "grad_norm": 5.379072408289089, + "kl": 0.330078125, + "learning_rate": 9.324324324324324e-07, + "loss": 0.0003, + "reward": 2.834401249885559, + "reward_std": 0.28128696233034134, + "rewards/final_reward": 0.7617745861450789, + "rewards/mask_iou_reward": 0.38088729307253943, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 0.8552343845367432, + "rewards/thk_ans_format_reward": 1.0, + "step": 240, + "think_completion_length": 64.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.45833587646484, + "epoch": 0.8128161888701517, + "grad_norm": 20.354490739109075, + "kl": 0.328125, + "learning_rate": 9.321509009009009e-07, + "loss": 0.0003, + "reward": 3.0151419639587402, + "reward_std": 0.15956728160381317, + "rewards/final_reward": 1.046194412546264, + "rewards/mask_iou_reward": 0.523097206273132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0151418447494507, + "rewards/thk_ans_format_reward": 1.0, + "step": 241, + "think_completion_length": 68.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.67708587646484, + "epoch": 0.8161888701517707, + "grad_norm": 9.0604303740086, + "kl": 0.3173828125, + "learning_rate": 9.318693693693693e-07, + "loss": 0.0003, + "reward": 3.092646598815918, + "reward_std": 0.23580051958560944, + "rewards/final_reward": 0.3709977762409087, + "rewards/mask_iou_reward": 0.18549888812045434, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0926466882228851, + "rewards/thk_ans_format_reward": 1.0, + "step": 242, + "think_completion_length": 67.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.7604217529297, + "epoch": 0.8195615514333895, + "grad_norm": 10.869798525247067, + "kl": 0.3125, + "learning_rate": 9.315878378378378e-07, + "loss": 0.0003, + "reward": 3.2662233114242554, + "reward_std": 0.13431217521429062, + "rewards/final_reward": 0.47974971969132774, + "rewards/mask_iou_reward": 0.23987485984566387, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2662232518196106, + "rewards/thk_ans_format_reward": 1.0, + "step": 243, + "think_completion_length": 59.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.43750762939453, + "epoch": 0.8229342327150084, + "grad_norm": 15.448332221184192, + "kl": 0.2939453125, + "learning_rate": 9.313063063063063e-07, + "loss": 0.0003, + "reward": 2.899739623069763, + "reward_std": 0.21755699813365936, + "rewards/final_reward": 0.7527796999562915, + "rewards/mask_iou_reward": 0.37638984997814573, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8997395038604736, + "rewards/thk_ans_format_reward": 1.0, + "step": 244, + "think_completion_length": 72.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.42708587646484, + "epoch": 0.8263069139966274, + "grad_norm": 17.271046761465456, + "kl": 0.2978515625, + "learning_rate": 9.310247747747747e-07, + "loss": 0.0003, + "reward": 3.221222758293152, + "reward_std": 0.19722003489732742, + "rewards/final_reward": 1.0775364410245993, + "rewards/mask_iou_reward": 0.5387682205122997, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2212228178977966, + "rewards/thk_ans_format_reward": 1.0, + "step": 245, + "think_completion_length": 87.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.61459350585938, + "epoch": 0.8296795952782462, + "grad_norm": 6.529827798108178, + "kl": 0.32421875, + "learning_rate": 9.307432432432432e-07, + "loss": 0.0003, + "reward": 3.007061004638672, + "reward_std": 0.23605723679065704, + "rewards/final_reward": 0.7326444464817671, + "rewards/mask_iou_reward": 0.36632222324088354, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0070610642433167, + "rewards/thk_ans_format_reward": 1.0, + "step": 246, + "think_completion_length": 60.083333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.89583587646484, + "epoch": 0.8330522765598651, + "grad_norm": 6.977264576795783, + "kl": 0.3427734375, + "learning_rate": 9.304617117117116e-07, + "loss": 0.0003, + "reward": 3.198093056678772, + "reward_std": 0.09082278236746788, + "rewards/final_reward": 0.6644612765647351, + "rewards/mask_iou_reward": 0.33223063828236754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1980929970741272, + "rewards/thk_ans_format_reward": 1.0, + "step": 247, + "think_completion_length": 63.08333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.08333587646484, + "epoch": 0.836424957841484, + "grad_norm": 6.7442749121409555, + "kl": 0.271484375, + "learning_rate": 9.301801801801802e-07, + "loss": 0.0003, + "reward": 3.0286999940872192, + "reward_std": 0.20378149673342705, + "rewards/final_reward": 0.7986789049758385, + "rewards/mask_iou_reward": 0.39933945248791924, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0391165912151337, + "rewards/thk_ans_format_reward": 1.0, + "step": 248, + "think_completion_length": 66.70833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.59375762939453, + "epoch": 0.8397976391231029, + "grad_norm": 15.270005105074052, + "kl": 0.29296875, + "learning_rate": 9.298986486486487e-07, + "loss": 0.0003, + "reward": 2.760540723800659, + "reward_std": 0.2210642360150814, + "rewards/final_reward": 0.802359601161798, + "rewards/mask_iou_reward": 0.401179800580899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7605406939983368, + "rewards/thk_ans_format_reward": 1.0, + "step": 249, + "think_completion_length": 61.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.87500762939453, + "epoch": 0.8431703204047217, + "grad_norm": 9.56026883907223, + "kl": 0.2939453125, + "learning_rate": 9.296171171171171e-07, + "loss": 0.0003, + "reward": 3.385342001914978, + "reward_std": 0.20231656730175018, + "rewards/final_reward": 1.6732546895302796, + "rewards/mask_iou_reward": 0.8366273447651398, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.385342001914978, + "rewards/thk_ans_format_reward": 1.0, + "step": 250, + "think_completion_length": 60.166666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.67708587646484, + "epoch": 0.8465430016863407, + "grad_norm": 14.351451585261236, + "kl": 0.326171875, + "learning_rate": 9.293355855855856e-07, + "loss": 0.0003, + "reward": 3.166195034980774, + "reward_std": 0.22971728444099426, + "rewards/final_reward": 0.8075349486655949, + "rewards/mask_iou_reward": 0.40376747433279747, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1661950051784515, + "rewards/thk_ans_format_reward": 1.0, + "step": 251, + "think_completion_length": 64.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.34375, + "epoch": 0.8499156829679595, + "grad_norm": 8.474053501046951, + "kl": 0.28515625, + "learning_rate": 9.290540540540541e-07, + "loss": 0.0003, + "reward": 2.7512192726135254, + "reward_std": 0.24805811047554016, + "rewards/final_reward": 1.4858556421360731, + "rewards/mask_iou_reward": 0.7429278210680366, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7512190937995911, + "rewards/thk_ans_format_reward": 1.0, + "step": 252, + "think_completion_length": 66.83333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.75000762939453, + "epoch": 0.8532883642495784, + "grad_norm": 9.573703576617035, + "kl": 0.294921875, + "learning_rate": 9.287725225225225e-07, + "loss": 0.0003, + "reward": 3.110999345779419, + "reward_std": 0.18930789083242416, + "rewards/final_reward": 1.3280421091057069, + "rewards/mask_iou_reward": 0.6640210545528534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1109992265701294, + "rewards/thk_ans_format_reward": 1.0, + "step": 253, + "think_completion_length": 63.333333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1666717529297, + "epoch": 0.8566610455311973, + "grad_norm": 7.772974324691213, + "kl": 0.2900390625, + "learning_rate": 9.28490990990991e-07, + "loss": 0.0003, + "reward": 2.875392436981201, + "reward_std": 0.18357276916503906, + "rewards/final_reward": 0.6232644154840334, + "rewards/mask_iou_reward": 0.3116322077420167, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8753922432661057, + "rewards/thk_ans_format_reward": 1.0, + "step": 254, + "think_completion_length": 65.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.43750381469727, + "epoch": 0.8600337268128162, + "grad_norm": 10.93387391032777, + "kl": 0.662109375, + "learning_rate": 9.282094594594594e-07, + "loss": 0.0007, + "reward": 3.0346208810806274, + "reward_std": 0.2844673991203308, + "rewards/final_reward": 1.1061904723534532, + "rewards/mask_iou_reward": 0.5530952361767266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.034620761871338, + "rewards/thk_ans_format_reward": 1.0, + "step": 255, + "think_completion_length": 64.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.00000762939453, + "epoch": 0.863406408094435, + "grad_norm": 8.986343999032677, + "kl": 0.2626953125, + "learning_rate": 9.279279279279278e-07, + "loss": 0.0003, + "reward": 2.7675180435180664, + "reward_std": 0.1956443041563034, + "rewards/final_reward": 0.3999406699768425, + "rewards/mask_iou_reward": 0.19997033498842126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.767518013715744, + "rewards/thk_ans_format_reward": 1.0, + "step": 256, + "think_completion_length": 63.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5104217529297, + "epoch": 0.866779089376054, + "grad_norm": 9.091060447695627, + "kl": 0.2919921875, + "learning_rate": 9.276463963963963e-07, + "loss": 0.0003, + "reward": 3.3000314235687256, + "reward_std": 0.1437247097492218, + "rewards/final_reward": 1.140711099988953, + "rewards/mask_iou_reward": 0.5703555499944765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3000314831733704, + "rewards/thk_ans_format_reward": 1.0, + "step": 257, + "think_completion_length": 63.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3854217529297, + "epoch": 0.8701517706576728, + "grad_norm": 7.808627484090949, + "kl": 0.2890625, + "learning_rate": 9.273648648648648e-07, + "loss": 0.0003, + "reward": 2.876472234725952, + "reward_std": 0.16375703364610672, + "rewards/final_reward": 0.9649949267346407, + "rewards/mask_iou_reward": 0.48249746336732036, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.8868888914585114, + "rewards/thk_ans_format_reward": 1.0, + "step": 258, + "think_completion_length": 57.458333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.6041717529297, + "epoch": 0.8735244519392917, + "grad_norm": 9.507535751938144, + "kl": 0.2841796875, + "learning_rate": 9.270833333333333e-07, + "loss": 0.0003, + "reward": 3.252575159072876, + "reward_std": 0.17459773272275925, + "rewards/final_reward": 0.8978655204807608, + "rewards/mask_iou_reward": 0.4489327602403804, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2525752782821655, + "rewards/thk_ans_format_reward": 1.0, + "step": 259, + "think_completion_length": 55.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.67708587646484, + "epoch": 0.8768971332209107, + "grad_norm": 8.94111935304906, + "kl": 0.28515625, + "learning_rate": 9.268018018018017e-07, + "loss": 0.0003, + "reward": 3.020329713821411, + "reward_std": 0.21678221970796585, + "rewards/final_reward": 0.695925233246127, + "rewards/mask_iou_reward": 0.3479626166230635, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0203297436237335, + "rewards/thk_ans_format_reward": 1.0, + "step": 260, + "think_completion_length": 62.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.86458587646484, + "epoch": 0.8802698145025295, + "grad_norm": 17.658472675005275, + "kl": 0.2666015625, + "learning_rate": 9.265202702702702e-07, + "loss": 0.0003, + "reward": 3.2855056524276733, + "reward_std": 0.1681097000837326, + "rewards/final_reward": 0.8268895833072832, + "rewards/mask_iou_reward": 0.4134447916536416, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2855055332183838, + "rewards/thk_ans_format_reward": 1.0, + "step": 261, + "think_completion_length": 63.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.65625762939453, + "epoch": 0.8836424957841484, + "grad_norm": 7.071090498535165, + "kl": 0.2822265625, + "learning_rate": 9.262387387387387e-07, + "loss": 0.0003, + "reward": 3.1476712226867676, + "reward_std": 0.22411763668060303, + "rewards/final_reward": 1.200344153075685, + "rewards/mask_iou_reward": 0.6001720765378425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1476709842681885, + "rewards/thk_ans_format_reward": 1.0, + "step": 262, + "think_completion_length": 60.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.92708587646484, + "epoch": 0.8870151770657673, + "grad_norm": 6.309393231244978, + "kl": 0.296875, + "learning_rate": 9.259572072072071e-07, + "loss": 0.0003, + "reward": 3.180266857147217, + "reward_std": 0.19312848150730133, + "rewards/final_reward": 1.2631500104789617, + "rewards/mask_iou_reward": 0.6315750052394808, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.180266797542572, + "rewards/thk_ans_format_reward": 1.0, + "step": 263, + "think_completion_length": 64.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.58333587646484, + "epoch": 0.8903878583473862, + "grad_norm": 10.50935003017543, + "kl": 0.3125, + "learning_rate": 9.256756756756756e-07, + "loss": 0.0003, + "reward": 2.9178611040115356, + "reward_std": 0.18160251900553703, + "rewards/final_reward": 0.6870835665857659, + "rewards/mask_iou_reward": 0.34354178329288293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9178611040115356, + "rewards/thk_ans_format_reward": 1.0, + "step": 264, + "think_completion_length": 61.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.7916717529297, + "epoch": 0.893760539629005, + "grad_norm": 7.908751555564282, + "kl": 0.3466796875, + "learning_rate": 9.25394144144144e-07, + "loss": 0.0003, + "reward": 3.27189302444458, + "reward_std": 0.19905216246843338, + "rewards/final_reward": 0.7301093085456902, + "rewards/mask_iou_reward": 0.3650546542728451, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2718929648399353, + "rewards/thk_ans_format_reward": 1.0, + "step": 265, + "think_completion_length": 62.08333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.33334350585938, + "epoch": 0.897133220910624, + "grad_norm": 6.078543222665936, + "kl": 0.2958984375, + "learning_rate": 9.251126126126125e-07, + "loss": 0.0003, + "reward": 2.89335036277771, + "reward_std": 0.23937778174877167, + "rewards/final_reward": 0.9255969796433386, + "rewards/mask_iou_reward": 0.4627984898216693, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8933502733707428, + "rewards/thk_ans_format_reward": 1.0, + "step": 266, + "think_completion_length": 69.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.86458587646484, + "epoch": 0.9005059021922428, + "grad_norm": 6.210516801841927, + "kl": 0.271484375, + "learning_rate": 9.24831081081081e-07, + "loss": 0.0004, + "reward": 2.7252787351608276, + "reward_std": 0.3134681284427643, + "rewards/final_reward": 0.2785383929054998, + "rewards/mask_iou_reward": 0.1392691964527499, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 0.7773620188236237, + "rewards/thk_ans_format_reward": 1.0, + "step": 267, + "think_completion_length": 72.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.27083587646484, + "epoch": 0.9038785834738617, + "grad_norm": 4.866428396155994, + "kl": 0.2734375, + "learning_rate": 9.245495495495495e-07, + "loss": 0.0003, + "reward": 2.961507558822632, + "reward_std": 0.13086314499378204, + "rewards/final_reward": 0.4467381767630276, + "rewards/mask_iou_reward": 0.2233690883815138, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9615074098110199, + "rewards/thk_ans_format_reward": 1.0, + "step": 268, + "think_completion_length": 64.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.1666717529297, + "epoch": 0.9072512647554806, + "grad_norm": 9.751330518139088, + "kl": 0.31640625, + "learning_rate": 9.24268018018018e-07, + "loss": 0.0003, + "reward": 3.138562798500061, + "reward_std": 0.18835950642824173, + "rewards/final_reward": 0.7717814922707004, + "rewards/mask_iou_reward": 0.3858907461353502, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1385626792907715, + "rewards/thk_ans_format_reward": 1.0, + "step": 269, + "think_completion_length": 70.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.03125762939453, + "epoch": 0.9106239460370995, + "grad_norm": 5.261070572921644, + "kl": 0.306640625, + "learning_rate": 9.239864864864865e-07, + "loss": 0.0003, + "reward": 3.116981267929077, + "reward_std": 0.24101653695106506, + "rewards/final_reward": 1.3195163044452227, + "rewards/mask_iou_reward": 0.6597581522226114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1169812679290771, + "rewards/thk_ans_format_reward": 1.0, + "step": 270, + "think_completion_length": 73.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.6875, + "epoch": 0.9139966273187183, + "grad_norm": 6.026911173579967, + "kl": 0.2666015625, + "learning_rate": 9.237049549549549e-07, + "loss": 0.0003, + "reward": 3.1197715997695923, + "reward_std": 0.3672148436307907, + "rewards/final_reward": 0.4015020312015995, + "rewards/mask_iou_reward": 0.20075101560079975, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1197713613510132, + "rewards/thk_ans_format_reward": 1.0, + "step": 271, + "think_completion_length": 73.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.28125762939453, + "epoch": 0.9173693086003373, + "grad_norm": 12.035554252990222, + "kl": 0.271484375, + "learning_rate": 9.234234234234234e-07, + "loss": 0.0003, + "reward": 3.3170535564422607, + "reward_std": 0.0954308807849884, + "rewards/final_reward": 1.2198915213394868, + "rewards/mask_iou_reward": 0.6099457606697434, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3170537948608398, + "rewards/thk_ans_format_reward": 1.0, + "step": 272, + "think_completion_length": 72.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.71875, + "epoch": 0.9207419898819561, + "grad_norm": 9.822815415789027, + "kl": 0.2841796875, + "learning_rate": 9.231418918918918e-07, + "loss": 0.0003, + "reward": 3.4125086069107056, + "reward_std": 0.1690869778394699, + "rewards/final_reward": 1.3490748535842736, + "rewards/mask_iou_reward": 0.6745374267921368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4125087261199951, + "rewards/thk_ans_format_reward": 1.0, + "step": 273, + "think_completion_length": 65.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.3541717529297, + "epoch": 0.924114671163575, + "grad_norm": 4.455084478392973, + "kl": 0.302734375, + "learning_rate": 9.228603603603603e-07, + "loss": 0.0003, + "reward": 3.191675901412964, + "reward_std": 0.28003841638565063, + "rewards/final_reward": 1.7114721652174203, + "rewards/mask_iou_reward": 0.8557360826087101, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1916756629943848, + "rewards/thk_ans_format_reward": 1.0, + "step": 274, + "think_completion_length": 71.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.14583587646484, + "epoch": 0.927487352445194, + "grad_norm": 5.316324049123673, + "kl": 0.27734375, + "learning_rate": 9.225788288288288e-07, + "loss": 0.0003, + "reward": 2.9825011491775513, + "reward_std": 0.04889613017439842, + "rewards/final_reward": 0.6555297220953474, + "rewards/mask_iou_reward": 0.3277648610476737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9825010895729065, + "rewards/thk_ans_format_reward": 1.0, + "step": 275, + "think_completion_length": 69.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.8854217529297, + "epoch": 0.9308600337268128, + "grad_norm": 7.561019231281056, + "kl": 0.2998046875, + "learning_rate": 9.222972972972972e-07, + "loss": 0.0003, + "reward": 3.01677143573761, + "reward_std": 0.34936605393886566, + "rewards/final_reward": 0.778316704859161, + "rewards/mask_iou_reward": 0.3891583524295805, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0271879434585571, + "rewards/thk_ans_format_reward": 1.0, + "step": 276, + "think_completion_length": 69.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6666717529297, + "epoch": 0.9342327150084317, + "grad_norm": 11.899753302570709, + "kl": 0.365234375, + "learning_rate": 9.220157657657657e-07, + "loss": 0.0004, + "reward": 3.115198850631714, + "reward_std": 0.35679496824741364, + "rewards/final_reward": 1.2854900381729721, + "rewards/mask_iou_reward": 0.6427450190864861, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.1568655371665955, + "rewards/thk_ans_format_reward": 1.0, + "step": 277, + "think_completion_length": 65.41666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.33333587646484, + "epoch": 0.9376053962900506, + "grad_norm": 6.487732011910408, + "kl": 0.30859375, + "learning_rate": 9.217342342342342e-07, + "loss": 0.0003, + "reward": 3.1566383838653564, + "reward_std": 0.31383951008319855, + "rewards/final_reward": 0.7289020591503977, + "rewards/mask_iou_reward": 0.36445102957519887, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1670548915863037, + "rewards/thk_ans_format_reward": 1.0, + "step": 278, + "think_completion_length": 67.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 0.9409780775716695, + "grad_norm": 14.827734251735015, + "kl": 0.3056640625, + "learning_rate": 9.214527027027027e-07, + "loss": 0.0003, + "reward": 3.365423798561096, + "reward_std": 0.22016916424036026, + "rewards/final_reward": 1.4150596939165623, + "rewards/mask_iou_reward": 0.7075298469582811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.365423560142517, + "rewards/thk_ans_format_reward": 1.0, + "step": 279, + "think_completion_length": 63.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.77083587646484, + "epoch": 0.9443507588532883, + "grad_norm": 7.785269993330108, + "kl": 0.2861328125, + "learning_rate": 9.211711711711712e-07, + "loss": 0.0003, + "reward": 2.8190064430236816, + "reward_std": 0.20893994718790054, + "rewards/final_reward": 0.2796995562385235, + "rewards/mask_iou_reward": 0.13984977811926175, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8190064132213593, + "rewards/thk_ans_format_reward": 1.0, + "step": 280, + "think_completion_length": 63.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.22916793823242, + "epoch": 0.9477234401349073, + "grad_norm": 16.756402992934778, + "kl": 0.30078125, + "learning_rate": 9.208896396396396e-07, + "loss": 0.0003, + "reward": 3.037890315055847, + "reward_std": 0.23893652856349945, + "rewards/final_reward": 0.7540887574307235, + "rewards/mask_iou_reward": 0.3770443787153617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0378903150558472, + "rewards/thk_ans_format_reward": 1.0, + "step": 281, + "think_completion_length": 61.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.97916793823242, + "epoch": 0.9510961214165261, + "grad_norm": 5.730385007491753, + "kl": 0.328125, + "learning_rate": 9.206081081081081e-07, + "loss": 0.0003, + "reward": 2.7796366214752197, + "reward_std": 0.22210410237312317, + "rewards/final_reward": 0.9956611302221129, + "rewards/mask_iou_reward": 0.49783056511105644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7796366810798645, + "rewards/thk_ans_format_reward": 1.0, + "step": 282, + "think_completion_length": 60.708333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.36458587646484, + "epoch": 0.954468802698145, + "grad_norm": 10.183292819390926, + "kl": 0.310546875, + "learning_rate": 9.203265765765765e-07, + "loss": 0.0003, + "reward": 3.1968239545822144, + "reward_std": 0.17233379930257797, + "rewards/final_reward": 1.6030817393916479, + "rewards/mask_iou_reward": 0.8015408696958239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1968241333961487, + "rewards/thk_ans_format_reward": 1.0, + "step": 283, + "think_completion_length": 70.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0104217529297, + "epoch": 0.9578414839797639, + "grad_norm": 5.6452981089641865, + "kl": 0.337890625, + "learning_rate": 9.20045045045045e-07, + "loss": 0.0003, + "reward": 3.0770174264907837, + "reward_std": 0.40977267920970917, + "rewards/final_reward": 0.8966653625690325, + "rewards/mask_iou_reward": 0.44833268128451625, + "rewards/sam_format_reward": 0.9166666865348816, + "rewards/sam_reward_func_ultra": 1.160350501537323, + "rewards/thk_ans_format_reward": 1.0, + "step": 284, + "think_completion_length": 63.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.31250762939453, + "epoch": 0.9612141652613828, + "grad_norm": 26.57340182822755, + "kl": 0.3095703125, + "learning_rate": 9.197635135135135e-07, + "loss": 0.0003, + "reward": 3.146125912666321, + "reward_std": 0.36684100329875946, + "rewards/final_reward": 1.7262357467138223, + "rewards/mask_iou_reward": 0.8631178733569111, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1461257934570312, + "rewards/thk_ans_format_reward": 1.0, + "step": 285, + "think_completion_length": 54.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.64583587646484, + "epoch": 0.9645868465430016, + "grad_norm": 4.967807923742779, + "kl": 0.3525390625, + "learning_rate": 9.194819819819819e-07, + "loss": 0.0004, + "reward": 2.8754937648773193, + "reward_std": 0.11622428148984909, + "rewards/final_reward": 0.728122555218105, + "rewards/mask_iou_reward": 0.3640612776090525, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8754937052726746, + "rewards/thk_ans_format_reward": 1.0, + "step": 286, + "think_completion_length": 59.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.9791717529297, + "epoch": 0.9679595278246206, + "grad_norm": 7.684197556109967, + "kl": 0.3359375, + "learning_rate": 9.192004504504504e-07, + "loss": 0.0003, + "reward": 3.0598126649856567, + "reward_std": 0.2900129407644272, + "rewards/final_reward": 1.679721792899501, + "rewards/mask_iou_reward": 0.8398608964497505, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0598126649856567, + "rewards/thk_ans_format_reward": 1.0, + "step": 287, + "think_completion_length": 56.708333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.2395896911621, + "epoch": 0.9713322091062394, + "grad_norm": 5.760732467267119, + "kl": 0.365234375, + "learning_rate": 9.18918918918919e-07, + "loss": 0.0004, + "reward": 2.9172106981277466, + "reward_std": 0.15981094166636467, + "rewards/final_reward": 0.9524224872353481, + "rewards/mask_iou_reward": 0.47621124361767403, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9172105193138123, + "rewards/thk_ans_format_reward": 1.0, + "step": 288, + "think_completion_length": 56.04166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.6354217529297, + "epoch": 0.9747048903878583, + "grad_norm": 5.7714829299216355, + "kl": 0.3427734375, + "learning_rate": 9.186373873873874e-07, + "loss": 0.0003, + "reward": 3.3801859617233276, + "reward_std": 0.2743668332695961, + "rewards/final_reward": 1.7714707382638077, + "rewards/mask_iou_reward": 0.8857353691319039, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3801860809326172, + "rewards/thk_ans_format_reward": 1.0, + "step": 289, + "think_completion_length": 57.583333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.75000381469727, + "epoch": 0.9780775716694773, + "grad_norm": 16.609336956336424, + "kl": 0.3720703125, + "learning_rate": 9.183558558558559e-07, + "loss": 0.0005, + "reward": 3.2253894805908203, + "reward_std": 0.17594532668590546, + "rewards/final_reward": 1.675658520954385, + "rewards/mask_iou_reward": 0.8378292604771925, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2358060479164124, + "rewards/thk_ans_format_reward": 1.0, + "step": 290, + "think_completion_length": 55.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.18750762939453, + "epoch": 0.9814502529510961, + "grad_norm": 11.02554804355287, + "kl": 0.353515625, + "learning_rate": 9.180743243243243e-07, + "loss": 0.0004, + "reward": 3.0442588329315186, + "reward_std": 0.31053662300109863, + "rewards/final_reward": 1.8277712111056867, + "rewards/mask_iou_reward": 0.9138856055528434, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0442588925361633, + "rewards/thk_ans_format_reward": 1.0, + "step": 291, + "think_completion_length": 53.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.67708587646484, + "epoch": 0.984822934232715, + "grad_norm": 6.016837428298982, + "kl": 0.3681640625, + "learning_rate": 9.177927927927928e-07, + "loss": 0.0004, + "reward": 3.389703154563904, + "reward_std": 0.17725949734449387, + "rewards/final_reward": 1.3419651151754086, + "rewards/mask_iou_reward": 0.6709825575877043, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3897031545639038, + "rewards/thk_ans_format_reward": 1.0, + "step": 292, + "think_completion_length": 55.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.375, + "epoch": 0.9881956155143339, + "grad_norm": 10.216163938938413, + "kl": 0.3359375, + "learning_rate": 9.175112612612613e-07, + "loss": 0.0003, + "reward": 3.1626222133636475, + "reward_std": 0.20485194586217403, + "rewards/final_reward": 1.1873850468419163, + "rewards/mask_iou_reward": 0.5936925234209581, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1626221537590027, + "rewards/thk_ans_format_reward": 1.0, + "step": 293, + "think_completion_length": 53.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.7604217529297, + "epoch": 0.9915682967959528, + "grad_norm": 8.66201289787897, + "kl": 0.373046875, + "learning_rate": 9.172297297297297e-07, + "loss": 0.0004, + "reward": 3.003246307373047, + "reward_std": 0.18188440799713135, + "rewards/final_reward": 1.3201920244706054, + "rewards/mask_iou_reward": 0.6600960122353027, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0032461285591125, + "rewards/thk_ans_format_reward": 1.0, + "step": 294, + "think_completion_length": 57.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.31250762939453, + "epoch": 0.9949409780775716, + "grad_norm": 10.932256085842326, + "kl": 0.3720703125, + "learning_rate": 9.169481981981982e-07, + "loss": 0.0004, + "reward": 2.940837025642395, + "reward_std": 0.1912689208984375, + "rewards/final_reward": 0.8208294820244021, + "rewards/mask_iou_reward": 0.41041474101220105, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9408369064331055, + "rewards/thk_ans_format_reward": 1.0, + "step": 295, + "think_completion_length": 54.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.92105102539062, + "epoch": 0.9983136593591906, + "grad_norm": 12.666395801396902, + "kl": 0.4267578125, + "learning_rate": 9.166666666666665e-07, + "loss": 0.0004, + "reward": 2.8901513814926147, + "reward_std": 0.2604110687971115, + "rewards/final_reward": 0.7894457895435861, + "rewards/mask_iou_reward": 0.39472289477179306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8901514261960983, + "rewards/thk_ans_format_reward": 1.0, + "step": 296, + "think_completion_length": 49.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.47916793823242, + "epoch": 1.0033726812816188, + "grad_norm": 5.005331899064764, + "kl": 0.4736328125, + "learning_rate": 9.16385135135135e-07, + "loss": 0.0005, + "reward": 3.0064163208007812, + "reward_std": 0.13095365837216377, + "rewards/final_reward": 1.0119502184697924, + "rewards/mask_iou_reward": 0.5059751092348962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0064163208007812, + "rewards/thk_ans_format_reward": 1.0, + "step": 297, + "think_completion_length": 52.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.40625381469727, + "epoch": 1.0067453625632379, + "grad_norm": 13.649116642243577, + "kl": 0.421875, + "learning_rate": 9.161036036036036e-07, + "loss": 0.0004, + "reward": 3.4440163373947144, + "reward_std": 0.20361152291297913, + "rewards/final_reward": 1.5423490000752769, + "rewards/mask_iou_reward": 0.7711745000376384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4440162181854248, + "rewards/thk_ans_format_reward": 1.0, + "step": 298, + "think_completion_length": 53.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.96875381469727, + "epoch": 1.0101180438448567, + "grad_norm": 5.44147932218635, + "kl": 0.458984375, + "learning_rate": 9.15822072072072e-07, + "loss": 0.0005, + "reward": 3.1435067653656006, + "reward_std": 0.33636271953582764, + "rewards/final_reward": 1.2787666554971673, + "rewards/mask_iou_reward": 0.6393833277485836, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1435065269470215, + "rewards/thk_ans_format_reward": 1.0, + "step": 299, + "think_completion_length": 50.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.52083587646484, + "epoch": 1.0134907251264755, + "grad_norm": 13.105966422900543, + "kl": 0.42578125, + "learning_rate": 9.155405405405405e-07, + "loss": 0.0004, + "reward": 2.835780143737793, + "reward_std": 0.22343186289072037, + "rewards/final_reward": 0.05838687385318925, + "rewards/mask_iou_reward": 0.029193436926594624, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8357802033424377, + "rewards/thk_ans_format_reward": 1.0, + "step": 300, + "think_completion_length": 53.583333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.22917175292969, + "epoch": 1.0168634064080944, + "grad_norm": 16.097568168056785, + "kl": 0.3955078125, + "learning_rate": 9.152590090090089e-07, + "loss": 0.0004, + "reward": 3.235043168067932, + "reward_std": 0.3790423274040222, + "rewards/final_reward": 1.5514956112814668, + "rewards/mask_iou_reward": 0.7757478056407334, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.2975430488586426, + "rewards/thk_ans_format_reward": 1.0, + "step": 301, + "think_completion_length": 54.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.55208587646484, + "epoch": 1.0202360876897134, + "grad_norm": 5.126815049659408, + "kl": 0.3359375, + "learning_rate": 9.149774774774774e-07, + "loss": 0.0003, + "reward": 3.221487045288086, + "reward_std": 0.2783351540565491, + "rewards/final_reward": 0.9219856744032554, + "rewards/mask_iou_reward": 0.4609928372016277, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2214871644973755, + "rewards/thk_ans_format_reward": 1.0, + "step": 302, + "think_completion_length": 54.041666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.125, + "epoch": 1.0236087689713322, + "grad_norm": 7.340733693513745, + "kl": 0.54296875, + "learning_rate": 9.146959459459459e-07, + "loss": 0.0005, + "reward": 3.38637638092041, + "reward_std": 0.14389759302139282, + "rewards/final_reward": 1.8092322656026716, + "rewards/mask_iou_reward": 0.9046161328013358, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3863762021064758, + "rewards/thk_ans_format_reward": 1.0, + "step": 303, + "think_completion_length": 51.166666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.40625, + "epoch": 1.026981450252951, + "grad_norm": 5.265802698091234, + "kl": 0.3916015625, + "learning_rate": 9.144144144144143e-07, + "loss": 0.0004, + "reward": 3.4280911684036255, + "reward_std": 0.12274213880300522, + "rewards/final_reward": 1.5388529343411281, + "rewards/mask_iou_reward": 0.7694264671705641, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4280911087989807, + "rewards/thk_ans_format_reward": 1.0, + "step": 304, + "think_completion_length": 59.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.13542175292969, + "epoch": 1.03035413153457, + "grad_norm": 7.1611615022036, + "kl": 0.373046875, + "learning_rate": 9.141328828828828e-07, + "loss": 0.0004, + "reward": 3.184275269508362, + "reward_std": 0.14417023956775665, + "rewards/final_reward": 0.579938548943454, + "rewards/mask_iou_reward": 0.289969274471727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1842751204967499, + "rewards/thk_ans_format_reward": 1.0, + "step": 305, + "think_completion_length": 57.791666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.37500381469727, + "epoch": 1.033726812816189, + "grad_norm": 13.402703536491135, + "kl": 0.3583984375, + "learning_rate": 9.138513513513512e-07, + "loss": 0.0004, + "reward": 3.2078309059143066, + "reward_std": 0.23842425644397736, + "rewards/final_reward": 0.8247731713945541, + "rewards/mask_iou_reward": 0.41238658569727704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.207830786705017, + "rewards/thk_ans_format_reward": 1.0, + "step": 306, + "think_completion_length": 52.208333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.34375, + "epoch": 1.0370994940978078, + "grad_norm": 5.58524354551215, + "kl": 0.3662109375, + "learning_rate": 9.135698198198197e-07, + "loss": 0.0004, + "reward": 3.319042444229126, + "reward_std": 0.10922817140817642, + "rewards/final_reward": 1.763124109472381, + "rewards/mask_iou_reward": 0.8815620547361905, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3190423250198364, + "rewards/thk_ans_format_reward": 1.0, + "step": 307, + "think_completion_length": 56.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.59375762939453, + "epoch": 1.0404721753794266, + "grad_norm": 5.602803151228455, + "kl": 0.388671875, + "learning_rate": 9.132882882882883e-07, + "loss": 0.0004, + "reward": 3.567653179168701, + "reward_std": 0.15950121730566025, + "rewards/final_reward": 1.6246588918868299, + "rewards/mask_iou_reward": 0.8123294459434149, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5676530003547668, + "rewards/thk_ans_format_reward": 1.0, + "step": 308, + "think_completion_length": 58.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.93750381469727, + "epoch": 1.0438448566610454, + "grad_norm": 6.580711507853475, + "kl": 0.4091796875, + "learning_rate": 9.130067567567567e-07, + "loss": 0.0004, + "reward": 3.3005740642547607, + "reward_std": 0.16971008479595184, + "rewards/final_reward": 0.9619296225234968, + "rewards/mask_iou_reward": 0.4809648112617484, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3005741834640503, + "rewards/thk_ans_format_reward": 1.0, + "step": 309, + "think_completion_length": 57.41666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.27083587646484, + "epoch": 1.0472175379426645, + "grad_norm": 4.5198260011088855, + "kl": 0.4189453125, + "learning_rate": 9.127252252252252e-07, + "loss": 0.0004, + "reward": 3.3053261041641235, + "reward_std": 0.18478236347436905, + "rewards/final_reward": 0.9405771749478797, + "rewards/mask_iou_reward": 0.47028858747393987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3053261637687683, + "rewards/thk_ans_format_reward": 1.0, + "step": 310, + "think_completion_length": 61.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.90625381469727, + "epoch": 1.0505902192242833, + "grad_norm": 5.3429957757313975, + "kl": 0.3955078125, + "learning_rate": 9.124436936936937e-07, + "loss": 0.0004, + "reward": 3.3730320930480957, + "reward_std": 0.122093815356493, + "rewards/final_reward": 1.3094642304518052, + "rewards/mask_iou_reward": 0.6547321152259026, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3730321526527405, + "rewards/thk_ans_format_reward": 1.0, + "step": 311, + "think_completion_length": 61.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.14583587646484, + "epoch": 1.0539629005059021, + "grad_norm": 39.54666840814476, + "kl": 0.640625, + "learning_rate": 9.121621621621621e-07, + "loss": 0.0006, + "reward": 3.1350291967391968, + "reward_std": 0.17641381546854973, + "rewards/final_reward": 1.493971782697706, + "rewards/mask_iou_reward": 0.746985891348853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1350291967391968, + "rewards/thk_ans_format_reward": 1.0, + "step": 312, + "think_completion_length": 60.333333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.47916793823242, + "epoch": 1.0573355817875212, + "grad_norm": 15.95302602085042, + "kl": 0.4345703125, + "learning_rate": 9.118806306306306e-07, + "loss": 0.0004, + "reward": 3.29574453830719, + "reward_std": 0.12975647673010826, + "rewards/final_reward": 1.198893491412423, + "rewards/mask_iou_reward": 0.5994467457062115, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.29574453830719, + "rewards/thk_ans_format_reward": 1.0, + "step": 313, + "think_completion_length": 61.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.64583587646484, + "epoch": 1.06070826306914, + "grad_norm": 14.401676576205926, + "kl": 0.3408203125, + "learning_rate": 9.11599099099099e-07, + "loss": 0.0003, + "reward": 3.13248074054718, + "reward_std": 0.31112509220838547, + "rewards/final_reward": 1.2162999474308984, + "rewards/mask_iou_reward": 0.6081499737154492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1324808597564697, + "rewards/thk_ans_format_reward": 1.0, + "step": 314, + "think_completion_length": 58.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.8541717529297, + "epoch": 1.0640809443507588, + "grad_norm": 8.28584318628263, + "kl": 0.39453125, + "learning_rate": 9.113175675675675e-07, + "loss": 0.0004, + "reward": 3.271073341369629, + "reward_std": 0.2790771424770355, + "rewards/final_reward": 1.3517874102249416, + "rewards/mask_iou_reward": 0.6758937051124708, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.2919068932533264, + "rewards/thk_ans_format_reward": 1.0, + "step": 315, + "think_completion_length": 57.416666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.33333587646484, + "epoch": 1.0674536256323777, + "grad_norm": 7.111793447392556, + "kl": 0.91796875, + "learning_rate": 9.11036036036036e-07, + "loss": 0.0009, + "reward": 3.2280776500701904, + "reward_std": 0.2805949002504349, + "rewards/final_reward": 0.7988367844819377, + "rewards/mask_iou_reward": 0.39941839224096887, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2280775904655457, + "rewards/thk_ans_format_reward": 1.0, + "step": 316, + "think_completion_length": 64.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.0729217529297, + "epoch": 1.0708263069139967, + "grad_norm": 9.066928980109676, + "kl": 0.3876953125, + "learning_rate": 9.107545045045044e-07, + "loss": 0.0004, + "reward": 3.4015519618988037, + "reward_std": 0.14515436440706253, + "rewards/final_reward": 1.753466714461804, + "rewards/mask_iou_reward": 0.876733357230902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4015517830848694, + "rewards/thk_ans_format_reward": 1.0, + "step": 317, + "think_completion_length": 57.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.31250381469727, + "epoch": 1.0741989881956155, + "grad_norm": 5.169293870970053, + "kl": 0.373046875, + "learning_rate": 9.10472972972973e-07, + "loss": 0.0004, + "reward": 3.331157088279724, + "reward_std": 0.24197855591773987, + "rewards/final_reward": 1.239624879007727, + "rewards/mask_iou_reward": 0.6198124395038636, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3519902229309082, + "rewards/thk_ans_format_reward": 1.0, + "step": 318, + "think_completion_length": 64.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.90625, + "epoch": 1.0775716694772344, + "grad_norm": 6.663895649543028, + "kl": 0.40234375, + "learning_rate": 9.101914414414415e-07, + "loss": 0.0004, + "reward": 3.3463969230651855, + "reward_std": 0.27501678466796875, + "rewards/final_reward": 0.9133032025802322, + "rewards/mask_iou_reward": 0.4566516012901161, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.346396803855896, + "rewards/thk_ans_format_reward": 1.0, + "step": 319, + "think_completion_length": 56.291666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.25, + "epoch": 1.0809443507588532, + "grad_norm": 4.811356387200662, + "kl": 0.3759765625, + "learning_rate": 9.099099099099099e-07, + "loss": 0.0004, + "reward": 3.1629140377044678, + "reward_std": 0.16561146825551987, + "rewards/final_reward": 1.28658606261834, + "rewards/mask_iou_reward": 0.64329303130917, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1629141569137573, + "rewards/thk_ans_format_reward": 1.0, + "step": 320, + "think_completion_length": 56.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.68750762939453, + "epoch": 1.0843170320404723, + "grad_norm": 5.455993712741003, + "kl": 0.4814453125, + "learning_rate": 9.096283783783784e-07, + "loss": 0.0005, + "reward": 3.163417100906372, + "reward_std": 0.08710319921374321, + "rewards/final_reward": 0.9828214154254767, + "rewards/mask_iou_reward": 0.49141070771273837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1634172201156616, + "rewards/thk_ans_format_reward": 1.0, + "step": 321, + "think_completion_length": 64.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.03125762939453, + "epoch": 1.087689713322091, + "grad_norm": 20.96952304765022, + "kl": 0.5859375, + "learning_rate": 9.093468468468468e-07, + "loss": 0.0006, + "reward": 3.3637170791625977, + "reward_std": 0.17667383700609207, + "rewards/final_reward": 1.1931116823244388, + "rewards/mask_iou_reward": 0.5965558411622194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3637170791625977, + "rewards/thk_ans_format_reward": 1.0, + "step": 322, + "think_completion_length": 58.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.91667175292969, + "epoch": 1.09106239460371, + "grad_norm": 27.194575381051997, + "kl": 0.4462890625, + "learning_rate": 9.090653153153153e-07, + "loss": 0.0004, + "reward": 3.3401143550872803, + "reward_std": 0.09220879897475243, + "rewards/final_reward": 0.9237625026772636, + "rewards/mask_iou_reward": 0.4618812513386318, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3401142954826355, + "rewards/thk_ans_format_reward": 1.0, + "step": 323, + "think_completion_length": 61.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.6354217529297, + "epoch": 1.0944350758853287, + "grad_norm": 7.9243269526549325, + "kl": 0.4013671875, + "learning_rate": 9.087837837837838e-07, + "loss": 0.0004, + "reward": 3.462162137031555, + "reward_std": 0.17394614964723587, + "rewards/final_reward": 0.9065336502215842, + "rewards/mask_iou_reward": 0.4532668251107921, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.462161898612976, + "rewards/thk_ans_format_reward": 1.0, + "step": 324, + "think_completion_length": 59.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.55208587646484, + "epoch": 1.0978077571669478, + "grad_norm": 7.6876068680615495, + "kl": 0.396484375, + "learning_rate": 9.085022522522522e-07, + "loss": 0.0004, + "reward": 3.240488648414612, + "reward_std": 0.24380206316709518, + "rewards/final_reward": 0.9493554668714257, + "rewards/mask_iou_reward": 0.47467773343571285, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2404886484146118, + "rewards/thk_ans_format_reward": 1.0, + "step": 325, + "think_completion_length": 61.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.93750381469727, + "epoch": 1.1011804384485666, + "grad_norm": 17.497626082493245, + "kl": 0.40625, + "learning_rate": 9.082207207207207e-07, + "loss": 0.0004, + "reward": 2.984956741333008, + "reward_std": 0.16040324792265892, + "rewards/final_reward": 1.161768861024481, + "rewards/mask_iou_reward": 0.5808844305122405, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.9953732788562775, + "rewards/thk_ans_format_reward": 1.0, + "step": 326, + "think_completion_length": 60.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.0625, + "epoch": 1.1045531197301854, + "grad_norm": 8.722872084277531, + "kl": 0.4140625, + "learning_rate": 9.079391891891891e-07, + "loss": 0.0004, + "reward": 3.3815075159072876, + "reward_std": 0.2122548222541809, + "rewards/final_reward": 1.1613737494992136, + "rewards/mask_iou_reward": 0.5806868747496068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3815075755119324, + "rewards/thk_ans_format_reward": 1.0, + "step": 327, + "think_completion_length": 54.08333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.93750762939453, + "epoch": 1.1079258010118043, + "grad_norm": 6.552934117656638, + "kl": 0.3876953125, + "learning_rate": 9.076576576576577e-07, + "loss": 0.0004, + "reward": 3.2308026552200317, + "reward_std": 0.32459303736686707, + "rewards/final_reward": 0.9769021255318333, + "rewards/mask_iou_reward": 0.48845106276591665, + "rewards/sam_format_reward": 0.9270833432674408, + "rewards/sam_reward_func_ultra": 1.3037192821502686, + "rewards/thk_ans_format_reward": 1.0, + "step": 328, + "think_completion_length": 58.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.34375, + "epoch": 1.1112984822934233, + "grad_norm": 5.333888749074564, + "kl": 0.40625, + "learning_rate": 9.073761261261262e-07, + "loss": 0.0004, + "reward": 3.342952847480774, + "reward_std": 0.16162853688001633, + "rewards/final_reward": 1.0840490033547516, + "rewards/mask_iou_reward": 0.5420245016773758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3429526686668396, + "rewards/thk_ans_format_reward": 1.0, + "step": 329, + "think_completion_length": 55.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.8333396911621, + "epoch": 1.1146711635750421, + "grad_norm": 24.794201686691693, + "kl": 0.435546875, + "learning_rate": 9.070945945945946e-07, + "loss": 0.0005, + "reward": 3.0924460887908936, + "reward_std": 0.13998809456825256, + "rewards/final_reward": 0.839294410155661, + "rewards/mask_iou_reward": 0.4196472050778305, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1028627753257751, + "rewards/thk_ans_format_reward": 1.0, + "step": 330, + "think_completion_length": 64.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.41666793823242, + "epoch": 1.118043844856661, + "grad_norm": 14.98694410165655, + "kl": 0.3916015625, + "learning_rate": 9.068130630630631e-07, + "loss": 0.0004, + "reward": 3.1274584531784058, + "reward_std": 0.2977043092250824, + "rewards/final_reward": 0.9250505167662035, + "rewards/mask_iou_reward": 0.46252525838310177, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1378751993179321, + "rewards/thk_ans_format_reward": 1.0, + "step": 331, + "think_completion_length": 60.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.37500381469727, + "epoch": 1.12141652613828, + "grad_norm": 5.783084805803711, + "kl": 0.44921875, + "learning_rate": 9.065315315315315e-07, + "loss": 0.0004, + "reward": 2.9259408712387085, + "reward_std": 0.2695099413394928, + "rewards/final_reward": 0.5560096592874704, + "rewards/mask_iou_reward": 0.2780048296437352, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9259407222270966, + "rewards/thk_ans_format_reward": 1.0, + "step": 332, + "think_completion_length": 62.45833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8541717529297, + "epoch": 1.1247892074198989, + "grad_norm": 9.244155069198841, + "kl": 0.470703125, + "learning_rate": 9.0625e-07, + "loss": 0.0005, + "reward": 3.3988051414489746, + "reward_std": 0.14826885610818863, + "rewards/final_reward": 1.260855574702926, + "rewards/mask_iou_reward": 0.630427787351463, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3988049626350403, + "rewards/thk_ans_format_reward": 1.0, + "step": 333, + "think_completion_length": 61.45833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.73958587646484, + "epoch": 1.1281618887015177, + "grad_norm": 19.766878070326953, + "kl": 0.412109375, + "learning_rate": 9.059684684684685e-07, + "loss": 0.0004, + "reward": 3.0867655277252197, + "reward_std": 0.2322430983185768, + "rewards/final_reward": 0.8911001395752802, + "rewards/mask_iou_reward": 0.4455500697876401, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0867655277252197, + "rewards/thk_ans_format_reward": 1.0, + "step": 334, + "think_completion_length": 64.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.42708969116211, + "epoch": 1.1315345699831365, + "grad_norm": 5.298022005482819, + "kl": 0.4306640625, + "learning_rate": 9.056869369369369e-07, + "loss": 0.0004, + "reward": 3.566340446472168, + "reward_std": 0.08108654618263245, + "rewards/final_reward": 1.876243345798129, + "rewards/mask_iou_reward": 0.9381216728990645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5663405060768127, + "rewards/thk_ans_format_reward": 1.0, + "step": 335, + "think_completion_length": 61.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.68750381469727, + "epoch": 1.1349072512647556, + "grad_norm": 12.434004407283455, + "kl": 0.4384765625, + "learning_rate": 9.054054054054053e-07, + "loss": 0.0005, + "reward": 3.3265706300735474, + "reward_std": 0.23486195504665375, + "rewards/final_reward": 1.474851406697328, + "rewards/mask_iou_reward": 0.737425703348664, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3265705704689026, + "rewards/thk_ans_format_reward": 1.0, + "step": 336, + "think_completion_length": 61.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0729217529297, + "epoch": 1.1382799325463744, + "grad_norm": 7.178608067119752, + "kl": 0.533203125, + "learning_rate": 9.051238738738737e-07, + "loss": 0.0005, + "reward": 3.492894411087036, + "reward_std": 0.2055719941854477, + "rewards/final_reward": 0.990075326096736, + "rewards/mask_iou_reward": 0.495037663048368, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5033112168312073, + "rewards/thk_ans_format_reward": 1.0, + "step": 337, + "think_completion_length": 54.041666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.22917175292969, + "epoch": 1.1416526138279932, + "grad_norm": 5.280799948687187, + "kl": 0.48046875, + "learning_rate": 9.048423423423422e-07, + "loss": 0.0005, + "reward": 3.5345152616500854, + "reward_std": 0.10706453770399094, + "rewards/final_reward": 1.7453732959394208, + "rewards/mask_iou_reward": 0.8726866479697104, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5449321866035461, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 338, + "think_completion_length": 62.083333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.30208587646484, + "epoch": 1.1450252951096123, + "grad_norm": 7.207464617440506, + "kl": 0.3759765625, + "learning_rate": 9.045608108108108e-07, + "loss": 0.0004, + "reward": 2.865417242050171, + "reward_std": 0.37941035628318787, + "rewards/final_reward": 0.8771180013313473, + "rewards/mask_iou_reward": 0.43855900066567366, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8654172122478485, + "rewards/thk_ans_format_reward": 1.0, + "step": 339, + "think_completion_length": 71.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.92708587646484, + "epoch": 1.148397976391231, + "grad_norm": 11.641408008070787, + "kl": 0.4130859375, + "learning_rate": 9.042792792792792e-07, + "loss": 0.0004, + "reward": 3.2123430967330933, + "reward_std": 0.2588522955775261, + "rewards/final_reward": 1.4110095741889364, + "rewards/mask_iou_reward": 0.7055047870944682, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.21234330534935, + "rewards/thk_ans_format_reward": 1.0, + "step": 340, + "think_completion_length": 65.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.13541793823242, + "epoch": 1.15177065767285, + "grad_norm": 6.67371298694648, + "kl": 0.4033203125, + "learning_rate": 9.039977477477477e-07, + "loss": 0.0004, + "reward": 3.1938616037368774, + "reward_std": 0.16771592944860458, + "rewards/final_reward": 1.5646572712220155, + "rewards/mask_iou_reward": 0.7823286356110077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1938616037368774, + "rewards/thk_ans_format_reward": 1.0, + "step": 341, + "think_completion_length": 62.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.16666793823242, + "epoch": 1.1551433389544687, + "grad_norm": 5.263702820720401, + "kl": 0.5166015625, + "learning_rate": 9.037162162162162e-07, + "loss": 0.0005, + "reward": 3.3237626552581787, + "reward_std": 0.17711199820041656, + "rewards/final_reward": 1.1942757440934884, + "rewards/mask_iou_reward": 0.5971378720467442, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.323762595653534, + "rewards/thk_ans_format_reward": 1.0, + "step": 342, + "think_completion_length": 58.45833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.80208587646484, + "epoch": 1.1585160202360876, + "grad_norm": 6.791804755889098, + "kl": 0.4521484375, + "learning_rate": 9.034346846846846e-07, + "loss": 0.0005, + "reward": 3.1926233768463135, + "reward_std": 0.1412236988544464, + "rewards/final_reward": 1.196448614697763, + "rewards/mask_iou_reward": 0.5982243073488815, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1926233768463135, + "rewards/thk_ans_format_reward": 1.0, + "step": 343, + "think_completion_length": 61.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.14583587646484, + "epoch": 1.1618887015177066, + "grad_norm": 6.534097465373398, + "kl": 0.4228515625, + "learning_rate": 9.031531531531531e-07, + "loss": 0.0004, + "reward": 2.971271514892578, + "reward_std": 0.3797031044960022, + "rewards/final_reward": 1.2611627610058478, + "rewards/mask_iou_reward": 0.6305813805029239, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.9816881716251373, + "rewards/thk_ans_format_reward": 1.0, + "step": 344, + "think_completion_length": 65.29166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.53125381469727, + "epoch": 1.1652613827993255, + "grad_norm": 7.74065202517212, + "kl": 0.46875, + "learning_rate": 9.028716216216215e-07, + "loss": 0.0005, + "reward": 3.186206817626953, + "reward_std": 0.11258535459637642, + "rewards/final_reward": 1.2044975491309748, + "rewards/mask_iou_reward": 0.6022487745654874, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1862066984176636, + "rewards/thk_ans_format_reward": 1.0, + "step": 345, + "think_completion_length": 64.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3229217529297, + "epoch": 1.1686340640809443, + "grad_norm": 6.48305370335897, + "kl": 0.4521484375, + "learning_rate": 9.0259009009009e-07, + "loss": 0.0005, + "reward": 3.4441089630126953, + "reward_std": 0.23721785843372345, + "rewards/final_reward": 1.1137867720324555, + "rewards/mask_iou_reward": 0.5568933860162277, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4441088438034058, + "rewards/thk_ans_format_reward": 1.0, + "step": 346, + "think_completion_length": 66.20833333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.98958587646484, + "epoch": 1.1720067453625633, + "grad_norm": 11.63205530379195, + "kl": 0.4541015625, + "learning_rate": 9.023085585585585e-07, + "loss": 0.0005, + "reward": 3.355055093765259, + "reward_std": 0.1520690880715847, + "rewards/final_reward": 0.31849268904859007, + "rewards/mask_iou_reward": 0.15924634452429504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3550548553466797, + "rewards/thk_ans_format_reward": 1.0, + "step": 347, + "think_completion_length": 67.04166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.09375381469727, + "epoch": 1.1753794266441822, + "grad_norm": 6.91360662733961, + "kl": 0.4638671875, + "learning_rate": 9.020270270270269e-07, + "loss": 0.0005, + "reward": 3.23860239982605, + "reward_std": 0.09308822453022003, + "rewards/final_reward": 0.7723889578444937, + "rewards/mask_iou_reward": 0.38619447892224684, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.238602489233017, + "rewards/thk_ans_format_reward": 1.0, + "step": 348, + "think_completion_length": 58.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.98958587646484, + "epoch": 1.178752107925801, + "grad_norm": 6.787137639312774, + "kl": 0.400390625, + "learning_rate": 9.017454954954955e-07, + "loss": 0.0005, + "reward": 3.483386754989624, + "reward_std": 0.15565379709005356, + "rewards/final_reward": 1.2750090210076546, + "rewards/mask_iou_reward": 0.6375045105038273, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4833868145942688, + "rewards/thk_ans_format_reward": 1.0, + "step": 349, + "think_completion_length": 60.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.92708587646484, + "epoch": 1.1821247892074198, + "grad_norm": 6.595761566501617, + "kl": 0.4248046875, + "learning_rate": 9.014639639639639e-07, + "loss": 0.0004, + "reward": 3.601720929145813, + "reward_std": 0.10276154428720474, + "rewards/final_reward": 1.8197904926380537, + "rewards/mask_iou_reward": 0.9098952463190269, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6017211079597473, + "rewards/thk_ans_format_reward": 1.0, + "step": 350, + "think_completion_length": 52.708333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.72916793823242, + "epoch": 1.1854974704890389, + "grad_norm": 19.507311166473414, + "kl": 0.416015625, + "learning_rate": 9.011824324324324e-07, + "loss": 0.0004, + "reward": 3.3562408685684204, + "reward_std": 0.13687966763973236, + "rewards/final_reward": 1.2024907054821465, + "rewards/mask_iou_reward": 0.6012453527410733, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3562407493591309, + "rewards/thk_ans_format_reward": 1.0, + "step": 351, + "think_completion_length": 60.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.875, + "epoch": 1.1888701517706577, + "grad_norm": 19.857489845731262, + "kl": 0.47265625, + "learning_rate": 9.009009009009009e-07, + "loss": 0.0005, + "reward": 3.7000582218170166, + "reward_std": 0.07116364687681198, + "rewards/final_reward": 1.3070988339251544, + "rewards/mask_iou_reward": 0.6535494169625772, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.700058102607727, + "rewards/thk_ans_format_reward": 1.0, + "step": 352, + "think_completion_length": 64.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.48958587646484, + "epoch": 1.1922428330522765, + "grad_norm": 8.064821636428672, + "kl": 0.45703125, + "learning_rate": 9.006193693693693e-07, + "loss": 0.0004, + "reward": 3.244594097137451, + "reward_std": 0.16334939748048782, + "rewards/final_reward": 1.0210304897324536, + "rewards/mask_iou_reward": 0.5105152448662268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2445939779281616, + "rewards/thk_ans_format_reward": 1.0, + "step": 353, + "think_completion_length": 60.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.05208969116211, + "epoch": 1.1956155143338956, + "grad_norm": 37.25785884236522, + "kl": 0.455078125, + "learning_rate": 9.003378378378378e-07, + "loss": 0.0005, + "reward": 3.482452630996704, + "reward_std": 0.17858774214982986, + "rewards/final_reward": 0.9796232169947886, + "rewards/mask_iou_reward": 0.4898116084973943, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4824528098106384, + "rewards/thk_ans_format_reward": 1.0, + "step": 354, + "think_completion_length": 67.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.69792175292969, + "epoch": 1.1989881956155144, + "grad_norm": 6.518708947918132, + "kl": 0.4267578125, + "learning_rate": 9.000563063063062e-07, + "loss": 0.0004, + "reward": 3.297169804573059, + "reward_std": 0.27533242851495743, + "rewards/final_reward": 1.3079855408483727, + "rewards/mask_iou_reward": 0.6539927704241864, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.297169804573059, + "rewards/thk_ans_format_reward": 1.0, + "step": 355, + "think_completion_length": 55.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.62500381469727, + "epoch": 1.2023608768971332, + "grad_norm": 6.487089048133617, + "kl": 0.4150390625, + "learning_rate": 8.997747747747747e-07, + "loss": 0.0004, + "reward": 3.531155824661255, + "reward_std": 0.08609841391444206, + "rewards/final_reward": 1.8262309782537098, + "rewards/mask_iou_reward": 0.9131154891268549, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5311556458473206, + "rewards/thk_ans_format_reward": 1.0, + "step": 356, + "think_completion_length": 57.666666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5208396911621, + "epoch": 1.205733558178752, + "grad_norm": 12.991734029654161, + "kl": 0.388671875, + "learning_rate": 8.994932432432432e-07, + "loss": 0.0004, + "reward": 3.5364201068878174, + "reward_std": 0.10052265971899033, + "rewards/final_reward": 0.9239991930084288, + "rewards/mask_iou_reward": 0.4619995965042144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5364200472831726, + "rewards/thk_ans_format_reward": 1.0, + "step": 357, + "think_completion_length": 59.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.96875381469727, + "epoch": 1.2091062394603709, + "grad_norm": 13.287657453364371, + "kl": 0.3974609375, + "learning_rate": 8.992117117117116e-07, + "loss": 0.0004, + "reward": 3.169158458709717, + "reward_std": 0.18534202873706818, + "rewards/final_reward": 0.9565178424519037, + "rewards/mask_iou_reward": 0.47825892122595187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.16915825009346, + "rewards/thk_ans_format_reward": 1.0, + "step": 358, + "think_completion_length": 59.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.81250762939453, + "epoch": 1.21247892074199, + "grad_norm": 7.958957052398867, + "kl": 0.4306640625, + "learning_rate": 8.989301801801802e-07, + "loss": 0.0004, + "reward": 2.9273035526275635, + "reward_std": 0.22559326887130737, + "rewards/final_reward": 1.1544018769118864, + "rewards/mask_iou_reward": 0.5772009384559432, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9273033142089844, + "rewards/thk_ans_format_reward": 1.0, + "step": 359, + "think_completion_length": 60.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.06250381469727, + "epoch": 1.2158516020236088, + "grad_norm": 26.49649303527819, + "kl": 0.431640625, + "learning_rate": 8.986486486486487e-07, + "loss": 0.0004, + "reward": 3.3369808197021484, + "reward_std": 0.21792490035295486, + "rewards/final_reward": 0.8552900644315113, + "rewards/mask_iou_reward": 0.42764503221575567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3369808793067932, + "rewards/thk_ans_format_reward": 1.0, + "step": 360, + "think_completion_length": 64.29166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 1.2192242833052276, + "grad_norm": 8.376718706661617, + "kl": 0.40625, + "learning_rate": 8.983671171171171e-07, + "loss": 0.0004, + "reward": 3.4866225719451904, + "reward_std": 0.11170128360390663, + "rewards/final_reward": 1.5778857509655904, + "rewards/mask_iou_reward": 0.7889428754827952, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4866225719451904, + "rewards/thk_ans_format_reward": 1.0, + "step": 361, + "think_completion_length": 61.833333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.11458587646484, + "epoch": 1.2225969645868466, + "grad_norm": 9.68698005550589, + "kl": 0.375, + "learning_rate": 8.980855855855856e-07, + "loss": 0.0004, + "reward": 3.416308283805847, + "reward_std": 0.19030694663524628, + "rewards/final_reward": 0.9690831250824701, + "rewards/mask_iou_reward": 0.48454156254123504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4163082838058472, + "rewards/thk_ans_format_reward": 1.0, + "step": 362, + "think_completion_length": 60.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.96875381469727, + "epoch": 1.2259696458684655, + "grad_norm": 5.466876208748925, + "kl": 0.3701171875, + "learning_rate": 8.97804054054054e-07, + "loss": 0.0004, + "reward": 3.083742141723633, + "reward_std": 0.2588346600532532, + "rewards/final_reward": 1.0219780085731855, + "rewards/mask_iou_reward": 0.5109890042865928, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0837420225143433, + "rewards/thk_ans_format_reward": 1.0, + "step": 363, + "think_completion_length": 57.833333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.04167175292969, + "epoch": 1.2293423271500843, + "grad_norm": 6.13962734112624, + "kl": 0.4306640625, + "learning_rate": 8.975225225225225e-07, + "loss": 0.0004, + "reward": 3.355513334274292, + "reward_std": 0.11050765588879585, + "rewards/final_reward": 1.684858807403022, + "rewards/mask_iou_reward": 0.842429403701511, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.355513334274292, + "rewards/thk_ans_format_reward": 1.0, + "step": 364, + "think_completion_length": 62.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.09375, + "epoch": 1.2327150084317031, + "grad_norm": 7.982760324456483, + "kl": 0.42578125, + "learning_rate": 8.97240990990991e-07, + "loss": 0.0004, + "reward": 3.561036229133606, + "reward_std": 0.1595241203904152, + "rewards/final_reward": 1.6920156046347539, + "rewards/mask_iou_reward": 0.8460078023173769, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5610363483428955, + "rewards/thk_ans_format_reward": 1.0, + "step": 365, + "think_completion_length": 58.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.54166793823242, + "epoch": 1.2360876897133222, + "grad_norm": 6.512397630777975, + "kl": 0.4765625, + "learning_rate": 8.969594594594594e-07, + "loss": 0.0005, + "reward": 3.0696985721588135, + "reward_std": 0.1708909571170807, + "rewards/final_reward": 1.20217084539104, + "rewards/mask_iou_reward": 0.60108542269552, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0696982145309448, + "rewards/thk_ans_format_reward": 1.0, + "step": 366, + "think_completion_length": 60.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.29166793823242, + "epoch": 1.239460370994941, + "grad_norm": 6.0760802820191415, + "kl": 0.419921875, + "learning_rate": 8.966779279279279e-07, + "loss": 0.0004, + "reward": 3.317044496536255, + "reward_std": 0.18018481880426407, + "rewards/final_reward": 0.9774291667228298, + "rewards/mask_iou_reward": 0.4887145833614149, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3170446157455444, + "rewards/thk_ans_format_reward": 1.0, + "step": 367, + "think_completion_length": 73.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.18750762939453, + "epoch": 1.2428330522765598, + "grad_norm": 19.438760787888953, + "kl": 0.4072265625, + "learning_rate": 8.963963963963963e-07, + "loss": 0.0004, + "reward": 2.8064730167388916, + "reward_std": 0.15254508703947067, + "rewards/final_reward": 1.2937061810125203, + "rewards/mask_iou_reward": 0.6468530905062602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8064730167388916, + "rewards/thk_ans_format_reward": 1.0, + "step": 368, + "think_completion_length": 58.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.65625762939453, + "epoch": 1.2462057335581789, + "grad_norm": 5.73307642782275, + "kl": 0.4267578125, + "learning_rate": 8.961148648648649e-07, + "loss": 0.0004, + "reward": 3.4007943868637085, + "reward_std": 0.13911010324954987, + "rewards/final_reward": 1.4890716395653536, + "rewards/mask_iou_reward": 0.7445358197826768, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4007944464683533, + "rewards/thk_ans_format_reward": 1.0, + "step": 369, + "think_completion_length": 58.41666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6770896911621, + "epoch": 1.2495784148397977, + "grad_norm": 9.536909751789882, + "kl": 0.408203125, + "learning_rate": 8.958333333333334e-07, + "loss": 0.0004, + "reward": 2.7433794736862183, + "reward_std": 0.3473154753446579, + "rewards/final_reward": 0.42417780778787956, + "rewards/mask_iou_reward": 0.21208890389393978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7433794736862183, + "rewards/thk_ans_format_reward": 1.0, + "step": 370, + "think_completion_length": 70.79166666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.17708587646484, + "epoch": 1.2529510961214165, + "grad_norm": 7.0441708924637565, + "kl": 0.380859375, + "learning_rate": 8.955518018018018e-07, + "loss": 0.0004, + "reward": 3.249468207359314, + "reward_std": 0.19624735042452812, + "rewards/final_reward": 0.7434260286837571, + "rewards/mask_iou_reward": 0.37171301434187853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2494682669639587, + "rewards/thk_ans_format_reward": 1.0, + "step": 371, + "think_completion_length": 62.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.3645896911621, + "epoch": 1.2563237774030354, + "grad_norm": 9.238593494156854, + "kl": 0.38671875, + "learning_rate": 8.952702702702703e-07, + "loss": 0.0003, + "reward": 3.3437212705612183, + "reward_std": 0.3516358807682991, + "rewards/final_reward": 1.4175673642321096, + "rewards/mask_iou_reward": 0.7087836821160548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3437212705612183, + "rewards/thk_ans_format_reward": 1.0, + "step": 372, + "think_completion_length": 62.291666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.90625, + "epoch": 1.2596964586846542, + "grad_norm": 11.358320745996085, + "kl": 0.48828125, + "learning_rate": 8.949887387387387e-07, + "loss": 0.0005, + "reward": 3.284479856491089, + "reward_std": 0.18969615548849106, + "rewards/final_reward": 1.4953732471384509, + "rewards/mask_iou_reward": 0.7476866235692254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2844797372817993, + "rewards/thk_ans_format_reward": 1.0, + "step": 373, + "think_completion_length": 59.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.84375, + "epoch": 1.2630691399662732, + "grad_norm": 7.151983483918055, + "kl": 0.3818359375, + "learning_rate": 8.947072072072072e-07, + "loss": 0.0004, + "reward": 3.1946144104003906, + "reward_std": 0.11458705738186836, + "rewards/final_reward": 1.103767047410554, + "rewards/mask_iou_reward": 0.551883523705277, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1946144104003906, + "rewards/thk_ans_format_reward": 1.0, + "step": 374, + "think_completion_length": 56.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.03125, + "epoch": 1.266441821247892, + "grad_norm": 9.071646300744112, + "kl": 0.4267578125, + "learning_rate": 8.944256756756756e-07, + "loss": 0.0004, + "reward": 2.8307807445526123, + "reward_std": 0.15931977331638336, + "rewards/final_reward": 0.8759926112551624, + "rewards/mask_iou_reward": 0.4379963056275812, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8307807892560959, + "rewards/thk_ans_format_reward": 1.0, + "step": 375, + "think_completion_length": 55.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.02083587646484, + "epoch": 1.269814502529511, + "grad_norm": 16.160925277891703, + "kl": 0.4169921875, + "learning_rate": 8.94144144144144e-07, + "loss": 0.0005, + "reward": 3.142805814743042, + "reward_std": 0.11035696789622307, + "rewards/final_reward": 1.6712795075685372, + "rewards/mask_iou_reward": 0.8356397537842686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1428057253360748, + "rewards/thk_ans_format_reward": 1.0, + "step": 376, + "think_completion_length": 53.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.34375381469727, + "epoch": 1.27318718381113, + "grad_norm": 5.971996512155509, + "kl": 0.427734375, + "learning_rate": 8.938626126126125e-07, + "loss": 0.0004, + "reward": 2.9981162548065186, + "reward_std": 0.093520887196064, + "rewards/final_reward": 0.7963105389822663, + "rewards/mask_iou_reward": 0.39815526949113317, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.998116135597229, + "rewards/thk_ans_format_reward": 1.0, + "step": 377, + "think_completion_length": 55.54166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.36458587646484, + "epoch": 1.2765598650927488, + "grad_norm": 10.43167191450742, + "kl": 0.3994140625, + "learning_rate": 8.93581081081081e-07, + "loss": 0.0004, + "reward": 3.275425672531128, + "reward_std": 0.11471785977482796, + "rewards/final_reward": 1.7041599415896092, + "rewards/mask_iou_reward": 0.8520799707948046, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2754257917404175, + "rewards/thk_ans_format_reward": 1.0, + "step": 378, + "think_completion_length": 58.208333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.95833587646484, + "epoch": 1.2799325463743676, + "grad_norm": 11.154206440284236, + "kl": 0.369140625, + "learning_rate": 8.932995495495495e-07, + "loss": 0.0004, + "reward": 3.4361212253570557, + "reward_std": 0.11556711047887802, + "rewards/final_reward": 1.3823572230237666, + "rewards/mask_iou_reward": 0.6911786115118833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4361212849617004, + "rewards/thk_ans_format_reward": 1.0, + "step": 379, + "think_completion_length": 65.16666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.03125381469727, + "epoch": 1.2833052276559864, + "grad_norm": 5.38958933410073, + "kl": 0.3984375, + "learning_rate": 8.93018018018018e-07, + "loss": 0.0004, + "reward": 3.418978691101074, + "reward_std": 0.22982808575034142, + "rewards/final_reward": 1.6648917460641777, + "rewards/mask_iou_reward": 0.8324458730320888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.41897851228714, + "rewards/thk_ans_format_reward": 1.0, + "step": 380, + "think_completion_length": 55.333333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0625, + "epoch": 1.2866779089376055, + "grad_norm": 14.204627852713159, + "kl": 0.466796875, + "learning_rate": 8.927364864864864e-07, + "loss": 0.0005, + "reward": 3.205819845199585, + "reward_std": 0.17886455357074738, + "rewards/final_reward": 1.0723750964434002, + "rewards/mask_iou_reward": 0.5361875482217001, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2058197855949402, + "rewards/thk_ans_format_reward": 1.0, + "step": 381, + "think_completion_length": 56.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.11458969116211, + "epoch": 1.2900505902192243, + "grad_norm": 7.3675402122446085, + "kl": 0.37890625, + "learning_rate": 8.924549549549549e-07, + "loss": 0.0004, + "reward": 3.1716625690460205, + "reward_std": 0.17073575779795647, + "rewards/final_reward": 1.919256260742495, + "rewards/mask_iou_reward": 0.9596281303712475, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1716625690460205, + "rewards/thk_ans_format_reward": 1.0, + "step": 382, + "think_completion_length": 62.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.0208396911621, + "epoch": 1.2934232715008431, + "grad_norm": 15.221231003902952, + "kl": 0.341796875, + "learning_rate": 8.921734234234234e-07, + "loss": 0.0003, + "reward": 3.087328314781189, + "reward_std": 0.1399368941783905, + "rewards/final_reward": 0.9321140483293636, + "rewards/mask_iou_reward": 0.4660570241646818, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0873282551765442, + "rewards/thk_ans_format_reward": 1.0, + "step": 383, + "think_completion_length": 59.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.37500381469727, + "epoch": 1.2967959527824622, + "grad_norm": 5.2845428955702785, + "kl": 0.3779296875, + "learning_rate": 8.918918918918918e-07, + "loss": 0.0004, + "reward": 2.6019227504730225, + "reward_std": 0.1827988475561142, + "rewards/final_reward": 0.28128574034548853, + "rewards/mask_iou_reward": 0.14064287017274427, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.6019226759672165, + "rewards/thk_ans_format_reward": 1.0, + "step": 384, + "think_completion_length": 59.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.87500381469727, + "epoch": 1.300168634064081, + "grad_norm": 8.713017994196841, + "kl": 0.6875, + "learning_rate": 8.916103603603603e-07, + "loss": 0.0007, + "reward": 2.8522579669952393, + "reward_std": 0.2745845168828964, + "rewards/final_reward": 0.7205070515589428, + "rewards/mask_iou_reward": 0.3602535257794714, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8522579073905945, + "rewards/thk_ans_format_reward": 1.0, + "step": 385, + "think_completion_length": 58.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.27083587646484, + "epoch": 1.3035413153456998, + "grad_norm": 4.3399686330782785, + "kl": 0.416015625, + "learning_rate": 8.913288288288287e-07, + "loss": 0.0004, + "reward": 2.8038978576660156, + "reward_std": 0.20723021775484085, + "rewards/final_reward": 1.0406138061752164, + "rewards/mask_iou_reward": 0.5203069030876082, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8038977980613708, + "rewards/thk_ans_format_reward": 1.0, + "step": 386, + "think_completion_length": 56.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.17708587646484, + "epoch": 1.3069139966273187, + "grad_norm": 15.658706712906872, + "kl": 0.3671875, + "learning_rate": 8.910472972972972e-07, + "loss": 0.0004, + "reward": 3.316901206970215, + "reward_std": 0.1013756264001131, + "rewards/final_reward": 1.9059538432217367, + "rewards/mask_iou_reward": 0.9529769216108683, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3169008493423462, + "rewards/thk_ans_format_reward": 1.0, + "step": 387, + "think_completion_length": 52.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.92708587646484, + "epoch": 1.3102866779089375, + "grad_norm": 9.430723989126525, + "kl": 0.4130859375, + "learning_rate": 8.907657657657657e-07, + "loss": 0.0004, + "reward": 3.4749109745025635, + "reward_std": 0.08457119390368462, + "rewards/final_reward": 1.693266615218402, + "rewards/mask_iou_reward": 0.846633307609201, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4749109148979187, + "rewards/thk_ans_format_reward": 1.0, + "step": 388, + "think_completion_length": 52.916666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.30208587646484, + "epoch": 1.3136593591905565, + "grad_norm": 39.06175249555972, + "kl": 0.349609375, + "learning_rate": 8.904842342342342e-07, + "loss": 0.0004, + "reward": 3.163516402244568, + "reward_std": 0.09743357449769974, + "rewards/final_reward": 1.123364980192178, + "rewards/mask_iou_reward": 0.561682490096089, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1635163724422455, + "rewards/thk_ans_format_reward": 1.0, + "step": 389, + "think_completion_length": 56.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.1354217529297, + "epoch": 1.3170320404721754, + "grad_norm": 6.782655594229362, + "kl": 0.400390625, + "learning_rate": 8.902027027027027e-07, + "loss": 0.0004, + "reward": 3.012680411338806, + "reward_std": 0.22783783078193665, + "rewards/final_reward": 0.8515693866925474, + "rewards/mask_iou_reward": 0.4257846933462737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0126804113388062, + "rewards/thk_ans_format_reward": 1.0, + "step": 390, + "think_completion_length": 56.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.89583587646484, + "epoch": 1.3204047217537942, + "grad_norm": 14.255778869795074, + "kl": 0.470703125, + "learning_rate": 8.899211711711711e-07, + "loss": 0.0005, + "reward": 3.1088274717330933, + "reward_std": 0.25957299768924713, + "rewards/final_reward": 1.1813193456230515, + "rewards/mask_iou_reward": 0.5906596728115258, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1088275015354156, + "rewards/thk_ans_format_reward": 1.0, + "step": 391, + "think_completion_length": 50.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.12500381469727, + "epoch": 1.3237774030354132, + "grad_norm": 21.896006193903432, + "kl": 0.39453125, + "learning_rate": 8.896396396396396e-07, + "loss": 0.0004, + "reward": 2.811727285385132, + "reward_std": 0.24812977015972137, + "rewards/final_reward": 0.6650224283784837, + "rewards/mask_iou_reward": 0.33251121418924184, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8117272257804871, + "rewards/thk_ans_format_reward": 1.0, + "step": 392, + "think_completion_length": 45.833333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.71875381469727, + "epoch": 1.327150084317032, + "grad_norm": 12.972249422550195, + "kl": 0.4287109375, + "learning_rate": 8.893581081081081e-07, + "loss": 0.0004, + "reward": 3.5639950037002563, + "reward_std": 0.1949900984764099, + "rewards/final_reward": 1.6489261938244277, + "rewards/mask_iou_reward": 0.8244630969122139, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5639949440956116, + "rewards/thk_ans_format_reward": 1.0, + "step": 393, + "think_completion_length": 54.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.04166793823242, + "epoch": 1.330522765598651, + "grad_norm": 5.648364619261588, + "kl": 0.390625, + "learning_rate": 8.890765765765765e-07, + "loss": 0.0004, + "reward": 3.349947929382324, + "reward_std": 0.17918875813484192, + "rewards/final_reward": 0.6127089105917438, + "rewards/mask_iou_reward": 0.3063544552958719, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.34994775056839, + "rewards/thk_ans_format_reward": 1.0, + "step": 394, + "think_completion_length": 59.04166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.28125, + "epoch": 1.3338954468802697, + "grad_norm": 6.458264633868568, + "kl": 0.4365234375, + "learning_rate": 8.88795045045045e-07, + "loss": 0.0004, + "reward": 3.2618488073349, + "reward_std": 0.09879514575004578, + "rewards/final_reward": 1.8915855231564893, + "rewards/mask_iou_reward": 0.9457927615782447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2618490755558014, + "rewards/thk_ans_format_reward": 1.0, + "step": 395, + "think_completion_length": 51.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.5625, + "epoch": 1.3372681281618888, + "grad_norm": 21.847456737133793, + "kl": 0.388671875, + "learning_rate": 8.885135135135135e-07, + "loss": 0.0004, + "reward": 3.172235608100891, + "reward_std": 0.17507488653063774, + "rewards/final_reward": 1.7546129488764217, + "rewards/mask_iou_reward": 0.8773064744382109, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1722354888916016, + "rewards/thk_ans_format_reward": 1.0, + "step": 396, + "think_completion_length": 61.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.83333587646484, + "epoch": 1.3406408094435076, + "grad_norm": 10.361193095562697, + "kl": 0.3955078125, + "learning_rate": 8.882319819819819e-07, + "loss": 0.0004, + "reward": 3.3426592350006104, + "reward_std": 0.05589485540986061, + "rewards/final_reward": 1.4442277286890854, + "rewards/mask_iou_reward": 0.7221138643445427, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3426590859889984, + "rewards/thk_ans_format_reward": 1.0, + "step": 397, + "think_completion_length": 53.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.94791793823242, + "epoch": 1.3440134907251264, + "grad_norm": 6.465458092344196, + "kl": 0.435546875, + "learning_rate": 8.879504504504504e-07, + "loss": 0.0004, + "reward": 3.2610703706741333, + "reward_std": 0.18213983997702599, + "rewards/final_reward": 1.2578534767851406, + "rewards/mask_iou_reward": 0.6289267383925703, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2610703706741333, + "rewards/thk_ans_format_reward": 1.0, + "step": 398, + "think_completion_length": 61.041666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.26042175292969, + "epoch": 1.3473861720067455, + "grad_norm": 4.881478330864418, + "kl": 0.365234375, + "learning_rate": 8.876689189189189e-07, + "loss": 0.0004, + "reward": 2.9544575214385986, + "reward_std": 0.2750149741768837, + "rewards/final_reward": 1.321201826779002, + "rewards/mask_iou_reward": 0.660600913389501, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9544573724269867, + "rewards/thk_ans_format_reward": 1.0, + "step": 399, + "think_completion_length": 49.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.36458587646484, + "epoch": 1.3507588532883643, + "grad_norm": 9.08727978460178, + "kl": 0.3720703125, + "learning_rate": 8.873873873873874e-07, + "loss": 0.0004, + "reward": 3.395462989807129, + "reward_std": 0.1573108658194542, + "rewards/final_reward": 1.591655385445944, + "rewards/mask_iou_reward": 0.795827692722972, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3954631090164185, + "rewards/thk_ans_format_reward": 1.0, + "step": 400, + "think_completion_length": 56.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.89583587646484, + "epoch": 1.3541315345699831, + "grad_norm": 10.98203028725922, + "kl": 0.490234375, + "learning_rate": 8.871058558558559e-07, + "loss": 0.0005, + "reward": 3.2958803176879883, + "reward_std": 0.29315295070409775, + "rewards/final_reward": 1.8774677527947858, + "rewards/mask_iou_reward": 0.9387338763973929, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.295880138874054, + "rewards/thk_ans_format_reward": 1.0, + "step": 401, + "think_completion_length": 53.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.36458587646484, + "epoch": 1.357504215851602, + "grad_norm": 10.763475285844352, + "kl": 0.46875, + "learning_rate": 8.868243243243243e-07, + "loss": 0.0005, + "reward": 3.1025447845458984, + "reward_std": 0.1851225420832634, + "rewards/final_reward": 1.1530047529999958, + "rewards/mask_iou_reward": 0.5765023764999979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1025445461273193, + "rewards/thk_ans_format_reward": 1.0, + "step": 402, + "think_completion_length": 59.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.30208587646484, + "epoch": 1.3608768971332208, + "grad_norm": 13.295790055078449, + "kl": 0.4345703125, + "learning_rate": 8.865427927927928e-07, + "loss": 0.0004, + "reward": 3.2150758504867554, + "reward_std": 0.2421677317470312, + "rewards/final_reward": 0.4944866514919177, + "rewards/mask_iou_reward": 0.24724332574595884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2150757312774658, + "rewards/thk_ans_format_reward": 1.0, + "step": 403, + "think_completion_length": 54.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.03125381469727, + "epoch": 1.3642495784148398, + "grad_norm": 15.86535809741585, + "kl": 0.4970703125, + "learning_rate": 8.862612612612612e-07, + "loss": 0.0005, + "reward": 2.974599838256836, + "reward_std": 0.1448173001408577, + "rewards/final_reward": 0.9453635419360841, + "rewards/mask_iou_reward": 0.47268177096804204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9745997190475464, + "rewards/thk_ans_format_reward": 1.0, + "step": 404, + "think_completion_length": 55.33333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.80208587646484, + "epoch": 1.3676222596964587, + "grad_norm": 23.773005695655176, + "kl": 0.4541015625, + "learning_rate": 8.859797297297297e-07, + "loss": 0.0005, + "reward": 3.1880345344543457, + "reward_std": 0.18816696107387543, + "rewards/final_reward": 1.3801016130752197, + "rewards/mask_iou_reward": 0.6900508065376099, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.18803471326828, + "rewards/thk_ans_format_reward": 1.0, + "step": 405, + "think_completion_length": 50.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.12500381469727, + "epoch": 1.3709949409780775, + "grad_norm": 5.653160672220901, + "kl": 0.4296875, + "learning_rate": 8.856981981981982e-07, + "loss": 0.0005, + "reward": 3.32892107963562, + "reward_std": 0.15366527438163757, + "rewards/final_reward": 1.3260781944652618, + "rewards/mask_iou_reward": 0.6630390972326309, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.328921139240265, + "rewards/thk_ans_format_reward": 1.0, + "step": 406, + "think_completion_length": 51.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.35416793823242, + "epoch": 1.3743676222596966, + "grad_norm": 29.17112442936424, + "kl": 0.4013671875, + "learning_rate": 8.854166666666666e-07, + "loss": 0.0004, + "reward": 3.4100879430770874, + "reward_std": 0.23499078676104546, + "rewards/final_reward": 0.8420019533822547, + "rewards/mask_iou_reward": 0.42100097669112735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4100881218910217, + "rewards/thk_ans_format_reward": 1.0, + "step": 407, + "think_completion_length": 53.958333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.83333969116211, + "epoch": 1.3777403035413154, + "grad_norm": 27.863976318871792, + "kl": 0.427734375, + "learning_rate": 8.851351351351351e-07, + "loss": 0.0005, + "reward": 3.4160239696502686, + "reward_std": 0.13645297288894653, + "rewards/final_reward": 0.8744746874887974, + "rewards/mask_iou_reward": 0.4372373437443987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.416023850440979, + "rewards/thk_ans_format_reward": 1.0, + "step": 408, + "think_completion_length": 60.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.44791793823242, + "epoch": 1.3811129848229342, + "grad_norm": 9.56224631051352, + "kl": 0.400390625, + "learning_rate": 8.848536036036037e-07, + "loss": 0.0004, + "reward": 2.9102158546447754, + "reward_std": 0.17862200736999512, + "rewards/final_reward": 0.8199508471048658, + "rewards/mask_iou_reward": 0.4099754235524329, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9102159142494202, + "rewards/thk_ans_format_reward": 1.0, + "step": 409, + "think_completion_length": 45.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.06250381469727, + "epoch": 1.384485666104553, + "grad_norm": 6.5133688824628555, + "kl": 0.580078125, + "learning_rate": 8.845720720720721e-07, + "loss": 0.0006, + "reward": 3.3277347087860107, + "reward_std": 0.1092943362891674, + "rewards/final_reward": 1.4457946634446563, + "rewards/mask_iou_reward": 0.7228973317223282, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.327734887599945, + "rewards/thk_ans_format_reward": 1.0, + "step": 410, + "think_completion_length": 51.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.61458587646484, + "epoch": 1.387858347386172, + "grad_norm": 8.819736571295985, + "kl": 0.4853515625, + "learning_rate": 8.842905405405406e-07, + "loss": 0.0005, + "reward": 3.330057144165039, + "reward_std": 0.1943942978978157, + "rewards/final_reward": 1.2174243849477313, + "rewards/mask_iou_reward": 0.6087121924738657, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3300570249557495, + "rewards/thk_ans_format_reward": 1.0, + "step": 411, + "think_completion_length": 49.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.32291793823242, + "epoch": 1.391231028667791, + "grad_norm": 5.187668148870701, + "kl": 0.466796875, + "learning_rate": 8.84009009009009e-07, + "loss": 0.0005, + "reward": 3.494489073753357, + "reward_std": 0.057365935295820236, + "rewards/final_reward": 1.885988405475033, + "rewards/mask_iou_reward": 0.9429942027375166, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.494489073753357, + "rewards/thk_ans_format_reward": 1.0, + "step": 412, + "think_completion_length": 47.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.41667175292969, + "epoch": 1.3946037099494097, + "grad_norm": 7.520342801689036, + "kl": 0.5849609375, + "learning_rate": 8.837274774774775e-07, + "loss": 0.0006, + "reward": 3.332287549972534, + "reward_std": 0.2603805884718895, + "rewards/final_reward": 1.2709260084128842, + "rewards/mask_iou_reward": 0.6354630042064421, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3322875499725342, + "rewards/thk_ans_format_reward": 1.0, + "step": 413, + "think_completion_length": 50.95833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.57292175292969, + "epoch": 1.3979763912310288, + "grad_norm": 33.403308964345825, + "kl": 0.4951171875, + "learning_rate": 8.83445945945946e-07, + "loss": 0.0005, + "reward": 3.364820122718811, + "reward_std": 0.1603723168373108, + "rewards/final_reward": 1.4100766007692036, + "rewards/mask_iou_reward": 0.7050383003846018, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.364820122718811, + "rewards/thk_ans_format_reward": 1.0, + "step": 414, + "think_completion_length": 38.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.10416793823242, + "epoch": 1.4013490725126476, + "grad_norm": 5.862994402404749, + "kl": 0.47265625, + "learning_rate": 8.831644144144143e-07, + "loss": 0.0005, + "reward": 3.317633271217346, + "reward_std": 0.08006502967327833, + "rewards/final_reward": 1.4176781291376375, + "rewards/mask_iou_reward": 0.7088390645688187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.317633032798767, + "rewards/thk_ans_format_reward": 1.0, + "step": 415, + "think_completion_length": 47.08333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.125, + "epoch": 1.4047217537942664, + "grad_norm": 18.78986909256566, + "kl": 0.494140625, + "learning_rate": 8.828828828828828e-07, + "loss": 0.0005, + "reward": 3.111713409423828, + "reward_std": 0.16862037405371666, + "rewards/final_reward": 1.5124446319151947, + "rewards/mask_iou_reward": 0.7562223159575974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1117135286331177, + "rewards/thk_ans_format_reward": 1.0, + "step": 416, + "think_completion_length": 45.45833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.50000381469727, + "epoch": 1.4080944350758853, + "grad_norm": 6.739482714526197, + "kl": 0.482421875, + "learning_rate": 8.826013513513512e-07, + "loss": 0.0006, + "reward": 3.124404549598694, + "reward_std": 0.19819872826337814, + "rewards/final_reward": 0.4775836903808357, + "rewards/mask_iou_reward": 0.23879184519041785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1244046092033386, + "rewards/thk_ans_format_reward": 1.0, + "step": 417, + "think_completion_length": 42.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.61458587646484, + "epoch": 1.411467116357504, + "grad_norm": 5.99580145734765, + "kl": 0.4814453125, + "learning_rate": 8.823198198198197e-07, + "loss": 0.0005, + "reward": 3.605440378189087, + "reward_std": 0.1609882414340973, + "rewards/final_reward": 1.6322748536347056, + "rewards/mask_iou_reward": 0.8161374268173528, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.605440378189087, + "rewards/thk_ans_format_reward": 1.0, + "step": 418, + "think_completion_length": 46.583333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.47917175292969, + "epoch": 1.4148397976391232, + "grad_norm": 11.118318311069785, + "kl": 0.498046875, + "learning_rate": 8.820382882882883e-07, + "loss": 0.0005, + "reward": 3.18659508228302, + "reward_std": 0.32843393087387085, + "rewards/final_reward": 0.8392202733213036, + "rewards/mask_iou_reward": 0.4196101366606518, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2074283957481384, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 419, + "think_completion_length": 52.291666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.58333587646484, + "epoch": 1.418212478920742, + "grad_norm": 77.20231858601272, + "kl": 0.42578125, + "learning_rate": 8.817567567567567e-07, + "loss": 0.0005, + "reward": 3.1859129667282104, + "reward_std": 0.13804687187075615, + "rewards/final_reward": 1.168682475575744, + "rewards/mask_iou_reward": 0.584341237787872, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1859130263328552, + "rewards/thk_ans_format_reward": 1.0, + "step": 420, + "think_completion_length": 44.79166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3125, + "epoch": 1.4215851602023608, + "grad_norm": 16.91364728527806, + "kl": 0.4970703125, + "learning_rate": 8.814752252252252e-07, + "loss": 0.0005, + "reward": 3.5506646633148193, + "reward_std": 0.0705304704606533, + "rewards/final_reward": 1.1024359001502577, + "rewards/mask_iou_reward": 0.5512179500751289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5506644248962402, + "rewards/thk_ans_format_reward": 1.0, + "step": 421, + "think_completion_length": 40.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.02083587646484, + "epoch": 1.4249578414839799, + "grad_norm": 9.552478339006777, + "kl": 0.498046875, + "learning_rate": 8.811936936936936e-07, + "loss": 0.0005, + "reward": 3.524839758872986, + "reward_std": 0.16047295182943344, + "rewards/final_reward": 1.4494069846057993, + "rewards/mask_iou_reward": 0.7247034923028997, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5248395204544067, + "rewards/thk_ans_format_reward": 1.0, + "step": 422, + "think_completion_length": 41.20833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.58333587646484, + "epoch": 1.4283305227655987, + "grad_norm": 6.705347286067029, + "kl": 0.474609375, + "learning_rate": 8.809121621621621e-07, + "loss": 0.0005, + "reward": 3.1085526943206787, + "reward_std": 0.13762886077165604, + "rewards/final_reward": 1.4445938675272614, + "rewards/mask_iou_reward": 0.7222969337636307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1085528135299683, + "rewards/thk_ans_format_reward": 1.0, + "step": 423, + "think_completion_length": 34.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.75000381469727, + "epoch": 1.4317032040472175, + "grad_norm": 5.960587019979815, + "kl": 0.44921875, + "learning_rate": 8.806306306306306e-07, + "loss": 0.0004, + "reward": 2.8898168802261353, + "reward_std": 0.2549128457903862, + "rewards/final_reward": 0.6743037408393452, + "rewards/mask_iou_reward": 0.3371518704196726, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8898166865110397, + "rewards/thk_ans_format_reward": 1.0, + "step": 424, + "think_completion_length": 39.04166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.00000381469727, + "epoch": 1.4350758853288363, + "grad_norm": 4.748435112911275, + "kl": 0.54296875, + "learning_rate": 8.80349099099099e-07, + "loss": 0.0005, + "reward": 3.3094550371170044, + "reward_std": 0.11676504462957382, + "rewards/final_reward": 1.1488917938058036, + "rewards/mask_iou_reward": 0.5744458969029018, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3094549775123596, + "rewards/thk_ans_format_reward": 1.0, + "step": 425, + "think_completion_length": 37.166666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.91666793823242, + "epoch": 1.4384485666104554, + "grad_norm": 8.627079159689744, + "kl": 0.4765625, + "learning_rate": 8.800675675675675e-07, + "loss": 0.0005, + "reward": 2.928268551826477, + "reward_std": 0.10477589443325996, + "rewards/final_reward": 1.1542270032630197, + "rewards/mask_iou_reward": 0.5771135016315099, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9282682836055756, + "rewards/thk_ans_format_reward": 1.0, + "step": 426, + "think_completion_length": 37.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.70833587646484, + "epoch": 1.4418212478920742, + "grad_norm": 10.280001178760335, + "kl": 0.458984375, + "learning_rate": 8.797860360360359e-07, + "loss": 0.0005, + "reward": 3.1266590356826782, + "reward_std": 0.28933235257864, + "rewards/final_reward": 0.5333257628063448, + "rewards/mask_iou_reward": 0.2666628814031724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1370754837989807, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 427, + "think_completion_length": 41.58333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.42708587646484, + "epoch": 1.445193929173693, + "grad_norm": 9.663883538288415, + "kl": 0.4208984375, + "learning_rate": 8.795045045045044e-07, + "loss": 0.0004, + "reward": 3.270835518836975, + "reward_std": 0.3859306201338768, + "rewards/final_reward": 1.4194974541074559, + "rewards/mask_iou_reward": 0.7097487270537279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2708353996276855, + "rewards/thk_ans_format_reward": 1.0, + "step": 428, + "think_completion_length": 42.41666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.62500381469727, + "epoch": 1.448566610455312, + "grad_norm": 19.624851392813568, + "kl": 0.416015625, + "learning_rate": 8.79222972972973e-07, + "loss": 0.0004, + "reward": 3.212664246559143, + "reward_std": 0.1854577735066414, + "rewards/final_reward": 0.692815486793328, + "rewards/mask_iou_reward": 0.346407743396664, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2126640677452087, + "rewards/thk_ans_format_reward": 1.0, + "step": 429, + "think_completion_length": 45.91666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.18750762939453, + "epoch": 1.451939291736931, + "grad_norm": 8.15185088347606, + "kl": 0.404296875, + "learning_rate": 8.789414414414414e-07, + "loss": 0.0004, + "reward": 3.1903375387191772, + "reward_std": 0.18493592739105225, + "rewards/final_reward": 1.2455512865472378, + "rewards/mask_iou_reward": 0.6227756432736189, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1903374195098877, + "rewards/thk_ans_format_reward": 1.0, + "step": 430, + "think_completion_length": 49.66666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.93750381469727, + "epoch": 1.4553119730185498, + "grad_norm": 9.638413199088216, + "kl": 0.572265625, + "learning_rate": 8.786599099099099e-07, + "loss": 0.0006, + "reward": 3.6001389026641846, + "reward_std": 0.06927749514579773, + "rewards/final_reward": 1.8230214998088752, + "rewards/mask_iou_reward": 0.9115107499044376, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6001389026641846, + "rewards/thk_ans_format_reward": 1.0, + "step": 431, + "think_completion_length": 44.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.9375, + "epoch": 1.4586846543001686, + "grad_norm": 8.658861372580853, + "kl": 0.4814453125, + "learning_rate": 8.783783783783784e-07, + "loss": 0.0005, + "reward": 3.0466322898864746, + "reward_std": 0.21577009186148643, + "rewards/final_reward": 0.2688910628278502, + "rewards/mask_iou_reward": 0.1344455314139251, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0466321110725403, + "rewards/thk_ans_format_reward": 1.0, + "step": 432, + "think_completion_length": 44.291666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.21875381469727, + "epoch": 1.4620573355817874, + "grad_norm": 8.80212834256664, + "kl": 0.4404296875, + "learning_rate": 8.780968468468468e-07, + "loss": 0.0004, + "reward": 3.499486207962036, + "reward_std": 0.18082892894744873, + "rewards/final_reward": 1.3780213683306257, + "rewards/mask_iou_reward": 0.6890106841653129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4994860887527466, + "rewards/thk_ans_format_reward": 1.0, + "step": 433, + "think_completion_length": 36.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.58333587646484, + "epoch": 1.4654300168634065, + "grad_norm": 22.96490471862446, + "kl": 0.4736328125, + "learning_rate": 8.778153153153153e-07, + "loss": 0.0005, + "reward": 3.292214035987854, + "reward_std": 0.24014803767204285, + "rewards/final_reward": 1.4014288031328084, + "rewards/mask_iou_reward": 0.7007144015664042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2922139763832092, + "rewards/thk_ans_format_reward": 1.0, + "step": 434, + "think_completion_length": 38.29166666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.82291793823242, + "epoch": 1.4688026981450253, + "grad_norm": 14.29935664393217, + "kl": 0.4375, + "learning_rate": 8.775337837837837e-07, + "loss": 0.0004, + "reward": 2.967806816101074, + "reward_std": 0.28818748891353607, + "rewards/final_reward": 0.2677004428336839, + "rewards/mask_iou_reward": 0.13385022141684194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9678069055080414, + "rewards/thk_ans_format_reward": 1.0, + "step": 435, + "think_completion_length": 41.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.98958587646484, + "epoch": 1.4721753794266441, + "grad_norm": 6.3730077599899015, + "kl": 0.390625, + "learning_rate": 8.772522522522522e-07, + "loss": 0.0004, + "reward": 3.3646087646484375, + "reward_std": 0.08628809824585915, + "rewards/final_reward": 0.9938647796127891, + "rewards/mask_iou_reward": 0.49693238980639454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.364608883857727, + "rewards/thk_ans_format_reward": 1.0, + "step": 436, + "think_completion_length": 38.041666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.6354217529297, + "epoch": 1.4755480607082632, + "grad_norm": 10.925111726883417, + "kl": 0.4765625, + "learning_rate": 8.769707207207207e-07, + "loss": 0.0005, + "reward": 2.952384114265442, + "reward_std": 0.16299670934677124, + "rewards/final_reward": 1.2880874610009931, + "rewards/mask_iou_reward": 0.6440437305004966, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9523837566375732, + "rewards/thk_ans_format_reward": 1.0, + "step": 437, + "think_completion_length": 35.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.42708587646484, + "epoch": 1.478920741989882, + "grad_norm": 10.015341910465036, + "kl": 0.3837890625, + "learning_rate": 8.766891891891891e-07, + "loss": 0.0004, + "reward": 3.039707899093628, + "reward_std": 0.13153230771422386, + "rewards/final_reward": 0.9393191034972563, + "rewards/mask_iou_reward": 0.46965955174862817, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0397077798843384, + "rewards/thk_ans_format_reward": 1.0, + "step": 438, + "think_completion_length": 39.16666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.88541793823242, + "epoch": 1.4822934232715008, + "grad_norm": 39.92832341017036, + "kl": 0.4462890625, + "learning_rate": 8.764076576576577e-07, + "loss": 0.0004, + "reward": 3.5502430200576782, + "reward_std": 0.1625949591398239, + "rewards/final_reward": 1.6293686105111496, + "rewards/mask_iou_reward": 0.8146843052555748, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.550242841243744, + "rewards/thk_ans_format_reward": 1.0, + "step": 439, + "think_completion_length": 39.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.8125, + "epoch": 1.4856661045531196, + "grad_norm": 11.879109435228093, + "kl": 0.3935546875, + "learning_rate": 8.761261261261261e-07, + "loss": 0.0004, + "reward": 3.2247962951660156, + "reward_std": 0.27370116859674454, + "rewards/final_reward": 0.9659916672752272, + "rewards/mask_iou_reward": 0.4829958336376136, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2352128624916077, + "rewards/thk_ans_format_reward": 1.0, + "step": 440, + "think_completion_length": 39.333333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.41667175292969, + "epoch": 1.4890387858347387, + "grad_norm": 6.712408642446364, + "kl": 0.3935546875, + "learning_rate": 8.758445945945946e-07, + "loss": 0.0004, + "reward": 3.5379750728607178, + "reward_std": 0.17404749989509583, + "rewards/final_reward": 1.6421923444437927, + "rewards/mask_iou_reward": 0.8210961722218963, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5379751920700073, + "rewards/thk_ans_format_reward": 1.0, + "step": 441, + "think_completion_length": 35.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.19791793823242, + "epoch": 1.4924114671163575, + "grad_norm": 6.709798659834236, + "kl": 0.525390625, + "learning_rate": 8.755630630630631e-07, + "loss": 0.0005, + "reward": 3.350723624229431, + "reward_std": 0.17634809762239456, + "rewards/final_reward": 1.4496213303634948, + "rewards/mask_iou_reward": 0.7248106651817474, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3507237434387207, + "rewards/thk_ans_format_reward": 1.0, + "step": 442, + "think_completion_length": 33.583333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.33333587646484, + "epoch": 1.4957841483979764, + "grad_norm": 7.958027162502342, + "kl": 0.4453125, + "learning_rate": 8.752815315315315e-07, + "loss": 0.0004, + "reward": 2.863914966583252, + "reward_std": 0.15693428367376328, + "rewards/final_reward": 0.44607979220935845, + "rewards/mask_iou_reward": 0.22303989610467922, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8639149367809296, + "rewards/thk_ans_format_reward": 1.0, + "step": 443, + "think_completion_length": 44.291666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.19791793823242, + "epoch": 1.4991568296795954, + "grad_norm": 9.352421356313323, + "kl": 0.4365234375, + "learning_rate": 8.75e-07, + "loss": 0.0004, + "reward": 3.37521755695343, + "reward_std": 0.2912629693746567, + "rewards/final_reward": 0.9321297674460074, + "rewards/mask_iou_reward": 0.4660648837230037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3752171993255615, + "rewards/thk_ans_format_reward": 1.0, + "step": 444, + "think_completion_length": 32.70833333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.16666793823242, + "epoch": 1.5025295109612142, + "grad_norm": 13.719292122120075, + "kl": 0.5673828125, + "learning_rate": 8.747184684684684e-07, + "loss": 0.0006, + "reward": 3.136060118675232, + "reward_std": 0.17749232798814774, + "rewards/final_reward": 0.6155818352553188, + "rewards/mask_iou_reward": 0.3077909176276594, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1360602378845215, + "rewards/thk_ans_format_reward": 1.0, + "step": 445, + "think_completion_length": 23.708333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.65625381469727, + "epoch": 1.505902192242833, + "grad_norm": 11.080309636704607, + "kl": 0.4482421875, + "learning_rate": 8.744369369369369e-07, + "loss": 0.0005, + "reward": 3.1183054447174072, + "reward_std": 0.10751515999436378, + "rewards/final_reward": 1.6852675556133896, + "rewards/mask_iou_reward": 0.8426337778066948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1183055639266968, + "rewards/thk_ans_format_reward": 1.0, + "step": 446, + "think_completion_length": 31.041666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.02083587646484, + "epoch": 1.5092748735244519, + "grad_norm": 13.161494532551057, + "kl": 0.4482421875, + "learning_rate": 8.741554054054054e-07, + "loss": 0.0005, + "reward": 3.2273337841033936, + "reward_std": 0.1627396196126938, + "rewards/final_reward": 0.9859541692869749, + "rewards/mask_iou_reward": 0.49297708464348744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2273337244987488, + "rewards/thk_ans_format_reward": 1.0, + "step": 447, + "think_completion_length": 36.83333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.33333587646484, + "epoch": 1.5126475548060707, + "grad_norm": 10.348315811850904, + "kl": 0.50390625, + "learning_rate": 8.738738738738738e-07, + "loss": 0.0005, + "reward": 2.807453989982605, + "reward_std": 0.19256117939949036, + "rewards/final_reward": 0.813686752756777, + "rewards/mask_iou_reward": 0.4068433763783885, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8074538707733154, + "rewards/thk_ans_format_reward": 1.0, + "step": 448, + "think_completion_length": 23.791666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.36458969116211, + "epoch": 1.5160202360876898, + "grad_norm": 21.72668434175921, + "kl": 0.412109375, + "learning_rate": 8.735923423423423e-07, + "loss": 0.0004, + "reward": 2.9704527854919434, + "reward_std": 0.25510428100824356, + "rewards/final_reward": 0.9751997237566301, + "rewards/mask_iou_reward": 0.48759986187831506, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9704526960849762, + "rewards/thk_ans_format_reward": 1.0, + "step": 449, + "think_completion_length": 19.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.71875381469727, + "epoch": 1.5193929173693086, + "grad_norm": 7.1547058197514115, + "kl": 0.4228515625, + "learning_rate": 8.733108108108109e-07, + "loss": 0.0004, + "reward": 2.8380229473114014, + "reward_std": 0.3202047646045685, + "rewards/final_reward": 1.2212116006312888, + "rewards/mask_iou_reward": 0.6106058003156444, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8380228579044342, + "rewards/thk_ans_format_reward": 1.0, + "step": 450, + "think_completion_length": 24.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.96875381469727, + "epoch": 1.5227655986509276, + "grad_norm": 13.72594240653149, + "kl": 0.4697265625, + "learning_rate": 8.730292792792793e-07, + "loss": 0.0005, + "reward": 3.395395040512085, + "reward_std": 0.1617676541209221, + "rewards/final_reward": 1.5347950284425567, + "rewards/mask_iou_reward": 0.7673975142212783, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3953949809074402, + "rewards/thk_ans_format_reward": 1.0, + "step": 451, + "think_completion_length": 22.791666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.9375, + "epoch": 1.5261382799325465, + "grad_norm": 132.82594405838844, + "kl": 0.4873046875, + "learning_rate": 8.727477477477478e-07, + "loss": 0.0005, + "reward": 3.166351079940796, + "reward_std": 0.22434765845537186, + "rewards/final_reward": 1.1407594605091715, + "rewards/mask_iou_reward": 0.5703797302545858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1663509607315063, + "rewards/thk_ans_format_reward": 1.0, + "step": 452, + "think_completion_length": 23.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.25000762939453, + "epoch": 1.5295109612141653, + "grad_norm": 8.261307910998266, + "kl": 0.435546875, + "learning_rate": 8.724662162162162e-07, + "loss": 0.0004, + "reward": 3.166746139526367, + "reward_std": 0.1179632619023323, + "rewards/final_reward": 1.3772007662946268, + "rewards/mask_iou_reward": 0.6886003831473134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1667458713054657, + "rewards/thk_ans_format_reward": 1.0, + "step": 453, + "think_completion_length": 18.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.47916793823242, + "epoch": 1.5328836424957841, + "grad_norm": 18.2005658169965, + "kl": 0.490234375, + "learning_rate": 8.721846846846846e-07, + "loss": 0.0005, + "reward": 3.2263890504837036, + "reward_std": 0.1771574541926384, + "rewards/final_reward": 1.3200745693435916, + "rewards/mask_iou_reward": 0.6600372846717958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2263891696929932, + "rewards/thk_ans_format_reward": 1.0, + "step": 454, + "think_completion_length": 16.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.53125381469727, + "epoch": 1.536256323777403, + "grad_norm": 6.950321217424066, + "kl": 0.4453125, + "learning_rate": 8.719031531531531e-07, + "loss": 0.0005, + "reward": 3.212833523750305, + "reward_std": 0.16523578390479088, + "rewards/final_reward": 1.696258540360979, + "rewards/mask_iou_reward": 0.8481292701804894, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2232502102851868, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 455, + "think_completion_length": 18.708333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.37500381469727, + "epoch": 1.5396290050590218, + "grad_norm": 17.482705271182557, + "kl": 0.587890625, + "learning_rate": 8.716216216216215e-07, + "loss": 0.0006, + "reward": 3.290819525718689, + "reward_std": 0.2551625818014145, + "rewards/final_reward": 0.7625233320673338, + "rewards/mask_iou_reward": 0.3812616660336669, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2908196151256561, + "rewards/thk_ans_format_reward": 1.0, + "step": 456, + "think_completion_length": 23.458333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.08333587646484, + "epoch": 1.5430016863406408, + "grad_norm": 9.824438040670186, + "kl": 0.4658203125, + "learning_rate": 8.7134009009009e-07, + "loss": 0.0005, + "reward": 2.9174301624298096, + "reward_std": 0.298883818089962, + "rewards/final_reward": 0.5540578812699941, + "rewards/mask_iou_reward": 0.27702894063499706, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9174301624298096, + "rewards/thk_ans_format_reward": 1.0, + "step": 457, + "think_completion_length": 21.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.35416793823242, + "epoch": 1.5463743676222597, + "grad_norm": 9.440898123085097, + "kl": 0.47265625, + "learning_rate": 8.710585585585584e-07, + "loss": 0.0005, + "reward": 3.1261075735092163, + "reward_std": 0.16530471108853817, + "rewards/final_reward": 1.1013812481237961, + "rewards/mask_iou_reward": 0.5506906240618981, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1261076629161835, + "rewards/thk_ans_format_reward": 1.0, + "step": 458, + "think_completion_length": 18.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.77083587646484, + "epoch": 1.5497470489038787, + "grad_norm": 16.567436679373156, + "kl": 0.474609375, + "learning_rate": 8.707770270270269e-07, + "loss": 0.0005, + "reward": 2.9942362308502197, + "reward_std": 0.27375921979546547, + "rewards/final_reward": 1.4684187417871153, + "rewards/mask_iou_reward": 0.7342093708935576, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.994236171245575, + "rewards/thk_ans_format_reward": 1.0, + "step": 459, + "think_completion_length": 23.041666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.05208587646484, + "epoch": 1.5531197301854975, + "grad_norm": 54.120110367425866, + "kl": 0.95703125, + "learning_rate": 8.704954954954955e-07, + "loss": 0.001, + "reward": 2.818720817565918, + "reward_std": 0.18573438376188278, + "rewards/final_reward": 0.8216863238924244, + "rewards/mask_iou_reward": 0.4108431619462122, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8187207877635956, + "rewards/thk_ans_format_reward": 1.0, + "step": 460, + "think_completion_length": 20.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.76041793823242, + "epoch": 1.5564924114671164, + "grad_norm": 11.965967732569721, + "kl": 0.44921875, + "learning_rate": 8.702139639639639e-07, + "loss": 0.0005, + "reward": 3.033520817756653, + "reward_std": 0.08643431216478348, + "rewards/final_reward": 1.4058163678866773, + "rewards/mask_iou_reward": 0.7029081839433386, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0335208177566528, + "rewards/thk_ans_format_reward": 1.0, + "step": 461, + "think_completion_length": 17.458333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 83.97916793823242, + "epoch": 1.5598650927487352, + "grad_norm": 15.679449940056134, + "kl": 0.4609375, + "learning_rate": 8.699324324324324e-07, + "loss": 0.0005, + "reward": 3.1351873874664307, + "reward_std": 0.0932794027030468, + "rewards/final_reward": 1.1743316658542557, + "rewards/mask_iou_reward": 0.5871658329271279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1351872682571411, + "rewards/thk_ans_format_reward": 1.0, + "step": 462, + "think_completion_length": 17.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.63541793823242, + "epoch": 1.563237774030354, + "grad_norm": 8.531893149998371, + "kl": 0.46875, + "learning_rate": 8.696509009009008e-07, + "loss": 0.0005, + "reward": 3.204105496406555, + "reward_std": 0.22264982759952545, + "rewards/final_reward": 1.4184024437330647, + "rewards/mask_iou_reward": 0.7092012218665323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2041053175926208, + "rewards/thk_ans_format_reward": 1.0, + "step": 463, + "think_completion_length": 20.791666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.35417175292969, + "epoch": 1.566610455311973, + "grad_norm": 35.768289242163206, + "kl": 0.5595703125, + "learning_rate": 8.693693693693693e-07, + "loss": 0.0006, + "reward": 3.28743577003479, + "reward_std": 0.14121374301612377, + "rewards/final_reward": 1.7685978748483455, + "rewards/mask_iou_reward": 0.8842989374241728, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2874356508255005, + "rewards/thk_ans_format_reward": 1.0, + "step": 464, + "think_completion_length": 22.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.71875381469727, + "epoch": 1.569983136593592, + "grad_norm": 19.549502851993616, + "kl": 0.4150390625, + "learning_rate": 8.690878378378378e-07, + "loss": 0.0004, + "reward": 3.0029417276382446, + "reward_std": 0.027136605232954025, + "rewards/final_reward": 1.772966376810864, + "rewards/mask_iou_reward": 0.886483188405432, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0029414594173431, + "rewards/thk_ans_format_reward": 1.0, + "step": 465, + "think_completion_length": 17.458333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.94791793823242, + "epoch": 1.573355817875211, + "grad_norm": 7.625249061403965, + "kl": 0.44921875, + "learning_rate": 8.688063063063062e-07, + "loss": 0.0005, + "reward": 3.1399009227752686, + "reward_std": 0.11434066295623779, + "rewards/final_reward": 1.3885686335373428, + "rewards/mask_iou_reward": 0.6942843167686714, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.139900803565979, + "rewards/thk_ans_format_reward": 1.0, + "step": 466, + "think_completion_length": 22.458333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.31250381469727, + "epoch": 1.5767284991568298, + "grad_norm": 46.22982661858084, + "kl": 0.5126953125, + "learning_rate": 8.685247747747747e-07, + "loss": 0.0005, + "reward": 3.077829360961914, + "reward_std": 0.15692508220672607, + "rewards/final_reward": 1.2113539620508234, + "rewards/mask_iou_reward": 0.6056769810254117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.077829360961914, + "rewards/thk_ans_format_reward": 1.0, + "step": 467, + "think_completion_length": 16.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.375, + "epoch": 1.5801011804384486, + "grad_norm": 17.505457784534546, + "kl": 0.595703125, + "learning_rate": 8.682432432432431e-07, + "loss": 0.0006, + "reward": 2.748835325241089, + "reward_std": 0.12427278235554695, + "rewards/final_reward": 0.780510380319, + "rewards/mask_iou_reward": 0.3902551901595, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7488352358341217, + "rewards/thk_ans_format_reward": 1.0, + "step": 468, + "think_completion_length": 18.416666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.96875381469727, + "epoch": 1.5834738617200674, + "grad_norm": 8.498249651892177, + "kl": 0.4677734375, + "learning_rate": 8.679617117117116e-07, + "loss": 0.0005, + "reward": 3.204338550567627, + "reward_std": 0.19518911838531494, + "rewards/final_reward": 1.7290932128407333, + "rewards/mask_iou_reward": 0.8645466064203666, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2043386101722717, + "rewards/thk_ans_format_reward": 1.0, + "step": 469, + "think_completion_length": 18.666666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.96875381469727, + "epoch": 1.5868465430016863, + "grad_norm": 15.412252606508801, + "kl": 0.568359375, + "learning_rate": 8.676801801801802e-07, + "loss": 0.0006, + "reward": 3.1517279148101807, + "reward_std": 0.21602170914411545, + "rewards/final_reward": 1.6399599658435098, + "rewards/mask_iou_reward": 0.8199799829217549, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1517279148101807, + "rewards/thk_ans_format_reward": 1.0, + "step": 470, + "think_completion_length": 24.541666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.05208587646484, + "epoch": 1.590219224283305, + "grad_norm": 28.822529403476647, + "kl": 0.515625, + "learning_rate": 8.673986486486486e-07, + "loss": 0.0005, + "reward": 2.7272047996520996, + "reward_std": 0.15829136967658997, + "rewards/final_reward": 0.8382349980237175, + "rewards/mask_iou_reward": 0.41911749901185874, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7272045910358429, + "rewards/thk_ans_format_reward": 1.0, + "step": 471, + "think_completion_length": 18.458333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.35417175292969, + "epoch": 1.5935919055649241, + "grad_norm": 102.25877988832697, + "kl": 0.3818359375, + "learning_rate": 8.671171171171171e-07, + "loss": 0.0004, + "reward": 3.501787781715393, + "reward_std": 0.09546659886837006, + "rewards/final_reward": 1.8930582525314399, + "rewards/mask_iou_reward": 0.9465291262657199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5017877221107483, + "rewards/thk_ans_format_reward": 1.0, + "step": 472, + "think_completion_length": 22.166666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.50000381469727, + "epoch": 1.596964586846543, + "grad_norm": 4.43375616737534, + "kl": 0.6943359375, + "learning_rate": 8.668355855855856e-07, + "loss": 0.0007, + "reward": 3.063448905944824, + "reward_std": 0.18014680407941341, + "rewards/final_reward": 0.22403323992912672, + "rewards/mask_iou_reward": 0.11201661996456336, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0634491443634033, + "rewards/thk_ans_format_reward": 1.0, + "step": 473, + "think_completion_length": 21.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.02083587646484, + "epoch": 1.600337268128162, + "grad_norm": 14.473369009625422, + "kl": 0.494140625, + "learning_rate": 8.66554054054054e-07, + "loss": 0.0005, + "reward": 2.9824938774108887, + "reward_std": 0.16130833327770233, + "rewards/final_reward": 1.0900408410882503, + "rewards/mask_iou_reward": 0.5450204205441251, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9824941158294678, + "rewards/thk_ans_format_reward": 1.0, + "step": 474, + "think_completion_length": 20.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.46875381469727, + "epoch": 1.6037099494097808, + "grad_norm": 12.531912108914801, + "kl": 0.4267578125, + "learning_rate": 8.662725225225225e-07, + "loss": 0.0004, + "reward": 2.71895968914032, + "reward_std": 0.16828986257314682, + "rewards/final_reward": 0.9872382404177518, + "rewards/mask_iou_reward": 0.4936191202088759, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7189596146345139, + "rewards/thk_ans_format_reward": 1.0, + "step": 475, + "think_completion_length": 14.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.45833587646484, + "epoch": 1.6070826306913997, + "grad_norm": 23.104837063754868, + "kl": 0.478515625, + "learning_rate": 8.659909909909909e-07, + "loss": 0.0005, + "reward": 3.0753190517425537, + "reward_std": 0.057626042515039444, + "rewards/final_reward": 0.6522182180405172, + "rewards/mask_iou_reward": 0.3261091090202586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0753188133239746, + "rewards/thk_ans_format_reward": 1.0, + "step": 476, + "think_completion_length": 14.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.70833587646484, + "epoch": 1.6104553119730185, + "grad_norm": 51.14924206166379, + "kl": 0.47265625, + "learning_rate": 8.657094594594594e-07, + "loss": 0.0005, + "reward": 3.417826771736145, + "reward_std": 0.2440406084060669, + "rewards/final_reward": 1.1676209565768971, + "rewards/mask_iou_reward": 0.5838104782884486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4178267121315002, + "rewards/thk_ans_format_reward": 1.0, + "step": 477, + "think_completion_length": 15.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.11458587646484, + "epoch": 1.6138279932546373, + "grad_norm": 16.65912906390247, + "kl": 0.541015625, + "learning_rate": 8.654279279279279e-07, + "loss": 0.0006, + "reward": 3.257392644882202, + "reward_std": 0.11912001296877861, + "rewards/final_reward": 0.640605444527259, + "rewards/mask_iou_reward": 0.3203027222636295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2573924660682678, + "rewards/thk_ans_format_reward": 1.0, + "step": 478, + "think_completion_length": 14.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.68750381469727, + "epoch": 1.6172006745362564, + "grad_norm": 14.555733217592502, + "kl": 0.4775390625, + "learning_rate": 8.651463963963963e-07, + "loss": 0.0005, + "reward": 2.956945300102234, + "reward_std": 0.2652505896985531, + "rewards/final_reward": 1.0335901193700312, + "rewards/mask_iou_reward": 0.5167950596850156, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9673618376255035, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 479, + "think_completion_length": 14.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.15625, + "epoch": 1.6205733558178752, + "grad_norm": 45.33214603312758, + "kl": 0.4638671875, + "learning_rate": 8.648648648648649e-07, + "loss": 0.0005, + "reward": 3.351254463195801, + "reward_std": 0.1269562803208828, + "rewards/final_reward": 1.082978360079899, + "rewards/mask_iou_reward": 0.5414891800399495, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3512542843818665, + "rewards/thk_ans_format_reward": 1.0, + "step": 480, + "think_completion_length": 13.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.38541793823242, + "epoch": 1.6239460370994943, + "grad_norm": 12.613422364468201, + "kl": 0.4736328125, + "learning_rate": 8.645833333333333e-07, + "loss": 0.0005, + "reward": 3.2807374000549316, + "reward_std": 0.06218157522380352, + "rewards/final_reward": 1.29486728669709, + "rewards/mask_iou_reward": 0.647433643348545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2807374000549316, + "rewards/thk_ans_format_reward": 1.0, + "step": 481, + "think_completion_length": 13.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 86.15625, + "epoch": 1.627318718381113, + "grad_norm": 29.66491767918893, + "kl": 0.54296875, + "learning_rate": 8.643018018018018e-07, + "loss": 0.0008, + "reward": 3.1102946996688843, + "reward_std": 0.16617947816848755, + "rewards/final_reward": 0.9862282502708326, + "rewards/mask_iou_reward": 0.4931141251354163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1207115054130554, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 482, + "think_completion_length": 12.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.30208587646484, + "epoch": 1.630691399662732, + "grad_norm": 55.41155718440913, + "kl": 0.4482421875, + "learning_rate": 8.640202702702703e-07, + "loss": 0.0005, + "reward": 3.4419562816619873, + "reward_std": 0.07434825040400028, + "rewards/final_reward": 1.716160589569279, + "rewards/mask_iou_reward": 0.8580802947846395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4523729085922241, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 483, + "think_completion_length": 13.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.58333587646484, + "epoch": 1.6340640809443507, + "grad_norm": 16.071683379814385, + "kl": 0.45703125, + "learning_rate": 8.637387387387387e-07, + "loss": 0.0005, + "reward": 3.5900611877441406, + "reward_std": 0.13412418961524963, + "rewards/final_reward": 1.8966849195779, + "rewards/mask_iou_reward": 0.94834245978895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6108945608139038, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 484, + "think_completion_length": 12.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.98958587646484, + "epoch": 1.6374367622259696, + "grad_norm": 9.311258158438548, + "kl": 0.490234375, + "learning_rate": 8.634572072072072e-07, + "loss": 0.0005, + "reward": 3.518470883369446, + "reward_std": 0.19531650096178055, + "rewards/final_reward": 1.5953611940366779, + "rewards/mask_iou_reward": 0.7976805970183389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.518470823764801, + "rewards/thk_ans_format_reward": 1.0, + "step": 485, + "think_completion_length": 12.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.07291793823242, + "epoch": 1.6408094435075884, + "grad_norm": 12.15817766930668, + "kl": 0.4482421875, + "learning_rate": 8.631756756756757e-07, + "loss": 0.0004, + "reward": 2.868876814842224, + "reward_std": 0.09707498550415039, + "rewards/final_reward": 1.3961014519571442, + "rewards/mask_iou_reward": 0.6980507259785721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8688768744468689, + "rewards/thk_ans_format_reward": 1.0, + "step": 486, + "think_completion_length": 11.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.60416793823242, + "epoch": 1.6441821247892074, + "grad_norm": 14.482530307624337, + "kl": 0.515625, + "learning_rate": 8.628941441441441e-07, + "loss": 0.0005, + "reward": 3.417795419692993, + "reward_std": 0.04655772354453802, + "rewards/final_reward": 1.5810112497156465, + "rewards/mask_iou_reward": 0.7905056248578233, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.417795479297638, + "rewards/thk_ans_format_reward": 1.0, + "step": 487, + "think_completion_length": 11.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.21875381469727, + "epoch": 1.6475548060708263, + "grad_norm": 141.43443814301298, + "kl": 0.4111328125, + "learning_rate": 8.626126126126126e-07, + "loss": 0.0004, + "reward": 3.459301471710205, + "reward_std": 0.07235825061798096, + "rewards/final_reward": 0.9971469538340672, + "rewards/mask_iou_reward": 0.4985734769170336, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4593015313148499, + "rewards/thk_ans_format_reward": 1.0, + "step": 488, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 70.87500381469727, + "epoch": 1.6509274873524453, + "grad_norm": 15.072714849056247, + "kl": 0.53515625, + "learning_rate": 8.62331081081081e-07, + "loss": 0.0005, + "reward": 3.366679072380066, + "reward_std": 0.21495439112186432, + "rewards/final_reward": 1.6226693889678039, + "rewards/mask_iou_reward": 0.8113346944839019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.366679072380066, + "rewards/thk_ans_format_reward": 1.0, + "step": 489, + "think_completion_length": 12.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 74.31250381469727, + "epoch": 1.6543001686340641, + "grad_norm": 12.923645356988114, + "kl": 0.46484375, + "learning_rate": 8.620495495495496e-07, + "loss": 0.0005, + "reward": 3.3765722513198853, + "reward_std": 0.09477205201983452, + "rewards/final_reward": 0.87301188904249, + "rewards/mask_iou_reward": 0.436505944521245, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3765720129013062, + "rewards/thk_ans_format_reward": 1.0, + "step": 490, + "think_completion_length": 10.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.06250381469727, + "epoch": 1.657672849915683, + "grad_norm": 14.60867178239802, + "kl": 0.51171875, + "learning_rate": 8.617680180180181e-07, + "loss": 0.0005, + "reward": 3.226666212081909, + "reward_std": 0.2977278307080269, + "rewards/final_reward": 1.088146077694477, + "rewards/mask_iou_reward": 0.5440730388472385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.226666271686554, + "rewards/thk_ans_format_reward": 1.0, + "step": 491, + "think_completion_length": 11.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.10416793823242, + "epoch": 1.6610455311973018, + "grad_norm": 31.411912777573136, + "kl": 0.5390625, + "learning_rate": 8.614864864864865e-07, + "loss": 0.0005, + "reward": 3.4004725217819214, + "reward_std": 0.12245327979326248, + "rewards/final_reward": 1.3255855717465488, + "rewards/mask_iou_reward": 0.6627927858732744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4004724025726318, + "rewards/thk_ans_format_reward": 1.0, + "step": 492, + "think_completion_length": 9.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.72916793823242, + "epoch": 1.6644182124789206, + "grad_norm": 16.023223475825134, + "kl": 0.458984375, + "learning_rate": 8.61204954954955e-07, + "loss": 0.0005, + "reward": 3.2062329053878784, + "reward_std": 0.18962369859218597, + "rewards/final_reward": 1.2517562804400746, + "rewards/mask_iou_reward": 0.6258781402200373, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.206232726573944, + "rewards/thk_ans_format_reward": 1.0, + "step": 493, + "think_completion_length": 13.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.58333587646484, + "epoch": 1.6677908937605397, + "grad_norm": 11.265672770232708, + "kl": 0.552734375, + "learning_rate": 8.609234234234233e-07, + "loss": 0.0006, + "reward": 2.867155909538269, + "reward_std": 0.0892084464430809, + "rewards/final_reward": 0.8286811223868835, + "rewards/mask_iou_reward": 0.41434056119344176, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8671558797359467, + "rewards/thk_ans_format_reward": 1.0, + "step": 494, + "think_completion_length": 9.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 80.00000381469727, + "epoch": 1.6711635750421585, + "grad_norm": 13.26919597028308, + "kl": 0.46875, + "learning_rate": 8.606418918918918e-07, + "loss": 0.0005, + "reward": 3.3766839504241943, + "reward_std": 0.15332239121198654, + "rewards/final_reward": 1.256387966408285, + "rewards/mask_iou_reward": 0.6281939832041425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3766838312149048, + "rewards/thk_ans_format_reward": 1.0, + "step": 495, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 72.58333587646484, + "epoch": 1.6745362563237776, + "grad_norm": 13.018456947061745, + "kl": 0.4384765625, + "learning_rate": 8.603603603603603e-07, + "loss": 0.0005, + "reward": 3.3175920248031616, + "reward_std": 0.11998457461595535, + "rewards/final_reward": 1.6143369815601565, + "rewards/mask_iou_reward": 0.8071684907800782, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3175921440124512, + "rewards/thk_ans_format_reward": 1.0, + "step": 496, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.94791793823242, + "epoch": 1.6779089376053964, + "grad_norm": 11.159094503630527, + "kl": 0.50390625, + "learning_rate": 8.600788288288287e-07, + "loss": 0.0005, + "reward": 3.057952880859375, + "reward_std": 0.11427609622478485, + "rewards/final_reward": 1.5136194076524858, + "rewards/mask_iou_reward": 0.7568097038262429, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0579529404640198, + "rewards/thk_ans_format_reward": 1.0, + "step": 497, + "think_completion_length": 10.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.65625381469727, + "epoch": 1.6812816188870152, + "grad_norm": 16.197329729682938, + "kl": 0.6064453125, + "learning_rate": 8.597972972972972e-07, + "loss": 0.0006, + "reward": 3.7405370473861694, + "reward_std": 0.13544230163097382, + "rewards/final_reward": 1.7687259535170339, + "rewards/mask_iou_reward": 0.8843629767585169, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7509537935256958, + "rewards/thk_ans_format_reward": 1.0, + "step": 498, + "think_completion_length": 11.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.45833587646484, + "epoch": 1.684654300168634, + "grad_norm": 13.052639073244066, + "kl": 0.4970703125, + "learning_rate": 8.595157657657656e-07, + "loss": 0.0005, + "reward": 2.982196807861328, + "reward_std": 0.21756897866725922, + "rewards/final_reward": 0.9414634619208263, + "rewards/mask_iou_reward": 0.47073173096041315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9821969568729401, + "rewards/thk_ans_format_reward": 1.0, + "step": 499, + "think_completion_length": 13.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.01041793823242, + "epoch": 1.6880269814502529, + "grad_norm": 8.795085757071028, + "kl": 0.5146484375, + "learning_rate": 8.592342342342342e-07, + "loss": 0.0005, + "reward": 3.319603443145752, + "reward_std": 0.1779472529888153, + "rewards/final_reward": 0.9019757379410115, + "rewards/mask_iou_reward": 0.45098786897050575, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3196034133434296, + "rewards/thk_ans_format_reward": 1.0, + "step": 500, + "think_completion_length": 11.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.90625381469727, + "epoch": 1.6913996627318717, + "grad_norm": 7.774994787051015, + "kl": 0.4931640625, + "learning_rate": 8.589527027027027e-07, + "loss": 0.0005, + "reward": 3.361993193626404, + "reward_std": 0.15704002976417542, + "rewards/final_reward": 1.2416237096540461, + "rewards/mask_iou_reward": 0.6208118548270231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.361993432044983, + "rewards/thk_ans_format_reward": 1.0, + "step": 501, + "think_completion_length": 11.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 73.57292175292969, + "epoch": 1.6947723440134908, + "grad_norm": 15.137187828443501, + "kl": 0.529296875, + "learning_rate": 8.586711711711711e-07, + "loss": 0.0005, + "reward": 3.0099265575408936, + "reward_std": 0.1874416284263134, + "rewards/final_reward": 0.9739286970285753, + "rewards/mask_iou_reward": 0.48696434851428766, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0203429758548737, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 502, + "think_completion_length": 10.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.6875, + "epoch": 1.6981450252951096, + "grad_norm": 36.580436506414216, + "kl": 0.5, + "learning_rate": 8.583896396396396e-07, + "loss": 0.0005, + "reward": 3.3445026874542236, + "reward_std": 0.3046005591750145, + "rewards/final_reward": 1.582493975030316, + "rewards/mask_iou_reward": 0.791246987515158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3445026874542236, + "rewards/thk_ans_format_reward": 1.0, + "step": 503, + "think_completion_length": 11.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.33333969116211, + "epoch": 1.7015177065767286, + "grad_norm": 28.402567095919988, + "kl": 0.4189453125, + "learning_rate": 8.58108108108108e-07, + "loss": 0.0004, + "reward": 3.3351190090179443, + "reward_std": 0.3104002997279167, + "rewards/final_reward": 1.1093516487241417, + "rewards/mask_iou_reward": 0.5546758243620709, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3351190090179443, + "rewards/thk_ans_format_reward": 1.0, + "step": 504, + "think_completion_length": 10.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.37500381469727, + "epoch": 1.7048903878583475, + "grad_norm": 15.640567490493542, + "kl": 0.623046875, + "learning_rate": 8.578265765765765e-07, + "loss": 0.0006, + "reward": 3.162856936454773, + "reward_std": 0.10240738838911057, + "rewards/final_reward": 0.9918880465103225, + "rewards/mask_iou_reward": 0.49594402325516124, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1628568768501282, + "rewards/thk_ans_format_reward": 1.0, + "step": 505, + "think_completion_length": 12.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.22916793823242, + "epoch": 1.7082630691399663, + "grad_norm": 16.62535695298484, + "kl": 0.5078125, + "learning_rate": 8.57545045045045e-07, + "loss": 0.0006, + "reward": 3.3045690059661865, + "reward_std": 0.14377547800540924, + "rewards/final_reward": 1.5430981242085318, + "rewards/mask_iou_reward": 0.7715490621042659, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3045691847801208, + "rewards/thk_ans_format_reward": 1.0, + "step": 506, + "think_completion_length": 11.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.67708587646484, + "epoch": 1.7116357504215851, + "grad_norm": 11.151972314274246, + "kl": 0.50390625, + "learning_rate": 8.572635135135134e-07, + "loss": 0.0006, + "reward": 3.242234230041504, + "reward_std": 0.08867185190320015, + "rewards/final_reward": 0.7304459733074367, + "rewards/mask_iou_reward": 0.36522298665371833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2422342896461487, + "rewards/thk_ans_format_reward": 1.0, + "step": 507, + "think_completion_length": 12.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.82292175292969, + "epoch": 1.715008431703204, + "grad_norm": 9.160278725973138, + "kl": 0.4521484375, + "learning_rate": 8.569819819819819e-07, + "loss": 0.0005, + "reward": 3.0099756717681885, + "reward_std": 0.3636237531900406, + "rewards/final_reward": 0.9719676653414557, + "rewards/mask_iou_reward": 0.48598383267072787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.009975790977478, + "rewards/thk_ans_format_reward": 1.0, + "step": 508, + "think_completion_length": 11.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.50000381469727, + "epoch": 1.718381112984823, + "grad_norm": 25.299347390182906, + "kl": 0.5615234375, + "learning_rate": 8.567004504504504e-07, + "loss": 0.0006, + "reward": 2.975674629211426, + "reward_std": 0.2414003312587738, + "rewards/final_reward": 1.415178078522424, + "rewards/mask_iou_reward": 0.707589039261212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9756745994091034, + "rewards/thk_ans_format_reward": 1.0, + "step": 509, + "think_completion_length": 12.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.50000381469727, + "epoch": 1.7217537942664418, + "grad_norm": 5.745786155962972, + "kl": 0.525390625, + "learning_rate": 8.564189189189189e-07, + "loss": 0.0007, + "reward": 2.874884009361267, + "reward_std": 0.1341996043920517, + "rewards/final_reward": 0.1135072243268755, + "rewards/mask_iou_reward": 0.05675361216343775, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.874883770942688, + "rewards/thk_ans_format_reward": 1.0, + "step": 510, + "think_completion_length": 14.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.76041793823242, + "epoch": 1.7251264755480609, + "grad_norm": 12.503675028002357, + "kl": 0.4873046875, + "learning_rate": 8.561373873873874e-07, + "loss": 0.0005, + "reward": 3.1886398792266846, + "reward_std": 0.08919402211904526, + "rewards/final_reward": 0.5874913855773956, + "rewards/mask_iou_reward": 0.2937456927886978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1886397004127502, + "rewards/thk_ans_format_reward": 1.0, + "step": 511, + "think_completion_length": 15.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.57291793823242, + "epoch": 1.7284991568296797, + "grad_norm": 19.913816628162035, + "kl": 0.57421875, + "learning_rate": 8.558558558558558e-07, + "loss": 0.0006, + "reward": 3.1143531799316406, + "reward_std": 0.1809094250202179, + "rewards/final_reward": 0.949235596383203, + "rewards/mask_iou_reward": 0.4746177981916015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.114353060722351, + "rewards/thk_ans_format_reward": 1.0, + "step": 512, + "think_completion_length": 12.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.93750381469727, + "epoch": 1.7318718381112985, + "grad_norm": 65.4953382208261, + "kl": 0.5458984375, + "learning_rate": 8.555743243243243e-07, + "loss": 0.0005, + "reward": 3.310346841812134, + "reward_std": 0.18927180767059326, + "rewards/final_reward": 1.249014511299547, + "rewards/mask_iou_reward": 0.6245072556497735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3103469014167786, + "rewards/thk_ans_format_reward": 1.0, + "step": 513, + "think_completion_length": 11.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.62500381469727, + "epoch": 1.7352445193929174, + "grad_norm": 13.336240430300492, + "kl": 0.5078125, + "learning_rate": 8.552927927927928e-07, + "loss": 0.0005, + "reward": 3.2578518390655518, + "reward_std": 0.17345689982175827, + "rewards/final_reward": 0.9461252377763101, + "rewards/mask_iou_reward": 0.47306261888815504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2578518390655518, + "rewards/thk_ans_format_reward": 1.0, + "step": 514, + "think_completion_length": 13.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.25000762939453, + "epoch": 1.7386172006745362, + "grad_norm": 16.689742144094275, + "kl": 0.4970703125, + "learning_rate": 8.550112612612612e-07, + "loss": 0.0005, + "reward": 3.4923455715179443, + "reward_std": 0.17182448878884315, + "rewards/final_reward": 1.7788363585498463, + "rewards/mask_iou_reward": 0.8894181792749232, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5131787657737732, + "rewards/thk_ans_format_reward": 1.0, + "step": 515, + "think_completion_length": 13.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.95833587646484, + "epoch": 1.741989881956155, + "grad_norm": 22.269439493190728, + "kl": 0.61328125, + "learning_rate": 8.547297297297297e-07, + "loss": 0.0006, + "reward": 3.4098668098449707, + "reward_std": 0.1380665097385645, + "rewards/final_reward": 1.274625326712929, + "rewards/mask_iou_reward": 0.6373126633564645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.409866750240326, + "rewards/thk_ans_format_reward": 1.0, + "step": 516, + "think_completion_length": 12.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.95833587646484, + "epoch": 1.745362563237774, + "grad_norm": 13.715544708065021, + "kl": 0.447265625, + "learning_rate": 8.544481981981981e-07, + "loss": 0.0004, + "reward": 2.9615875482559204, + "reward_std": 0.36108721792697906, + "rewards/final_reward": 0.36051196483533215, + "rewards/mask_iou_reward": 0.18025598241766608, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 0.9824208915233612, + "rewards/thk_ans_format_reward": 1.0, + "step": 517, + "think_completion_length": 15.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.00000381469727, + "epoch": 1.7487352445193929, + "grad_norm": 66.02704543638346, + "kl": 0.48828125, + "learning_rate": 8.541666666666666e-07, + "loss": 0.0005, + "reward": 3.1223738193511963, + "reward_std": 0.13363390415906906, + "rewards/final_reward": 0.5471890291502142, + "rewards/mask_iou_reward": 0.2735945145751071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1223737597465515, + "rewards/thk_ans_format_reward": 1.0, + "step": 518, + "think_completion_length": 13.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.18750381469727, + "epoch": 1.752107925801012, + "grad_norm": 7.8067666006986, + "kl": 0.5322265625, + "learning_rate": 8.538851351351351e-07, + "loss": 0.0005, + "reward": 3.3601648807525635, + "reward_std": 0.15161875635385513, + "rewards/final_reward": 1.4802281160901876, + "rewards/mask_iou_reward": 0.7401140580450938, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3705815076828003, + "rewards/thk_ans_format_reward": 1.0, + "step": 519, + "think_completion_length": 13.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.61458587646484, + "epoch": 1.7554806070826308, + "grad_norm": 16.629566761999957, + "kl": 0.759765625, + "learning_rate": 8.536036036036036e-07, + "loss": 0.0008, + "reward": 2.8733246326446533, + "reward_std": 0.21523992344737053, + "rewards/final_reward": 1.0543335481722007, + "rewards/mask_iou_reward": 0.5271667740861004, + "rewards/sam_format_reward": 0.90625, + "rewards/sam_reward_func_ultra": 0.9670746624469757, + "rewards/thk_ans_format_reward": 1.0, + "step": 520, + "think_completion_length": 14.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 76.09375381469727, + "epoch": 1.7588532883642496, + "grad_norm": 14.082221414377713, + "kl": 0.537109375, + "learning_rate": 8.533220720720721e-07, + "loss": 0.0005, + "reward": 3.441567301750183, + "reward_std": 0.20213724300265312, + "rewards/final_reward": 1.5816256478233968, + "rewards/mask_iou_reward": 0.7908128239116984, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.462400734424591, + "rewards/thk_ans_format_reward": 1.0, + "step": 521, + "think_completion_length": 13.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.57291793823242, + "epoch": 1.7622259696458684, + "grad_norm": 8.582734926536023, + "kl": 0.544921875, + "learning_rate": 8.530405405405406e-07, + "loss": 0.0005, + "reward": 3.404173731803894, + "reward_std": 0.3278542831540108, + "rewards/final_reward": 1.3103814615068121, + "rewards/mask_iou_reward": 0.6551907307534061, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.414590299129486, + "rewards/thk_ans_format_reward": 1.0, + "step": 522, + "think_completion_length": 19.166666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.21875381469727, + "epoch": 1.7655986509274872, + "grad_norm": 6.750860216925896, + "kl": 0.546875, + "learning_rate": 8.52759009009009e-07, + "loss": 0.0006, + "reward": 3.1857587099075317, + "reward_std": 0.2380918264389038, + "rewards/final_reward": 0.5405636463610947, + "rewards/mask_iou_reward": 0.27028182318054733, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.2065917253494263, + "rewards/thk_ans_format_reward": 1.0, + "step": 523, + "think_completion_length": 15.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.8125, + "epoch": 1.768971332209106, + "grad_norm": 8.026721438225874, + "kl": 0.5068359375, + "learning_rate": 8.524774774774775e-07, + "loss": 0.0005, + "reward": 3.1504483222961426, + "reward_std": 0.17338748276233673, + "rewards/final_reward": 1.2888089465669585, + "rewards/mask_iou_reward": 0.6444044732834793, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.160864770412445, + "rewards/thk_ans_format_reward": 1.0, + "step": 524, + "think_completion_length": 15.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.48958587646484, + "epoch": 1.7723440134907251, + "grad_norm": 13.596240992474886, + "kl": 0.4853515625, + "learning_rate": 8.521959459459459e-07, + "loss": 0.0005, + "reward": 3.5323015451431274, + "reward_std": 0.13418062031269073, + "rewards/final_reward": 1.7019234677183674, + "rewards/mask_iou_reward": 0.8509617338591837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5323014855384827, + "rewards/thk_ans_format_reward": 1.0, + "step": 525, + "think_completion_length": 15.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.375, + "epoch": 1.7757166947723442, + "grad_norm": 11.637227110484277, + "kl": 0.4384765625, + "learning_rate": 8.519144144144144e-07, + "loss": 0.0004, + "reward": 3.1013338565826416, + "reward_std": 0.18526217341423035, + "rewards/final_reward": 1.5633311583878755, + "rewards/mask_iou_reward": 0.7816655791939378, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1221671551465988, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 526, + "think_completion_length": 15.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.42708587646484, + "epoch": 1.779089376053963, + "grad_norm": 18.429457228413558, + "kl": 0.4365234375, + "learning_rate": 8.516328828828829e-07, + "loss": 0.0004, + "reward": 3.297514796257019, + "reward_std": 0.15136384963989258, + "rewards/final_reward": 1.268170770882234, + "rewards/mask_iou_reward": 0.634085385441117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2975147366523743, + "rewards/thk_ans_format_reward": 1.0, + "step": 527, + "think_completion_length": 18.916666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.92708587646484, + "epoch": 1.7824620573355818, + "grad_norm": 5.886426929840268, + "kl": 0.521484375, + "learning_rate": 8.513513513513513e-07, + "loss": 0.0005, + "reward": 3.117105484008789, + "reward_std": 0.31912093609571457, + "rewards/final_reward": 1.445791756163469, + "rewards/mask_iou_reward": 0.7228958780817345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1171055436134338, + "rewards/thk_ans_format_reward": 1.0, + "step": 528, + "think_completion_length": 20.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.76041793823242, + "epoch": 1.7858347386172007, + "grad_norm": 99.15974140991527, + "kl": 0.484375, + "learning_rate": 8.510698198198198e-07, + "loss": 0.0005, + "reward": 3.0164090394973755, + "reward_std": 0.2540616989135742, + "rewards/final_reward": 0.9807436103667273, + "rewards/mask_iou_reward": 0.49037180518336365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.016409009695053, + "rewards/thk_ans_format_reward": 1.0, + "step": 529, + "think_completion_length": 14.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.61458587646484, + "epoch": 1.7892074198988195, + "grad_norm": 12.846402119311383, + "kl": 0.49609375, + "learning_rate": 8.507882882882883e-07, + "loss": 0.0005, + "reward": 3.278444290161133, + "reward_std": 0.19332800060510635, + "rewards/final_reward": 1.4669976720066265, + "rewards/mask_iou_reward": 0.7334988360033132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2784444093704224, + "rewards/thk_ans_format_reward": 1.0, + "step": 530, + "think_completion_length": 18.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.89583587646484, + "epoch": 1.7925801011804383, + "grad_norm": 50.20147304166135, + "kl": 0.748046875, + "learning_rate": 8.505067567567568e-07, + "loss": 0.0008, + "reward": 3.725192904472351, + "reward_std": 0.09113395772874355, + "rewards/final_reward": 1.8154955316875, + "rewards/mask_iou_reward": 0.90774776584375, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.725192904472351, + "rewards/thk_ans_format_reward": 1.0, + "step": 531, + "think_completion_length": 21.916666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.03125381469727, + "epoch": 1.7959527824620574, + "grad_norm": 10.161187398861516, + "kl": 0.4833984375, + "learning_rate": 8.502252252252253e-07, + "loss": 0.0005, + "reward": 3.1416884660720825, + "reward_std": 0.08379896730184555, + "rewards/final_reward": 0.6741074384144312, + "rewards/mask_iou_reward": 0.3370537192072156, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1416882276535034, + "rewards/thk_ans_format_reward": 1.0, + "step": 532, + "think_completion_length": 18.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.19791793823242, + "epoch": 1.7993254637436762, + "grad_norm": 11.261171569594273, + "kl": 0.4853515625, + "learning_rate": 8.499436936936937e-07, + "loss": 0.0005, + "reward": 2.8111319541931152, + "reward_std": 0.2083485722541809, + "rewards/final_reward": 1.0420207367476402, + "rewards/mask_iou_reward": 0.5210103683738201, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.8319653868675232, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 533, + "think_completion_length": 22.416666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.73958587646484, + "epoch": 1.8026981450252952, + "grad_norm": 6.990923161253162, + "kl": 0.5390625, + "learning_rate": 8.496621621621621e-07, + "loss": 0.0006, + "reward": 3.4891117811203003, + "reward_std": 0.1246962659060955, + "rewards/final_reward": 1.4770193974135732, + "rewards/mask_iou_reward": 0.7385096987067866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4891117215156555, + "rewards/thk_ans_format_reward": 1.0, + "step": 534, + "think_completion_length": 21.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.36458587646484, + "epoch": 1.806070826306914, + "grad_norm": 6.968802100285715, + "kl": 0.5068359375, + "learning_rate": 8.493806306306305e-07, + "loss": 0.0005, + "reward": 2.9380651712417603, + "reward_std": 0.1562102735042572, + "rewards/final_reward": 1.3316888883683826, + "rewards/mask_iou_reward": 0.6658444441841913, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9380651414394379, + "rewards/thk_ans_format_reward": 1.0, + "step": 535, + "think_completion_length": 19.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 79.26041793823242, + "epoch": 1.809443507588533, + "grad_norm": 12.590632039643781, + "kl": 0.4951171875, + "learning_rate": 8.49099099099099e-07, + "loss": 0.0005, + "reward": 3.259642004966736, + "reward_std": 0.1716170459985733, + "rewards/final_reward": 0.9560325567170211, + "rewards/mask_iou_reward": 0.47801627835851057, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2596420645713806, + "rewards/thk_ans_format_reward": 1.0, + "step": 536, + "think_completion_length": 18.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.65625, + "epoch": 1.8128161888701517, + "grad_norm": 10.64822504901881, + "kl": 0.546875, + "learning_rate": 8.488175675675675e-07, + "loss": 0.0005, + "reward": 3.2001864910125732, + "reward_std": 0.17615430057048798, + "rewards/final_reward": 1.3802497216416596, + "rewards/mask_iou_reward": 0.6901248608208298, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.2210198640823364, + "rewards/thk_ans_format_reward": 1.0, + "step": 537, + "think_completion_length": 19.291666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.92708587646484, + "epoch": 1.8161888701517706, + "grad_norm": 8.766553578424087, + "kl": 0.4453125, + "learning_rate": 8.485360360360359e-07, + "loss": 0.0004, + "reward": 3.3197951316833496, + "reward_std": 0.2897758111357689, + "rewards/final_reward": 1.2185051844394084, + "rewards/mask_iou_reward": 0.6092525922197042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3197951316833496, + "rewards/thk_ans_format_reward": 1.0, + "step": 538, + "think_completion_length": 19.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.87500381469727, + "epoch": 1.8195615514333894, + "grad_norm": 8.711449233550052, + "kl": 0.4384765625, + "learning_rate": 8.482545045045044e-07, + "loss": 0.0004, + "reward": 3.3355772495269775, + "reward_std": 0.14928391575813293, + "rewards/final_reward": 1.659850050810137, + "rewards/mask_iou_reward": 0.8299250254050685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.335577130317688, + "rewards/thk_ans_format_reward": 1.0, + "step": 539, + "think_completion_length": 21.541666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.375, + "epoch": 1.8229342327150084, + "grad_norm": 8.527152093675014, + "kl": 0.5087890625, + "learning_rate": 8.47972972972973e-07, + "loss": 0.0005, + "reward": 3.0749590396881104, + "reward_std": 0.2024538516998291, + "rewards/final_reward": 1.164322524556697, + "rewards/mask_iou_reward": 0.5821612622783485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0749590396881104, + "rewards/thk_ans_format_reward": 1.0, + "step": 540, + "think_completion_length": 20.583333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 84.73958587646484, + "epoch": 1.8263069139966275, + "grad_norm": 17.30409683070416, + "kl": 0.513671875, + "learning_rate": 8.476914414414414e-07, + "loss": 0.0005, + "reward": 3.2158886194229126, + "reward_std": 0.15056533366441727, + "rewards/final_reward": 1.4837446194475552, + "rewards/mask_iou_reward": 0.7418723097237776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2158884406089783, + "rewards/thk_ans_format_reward": 1.0, + "step": 541, + "think_completion_length": 19.958333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 85.14583587646484, + "epoch": 1.8296795952782463, + "grad_norm": 19.940723616089134, + "kl": 0.4716796875, + "learning_rate": 8.474099099099099e-07, + "loss": 0.0005, + "reward": 3.301823854446411, + "reward_std": 0.15247973054647446, + "rewards/final_reward": 1.0294040972133638, + "rewards/mask_iou_reward": 0.5147020486066819, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.301823914051056, + "rewards/thk_ans_format_reward": 1.0, + "step": 542, + "think_completion_length": 23.291666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.6875, + "epoch": 1.8330522765598651, + "grad_norm": 7.025931078759855, + "kl": 0.5224609375, + "learning_rate": 8.471283783783783e-07, + "loss": 0.0005, + "reward": 3.1780710220336914, + "reward_std": 0.0799750704318285, + "rewards/final_reward": 0.8655492573961091, + "rewards/mask_iou_reward": 0.43277462869805455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1780711114406586, + "rewards/thk_ans_format_reward": 1.0, + "step": 543, + "think_completion_length": 24.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.25000381469727, + "epoch": 1.836424957841484, + "grad_norm": 8.697002105342502, + "kl": 0.4951171875, + "learning_rate": 8.468468468468468e-07, + "loss": 0.0005, + "reward": 3.0120667219161987, + "reward_std": 0.18368937820196152, + "rewards/final_reward": 0.4529770983270571, + "rewards/mask_iou_reward": 0.22648854916352856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.012066513299942, + "rewards/thk_ans_format_reward": 1.0, + "step": 544, + "think_completion_length": 24.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.69792175292969, + "epoch": 1.8397976391231028, + "grad_norm": 16.1047685273957, + "kl": 0.53125, + "learning_rate": 8.465653153153153e-07, + "loss": 0.0005, + "reward": 3.142224669456482, + "reward_std": 0.1355418637394905, + "rewards/final_reward": 1.393036506519874, + "rewards/mask_iou_reward": 0.696518253259937, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1422248184680939, + "rewards/thk_ans_format_reward": 1.0, + "step": 545, + "think_completion_length": 21.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.76042175292969, + "epoch": 1.8431703204047216, + "grad_norm": 14.620952623904842, + "kl": 0.5224609375, + "learning_rate": 8.462837837837837e-07, + "loss": 0.0005, + "reward": 3.158683657646179, + "reward_std": 0.30953148007392883, + "rewards/final_reward": 0.8971900857425309, + "rewards/mask_iou_reward": 0.44859504287126545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1586836576461792, + "rewards/thk_ans_format_reward": 1.0, + "step": 546, + "think_completion_length": 18.666666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 87.78125, + "epoch": 1.8465430016863407, + "grad_norm": 10.621494015912932, + "kl": 0.529296875, + "learning_rate": 8.460022522522522e-07, + "loss": 0.0005, + "reward": 2.7198134660720825, + "reward_std": 0.30272846668958664, + "rewards/final_reward": 1.0138567852360123, + "rewards/mask_iou_reward": 0.5069283926180062, + "rewards/sam_format_reward": 0.90625, + "rewards/sam_reward_func_ultra": 0.8135634064674377, + "rewards/thk_ans_format_reward": 1.0, + "step": 547, + "think_completion_length": 21.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.03125381469727, + "epoch": 1.8499156829679595, + "grad_norm": 8.631107282509562, + "kl": 0.541015625, + "learning_rate": 8.457207207207206e-07, + "loss": 0.0006, + "reward": 3.328859329223633, + "reward_std": 0.07117979228496552, + "rewards/final_reward": 1.3748636443975273, + "rewards/mask_iou_reward": 0.6874318221987636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3288592994213104, + "rewards/thk_ans_format_reward": 1.0, + "step": 548, + "think_completion_length": 24.166666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.85417175292969, + "epoch": 1.8532883642495785, + "grad_norm": 29.54763154400814, + "kl": 0.455078125, + "learning_rate": 8.454391891891891e-07, + "loss": 0.0005, + "reward": 3.215111494064331, + "reward_std": 0.1470106765627861, + "rewards/final_reward": 1.3981518771005164, + "rewards/mask_iou_reward": 0.6990759385502582, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2255281805992126, + "rewards/thk_ans_format_reward": 1.0, + "step": 549, + "think_completion_length": 22.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.35416793823242, + "epoch": 1.8566610455311974, + "grad_norm": 15.323512729927169, + "kl": 0.474609375, + "learning_rate": 8.451576576576577e-07, + "loss": 0.0005, + "reward": 3.0582566261291504, + "reward_std": 0.16113968193531036, + "rewards/final_reward": 0.47448022909594245, + "rewards/mask_iou_reward": 0.23724011454797123, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0582563877105713, + "rewards/thk_ans_format_reward": 1.0, + "step": 550, + "think_completion_length": 22.708333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.36458969116211, + "epoch": 1.8600337268128162, + "grad_norm": 6.6363475425215706, + "kl": 0.46484375, + "learning_rate": 8.448761261261261e-07, + "loss": 0.0005, + "reward": 3.2980759143829346, + "reward_std": 0.12696680054068565, + "rewards/final_reward": 1.476385199323587, + "rewards/mask_iou_reward": 0.7381925996617935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2980758547782898, + "rewards/thk_ans_format_reward": 1.0, + "step": 551, + "think_completion_length": 19.916666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.5, + "epoch": 1.863406408094435, + "grad_norm": 10.793882018166471, + "kl": 0.501953125, + "learning_rate": 8.445945945945946e-07, + "loss": 0.0005, + "reward": 3.0894296169281006, + "reward_std": 0.14166902750730515, + "rewards/final_reward": 0.6742415648528425, + "rewards/mask_iou_reward": 0.33712078242642124, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0894296169281006, + "rewards/thk_ans_format_reward": 1.0, + "step": 552, + "think_completion_length": 22.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.26041793823242, + "epoch": 1.8667790893760539, + "grad_norm": 8.964690189516379, + "kl": 0.4306640625, + "learning_rate": 8.44313063063063e-07, + "loss": 0.0004, + "reward": 3.16372811794281, + "reward_std": 0.3229072540998459, + "rewards/final_reward": 1.920765938205148, + "rewards/mask_iou_reward": 0.960382969102574, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1637282073497772, + "rewards/thk_ans_format_reward": 1.0, + "step": 553, + "think_completion_length": 19.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.62500381469727, + "epoch": 1.8701517706576727, + "grad_norm": 10.639129220312041, + "kl": 0.45703125, + "learning_rate": 8.440315315315315e-07, + "loss": 0.0005, + "reward": 3.0981560945510864, + "reward_std": 0.10398751497268677, + "rewards/final_reward": 1.3976912835931667, + "rewards/mask_iou_reward": 0.6988456417965834, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.098155915737152, + "rewards/thk_ans_format_reward": 1.0, + "step": 554, + "think_completion_length": 18.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.68750381469727, + "epoch": 1.8735244519392917, + "grad_norm": 11.375196091996045, + "kl": 0.41015625, + "learning_rate": 8.4375e-07, + "loss": 0.0004, + "reward": 3.0671032667160034, + "reward_std": 0.08987793326377869, + "rewards/final_reward": 0.9952254414524675, + "rewards/mask_iou_reward": 0.49761272072623375, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0671032667160034, + "rewards/thk_ans_format_reward": 1.0, + "step": 555, + "think_completion_length": 20.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.7291717529297, + "epoch": 1.8768971332209108, + "grad_norm": 7.375784396473635, + "kl": 0.4189453125, + "learning_rate": 8.434684684684684e-07, + "loss": 0.0004, + "reward": 3.3185118436813354, + "reward_std": 0.1634932905435562, + "rewards/final_reward": 0.7958873993629867, + "rewards/mask_iou_reward": 0.39794369968149335, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.318511962890625, + "rewards/thk_ans_format_reward": 1.0, + "step": 556, + "think_completion_length": 25.458333333333336 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.2916717529297, + "epoch": 1.8802698145025296, + "grad_norm": 13.869981408595814, + "kl": 0.390625, + "learning_rate": 8.431869369369369e-07, + "loss": 0.0004, + "reward": 3.16398823261261, + "reward_std": 0.21535015106201172, + "rewards/final_reward": 0.7045846438755945, + "rewards/mask_iou_reward": 0.35229232193779725, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1639882326126099, + "rewards/thk_ans_format_reward": 1.0, + "step": 557, + "think_completion_length": 25.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.97916793823242, + "epoch": 1.8836424957841484, + "grad_norm": 16.455273669557467, + "kl": 0.5283203125, + "learning_rate": 8.429054054054054e-07, + "loss": 0.0005, + "reward": 3.16058886051178, + "reward_std": 0.3432590663433075, + "rewards/final_reward": 1.1915005029036203, + "rewards/mask_iou_reward": 0.5957502514518102, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1710055470466614, + "rewards/thk_ans_format_reward": 1.0, + "step": 558, + "think_completion_length": 17.666666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5729217529297, + "epoch": 1.8870151770657673, + "grad_norm": 36.217949351883604, + "kl": 0.3828125, + "learning_rate": 8.426238738738738e-07, + "loss": 0.0004, + "reward": 3.172240138053894, + "reward_std": 0.2821786254644394, + "rewards/final_reward": 1.4714988759551624, + "rewards/mask_iou_reward": 0.7357494379775812, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1826567649841309, + "rewards/thk_ans_format_reward": 1.0, + "step": 559, + "think_completion_length": 22.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.45833587646484, + "epoch": 1.890387858347386, + "grad_norm": 8.45836576322927, + "kl": 0.384765625, + "learning_rate": 8.423423423423423e-07, + "loss": 0.0004, + "reward": 3.2458680868148804, + "reward_std": 0.10261016711592674, + "rewards/final_reward": 1.7873590346464214, + "rewards/mask_iou_reward": 0.8936795173232107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2458679676055908, + "rewards/thk_ans_format_reward": 1.0, + "step": 560, + "think_completion_length": 19.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6041717529297, + "epoch": 1.893760539629005, + "grad_norm": 9.87017521422623, + "kl": 0.3740234375, + "learning_rate": 8.420608108108108e-07, + "loss": 0.0004, + "reward": 3.1932257413864136, + "reward_std": 0.2400343120098114, + "rewards/final_reward": 1.5858177346376892, + "rewards/mask_iou_reward": 0.7929088673188446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1932256817817688, + "rewards/thk_ans_format_reward": 1.0, + "step": 561, + "think_completion_length": 18.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.7916717529297, + "epoch": 1.897133220910624, + "grad_norm": 8.911821171091752, + "kl": 0.41015625, + "learning_rate": 8.417792792792793e-07, + "loss": 0.0004, + "reward": 3.3439966440200806, + "reward_std": 0.11329784244298935, + "rewards/final_reward": 1.786571429457327, + "rewards/mask_iou_reward": 0.8932857147286635, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3439966440200806, + "rewards/thk_ans_format_reward": 1.0, + "step": 562, + "think_completion_length": 16.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.13541793823242, + "epoch": 1.9005059021922428, + "grad_norm": 22.033788659128657, + "kl": 0.5859375, + "learning_rate": 8.414977477477478e-07, + "loss": 0.0006, + "reward": 3.2525041103363037, + "reward_std": 0.1615203619003296, + "rewards/final_reward": 1.3467275082537487, + "rewards/mask_iou_reward": 0.6733637541268743, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.3254209160804749, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 563, + "think_completion_length": 19.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.3958396911621, + "epoch": 1.9038785834738619, + "grad_norm": 5.99702068619869, + "kl": 0.4228515625, + "learning_rate": 8.412162162162162e-07, + "loss": 0.0004, + "reward": 2.9900684356689453, + "reward_std": 0.2933308109641075, + "rewards/final_reward": 1.1604803190239532, + "rewards/mask_iou_reward": 0.5802401595119766, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.0109014511108398, + "rewards/thk_ans_format_reward": 1.0, + "step": 564, + "think_completion_length": 17.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5104217529297, + "epoch": 1.9072512647554807, + "grad_norm": 36.300540211403195, + "kl": 0.439453125, + "learning_rate": 8.409346846846847e-07, + "loss": 0.0004, + "reward": 3.008383631706238, + "reward_std": 0.3261030241847038, + "rewards/final_reward": 1.6861075354920192, + "rewards/mask_iou_reward": 0.8430537677460096, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.0604670345783234, + "rewards/thk_ans_format_reward": 1.0, + "step": 565, + "think_completion_length": 19.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.00000381469727, + "epoch": 1.9106239460370995, + "grad_norm": 8.11030374288559, + "kl": 0.4375, + "learning_rate": 8.406531531531531e-07, + "loss": 0.0004, + "reward": 3.056068181991577, + "reward_std": 0.3603939563035965, + "rewards/final_reward": 1.7098159849058376, + "rewards/mask_iou_reward": 0.8549079924529188, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.0873180031776428, + "rewards/thk_ans_format_reward": 1.0, + "step": 566, + "think_completion_length": 15.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.27083587646484, + "epoch": 1.9139966273187183, + "grad_norm": 11.099614270228152, + "kl": 0.380859375, + "learning_rate": 8.403716216216216e-07, + "loss": 0.0004, + "reward": 3.1362944841384888, + "reward_std": 0.16306444257497787, + "rewards/final_reward": 1.3717985459900603, + "rewards/mask_iou_reward": 0.6858992729950302, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.146710753440857, + "rewards/thk_ans_format_reward": 1.0, + "step": 567, + "think_completion_length": 19.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.4375, + "epoch": 1.9173693086003372, + "grad_norm": 7.039001675126875, + "kl": 0.396484375, + "learning_rate": 8.400900900900901e-07, + "loss": 0.0004, + "reward": 3.3202935457229614, + "reward_std": 0.16550321877002716, + "rewards/final_reward": 1.3515027226349199, + "rewards/mask_iou_reward": 0.6757513613174599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3202936053276062, + "rewards/thk_ans_format_reward": 1.0, + "step": 568, + "think_completion_length": 20.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.95833587646484, + "epoch": 1.920741989881956, + "grad_norm": 17.834134911337642, + "kl": 0.4169921875, + "learning_rate": 8.398085585585585e-07, + "loss": 0.0004, + "reward": 3.235255718231201, + "reward_std": 0.15589579567313194, + "rewards/final_reward": 1.129114600213489, + "rewards/mask_iou_reward": 0.5645573001067445, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2352555394172668, + "rewards/thk_ans_format_reward": 1.0, + "step": 569, + "think_completion_length": 16.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.46875762939453, + "epoch": 1.924114671163575, + "grad_norm": 12.902306834957399, + "kl": 0.3740234375, + "learning_rate": 8.39527027027027e-07, + "loss": 0.0004, + "reward": 3.1055132150650024, + "reward_std": 0.3345019519329071, + "rewards/final_reward": 0.8178962587305949, + "rewards/mask_iou_reward": 0.40894812936529745, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.115929663181305, + "rewards/thk_ans_format_reward": 1.0, + "step": 570, + "think_completion_length": 16.791666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.9479217529297, + "epoch": 1.927487352445194, + "grad_norm": 5.639968467120446, + "kl": 0.408203125, + "learning_rate": 8.392454954954956e-07, + "loss": 0.0004, + "reward": 3.051029324531555, + "reward_std": 0.21297653764486313, + "rewards/final_reward": 1.22482687311132, + "rewards/mask_iou_reward": 0.61241343655566, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.082279086112976, + "rewards/thk_ans_format_reward": 1.0, + "step": 571, + "think_completion_length": 19.666666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.46875381469727, + "epoch": 1.930860033726813, + "grad_norm": 13.607824062123045, + "kl": 0.4013671875, + "learning_rate": 8.38963963963964e-07, + "loss": 0.0004, + "reward": 3.496493697166443, + "reward_std": 0.30988840758800507, + "rewards/final_reward": 1.2803013559711691, + "rewards/mask_iou_reward": 0.6401506779855846, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5173268914222717, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 572, + "think_completion_length": 19.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.55208587646484, + "epoch": 1.9342327150084317, + "grad_norm": 8.18383905891403, + "kl": 0.431640625, + "learning_rate": 8.386824324324324e-07, + "loss": 0.0005, + "reward": 3.22155499458313, + "reward_std": 0.1360001638531685, + "rewards/final_reward": 1.4888304391906995, + "rewards/mask_iou_reward": 0.7444152195953497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2215549945831299, + "rewards/thk_ans_format_reward": 1.0, + "step": 573, + "think_completion_length": 15.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.17708587646484, + "epoch": 1.9376053962900506, + "grad_norm": 13.28590429258825, + "kl": 0.4375, + "learning_rate": 8.384009009009008e-07, + "loss": 0.0004, + "reward": 3.196623682975769, + "reward_std": 0.27096298336982727, + "rewards/final_reward": 0.6739398318220715, + "rewards/mask_iou_reward": 0.33696991591103576, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.2487069964408875, + "rewards/thk_ans_format_reward": 1.0, + "step": 574, + "think_completion_length": 14.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.22917175292969, + "epoch": 1.9409780775716694, + "grad_norm": 17.936727518963696, + "kl": 0.4462890625, + "learning_rate": 8.381193693693693e-07, + "loss": 0.0004, + "reward": 3.063702344894409, + "reward_std": 0.3580833673477173, + "rewards/final_reward": 1.254804599144204, + "rewards/mask_iou_reward": 0.627402299572102, + "rewards/sam_format_reward": 0.90625, + "rewards/sam_reward_func_ultra": 1.1574523150920868, + "rewards/thk_ans_format_reward": 1.0, + "step": 575, + "think_completion_length": 17.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.125, + "epoch": 1.9443507588532882, + "grad_norm": 9.001644590818499, + "kl": 0.369140625, + "learning_rate": 8.378378378378377e-07, + "loss": 0.0004, + "reward": 3.3422189950942993, + "reward_std": 0.1806359961628914, + "rewards/final_reward": 1.3741094642864033, + "rewards/mask_iou_reward": 0.6870547321432017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3422189950942993, + "rewards/thk_ans_format_reward": 1.0, + "step": 576, + "think_completion_length": 16.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.11458587646484, + "epoch": 1.9477234401349073, + "grad_norm": 6.693206561854366, + "kl": 0.3984375, + "learning_rate": 8.375563063063062e-07, + "loss": 0.0004, + "reward": 3.150846838951111, + "reward_std": 0.23167786747217178, + "rewards/final_reward": 1.1163596449199726, + "rewards/mask_iou_reward": 0.5581798224599863, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.182096779346466, + "rewards/thk_ans_format_reward": 1.0, + "step": 577, + "think_completion_length": 16.041666666666664 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.27083587646484, + "epoch": 1.951096121416526, + "grad_norm": 8.591423372545021, + "kl": 0.4287109375, + "learning_rate": 8.372747747747747e-07, + "loss": 0.0004, + "reward": 2.970146656036377, + "reward_std": 0.18085888028144836, + "rewards/final_reward": 0.30750705059321537, + "rewards/mask_iou_reward": 0.15375352529660768, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.970146656036377, + "rewards/thk_ans_format_reward": 1.0, + "step": 578, + "think_completion_length": 14.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.98958587646484, + "epoch": 1.9544688026981452, + "grad_norm": 68.40018713629142, + "kl": 0.408203125, + "learning_rate": 8.369932432432431e-07, + "loss": 0.0004, + "reward": 3.5376436710357666, + "reward_std": 0.07944156974554062, + "rewards/final_reward": 1.7256402832301423, + "rewards/mask_iou_reward": 0.8628201416150711, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5376437306404114, + "rewards/thk_ans_format_reward": 1.0, + "step": 579, + "think_completion_length": 19.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.56250381469727, + "epoch": 1.957841483979764, + "grad_norm": 14.388456789886524, + "kl": 0.43359375, + "learning_rate": 8.367117117117116e-07, + "loss": 0.0004, + "reward": 3.4017832279205322, + "reward_std": 0.14799121022224426, + "rewards/final_reward": 0.722340396741691, + "rewards/mask_iou_reward": 0.3611701983708455, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.401783049106598, + "rewards/thk_ans_format_reward": 1.0, + "step": 580, + "think_completion_length": 14.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.58333587646484, + "epoch": 1.9612141652613828, + "grad_norm": 24.806741406182244, + "kl": 0.4228515625, + "learning_rate": 8.364301801801802e-07, + "loss": 0.0004, + "reward": 3.232166051864624, + "reward_std": 0.05064544826745987, + "rewards/final_reward": 0.9439742251988625, + "rewards/mask_iou_reward": 0.47198711259943127, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2321660816669464, + "rewards/thk_ans_format_reward": 1.0, + "step": 581, + "think_completion_length": 15.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.06250381469727, + "epoch": 1.9645868465430016, + "grad_norm": 4.889973371546044, + "kl": 0.6435546875, + "learning_rate": 8.361486486486486e-07, + "loss": 0.0006, + "reward": 3.579144597053528, + "reward_std": 0.09146898984909058, + "rewards/final_reward": 1.687550136151648, + "rewards/mask_iou_reward": 0.843775068075824, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.579144537448883, + "rewards/thk_ans_format_reward": 1.0, + "step": 582, + "think_completion_length": 16.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.56250381469727, + "epoch": 1.9679595278246205, + "grad_norm": 5.160628487146692, + "kl": 0.4306640625, + "learning_rate": 8.358671171171171e-07, + "loss": 0.0004, + "reward": 3.189295530319214, + "reward_std": 0.21881115436553955, + "rewards/final_reward": 1.0328786095998268, + "rewards/mask_iou_reward": 0.5164393047999134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1892955303192139, + "rewards/thk_ans_format_reward": 1.0, + "step": 583, + "think_completion_length": 14.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.47917175292969, + "epoch": 1.9713322091062393, + "grad_norm": 11.902639843711885, + "kl": 0.384765625, + "learning_rate": 8.355855855855855e-07, + "loss": 0.0004, + "reward": 2.815028190612793, + "reward_std": 0.3476174771785736, + "rewards/final_reward": 1.0217865616110187, + "rewards/mask_iou_reward": 0.5108932808055093, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.8254446983337402, + "rewards/thk_ans_format_reward": 1.0, + "step": 584, + "think_completion_length": 13.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.875, + "epoch": 1.9747048903878583, + "grad_norm": 41.89633117528405, + "kl": 0.4580078125, + "learning_rate": 8.35304054054054e-07, + "loss": 0.0005, + "reward": 3.0252166986465454, + "reward_std": 0.12799861282110214, + "rewards/final_reward": 0.8591126518569622, + "rewards/mask_iou_reward": 0.4295563259284811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0252164900302887, + "rewards/thk_ans_format_reward": 1.0, + "step": 585, + "think_completion_length": 14.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.14583587646484, + "epoch": 1.9780775716694774, + "grad_norm": 8.279356260712635, + "kl": 0.4189453125, + "learning_rate": 8.350225225225225e-07, + "loss": 0.0004, + "reward": 3.1389033794403076, + "reward_std": 0.17402781546115875, + "rewards/final_reward": 0.7781741211554355, + "rewards/mask_iou_reward": 0.38908706057771775, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1389034986495972, + "rewards/thk_ans_format_reward": 1.0, + "step": 586, + "think_completion_length": 14.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.3229217529297, + "epoch": 1.9814502529510962, + "grad_norm": 7.239676615842828, + "kl": 0.421875, + "learning_rate": 8.347409909909909e-07, + "loss": 0.0004, + "reward": 3.356380581855774, + "reward_std": 0.1445971019566059, + "rewards/final_reward": 1.6324843258097554, + "rewards/mask_iou_reward": 0.8162421629048777, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.356380581855774, + "rewards/thk_ans_format_reward": 1.0, + "step": 587, + "think_completion_length": 13.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.72916793823242, + "epoch": 1.984822934232715, + "grad_norm": 6.624462864165187, + "kl": 0.626953125, + "learning_rate": 8.344594594594594e-07, + "loss": 0.0006, + "reward": 3.1781363487243652, + "reward_std": 0.18783046305179596, + "rewards/final_reward": 1.0586905247523806, + "rewards/mask_iou_reward": 0.5293452623761903, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1781366467475891, + "rewards/thk_ans_format_reward": 1.0, + "step": 588, + "think_completion_length": 14.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.95833587646484, + "epoch": 1.9881956155143339, + "grad_norm": 6.657899573691945, + "kl": 0.66796875, + "learning_rate": 8.341779279279278e-07, + "loss": 0.0007, + "reward": 3.361886739730835, + "reward_std": 0.10654079541563988, + "rewards/final_reward": 1.6515764617191033, + "rewards/mask_iou_reward": 0.8257882308595517, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.361886739730835, + "rewards/thk_ans_format_reward": 1.0, + "step": 589, + "think_completion_length": 16.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.50000381469727, + "epoch": 1.9915682967959527, + "grad_norm": 7.385702122693169, + "kl": 0.4765625, + "learning_rate": 8.338963963963963e-07, + "loss": 0.0005, + "reward": 3.151884913444519, + "reward_std": 0.2869477644562721, + "rewards/final_reward": 1.5238424558101884, + "rewards/mask_iou_reward": 0.7619212279050942, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1518847942352295, + "rewards/thk_ans_format_reward": 1.0, + "step": 590, + "think_completion_length": 16.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.81250381469727, + "epoch": 1.9949409780775715, + "grad_norm": 14.355643385177684, + "kl": 0.443359375, + "learning_rate": 8.336148648648649e-07, + "loss": 0.0004, + "reward": 3.2977166175842285, + "reward_std": 0.2152240127325058, + "rewards/final_reward": 0.6568418408176202, + "rewards/mask_iou_reward": 0.3284209204088101, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2977163195610046, + "rewards/thk_ans_format_reward": 1.0, + "step": 591, + "think_completion_length": 16.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.15789413452148, + "epoch": 1.9983136593591906, + "grad_norm": 10.702373666078783, + "kl": 0.390625, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0004, + "reward": 3.004240393638611, + "reward_std": 0.21375977247953415, + "rewards/final_reward": 0.6922338710796716, + "rewards/mask_iou_reward": 0.3461169355398358, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0042401552200317, + "rewards/thk_ans_format_reward": 1.0, + "step": 592, + "think_completion_length": 13.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.15625, + "epoch": 2.003372681281619, + "grad_norm": 7.316613913101059, + "kl": 0.494140625, + "learning_rate": 8.330518018018018e-07, + "loss": 0.0005, + "reward": 3.213071584701538, + "reward_std": 0.21354631334543228, + "rewards/final_reward": 1.085932180658682, + "rewards/mask_iou_reward": 0.542966090329341, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.223488211631775, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 593, + "think_completion_length": 10.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.83333587646484, + "epoch": 2.0067453625632377, + "grad_norm": 17.99254075390951, + "kl": 0.4375, + "learning_rate": 8.327702702702703e-07, + "loss": 0.0004, + "reward": 3.254488706588745, + "reward_std": 0.16734839975833893, + "rewards/final_reward": 0.6797857897801465, + "rewards/mask_iou_reward": 0.33989289489007324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2544885873794556, + "rewards/thk_ans_format_reward": 1.0, + "step": 594, + "think_completion_length": 12.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.27083587646484, + "epoch": 2.0101180438448565, + "grad_norm": 6.0033516691706, + "kl": 0.4580078125, + "learning_rate": 8.324887387387387e-07, + "loss": 0.0005, + "reward": 3.2571709156036377, + "reward_std": 0.15888730436563492, + "rewards/final_reward": 1.4449123377187612, + "rewards/mask_iou_reward": 0.7224561688593806, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2571707367897034, + "rewards/thk_ans_format_reward": 1.0, + "step": 595, + "think_completion_length": 11.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.65625381469727, + "epoch": 2.0134907251264758, + "grad_norm": 11.736645609380771, + "kl": 0.427734375, + "learning_rate": 8.322072072072072e-07, + "loss": 0.0004, + "reward": 3.4008623361587524, + "reward_std": 0.19675985723733902, + "rewards/final_reward": 1.419864947487815, + "rewards/mask_iou_reward": 0.7099324737439076, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4008623361587524, + "rewards/thk_ans_format_reward": 1.0, + "step": 596, + "think_completion_length": 10.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.26041793823242, + "epoch": 2.0168634064080946, + "grad_norm": 7.651004215995592, + "kl": 0.45703125, + "learning_rate": 8.319256756756756e-07, + "loss": 0.0005, + "reward": 3.1595300436019897, + "reward_std": 0.16874928027391434, + "rewards/final_reward": 1.1975441458176574, + "rewards/mask_iou_reward": 0.5987720729088287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1595301032066345, + "rewards/thk_ans_format_reward": 1.0, + "step": 597, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.68750381469727, + "epoch": 2.0202360876897134, + "grad_norm": 7.692105896132088, + "kl": 0.38671875, + "learning_rate": 8.316441441441441e-07, + "loss": 0.0004, + "reward": 3.4031869173049927, + "reward_std": 0.1947040967643261, + "rewards/final_reward": 1.5671920196850815, + "rewards/mask_iou_reward": 0.7835960098425407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4031866192817688, + "rewards/thk_ans_format_reward": 1.0, + "step": 598, + "think_completion_length": 11.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.23958587646484, + "epoch": 2.0236087689713322, + "grad_norm": 5.199739352470058, + "kl": 0.6396484375, + "learning_rate": 8.313626126126126e-07, + "loss": 0.0006, + "reward": 3.4093559980392456, + "reward_std": 0.10994856804609299, + "rewards/final_reward": 0.9188244675614923, + "rewards/mask_iou_reward": 0.45941223378074614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4093559384346008, + "rewards/thk_ans_format_reward": 1.0, + "step": 599, + "think_completion_length": 10.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.23958587646484, + "epoch": 2.026981450252951, + "grad_norm": 8.866087972734299, + "kl": 0.4931640625, + "learning_rate": 8.31081081081081e-07, + "loss": 0.0005, + "reward": 2.955493450164795, + "reward_std": 0.1533719301223755, + "rewards/final_reward": 1.0310845691694952, + "rewards/mask_iou_reward": 0.5155422845847476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.955493301153183, + "rewards/thk_ans_format_reward": 1.0, + "step": 600, + "think_completion_length": 11.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.33333587646484, + "epoch": 2.03035413153457, + "grad_norm": 10.027714875616592, + "kl": 0.484375, + "learning_rate": 8.307995495495496e-07, + "loss": 0.0005, + "reward": 3.1500658988952637, + "reward_std": 0.17688149958848953, + "rewards/final_reward": 1.3310248751677642, + "rewards/mask_iou_reward": 0.6655124375838821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1500657796859741, + "rewards/thk_ans_format_reward": 1.0, + "step": 601, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.875, + "epoch": 2.0337268128161887, + "grad_norm": 15.868187188602219, + "kl": 0.4130859375, + "learning_rate": 8.30518018018018e-07, + "loss": 0.0004, + "reward": 3.543818950653076, + "reward_std": 0.22326519712805748, + "rewards/final_reward": 1.3907805272302842, + "rewards/mask_iou_reward": 0.6953902636151421, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5438188910484314, + "rewards/thk_ans_format_reward": 1.0, + "step": 602, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.53125, + "epoch": 2.0370994940978076, + "grad_norm": 22.880355331905218, + "kl": 0.4365234375, + "learning_rate": 8.302364864864865e-07, + "loss": 0.0004, + "reward": 2.813183307647705, + "reward_std": 0.23120269179344177, + "rewards/final_reward": 1.3932862144098808, + "rewards/mask_iou_reward": 0.6966431072049404, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.82359978556633, + "rewards/thk_ans_format_reward": 1.0, + "step": 603, + "think_completion_length": 11.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.14583587646484, + "epoch": 2.040472175379427, + "grad_norm": 52.891657910077754, + "kl": 0.439453125, + "learning_rate": 8.29954954954955e-07, + "loss": 0.0004, + "reward": 3.7011115550994873, + "reward_std": 0.1626211702823639, + "rewards/final_reward": 1.7655026499713529, + "rewards/mask_iou_reward": 0.8827513249856764, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.701111614704132, + "rewards/thk_ans_format_reward": 1.0, + "step": 604, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.50000381469727, + "epoch": 2.0438448566610457, + "grad_norm": 5.637244830932424, + "kl": 0.44921875, + "learning_rate": 8.296734234234234e-07, + "loss": 0.0005, + "reward": 3.6091989278793335, + "reward_std": 0.11021934449672699, + "rewards/final_reward": 1.5831235540128628, + "rewards/mask_iou_reward": 0.7915617770064314, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6091991662979126, + "rewards/thk_ans_format_reward": 1.0, + "step": 605, + "think_completion_length": 11.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.36458587646484, + "epoch": 2.0472175379426645, + "grad_norm": 27.7979225374541, + "kl": 0.4326171875, + "learning_rate": 8.293918918918919e-07, + "loss": 0.0004, + "reward": 3.490352988243103, + "reward_std": 0.23794641345739365, + "rewards/final_reward": 1.4083256625127278, + "rewards/mask_iou_reward": 0.7041628312563639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4903529286384583, + "rewards/thk_ans_format_reward": 1.0, + "step": 606, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.2604217529297, + "epoch": 2.0505902192242833, + "grad_norm": 10.320866046158933, + "kl": 0.4775390625, + "learning_rate": 8.291103603603603e-07, + "loss": 0.0005, + "reward": 3.5833810567855835, + "reward_std": 0.2443862035870552, + "rewards/final_reward": 1.7815358176192633, + "rewards/mask_iou_reward": 0.8907679088096316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5833812355995178, + "rewards/thk_ans_format_reward": 1.0, + "step": 607, + "think_completion_length": 12.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.50000381469727, + "epoch": 2.053962900505902, + "grad_norm": 21.575973813768552, + "kl": 0.51171875, + "learning_rate": 8.288288288288288e-07, + "loss": 0.0005, + "reward": 3.36086106300354, + "reward_std": 0.09921130910515785, + "rewards/final_reward": 1.5238539110371945, + "rewards/mask_iou_reward": 0.7619269555185972, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3608611226081848, + "rewards/thk_ans_format_reward": 1.0, + "step": 608, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.36458587646484, + "epoch": 2.057335581787521, + "grad_norm": 7.857035715646524, + "kl": 0.4697265625, + "learning_rate": 8.285472972972973e-07, + "loss": 0.0005, + "reward": 3.3863420486450195, + "reward_std": 0.13179394975304604, + "rewards/final_reward": 1.3365266390593695, + "rewards/mask_iou_reward": 0.6682633195296848, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3967588543891907, + "rewards/thk_ans_format_reward": 1.0, + "step": 609, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.20833587646484, + "epoch": 2.06070826306914, + "grad_norm": 9.309948150573302, + "kl": 0.412109375, + "learning_rate": 8.282657657657657e-07, + "loss": 0.0004, + "reward": 2.7876734733581543, + "reward_std": 0.17468656226992607, + "rewards/final_reward": 1.1084424634770147, + "rewards/mask_iou_reward": 0.5542212317385073, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7876733839511871, + "rewards/thk_ans_format_reward": 1.0, + "step": 610, + "think_completion_length": 12.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.92708587646484, + "epoch": 2.064080944350759, + "grad_norm": 8.297823282134887, + "kl": 0.5166015625, + "learning_rate": 8.279842342342343e-07, + "loss": 0.0005, + "reward": 3.274766206741333, + "reward_std": 0.2503800541162491, + "rewards/final_reward": 1.1995371075953356, + "rewards/mask_iou_reward": 0.5997685537976678, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.295599639415741, + "rewards/thk_ans_format_reward": 1.0, + "step": 611, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.46875381469727, + "epoch": 2.067453625632378, + "grad_norm": 5.647774807748787, + "kl": 0.455078125, + "learning_rate": 8.277027027027028e-07, + "loss": 0.0005, + "reward": 3.3382933139801025, + "reward_std": 0.06181446276605129, + "rewards/final_reward": 0.7236724084096133, + "rewards/mask_iou_reward": 0.36183620420480667, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.338293194770813, + "rewards/thk_ans_format_reward": 1.0, + "step": 612, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.80208587646484, + "epoch": 2.0708263069139967, + "grad_norm": 14.09217085577382, + "kl": 0.46484375, + "learning_rate": 8.274211711711711e-07, + "loss": 0.0005, + "reward": 3.541401505470276, + "reward_std": 0.1674039326608181, + "rewards/final_reward": 1.3222702404190154, + "rewards/mask_iou_reward": 0.6611351202095077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5414013266563416, + "rewards/thk_ans_format_reward": 1.0, + "step": 613, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.2083396911621, + "epoch": 2.0741989881956155, + "grad_norm": 7.905145281311906, + "kl": 0.4130859375, + "learning_rate": 8.271396396396396e-07, + "loss": 0.0004, + "reward": 3.4339908361434937, + "reward_std": 0.08459173701703548, + "rewards/final_reward": 1.4237471525821739, + "rewards/mask_iou_reward": 0.7118735762910869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.433990716934204, + "rewards/thk_ans_format_reward": 1.0, + "step": 614, + "think_completion_length": 10.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.38541793823242, + "epoch": 2.0775716694772344, + "grad_norm": 11.317119418959988, + "kl": 0.5244140625, + "learning_rate": 8.26858108108108e-07, + "loss": 0.0005, + "reward": 2.9590145349502563, + "reward_std": 0.08396272733807564, + "rewards/final_reward": 1.5951641649651038, + "rewards/mask_iou_reward": 0.7975820824825519, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9590144753456116, + "rewards/thk_ans_format_reward": 1.0, + "step": 615, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.65625381469727, + "epoch": 2.080944350758853, + "grad_norm": 10.528441939049495, + "kl": 0.4189453125, + "learning_rate": 8.265765765765765e-07, + "loss": 0.0004, + "reward": 3.174362897872925, + "reward_std": 0.20493387430906296, + "rewards/final_reward": 1.32150395635473, + "rewards/mask_iou_reward": 0.660751978177365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1743630170822144, + "rewards/thk_ans_format_reward": 1.0, + "step": 616, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.5625, + "epoch": 2.084317032040472, + "grad_norm": 11.632868904279976, + "kl": 0.548828125, + "learning_rate": 8.26295045045045e-07, + "loss": 0.0005, + "reward": 3.5149197578430176, + "reward_std": 0.16870852559804916, + "rewards/final_reward": 1.1256652894876513, + "rewards/mask_iou_reward": 0.5628326447438257, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5149198174476624, + "rewards/thk_ans_format_reward": 1.0, + "step": 617, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.04166793823242, + "epoch": 2.087689713322091, + "grad_norm": 10.672156817999449, + "kl": 0.4765625, + "learning_rate": 8.260135135135134e-07, + "loss": 0.0005, + "reward": 3.462290406227112, + "reward_std": 0.1524660885334015, + "rewards/final_reward": 1.6213807924322559, + "rewards/mask_iou_reward": 0.8106903962161279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4622901678085327, + "rewards/thk_ans_format_reward": 1.0, + "step": 618, + "think_completion_length": 10.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.40625381469727, + "epoch": 2.09106239460371, + "grad_norm": 9.171979331095862, + "kl": 0.490234375, + "learning_rate": 8.257319819819819e-07, + "loss": 0.0005, + "reward": 3.575721502304077, + "reward_std": 0.22333138436079025, + "rewards/final_reward": 1.589309503896279, + "rewards/mask_iou_reward": 0.7946547519481395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5757215023040771, + "rewards/thk_ans_format_reward": 1.0, + "step": 619, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.43750381469727, + "epoch": 2.094435075885329, + "grad_norm": 4.728161898826185, + "kl": 0.44921875, + "learning_rate": 8.254504504504503e-07, + "loss": 0.0005, + "reward": 3.340423822402954, + "reward_std": 0.12725035846233368, + "rewards/final_reward": 0.8856937987091074, + "rewards/mask_iou_reward": 0.4428468993545537, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.340423822402954, + "rewards/thk_ans_format_reward": 1.0, + "step": 620, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.38541793823242, + "epoch": 2.097807757166948, + "grad_norm": 31.809054413592087, + "kl": 0.4521484375, + "learning_rate": 8.251689189189189e-07, + "loss": 0.0005, + "reward": 3.261618733406067, + "reward_std": 0.12912706285715103, + "rewards/final_reward": 1.4930522317336727, + "rewards/mask_iou_reward": 0.7465261158668364, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2616186141967773, + "rewards/thk_ans_format_reward": 1.0, + "step": 621, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.32291793823242, + "epoch": 2.1011804384485666, + "grad_norm": 6.562821215019139, + "kl": 0.5, + "learning_rate": 8.248873873873874e-07, + "loss": 0.0005, + "reward": 3.1994651556015015, + "reward_std": 0.17931604385375977, + "rewards/final_reward": 0.9012642826978075, + "rewards/mask_iou_reward": 0.4506321413489037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1994652152061462, + "rewards/thk_ans_format_reward": 1.0, + "step": 622, + "think_completion_length": 10.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.11458969116211, + "epoch": 2.1045531197301854, + "grad_norm": 12.171223316148184, + "kl": 0.53125, + "learning_rate": 8.246058558558558e-07, + "loss": 0.0005, + "reward": 3.2846641540527344, + "reward_std": 0.14336452260613441, + "rewards/final_reward": 1.3247871014828667, + "rewards/mask_iou_reward": 0.6623935507414334, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2846640944480896, + "rewards/thk_ans_format_reward": 1.0, + "step": 623, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75000381469727, + "epoch": 2.1079258010118043, + "grad_norm": 16.407159118171702, + "kl": 0.578125, + "learning_rate": 8.243243243243243e-07, + "loss": 0.0006, + "reward": 3.310416102409363, + "reward_std": 0.2252977043390274, + "rewards/final_reward": 1.1116370296886442, + "rewards/mask_iou_reward": 0.5558185148443221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3104161024093628, + "rewards/thk_ans_format_reward": 1.0, + "step": 624, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8333396911621, + "epoch": 2.111298482293423, + "grad_norm": 50.87161067879284, + "kl": 0.40625, + "learning_rate": 8.240427927927927e-07, + "loss": 0.0004, + "reward": 3.2098931074142456, + "reward_std": 0.14654186181724072, + "rewards/final_reward": 0.2948156663781496, + "rewards/mask_iou_reward": 0.1474078331890748, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2098931670188904, + "rewards/thk_ans_format_reward": 1.0, + "step": 625, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.34375381469727, + "epoch": 2.1146711635750424, + "grad_norm": 72.77830962492465, + "kl": 0.505859375, + "learning_rate": 8.237612612612612e-07, + "loss": 0.0005, + "reward": 3.3907355070114136, + "reward_std": 0.2131858579814434, + "rewards/final_reward": 1.1081243696967564, + "rewards/mask_iou_reward": 0.5540621848483782, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3907356262207031, + "rewards/thk_ans_format_reward": 1.0, + "step": 626, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.83333587646484, + "epoch": 2.118043844856661, + "grad_norm": 8.563624464427336, + "kl": 0.4130859375, + "learning_rate": 8.234797297297297e-07, + "loss": 0.0005, + "reward": 3.1916167736053467, + "reward_std": 0.0674378052353859, + "rewards/final_reward": 0.8132947407315055, + "rewards/mask_iou_reward": 0.40664737036575277, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.191616952419281, + "rewards/thk_ans_format_reward": 1.0, + "step": 627, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.28125, + "epoch": 2.12141652613828, + "grad_norm": 18.233602877049936, + "kl": 0.5576171875, + "learning_rate": 8.231981981981981e-07, + "loss": 0.0007, + "reward": 3.689383387565613, + "reward_std": 0.058890651911497116, + "rewards/final_reward": 1.573615220073414, + "rewards/mask_iou_reward": 0.786807610036707, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6893833875656128, + "rewards/thk_ans_format_reward": 1.0, + "step": 628, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.55208587646484, + "epoch": 2.124789207419899, + "grad_norm": 26.123304191153395, + "kl": 0.4013671875, + "learning_rate": 8.229166666666666e-07, + "loss": 0.0004, + "reward": 3.4777636528015137, + "reward_std": 0.07786043360829353, + "rewards/final_reward": 1.213893867253093, + "rewards/mask_iou_reward": 0.6069469336265465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4777635335922241, + "rewards/thk_ans_format_reward": 1.0, + "step": 629, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.04166793823242, + "epoch": 2.1281618887015177, + "grad_norm": 6.574647453197092, + "kl": 0.439453125, + "learning_rate": 8.22635135135135e-07, + "loss": 0.0004, + "reward": 3.1995784044265747, + "reward_std": 0.11072956770658493, + "rewards/final_reward": 1.289389255471192, + "rewards/mask_iou_reward": 0.644694627735596, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1995784044265747, + "rewards/thk_ans_format_reward": 1.0, + "step": 630, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.27083587646484, + "epoch": 2.1315345699831365, + "grad_norm": 8.857344628430472, + "kl": 0.5341796875, + "learning_rate": 8.223536036036036e-07, + "loss": 0.0005, + "reward": 3.105801820755005, + "reward_std": 0.08226869069039822, + "rewards/final_reward": 1.1846171339453866, + "rewards/mask_iou_reward": 0.5923085669726933, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1058015823364258, + "rewards/thk_ans_format_reward": 1.0, + "step": 631, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.80208587646484, + "epoch": 2.1349072512647553, + "grad_norm": 6.608877064688749, + "kl": 0.4501953125, + "learning_rate": 8.220720720720721e-07, + "loss": 0.0005, + "reward": 3.033010244369507, + "reward_std": 0.23232241719961166, + "rewards/final_reward": 0.7721974526983842, + "rewards/mask_iou_reward": 0.3860987263491921, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0330101251602173, + "rewards/thk_ans_format_reward": 1.0, + "step": 632, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.82292175292969, + "epoch": 2.138279932546374, + "grad_norm": 26.31636866542848, + "kl": 0.48828125, + "learning_rate": 8.217905405405405e-07, + "loss": 0.0005, + "reward": 3.6307315826416016, + "reward_std": 0.11909966915845871, + "rewards/final_reward": 1.1624019924863673, + "rewards/mask_iou_reward": 0.5812009962431837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6307316422462463, + "rewards/thk_ans_format_reward": 1.0, + "step": 633, + "think_completion_length": 10.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.64583587646484, + "epoch": 2.1416526138279934, + "grad_norm": 6.46074500536475, + "kl": 0.49609375, + "learning_rate": 8.21509009009009e-07, + "loss": 0.0005, + "reward": 3.474371314048767, + "reward_std": 0.0844818763434887, + "rewards/final_reward": 0.9150024138423042, + "rewards/mask_iou_reward": 0.4575012069211521, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4743713736534119, + "rewards/thk_ans_format_reward": 1.0, + "step": 634, + "think_completion_length": 10.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.84375381469727, + "epoch": 2.1450252951096123, + "grad_norm": 9.554990225390688, + "kl": 0.4521484375, + "learning_rate": 8.212274774774775e-07, + "loss": 0.0005, + "reward": 3.6118842363357544, + "reward_std": 0.07224077731370926, + "rewards/final_reward": 1.7452657955108715, + "rewards/mask_iou_reward": 0.8726328977554357, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6118841767311096, + "rewards/thk_ans_format_reward": 1.0, + "step": 635, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.91667175292969, + "epoch": 2.148397976391231, + "grad_norm": 57.03052652676022, + "kl": 0.435546875, + "learning_rate": 8.209459459459459e-07, + "loss": 0.0004, + "reward": 3.4605711698532104, + "reward_std": 0.14525556564331055, + "rewards/final_reward": 1.7284143941174483, + "rewards/mask_iou_reward": 0.8642071970587242, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4605711102485657, + "rewards/thk_ans_format_reward": 1.0, + "step": 636, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.6875, + "epoch": 2.15177065767285, + "grad_norm": 11.390704504352188, + "kl": 0.4794921875, + "learning_rate": 8.206644144144144e-07, + "loss": 0.0005, + "reward": 3.2578898668289185, + "reward_std": 0.07665400765836239, + "rewards/final_reward": 1.8478282830408852, + "rewards/mask_iou_reward": 0.9239141415204426, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2578898072242737, + "rewards/thk_ans_format_reward": 1.0, + "step": 637, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.13542175292969, + "epoch": 2.1551433389544687, + "grad_norm": 10.79482036058893, + "kl": 0.4150390625, + "learning_rate": 8.203828828828828e-07, + "loss": 0.0004, + "reward": 3.455119490623474, + "reward_std": 0.12551749870181084, + "rewards/final_reward": 1.6525657013001847, + "rewards/mask_iou_reward": 0.8262828506500923, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4551194906234741, + "rewards/thk_ans_format_reward": 1.0, + "step": 638, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.01041793823242, + "epoch": 2.1585160202360876, + "grad_norm": 12.000131006840634, + "kl": 0.5625, + "learning_rate": 8.201013513513513e-07, + "loss": 0.0006, + "reward": 3.327172040939331, + "reward_std": 0.1687596347182989, + "rewards/final_reward": 1.7966222098178166, + "rewards/mask_iou_reward": 0.8983111049089083, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3271721005439758, + "rewards/thk_ans_format_reward": 1.0, + "step": 639, + "think_completion_length": 10.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.59375381469727, + "epoch": 2.1618887015177064, + "grad_norm": 11.094244556452955, + "kl": 0.46875, + "learning_rate": 8.198198198198198e-07, + "loss": 0.0005, + "reward": 3.408894658088684, + "reward_std": 0.08874082565307617, + "rewards/final_reward": 1.6323091881331337, + "rewards/mask_iou_reward": 0.8161545940665669, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4088948369026184, + "rewards/thk_ans_format_reward": 1.0, + "step": 640, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.38541793823242, + "epoch": 2.1652613827993257, + "grad_norm": 8.392822333817206, + "kl": 0.4453125, + "learning_rate": 8.195382882882883e-07, + "loss": 0.0004, + "reward": 3.328891634941101, + "reward_std": 0.14553168416023254, + "rewards/final_reward": 1.6706313970049362, + "rewards/mask_iou_reward": 0.8353156985024681, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.328891634941101, + "rewards/thk_ans_format_reward": 1.0, + "step": 641, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.125, + "epoch": 2.1686340640809445, + "grad_norm": 8.583111088104932, + "kl": 0.43359375, + "learning_rate": 8.192567567567568e-07, + "loss": 0.0004, + "reward": 3.2445223331451416, + "reward_std": 0.10854201018810272, + "rewards/final_reward": 1.436967285600808, + "rewards/mask_iou_reward": 0.718483642800404, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2445223331451416, + "rewards/thk_ans_format_reward": 1.0, + "step": 642, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.78125381469727, + "epoch": 2.1720067453625633, + "grad_norm": 10.30401824580383, + "kl": 0.4208984375, + "learning_rate": 8.189752252252252e-07, + "loss": 0.0004, + "reward": 3.26160991191864, + "reward_std": 0.22055093199014664, + "rewards/final_reward": 0.8553469935214908, + "rewards/mask_iou_reward": 0.4276734967607454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2616099119186401, + "rewards/thk_ans_format_reward": 1.0, + "step": 643, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.36458587646484, + "epoch": 2.175379426644182, + "grad_norm": 295.59265233358633, + "kl": 0.6044921875, + "learning_rate": 8.186936936936937e-07, + "loss": 0.0006, + "reward": 3.2367950677871704, + "reward_std": 0.13342823646962643, + "rewards/final_reward": 1.6191268212909824, + "rewards/mask_iou_reward": 0.8095634106454912, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2367949485778809, + "rewards/thk_ans_format_reward": 1.0, + "step": 644, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.52083969116211, + "epoch": 2.178752107925801, + "grad_norm": 7.906168172774929, + "kl": 0.4853515625, + "learning_rate": 8.184121621621622e-07, + "loss": 0.0005, + "reward": 3.4528859853744507, + "reward_std": 0.19831737503409386, + "rewards/final_reward": 0.8998141944664336, + "rewards/mask_iou_reward": 0.4499070972332168, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4528858661651611, + "rewards/thk_ans_format_reward": 1.0, + "step": 645, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.00000762939453, + "epoch": 2.18212478920742, + "grad_norm": 7.826136559641617, + "kl": 0.46875, + "learning_rate": 8.181306306306306e-07, + "loss": 0.0005, + "reward": 3.1739590167999268, + "reward_std": 0.18386272341012955, + "rewards/final_reward": 1.6549560272173047, + "rewards/mask_iou_reward": 0.8274780136086524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1739588975906372, + "rewards/thk_ans_format_reward": 1.0, + "step": 646, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.92708587646484, + "epoch": 2.1854974704890386, + "grad_norm": 16.502530928602464, + "kl": 0.4208984375, + "learning_rate": 8.178490990990991e-07, + "loss": 0.0004, + "reward": 3.230895757675171, + "reward_std": 0.20900658890604973, + "rewards/final_reward": 0.8469308443974172, + "rewards/mask_iou_reward": 0.4234654221987086, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2308956980705261, + "rewards/thk_ans_format_reward": 1.0, + "step": 647, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.30208587646484, + "epoch": 2.1888701517706575, + "grad_norm": 19.93251577727481, + "kl": 0.73046875, + "learning_rate": 8.175675675675676e-07, + "loss": 0.0007, + "reward": 3.41804575920105, + "reward_std": 0.21081428974866867, + "rewards/final_reward": 1.8266925880479017, + "rewards/mask_iou_reward": 0.9133462940239508, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.418045699596405, + "rewards/thk_ans_format_reward": 1.0, + "step": 648, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.80208587646484, + "epoch": 2.1922428330522767, + "grad_norm": 44.50576070941139, + "kl": 0.421875, + "learning_rate": 8.17286036036036e-07, + "loss": 0.0005, + "reward": 3.07036817073822, + "reward_std": 0.10139280930161476, + "rewards/final_reward": 0.8796635025441313, + "rewards/mask_iou_reward": 0.43983175127206564, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.070368230342865, + "rewards/thk_ans_format_reward": 1.0, + "step": 649, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.70833587646484, + "epoch": 2.1956155143338956, + "grad_norm": 6.518147187815124, + "kl": 0.470703125, + "learning_rate": 8.170045045045045e-07, + "loss": 0.0005, + "reward": 3.2474400997161865, + "reward_std": 0.13931379839777946, + "rewards/final_reward": 1.875586365490201, + "rewards/mask_iou_reward": 0.9377931827451005, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.247439980506897, + "rewards/thk_ans_format_reward": 1.0, + "step": 650, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.06250381469727, + "epoch": 2.1989881956155144, + "grad_norm": 13.896457174710385, + "kl": 0.521484375, + "learning_rate": 8.16722972972973e-07, + "loss": 0.0006, + "reward": 3.6261767148971558, + "reward_std": 0.1183246560394764, + "rewards/final_reward": 1.7615443987622346, + "rewards/mask_iou_reward": 0.8807721993811173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6261768341064453, + "rewards/thk_ans_format_reward": 1.0, + "step": 651, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.78125381469727, + "epoch": 2.2023608768971332, + "grad_norm": 7.521553036506293, + "kl": 0.509765625, + "learning_rate": 8.164414414414414e-07, + "loss": 0.0005, + "reward": 3.126633405685425, + "reward_std": 0.1798637956380844, + "rewards/final_reward": 1.4663160227039, + "rewards/mask_iou_reward": 0.73315801135195, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1266332566738129, + "rewards/thk_ans_format_reward": 1.0, + "step": 652, + "think_completion_length": 10.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.69791793823242, + "epoch": 2.205733558178752, + "grad_norm": 9.400018448545131, + "kl": 0.482421875, + "learning_rate": 8.161599099099099e-07, + "loss": 0.0005, + "reward": 3.5450875759124756, + "reward_std": 0.09950285963714123, + "rewards/final_reward": 1.4591154590757514, + "rewards/mask_iou_reward": 0.7295577295378757, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.545087456703186, + "rewards/thk_ans_format_reward": 1.0, + "step": 653, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.13541793823242, + "epoch": 2.209106239460371, + "grad_norm": 16.586317511471435, + "kl": 0.443359375, + "learning_rate": 8.158783783783783e-07, + "loss": 0.0004, + "reward": 3.3824548721313477, + "reward_std": 0.11548849008977413, + "rewards/final_reward": 0.9872206684867463, + "rewards/mask_iou_reward": 0.49361033424337314, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.382454752922058, + "rewards/thk_ans_format_reward": 1.0, + "step": 654, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.43750381469727, + "epoch": 2.2124789207419897, + "grad_norm": 15.711839076861132, + "kl": 1.1123046875, + "learning_rate": 8.155968468468468e-07, + "loss": 0.0011, + "reward": 3.5824573040008545, + "reward_std": 0.1437189131975174, + "rewards/final_reward": 1.5714705673789258, + "rewards/mask_iou_reward": 0.7857352836894629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5824573040008545, + "rewards/thk_ans_format_reward": 1.0, + "step": 655, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.60417175292969, + "epoch": 2.2158516020236085, + "grad_norm": 7.4396341779376405, + "kl": 0.583984375, + "learning_rate": 8.153153153153152e-07, + "loss": 0.0006, + "reward": 3.394721031188965, + "reward_std": 0.18189126066863537, + "rewards/final_reward": 1.6694186379997311, + "rewards/mask_iou_reward": 0.8347093189998656, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3947210907936096, + "rewards/thk_ans_format_reward": 1.0, + "step": 656, + "think_completion_length": 11.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.35417175292969, + "epoch": 2.219224283305228, + "grad_norm": 14.309370554809359, + "kl": 0.4638671875, + "learning_rate": 8.150337837837837e-07, + "loss": 0.0005, + "reward": 3.1339633464813232, + "reward_std": 0.17729290574789047, + "rewards/final_reward": 1.0959308709827407, + "rewards/mask_iou_reward": 0.5479654354913703, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1339632868766785, + "rewards/thk_ans_format_reward": 1.0, + "step": 657, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.1770896911621, + "epoch": 2.2225969645868466, + "grad_norm": 4.732564548831271, + "kl": 0.4404296875, + "learning_rate": 8.147522522522522e-07, + "loss": 0.0004, + "reward": 3.3023016452789307, + "reward_std": 0.12397704645991325, + "rewards/final_reward": 1.0975228223193203, + "rewards/mask_iou_reward": 0.5487614111596602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.302301824092865, + "rewards/thk_ans_format_reward": 1.0, + "step": 658, + "think_completion_length": 9.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.00000381469727, + "epoch": 2.2259696458684655, + "grad_norm": 12.362785143592603, + "kl": 0.447265625, + "learning_rate": 8.144707207207206e-07, + "loss": 0.0004, + "reward": 3.2936304807662964, + "reward_std": 0.12760978937149048, + "rewards/final_reward": 0.6488444817317736, + "rewards/mask_iou_reward": 0.3244222408658868, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2936302423477173, + "rewards/thk_ans_format_reward": 1.0, + "step": 659, + "think_completion_length": 11.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.45833587646484, + "epoch": 2.2293423271500843, + "grad_norm": 8.083525219026132, + "kl": 0.4404296875, + "learning_rate": 8.141891891891891e-07, + "loss": 0.0005, + "reward": 3.3997479677200317, + "reward_std": 0.2194822132587433, + "rewards/final_reward": 1.5548540385668563, + "rewards/mask_iou_reward": 0.7774270192834282, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3997478485107422, + "rewards/thk_ans_format_reward": 1.0, + "step": 660, + "think_completion_length": 10.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.12500381469727, + "epoch": 2.232715008431703, + "grad_norm": 7.8635506669881465, + "kl": 0.4794921875, + "learning_rate": 8.139076576576576e-07, + "loss": 0.0005, + "reward": 3.634366989135742, + "reward_std": 0.11458337679505348, + "rewards/final_reward": 1.6881464055982023, + "rewards/mask_iou_reward": 0.8440732027991011, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6343669295310974, + "rewards/thk_ans_format_reward": 1.0, + "step": 661, + "think_completion_length": 11.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.00000381469727, + "epoch": 2.236087689713322, + "grad_norm": 9.681507857307889, + "kl": 0.412109375, + "learning_rate": 8.136261261261261e-07, + "loss": 0.0004, + "reward": 3.29227614402771, + "reward_std": 0.09453720226883888, + "rewards/final_reward": 1.3683644280381078, + "rewards/mask_iou_reward": 0.6841822140190539, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.29227614402771, + "rewards/thk_ans_format_reward": 1.0, + "step": 662, + "think_completion_length": 9.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.93750381469727, + "epoch": 2.2394603709949408, + "grad_norm": 7.5674477129734745, + "kl": 0.392578125, + "learning_rate": 8.133445945945946e-07, + "loss": 0.0004, + "reward": 3.743380904197693, + "reward_std": 0.015893162926658988, + "rewards/final_reward": 1.5229256954080168, + "rewards/mask_iou_reward": 0.7614628477040084, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7433809041976929, + "rewards/thk_ans_format_reward": 1.0, + "step": 663, + "think_completion_length": 10.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.73958587646484, + "epoch": 2.24283305227656, + "grad_norm": 17.68664735167123, + "kl": 0.4287109375, + "learning_rate": 8.13063063063063e-07, + "loss": 0.0004, + "reward": 3.303257942199707, + "reward_std": 0.18324602022767067, + "rewards/final_reward": 1.7402201218143234, + "rewards/mask_iou_reward": 0.8701100609071617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3032580018043518, + "rewards/thk_ans_format_reward": 1.0, + "step": 664, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.42708587646484, + "epoch": 2.246205733558179, + "grad_norm": 6.916780449362966, + "kl": 0.42578125, + "learning_rate": 8.127815315315315e-07, + "loss": 0.0004, + "reward": 3.1752452850341797, + "reward_std": 0.09016630239784718, + "rewards/final_reward": 0.7180743257293005, + "rewards/mask_iou_reward": 0.35903716286465026, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.175245225429535, + "rewards/thk_ans_format_reward": 1.0, + "step": 665, + "think_completion_length": 10.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.44792175292969, + "epoch": 2.2495784148397977, + "grad_norm": 16.657736889436094, + "kl": 0.44921875, + "learning_rate": 8.125e-07, + "loss": 0.0005, + "reward": 3.4963905811309814, + "reward_std": 0.1357239931821823, + "rewards/final_reward": 1.911610818305828, + "rewards/mask_iou_reward": 0.955805409152914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4963907599449158, + "rewards/thk_ans_format_reward": 1.0, + "step": 666, + "think_completion_length": 13.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.72917175292969, + "epoch": 2.2529510961214165, + "grad_norm": 15.76213501616429, + "kl": 0.3994140625, + "learning_rate": 8.122184684684684e-07, + "loss": 0.0004, + "reward": 3.174826979637146, + "reward_std": 0.10770581662654877, + "rewards/final_reward": 1.3201200373647521, + "rewards/mask_iou_reward": 0.6600600186823761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1748269498348236, + "rewards/thk_ans_format_reward": 1.0, + "step": 667, + "think_completion_length": 11.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.6875, + "epoch": 2.2563237774030354, + "grad_norm": 13.227401276492902, + "kl": 0.421875, + "learning_rate": 8.119369369369369e-07, + "loss": 0.0004, + "reward": 3.2839311361312866, + "reward_std": 0.08108958974480629, + "rewards/final_reward": 1.5689729594727198, + "rewards/mask_iou_reward": 0.7844864797363599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2839311361312866, + "rewards/thk_ans_format_reward": 1.0, + "step": 668, + "think_completion_length": 10.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.15625, + "epoch": 2.259696458684654, + "grad_norm": 10.326067198476693, + "kl": 0.490234375, + "learning_rate": 8.116554054054053e-07, + "loss": 0.0005, + "reward": 3.206782102584839, + "reward_std": 0.1343858316540718, + "rewards/final_reward": 1.233378810524416, + "rewards/mask_iou_reward": 0.616689405262208, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2067819237709045, + "rewards/thk_ans_format_reward": 1.0, + "step": 669, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.67708587646484, + "epoch": 2.263069139966273, + "grad_norm": 14.823830916830676, + "kl": 0.45703125, + "learning_rate": 8.113738738738738e-07, + "loss": 0.0005, + "reward": 3.258223295211792, + "reward_std": 0.10940613597631454, + "rewards/final_reward": 1.0222389748364655, + "rewards/mask_iou_reward": 0.5111194874182328, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.258223056793213, + "rewards/thk_ans_format_reward": 1.0, + "step": 670, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.87500381469727, + "epoch": 2.2664418212478923, + "grad_norm": 14.677750769066721, + "kl": 0.388671875, + "learning_rate": 8.110923423423423e-07, + "loss": 0.0004, + "reward": 3.5307780504226685, + "reward_std": 0.11446313932538033, + "rewards/final_reward": 1.7383022024633599, + "rewards/mask_iou_reward": 0.8691511012316799, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5307780504226685, + "rewards/thk_ans_format_reward": 1.0, + "step": 671, + "think_completion_length": 10.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.89583969116211, + "epoch": 2.269814502529511, + "grad_norm": 10.298976393267948, + "kl": 0.4404296875, + "learning_rate": 8.108108108108108e-07, + "loss": 0.0004, + "reward": 3.189149498939514, + "reward_std": 0.12401114031672478, + "rewards/final_reward": 1.5704407655379913, + "rewards/mask_iou_reward": 0.7852203827689956, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1891493201255798, + "rewards/thk_ans_format_reward": 1.0, + "step": 672, + "think_completion_length": 10.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.30208587646484, + "epoch": 2.27318718381113, + "grad_norm": 11.436957821981954, + "kl": 0.4765625, + "learning_rate": 8.105292792792793e-07, + "loss": 0.0005, + "reward": 3.208579659461975, + "reward_std": 0.11934243142604828, + "rewards/final_reward": 1.6359468909472215, + "rewards/mask_iou_reward": 0.8179734454736107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2085795402526855, + "rewards/thk_ans_format_reward": 1.0, + "step": 673, + "think_completion_length": 10.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.53125, + "epoch": 2.2765598650927488, + "grad_norm": 12.15331139839109, + "kl": 0.5390625, + "learning_rate": 8.102477477477477e-07, + "loss": 0.0005, + "reward": 3.4956218004226685, + "reward_std": 0.12992879003286362, + "rewards/final_reward": 1.7897713668932158, + "rewards/mask_iou_reward": 0.8948856834466079, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4956215023994446, + "rewards/thk_ans_format_reward": 1.0, + "step": 674, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.45833587646484, + "epoch": 2.2799325463743676, + "grad_norm": 9.99176569694546, + "kl": 0.412109375, + "learning_rate": 8.099662162162162e-07, + "loss": 0.0004, + "reward": 3.268976926803589, + "reward_std": 0.25114165246486664, + "rewards/final_reward": 1.6725001930072043, + "rewards/mask_iou_reward": 0.8362500965036022, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2689767479896545, + "rewards/thk_ans_format_reward": 1.0, + "step": 675, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.01042175292969, + "epoch": 2.2833052276559864, + "grad_norm": 10.650917935298724, + "kl": 0.419921875, + "learning_rate": 8.096846846846847e-07, + "loss": 0.0004, + "reward": 3.4945836067199707, + "reward_std": 0.11074310541152954, + "rewards/final_reward": 1.5783195531313818, + "rewards/mask_iou_reward": 0.7891597765656909, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4945836067199707, + "rewards/thk_ans_format_reward": 1.0, + "step": 676, + "think_completion_length": 10.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.09375, + "epoch": 2.2866779089376053, + "grad_norm": 8.477878692957459, + "kl": 0.494140625, + "learning_rate": 8.094031531531531e-07, + "loss": 0.0005, + "reward": 3.306473970413208, + "reward_std": 0.20133724063634872, + "rewards/final_reward": 1.4288747373343276, + "rewards/mask_iou_reward": 0.7144373686671638, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3064739108085632, + "rewards/thk_ans_format_reward": 1.0, + "step": 677, + "think_completion_length": 11.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.67708587646484, + "epoch": 2.2900505902192245, + "grad_norm": 7.188209827485836, + "kl": 0.36328125, + "learning_rate": 8.091216216216216e-07, + "loss": 0.0004, + "reward": 3.555638909339905, + "reward_std": 0.12217172980308533, + "rewards/final_reward": 1.5055865595552458, + "rewards/mask_iou_reward": 0.7527932797776229, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5556389093399048, + "rewards/thk_ans_format_reward": 1.0, + "step": 678, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.82291793823242, + "epoch": 2.2934232715008434, + "grad_norm": 9.750461972160247, + "kl": 0.4072265625, + "learning_rate": 8.0884009009009e-07, + "loss": 0.0004, + "reward": 3.2604702711105347, + "reward_std": 0.09934688359498978, + "rewards/final_reward": 0.9451985545989782, + "rewards/mask_iou_reward": 0.4725992772994891, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2604702711105347, + "rewards/thk_ans_format_reward": 1.0, + "step": 679, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.27083587646484, + "epoch": 2.296795952782462, + "grad_norm": 15.365217581053004, + "kl": 0.59375, + "learning_rate": 8.085585585585585e-07, + "loss": 0.0006, + "reward": 3.0533676147460938, + "reward_std": 0.16271667182445526, + "rewards/final_reward": 0.4927129483488749, + "rewards/mask_iou_reward": 0.24635647417443746, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0533676147460938, + "rewards/thk_ans_format_reward": 1.0, + "step": 680, + "think_completion_length": 10.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.30208587646484, + "epoch": 2.300168634064081, + "grad_norm": 9.802254184837405, + "kl": 0.447265625, + "learning_rate": 8.08277027027027e-07, + "loss": 0.0004, + "reward": 3.028236746788025, + "reward_std": 0.21147775277495384, + "rewards/final_reward": 0.3851396040127628, + "rewards/mask_iou_reward": 0.1925698020063814, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0282368659973145, + "rewards/thk_ans_format_reward": 1.0, + "step": 681, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.79167175292969, + "epoch": 2.3035413153457, + "grad_norm": 14.761547115745145, + "kl": 0.53125, + "learning_rate": 8.079954954954955e-07, + "loss": 0.0005, + "reward": 3.521707773208618, + "reward_std": 0.12188014201819897, + "rewards/final_reward": 1.7237706323114415, + "rewards/mask_iou_reward": 0.8618853161557207, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5217076539993286, + "rewards/thk_ans_format_reward": 1.0, + "step": 682, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.53125, + "epoch": 2.3069139966273187, + "grad_norm": 9.288430390244054, + "kl": 0.3671875, + "learning_rate": 8.07713963963964e-07, + "loss": 0.0004, + "reward": 3.4879062175750732, + "reward_std": 0.22596611082553864, + "rewards/final_reward": 1.8825232373395848, + "rewards/mask_iou_reward": 0.9412616186697924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.487906038761139, + "rewards/thk_ans_format_reward": 1.0, + "step": 683, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.36458587646484, + "epoch": 2.3102866779089375, + "grad_norm": 14.599410626383364, + "kl": 0.4267578125, + "learning_rate": 8.074324324324325e-07, + "loss": 0.0004, + "reward": 3.4649771451950073, + "reward_std": 0.11124672368168831, + "rewards/final_reward": 1.5080582017135327, + "rewards/mask_iou_reward": 0.7540291008567663, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4649773240089417, + "rewards/thk_ans_format_reward": 1.0, + "step": 684, + "think_completion_length": 10.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.20833587646484, + "epoch": 2.3136593591905563, + "grad_norm": 10.06748944006634, + "kl": 0.4892578125, + "learning_rate": 8.071509009009009e-07, + "loss": 0.0005, + "reward": 3.408280611038208, + "reward_std": 0.09913060441613197, + "rewards/final_reward": 1.7207976126492222, + "rewards/mask_iou_reward": 0.8603988063246111, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4082804918289185, + "rewards/thk_ans_format_reward": 1.0, + "step": 685, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.02083969116211, + "epoch": 2.317032040472175, + "grad_norm": 7.514829742049848, + "kl": 0.4521484375, + "learning_rate": 8.068693693693694e-07, + "loss": 0.0005, + "reward": 3.2785524129867554, + "reward_std": 0.02402611169964075, + "rewards/final_reward": 1.5484094219072886, + "rewards/mask_iou_reward": 0.7742047109536443, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.278552532196045, + "rewards/thk_ans_format_reward": 1.0, + "step": 686, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.56250762939453, + "epoch": 2.3204047217537944, + "grad_norm": 9.801115745438581, + "kl": 0.484375, + "learning_rate": 8.065878378378378e-07, + "loss": 0.0005, + "reward": 2.937888503074646, + "reward_std": 0.13681496307253838, + "rewards/final_reward": 0.39095593049748123, + "rewards/mask_iou_reward": 0.19547796524874061, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9378882646560669, + "rewards/thk_ans_format_reward": 1.0, + "step": 687, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.85416793823242, + "epoch": 2.3237774030354132, + "grad_norm": 8.040804487614768, + "kl": 0.451171875, + "learning_rate": 8.063063063063063e-07, + "loss": 0.0005, + "reward": 3.6306684017181396, + "reward_std": 0.046969225630164146, + "rewards/final_reward": 1.8572513075997634, + "rewards/mask_iou_reward": 0.9286256537998817, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.63066828250885, + "rewards/thk_ans_format_reward": 1.0, + "step": 688, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.14583587646484, + "epoch": 2.327150084317032, + "grad_norm": 8.439382052771062, + "kl": 0.4775390625, + "learning_rate": 8.060247747747748e-07, + "loss": 0.0005, + "reward": 3.4388445615768433, + "reward_std": 0.14623238891363144, + "rewards/final_reward": 1.2177924990928135, + "rewards/mask_iou_reward": 0.6088962495464068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4388444423675537, + "rewards/thk_ans_format_reward": 1.0, + "step": 689, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.82291793823242, + "epoch": 2.330522765598651, + "grad_norm": 16.475672726688114, + "kl": 0.4375, + "learning_rate": 8.057432432432431e-07, + "loss": 0.0004, + "reward": 3.5080931186676025, + "reward_std": 0.2046094499528408, + "rewards/final_reward": 1.4977205328748284, + "rewards/mask_iou_reward": 0.7488602664374142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.508092999458313, + "rewards/thk_ans_format_reward": 1.0, + "step": 690, + "think_completion_length": 10.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.69791793823242, + "epoch": 2.3338954468802697, + "grad_norm": 16.864680723786535, + "kl": 0.421875, + "learning_rate": 8.054617117117116e-07, + "loss": 0.0004, + "reward": 2.887763261795044, + "reward_std": 0.0428765881806612, + "rewards/final_reward": 1.6773612262571533, + "rewards/mask_iou_reward": 0.8386806131285767, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.887763112783432, + "rewards/thk_ans_format_reward": 1.0, + "step": 691, + "think_completion_length": 10.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.21875381469727, + "epoch": 2.3372681281618886, + "grad_norm": 14.022754013063981, + "kl": 0.400390625, + "learning_rate": 8.051801801801801e-07, + "loss": 0.0004, + "reward": 3.0394562482833862, + "reward_std": 0.18824466317892075, + "rewards/final_reward": 0.954554072359945, + "rewards/mask_iou_reward": 0.4772770361799725, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0602895319461823, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 692, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.67708587646484, + "epoch": 2.3406408094435074, + "grad_norm": 12.700617794911235, + "kl": 0.443359375, + "learning_rate": 8.048986486486486e-07, + "loss": 0.0005, + "reward": 3.3846123218536377, + "reward_std": 0.13955982774496078, + "rewards/final_reward": 0.8707478787601919, + "rewards/mask_iou_reward": 0.43537393938009594, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3846123218536377, + "rewards/thk_ans_format_reward": 1.0, + "step": 693, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.25000381469727, + "epoch": 2.3440134907251267, + "grad_norm": 5.292351081418642, + "kl": 0.455078125, + "learning_rate": 8.046171171171171e-07, + "loss": 0.0005, + "reward": 3.3495113849639893, + "reward_std": 0.06185881420969963, + "rewards/final_reward": 1.8658587353803533, + "rewards/mask_iou_reward": 0.9329293676901766, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3495115637779236, + "rewards/thk_ans_format_reward": 1.0, + "step": 694, + "think_completion_length": 10.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.38541793823242, + "epoch": 2.3473861720067455, + "grad_norm": 11.049291780315604, + "kl": 0.451171875, + "learning_rate": 8.043355855855855e-07, + "loss": 0.0005, + "reward": 3.3538196086883545, + "reward_std": 0.05195300653576851, + "rewards/final_reward": 0.056906077261131706, + "rewards/mask_iou_reward": 0.028453038630565853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.353819489479065, + "rewards/thk_ans_format_reward": 1.0, + "step": 695, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.95833587646484, + "epoch": 2.3507588532883643, + "grad_norm": 19.11427795079353, + "kl": 0.58203125, + "learning_rate": 8.04054054054054e-07, + "loss": 0.0006, + "reward": 3.5086394548416138, + "reward_std": 0.14855768531560898, + "rewards/final_reward": 1.686626725092181, + "rewards/mask_iou_reward": 0.8433133625460905, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5086395740509033, + "rewards/thk_ans_format_reward": 1.0, + "step": 696, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.2708396911621, + "epoch": 2.354131534569983, + "grad_norm": 9.67194746839267, + "kl": 0.4052734375, + "learning_rate": 8.037725225225224e-07, + "loss": 0.0004, + "reward": 3.3273189067840576, + "reward_std": 0.19882089644670486, + "rewards/final_reward": 1.3801675103822553, + "rewards/mask_iou_reward": 0.6900837551911276, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.327318787574768, + "rewards/thk_ans_format_reward": 1.0, + "step": 697, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.67708587646484, + "epoch": 2.357504215851602, + "grad_norm": 40.37500069302396, + "kl": 0.654296875, + "learning_rate": 8.034909909909909e-07, + "loss": 0.0007, + "reward": 3.2038848400115967, + "reward_std": 0.2914121150970459, + "rewards/final_reward": 0.5983370712448542, + "rewards/mask_iou_reward": 0.2991685356224271, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2038847208023071, + "rewards/thk_ans_format_reward": 1.0, + "step": 698, + "think_completion_length": 11.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.35416793823242, + "epoch": 2.360876897133221, + "grad_norm": 10.49432652015857, + "kl": 0.404296875, + "learning_rate": 8.032094594594594e-07, + "loss": 0.0004, + "reward": 3.1447372436523438, + "reward_std": 0.17952913511544466, + "rewards/final_reward": 1.5661687614291515, + "rewards/mask_iou_reward": 0.7830843807145758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1447371244430542, + "rewards/thk_ans_format_reward": 1.0, + "step": 699, + "think_completion_length": 11.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.88541793823242, + "epoch": 2.3642495784148396, + "grad_norm": 19.711801674853763, + "kl": 0.53515625, + "learning_rate": 8.029279279279278e-07, + "loss": 0.0005, + "reward": 3.3763688802719116, + "reward_std": 0.14744899049401283, + "rewards/final_reward": 1.0176800515031297, + "rewards/mask_iou_reward": 0.5088400257515648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3763686418533325, + "rewards/thk_ans_format_reward": 1.0, + "step": 700, + "think_completion_length": 10.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.79166793823242, + "epoch": 2.367622259696459, + "grad_norm": 21.065652131398185, + "kl": 0.4423828125, + "learning_rate": 8.026463963963963e-07, + "loss": 0.0005, + "reward": 3.5854740142822266, + "reward_std": 0.08657106757164001, + "rewards/final_reward": 1.1906331353971875, + "rewards/mask_iou_reward": 0.5953165676985938, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5854740738868713, + "rewards/thk_ans_format_reward": 1.0, + "step": 701, + "think_completion_length": 10.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.20833587646484, + "epoch": 2.3709949409780777, + "grad_norm": 7.539771960538076, + "kl": 0.390625, + "learning_rate": 8.023648648648649e-07, + "loss": 0.0004, + "reward": 3.166221857070923, + "reward_std": 0.06890098564326763, + "rewards/final_reward": 1.1171324478432192, + "rewards/mask_iou_reward": 0.5585662239216096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.166221797466278, + "rewards/thk_ans_format_reward": 1.0, + "step": 702, + "think_completion_length": 11.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.10416793823242, + "epoch": 2.3743676222596966, + "grad_norm": 25.268475645824264, + "kl": 0.484375, + "learning_rate": 8.020833333333333e-07, + "loss": 0.0005, + "reward": 3.6058748960494995, + "reward_std": 0.1462496928870678, + "rewards/final_reward": 1.7142995604894324, + "rewards/mask_iou_reward": 0.8571497802447162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.60587477684021, + "rewards/thk_ans_format_reward": 1.0, + "step": 703, + "think_completion_length": 10.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.07291793823242, + "epoch": 2.3777403035413154, + "grad_norm": 5.86936429351156, + "kl": 0.513671875, + "learning_rate": 8.018018018018018e-07, + "loss": 0.0005, + "reward": 3.587558388710022, + "reward_std": 0.10432857647538185, + "rewards/final_reward": 1.5256586245058101, + "rewards/mask_iou_reward": 0.7628293122529051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5875583291053772, + "rewards/thk_ans_format_reward": 1.0, + "step": 704, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.98958587646484, + "epoch": 2.381112984822934, + "grad_norm": 24.572719789475233, + "kl": 0.458984375, + "learning_rate": 8.015202702702702e-07, + "loss": 0.0005, + "reward": 3.235170602798462, + "reward_std": 0.1947356564924121, + "rewards/final_reward": 1.6970421912813713, + "rewards/mask_iou_reward": 0.8485210956406857, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2351704239845276, + "rewards/thk_ans_format_reward": 1.0, + "step": 705, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.29166793823242, + "epoch": 2.384485666104553, + "grad_norm": 12.228544427874436, + "kl": 0.390625, + "learning_rate": 8.012387387387387e-07, + "loss": 0.0004, + "reward": 3.09304141998291, + "reward_std": 0.1303301863372326, + "rewards/final_reward": 1.5203176608072408, + "rewards/mask_iou_reward": 0.7601588304036204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0930412411689758, + "rewards/thk_ans_format_reward": 1.0, + "step": 706, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.30208587646484, + "epoch": 2.387858347386172, + "grad_norm": 11.83368014597079, + "kl": 4.857421875, + "learning_rate": 8.009572072072072e-07, + "loss": 0.0048, + "reward": 3.2624597549438477, + "reward_std": 0.18296461552381516, + "rewards/final_reward": 1.3711881609397212, + "rewards/mask_iou_reward": 0.6855940804698606, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.262459933757782, + "rewards/thk_ans_format_reward": 1.0, + "step": 707, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.33333587646484, + "epoch": 2.391231028667791, + "grad_norm": 16.164645790308068, + "kl": 0.4619140625, + "learning_rate": 8.006756756756756e-07, + "loss": 0.0005, + "reward": 2.9492790699005127, + "reward_std": 0.065645731985569, + "rewards/final_reward": 0.12719705036946563, + "rewards/mask_iou_reward": 0.06359852518473282, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9492788910865784, + "rewards/thk_ans_format_reward": 1.0, + "step": 708, + "think_completion_length": 11.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.19792175292969, + "epoch": 2.39460370994941, + "grad_norm": 11.315576698691505, + "kl": 0.486328125, + "learning_rate": 8.003941441441441e-07, + "loss": 0.0005, + "reward": 3.2986977100372314, + "reward_std": 0.16338078677654266, + "rewards/final_reward": 1.7742719041990807, + "rewards/mask_iou_reward": 0.8871359520995403, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.298697590827942, + "rewards/thk_ans_format_reward": 1.0, + "step": 709, + "think_completion_length": 10.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.64583587646484, + "epoch": 2.397976391231029, + "grad_norm": 10.16630512602375, + "kl": 0.4345703125, + "learning_rate": 8.001126126126125e-07, + "loss": 0.0004, + "reward": 3.5177905559539795, + "reward_std": 0.13988900184631348, + "rewards/final_reward": 1.7767121947188076, + "rewards/mask_iou_reward": 0.8883560973594038, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5177903771400452, + "rewards/thk_ans_format_reward": 1.0, + "step": 710, + "think_completion_length": 10.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.73958587646484, + "epoch": 2.4013490725126476, + "grad_norm": 11.27077040193017, + "kl": 0.4462890625, + "learning_rate": 7.99831081081081e-07, + "loss": 0.0004, + "reward": 3.3645020723342896, + "reward_std": 0.13164759427309036, + "rewards/final_reward": 0.8320936686355329, + "rewards/mask_iou_reward": 0.41604683431776646, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3645018935203552, + "rewards/thk_ans_format_reward": 1.0, + "step": 711, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75000381469727, + "epoch": 2.4047217537942664, + "grad_norm": 10.775746892595787, + "kl": 0.66015625, + "learning_rate": 7.995495495495496e-07, + "loss": 0.0007, + "reward": 3.5911271572113037, + "reward_std": 0.06256835721433163, + "rewards/final_reward": 1.9670477449263895, + "rewards/mask_iou_reward": 0.9835238724631947, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.591127097606659, + "rewards/thk_ans_format_reward": 1.0, + "step": 712, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.04166793823242, + "epoch": 2.4080944350758853, + "grad_norm": 17.154979763981668, + "kl": 0.48828125, + "learning_rate": 7.99268018018018e-07, + "loss": 0.0005, + "reward": 3.3226637840270996, + "reward_std": 0.06822742521762848, + "rewards/final_reward": 1.8144922612172647, + "rewards/mask_iou_reward": 0.9072461306086324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3226639032363892, + "rewards/thk_ans_format_reward": 1.0, + "step": 713, + "think_completion_length": 10.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.36458587646484, + "epoch": 2.411467116357504, + "grad_norm": 7.41758232371269, + "kl": 0.5302734375, + "learning_rate": 7.989864864864865e-07, + "loss": 0.0005, + "reward": 3.438087582588196, + "reward_std": 0.1307989489287138, + "rewards/final_reward": 1.4276302815895208, + "rewards/mask_iou_reward": 0.7138151407947604, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4380874633789062, + "rewards/thk_ans_format_reward": 1.0, + "step": 714, + "think_completion_length": 11.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.89583969116211, + "epoch": 2.414839797639123, + "grad_norm": 8.601012209208584, + "kl": 0.54296875, + "learning_rate": 7.987049549549549e-07, + "loss": 0.0005, + "reward": 3.1718395948410034, + "reward_std": 0.15443510934710503, + "rewards/final_reward": 1.1746513664066902, + "rewards/mask_iou_reward": 0.5873256832033451, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1718396544456482, + "rewards/thk_ans_format_reward": 1.0, + "step": 715, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.21875, + "epoch": 2.4182124789207418, + "grad_norm": 13.661422976929645, + "kl": 0.6015625, + "learning_rate": 7.984234234234234e-07, + "loss": 0.0006, + "reward": 3.3755754232406616, + "reward_std": 0.17705781757831573, + "rewards/final_reward": 1.4551628170109665, + "rewards/mask_iou_reward": 0.7275814085054833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3755754232406616, + "rewards/thk_ans_format_reward": 1.0, + "step": 716, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.30208587646484, + "epoch": 2.421585160202361, + "grad_norm": 15.528903391546768, + "kl": 0.53125, + "learning_rate": 7.981418918918919e-07, + "loss": 0.0005, + "reward": 3.4267293214797974, + "reward_std": 0.21596352756023407, + "rewards/final_reward": 1.5162321323850754, + "rewards/mask_iou_reward": 0.7581160661925377, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4267293214797974, + "rewards/thk_ans_format_reward": 1.0, + "step": 717, + "think_completion_length": 11.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.13541793823242, + "epoch": 2.42495784148398, + "grad_norm": 7.660762203198963, + "kl": 0.763671875, + "learning_rate": 7.978603603603603e-07, + "loss": 0.0008, + "reward": 3.088021755218506, + "reward_std": 0.17404986545443535, + "rewards/final_reward": 1.4431847347902549, + "rewards/mask_iou_reward": 0.7215923673951274, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.088021606206894, + "rewards/thk_ans_format_reward": 1.0, + "step": 718, + "think_completion_length": 10.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.41666793823242, + "epoch": 2.4283305227655987, + "grad_norm": 10.275559775968965, + "kl": 0.771484375, + "learning_rate": 7.975788288288288e-07, + "loss": 0.0008, + "reward": 3.5344756841659546, + "reward_std": 0.09361070767045021, + "rewards/final_reward": 1.5879363495312444, + "rewards/mask_iou_reward": 0.7939681747656222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5344756841659546, + "rewards/thk_ans_format_reward": 1.0, + "step": 719, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.1875, + "epoch": 2.4317032040472175, + "grad_norm": 6.11686753079081, + "kl": 0.6015625, + "learning_rate": 7.972972972972972e-07, + "loss": 0.0006, + "reward": 3.258207678794861, + "reward_std": 0.21193058043718338, + "rewards/final_reward": 0.5687722459062485, + "rewards/mask_iou_reward": 0.28438612295312427, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2582077383995056, + "rewards/thk_ans_format_reward": 1.0, + "step": 720, + "think_completion_length": 11.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.82291793823242, + "epoch": 2.4350758853288363, + "grad_norm": 188.70353819108283, + "kl": 0.548828125, + "learning_rate": 7.970157657657657e-07, + "loss": 0.0005, + "reward": 3.328008770942688, + "reward_std": 0.07661649584770203, + "rewards/final_reward": 1.3145267406893124, + "rewards/mask_iou_reward": 0.6572633703446562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3280084133148193, + "rewards/thk_ans_format_reward": 1.0, + "step": 721, + "think_completion_length": 9.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.79167175292969, + "epoch": 2.438448566610455, + "grad_norm": 8.175555799208823, + "kl": 0.595703125, + "learning_rate": 7.967342342342343e-07, + "loss": 0.0006, + "reward": 3.286064028739929, + "reward_std": 0.1305839866399765, + "rewards/final_reward": 1.9222867314872671, + "rewards/mask_iou_reward": 0.9611433657436336, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2860642075538635, + "rewards/thk_ans_format_reward": 1.0, + "step": 722, + "think_completion_length": 10.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.91667175292969, + "epoch": 2.441821247892074, + "grad_norm": 19.677120564321587, + "kl": 0.62109375, + "learning_rate": 7.964527027027027e-07, + "loss": 0.0006, + "reward": 3.4861291646957397, + "reward_std": 0.111208725720644, + "rewards/final_reward": 1.6771212868223513, + "rewards/mask_iou_reward": 0.8385606434111756, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4861292839050293, + "rewards/thk_ans_format_reward": 1.0, + "step": 723, + "think_completion_length": 11.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.41667175292969, + "epoch": 2.4451939291736933, + "grad_norm": 12.386210317467004, + "kl": 0.548828125, + "learning_rate": 7.961711711711712e-07, + "loss": 0.0005, + "reward": 3.435595750808716, + "reward_std": 0.21505354344844818, + "rewards/final_reward": 1.6424393573599578, + "rewards/mask_iou_reward": 0.8212196786799789, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4460124373435974, + "rewards/thk_ans_format_reward": 1.0, + "step": 724, + "think_completion_length": 11.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.35416793823242, + "epoch": 2.448566610455312, + "grad_norm": 90.88539800576194, + "kl": 0.56640625, + "learning_rate": 7.958896396396397e-07, + "loss": 0.0006, + "reward": 3.139678478240967, + "reward_std": 0.1556791141629219, + "rewards/final_reward": 1.0097397701785693, + "rewards/mask_iou_reward": 0.5048698850892847, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.13967826962471, + "rewards/thk_ans_format_reward": 1.0, + "step": 725, + "think_completion_length": 11.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.94791793823242, + "epoch": 2.451939291736931, + "grad_norm": 11.034827414989431, + "kl": 0.580078125, + "learning_rate": 7.956081081081081e-07, + "loss": 0.0006, + "reward": 3.3471599817276, + "reward_std": 0.052762774750590324, + "rewards/final_reward": 1.4570581685778468, + "rewards/mask_iou_reward": 0.7285290842889234, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3471597135066986, + "rewards/thk_ans_format_reward": 1.0, + "step": 726, + "think_completion_length": 11.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.71875381469727, + "epoch": 2.4553119730185498, + "grad_norm": 10.107819781591768, + "kl": 0.611328125, + "learning_rate": 7.953265765765766e-07, + "loss": 0.0006, + "reward": 3.216899871826172, + "reward_std": 0.11818000301718712, + "rewards/final_reward": 1.7161205932460326, + "rewards/mask_iou_reward": 0.8580602966230163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2168999910354614, + "rewards/thk_ans_format_reward": 1.0, + "step": 727, + "think_completion_length": 10.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.04166793823242, + "epoch": 2.4586846543001686, + "grad_norm": 6.674039882817239, + "kl": 0.560546875, + "learning_rate": 7.95045045045045e-07, + "loss": 0.0006, + "reward": 3.42681086063385, + "reward_std": 0.13024066016077995, + "rewards/final_reward": 1.6990264231312215, + "rewards/mask_iou_reward": 0.8495132115656108, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4268107414245605, + "rewards/thk_ans_format_reward": 1.0, + "step": 728, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.71875381469727, + "epoch": 2.4620573355817874, + "grad_norm": 13.049637366531218, + "kl": 0.65234375, + "learning_rate": 7.947635135135135e-07, + "loss": 0.0006, + "reward": 3.1891380548477173, + "reward_std": 0.19797684997320175, + "rewards/final_reward": 1.1756615795267868, + "rewards/mask_iou_reward": 0.5878307897633934, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.189138114452362, + "rewards/thk_ans_format_reward": 1.0, + "step": 729, + "think_completion_length": 11.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.70833587646484, + "epoch": 2.4654300168634062, + "grad_norm": 4.379687752783074, + "kl": 0.5703125, + "learning_rate": 7.944819819819819e-07, + "loss": 0.0006, + "reward": 3.259893774986267, + "reward_std": 0.07151791453361511, + "rewards/final_reward": 1.5924158283390697, + "rewards/mask_iou_reward": 0.7962079141695348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.259893774986267, + "rewards/thk_ans_format_reward": 1.0, + "step": 730, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.13541793823242, + "epoch": 2.4688026981450255, + "grad_norm": 9.507324536247651, + "kl": 0.5546875, + "learning_rate": 7.942004504504503e-07, + "loss": 0.0006, + "reward": 3.215202569961548, + "reward_std": 0.0699087530374527, + "rewards/final_reward": 0.9720705627363007, + "rewards/mask_iou_reward": 0.48603528136815033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2152024507522583, + "rewards/thk_ans_format_reward": 1.0, + "step": 731, + "think_completion_length": 9.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.61458587646484, + "epoch": 2.4721753794266443, + "grad_norm": 12.873305700274182, + "kl": 0.7734375, + "learning_rate": 7.939189189189189e-07, + "loss": 0.0008, + "reward": 3.025223970413208, + "reward_std": 0.12610819563269615, + "rewards/final_reward": 1.7189425763675459, + "rewards/mask_iou_reward": 0.8594712881837729, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0252237915992737, + "rewards/thk_ans_format_reward": 1.0, + "step": 732, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.91666793823242, + "epoch": 2.475548060708263, + "grad_norm": 9.722875671257425, + "kl": 0.615234375, + "learning_rate": 7.936373873873873e-07, + "loss": 0.0006, + "reward": 3.4183343648910522, + "reward_std": 0.08810793235898018, + "rewards/final_reward": 1.855176038165883, + "rewards/mask_iou_reward": 0.9275880190829415, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.418334186077118, + "rewards/thk_ans_format_reward": 1.0, + "step": 733, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.67708587646484, + "epoch": 2.478920741989882, + "grad_norm": 15.856710122466179, + "kl": 0.587890625, + "learning_rate": 7.933558558558558e-07, + "loss": 0.0006, + "reward": 3.4329049587249756, + "reward_std": 0.07124324329197407, + "rewards/final_reward": 1.4161840189363588, + "rewards/mask_iou_reward": 0.7080920094681794, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4329049587249756, + "rewards/thk_ans_format_reward": 1.0, + "step": 734, + "think_completion_length": 9.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.44791793823242, + "epoch": 2.482293423271501, + "grad_norm": 152.2683999431526, + "kl": 0.56640625, + "learning_rate": 7.930743243243243e-07, + "loss": 0.0006, + "reward": 3.3764851093292236, + "reward_std": 0.15127253159880638, + "rewards/final_reward": 1.7125658778008792, + "rewards/mask_iou_reward": 0.8562829389004396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3764851689338684, + "rewards/thk_ans_format_reward": 1.0, + "step": 735, + "think_completion_length": 9.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.04166793823242, + "epoch": 2.4856661045531196, + "grad_norm": 8.316898560977593, + "kl": 0.572265625, + "learning_rate": 7.927927927927927e-07, + "loss": 0.0006, + "reward": 3.1848560571670532, + "reward_std": 0.11521704494953156, + "rewards/final_reward": 0.7065922922768331, + "rewards/mask_iou_reward": 0.35329614613841653, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1848559379577637, + "rewards/thk_ans_format_reward": 1.0, + "step": 736, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.37500381469727, + "epoch": 2.4890387858347385, + "grad_norm": 11.203963199161379, + "kl": 0.595703125, + "learning_rate": 7.925112612612612e-07, + "loss": 0.0006, + "reward": 3.3153754472732544, + "reward_std": 0.23398957401514053, + "rewards/final_reward": 1.038106492915174, + "rewards/mask_iou_reward": 0.519053246457587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3153753578662872, + "rewards/thk_ans_format_reward": 1.0, + "step": 737, + "think_completion_length": 11.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.13542175292969, + "epoch": 2.4924114671163577, + "grad_norm": 7.715947393781205, + "kl": 0.6328125, + "learning_rate": 7.922297297297296e-07, + "loss": 0.0006, + "reward": 3.4682679176330566, + "reward_std": 0.08761341124773026, + "rewards/final_reward": 1.088569312463899, + "rewards/mask_iou_reward": 0.5442846562319495, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4682677388191223, + "rewards/thk_ans_format_reward": 1.0, + "step": 738, + "think_completion_length": 10.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.80208587646484, + "epoch": 2.4957841483979766, + "grad_norm": 41.751188322079955, + "kl": 0.55078125, + "learning_rate": 7.919481981981981e-07, + "loss": 0.0006, + "reward": 3.5137277841567993, + "reward_std": 0.13675726018846035, + "rewards/final_reward": 0.9585005643467535, + "rewards/mask_iou_reward": 0.47925028217337673, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5137277245521545, + "rewards/thk_ans_format_reward": 1.0, + "step": 739, + "think_completion_length": 12.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.72917175292969, + "epoch": 2.4991568296795954, + "grad_norm": 8.421219003802829, + "kl": 0.6328125, + "learning_rate": 7.916666666666666e-07, + "loss": 0.0006, + "reward": 3.611706256866455, + "reward_std": 0.21243294700980186, + "rewards/final_reward": 1.7471342800039473, + "rewards/mask_iou_reward": 0.8735671400019737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6117061376571655, + "rewards/thk_ans_format_reward": 1.0, + "step": 740, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.42708587646484, + "epoch": 2.5025295109612142, + "grad_norm": 9.829424324427816, + "kl": 0.568359375, + "learning_rate": 7.91385135135135e-07, + "loss": 0.0006, + "reward": 3.395363688468933, + "reward_std": 0.12305304408073425, + "rewards/final_reward": 1.5213473515200766, + "rewards/mask_iou_reward": 0.7606736757600383, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.395363688468933, + "rewards/thk_ans_format_reward": 1.0, + "step": 741, + "think_completion_length": 10.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.51041793823242, + "epoch": 2.505902192242833, + "grad_norm": 11.854834571460097, + "kl": 0.625, + "learning_rate": 7.911036036036036e-07, + "loss": 0.0006, + "reward": 3.4499409198760986, + "reward_std": 0.10465443879365921, + "rewards/final_reward": 1.543411262235664, + "rewards/mask_iou_reward": 0.771705631117832, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.449940800666809, + "rewards/thk_ans_format_reward": 1.0, + "step": 742, + "think_completion_length": 10.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.41666793823242, + "epoch": 2.509274873524452, + "grad_norm": 11.431790307732706, + "kl": 0.6015625, + "learning_rate": 7.908220720720721e-07, + "loss": 0.0006, + "reward": 3.214622139930725, + "reward_std": 0.03172614425420761, + "rewards/final_reward": 1.272425770492403, + "rewards/mask_iou_reward": 0.6362128852462015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2146221995353699, + "rewards/thk_ans_format_reward": 1.0, + "step": 743, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.41667175292969, + "epoch": 2.5126475548060707, + "grad_norm": 8.830193308051955, + "kl": 0.572265625, + "learning_rate": 7.905405405405405e-07, + "loss": 0.0006, + "reward": 3.1759774684906006, + "reward_std": 0.10025330260396004, + "rewards/final_reward": 1.260989037491286, + "rewards/mask_iou_reward": 0.630494518745643, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1759773790836334, + "rewards/thk_ans_format_reward": 1.0, + "step": 744, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.50000762939453, + "epoch": 2.51602023608769, + "grad_norm": 20.759261570638913, + "kl": 0.80078125, + "learning_rate": 7.90259009009009e-07, + "loss": 0.0008, + "reward": 3.3195990324020386, + "reward_std": 0.09332029893994331, + "rewards/final_reward": 0.9221775935859856, + "rewards/mask_iou_reward": 0.4610887967929928, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.319598913192749, + "rewards/thk_ans_format_reward": 1.0, + "step": 745, + "think_completion_length": 10.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.58333587646484, + "epoch": 2.5193929173693084, + "grad_norm": 6.896446483981856, + "kl": 0.5546875, + "learning_rate": 7.899774774774774e-07, + "loss": 0.0006, + "reward": 3.1739712953567505, + "reward_std": 0.16202839836478233, + "rewards/final_reward": 1.7786173434773, + "rewards/mask_iou_reward": 0.88930867173865, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.173971176147461, + "rewards/thk_ans_format_reward": 1.0, + "step": 746, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.38541793823242, + "epoch": 2.5227655986509276, + "grad_norm": 11.71991482226643, + "kl": 0.625, + "learning_rate": 7.896959459459459e-07, + "loss": 0.0006, + "reward": 3.44577157497406, + "reward_std": 0.09941475465893745, + "rewards/final_reward": 1.323128358011256, + "rewards/mask_iou_reward": 0.661564179005628, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4457715153694153, + "rewards/thk_ans_format_reward": 1.0, + "step": 747, + "think_completion_length": 10.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.65625381469727, + "epoch": 2.5261382799325465, + "grad_norm": 7.493391681330979, + "kl": 0.5625, + "learning_rate": 7.894144144144144e-07, + "loss": 0.0006, + "reward": 3.0702245235443115, + "reward_std": 0.15229638293385506, + "rewards/final_reward": 1.1008842091447275, + "rewards/mask_iou_reward": 0.5504421045723638, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0702243149280548, + "rewards/thk_ans_format_reward": 1.0, + "step": 748, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.35417175292969, + "epoch": 2.5295109612141653, + "grad_norm": 14.720404781621857, + "kl": 0.87109375, + "learning_rate": 7.891328828828828e-07, + "loss": 0.0009, + "reward": 3.369284749031067, + "reward_std": 0.09888229332864285, + "rewards/final_reward": 1.722342529541773, + "rewards/mask_iou_reward": 0.8611712647708865, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3692848086357117, + "rewards/thk_ans_format_reward": 1.0, + "step": 749, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.29166793823242, + "epoch": 2.532883642495784, + "grad_norm": 33.226300521794954, + "kl": 0.64453125, + "learning_rate": 7.888513513513513e-07, + "loss": 0.0006, + "reward": 3.3416668176651, + "reward_std": 0.1597162000834942, + "rewards/final_reward": 1.3613542161465868, + "rewards/mask_iou_reward": 0.6806771080732934, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3416666984558105, + "rewards/thk_ans_format_reward": 1.0, + "step": 750, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.77083587646484, + "epoch": 2.536256323777403, + "grad_norm": 17.600051361273188, + "kl": 0.591796875, + "learning_rate": 7.885698198198197e-07, + "loss": 0.0006, + "reward": 3.325364351272583, + "reward_std": 0.21401405334472656, + "rewards/final_reward": 1.6552300188658267, + "rewards/mask_iou_reward": 0.8276150094329133, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3253644108772278, + "rewards/thk_ans_format_reward": 1.0, + "step": 751, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.31250762939453, + "epoch": 2.539629005059022, + "grad_norm": 22.345390640061805, + "kl": 0.775390625, + "learning_rate": 7.882882882882883e-07, + "loss": 0.0008, + "reward": 3.559385657310486, + "reward_std": 0.09891241788864136, + "rewards/final_reward": 1.1205373816626092, + "rewards/mask_iou_reward": 0.5602686908313046, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5593854784965515, + "rewards/thk_ans_format_reward": 1.0, + "step": 752, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.83333587646484, + "epoch": 2.5430016863406406, + "grad_norm": 42.517389071612314, + "kl": 0.560546875, + "learning_rate": 7.880067567567568e-07, + "loss": 0.0006, + "reward": 3.2056604623794556, + "reward_std": 0.1193031445145607, + "rewards/final_reward": 1.1134455035306088, + "rewards/mask_iou_reward": 0.5567227517653044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2056604027748108, + "rewards/thk_ans_format_reward": 1.0, + "step": 753, + "think_completion_length": 6.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.86458587646484, + "epoch": 2.54637436762226, + "grad_norm": 31.312787511542698, + "kl": 0.583984375, + "learning_rate": 7.877252252252252e-07, + "loss": 0.0006, + "reward": 3.3875555992126465, + "reward_std": 0.14285332709550858, + "rewards/final_reward": 1.5229593984733443, + "rewards/mask_iou_reward": 0.7614796992366721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3875554203987122, + "rewards/thk_ans_format_reward": 1.0, + "step": 754, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.88542175292969, + "epoch": 2.5497470489038787, + "grad_norm": 8.990249785443053, + "kl": 0.58984375, + "learning_rate": 7.874436936936937e-07, + "loss": 0.0006, + "reward": 2.9223185777664185, + "reward_std": 0.08924070559442043, + "rewards/final_reward": 1.0728038663698005, + "rewards/mask_iou_reward": 0.5364019331849003, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9223185181617737, + "rewards/thk_ans_format_reward": 1.0, + "step": 755, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.76042175292969, + "epoch": 2.5531197301854975, + "grad_norm": 13.843095526147629, + "kl": 0.83203125, + "learning_rate": 7.871621621621622e-07, + "loss": 0.0008, + "reward": 3.4950000047683716, + "reward_std": 0.05128934606909752, + "rewards/final_reward": 1.7058042723780633, + "rewards/mask_iou_reward": 0.8529021361890317, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4949997067451477, + "rewards/thk_ans_format_reward": 1.0, + "step": 756, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.50000381469727, + "epoch": 2.5564924114671164, + "grad_norm": 31.95743816328789, + "kl": 0.5703125, + "learning_rate": 7.868806306306306e-07, + "loss": 0.0006, + "reward": 3.0966413021087646, + "reward_std": 0.09938472509384155, + "rewards/final_reward": 1.5479266235541327, + "rewards/mask_iou_reward": 0.7739633117770663, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0966413915157318, + "rewards/thk_ans_format_reward": 1.0, + "step": 757, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.64583587646484, + "epoch": 2.559865092748735, + "grad_norm": 12.172451648435866, + "kl": 0.591796875, + "learning_rate": 7.865990990990991e-07, + "loss": 0.0006, + "reward": 3.5292413234710693, + "reward_std": 0.08699771389365196, + "rewards/final_reward": 0.8571612771535517, + "rewards/mask_iou_reward": 0.42858063857677586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5292413234710693, + "rewards/thk_ans_format_reward": 1.0, + "step": 758, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.45833587646484, + "epoch": 2.563237774030354, + "grad_norm": 11.122775520400872, + "kl": 0.62109375, + "learning_rate": 7.863175675675675e-07, + "loss": 0.0006, + "reward": 3.1527295112609863, + "reward_std": 0.2126385048031807, + "rewards/final_reward": 0.9309354482470767, + "rewards/mask_iou_reward": 0.4654677241235384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1527293026447296, + "rewards/thk_ans_format_reward": 1.0, + "step": 759, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.54166793823242, + "epoch": 2.566610455311973, + "grad_norm": 10.546932510994008, + "kl": 1.5859375, + "learning_rate": 7.86036036036036e-07, + "loss": 0.0016, + "reward": 3.086781620979309, + "reward_std": 0.07337499689310789, + "rewards/final_reward": 0.7351426490944075, + "rewards/mask_iou_reward": 0.36757132454720376, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.08678138256073, + "rewards/thk_ans_format_reward": 1.0, + "step": 760, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.02083587646484, + "epoch": 2.569983136593592, + "grad_norm": 13.133133046507506, + "kl": 0.62109375, + "learning_rate": 7.857545045045045e-07, + "loss": 0.0006, + "reward": 3.4137195348739624, + "reward_std": 0.044736314564943314, + "rewards/final_reward": 1.2257461358321629, + "rewards/mask_iou_reward": 0.6128730679160814, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4137197136878967, + "rewards/thk_ans_format_reward": 1.0, + "step": 761, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.65625381469727, + "epoch": 2.573355817875211, + "grad_norm": 32.595514732619456, + "kl": 0.615234375, + "learning_rate": 7.85472972972973e-07, + "loss": 0.0006, + "reward": 3.445251703262329, + "reward_std": 0.20248185843229294, + "rewards/final_reward": 1.233189164325462, + "rewards/mask_iou_reward": 0.616594582162731, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4452515840530396, + "rewards/thk_ans_format_reward": 1.0, + "step": 762, + "think_completion_length": 10.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.32292175292969, + "epoch": 2.5767284991568298, + "grad_norm": 77.43355853852889, + "kl": 0.541015625, + "learning_rate": 7.851914414414415e-07, + "loss": 0.0005, + "reward": 3.3963409662246704, + "reward_std": 0.1772306263446808, + "rewards/final_reward": 1.3941096598606393, + "rewards/mask_iou_reward": 0.6970548299303196, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.396340787410736, + "rewards/thk_ans_format_reward": 1.0, + "step": 763, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.41667175292969, + "epoch": 2.5801011804384486, + "grad_norm": 8.543003932244329, + "kl": 0.611328125, + "learning_rate": 7.849099099099099e-07, + "loss": 0.0006, + "reward": 3.338282823562622, + "reward_std": 0.1316997967660427, + "rewards/final_reward": 1.7267376913174535, + "rewards/mask_iou_reward": 0.8633688456587267, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3382827639579773, + "rewards/thk_ans_format_reward": 1.0, + "step": 764, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.79166793823242, + "epoch": 2.5834738617200674, + "grad_norm": 42.44242367659991, + "kl": 0.626953125, + "learning_rate": 7.846283783783784e-07, + "loss": 0.0007, + "reward": 3.2067281007766724, + "reward_std": 0.10132832825183868, + "rewards/final_reward": 0.6540462389631674, + "rewards/mask_iou_reward": 0.3270231194815837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2067280411720276, + "rewards/thk_ans_format_reward": 1.0, + "step": 765, + "think_completion_length": 10.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.52083587646484, + "epoch": 2.5868465430016863, + "grad_norm": 32.92501596552101, + "kl": 0.572265625, + "learning_rate": 7.843468468468469e-07, + "loss": 0.0006, + "reward": 3.105804204940796, + "reward_std": 0.20033784210681915, + "rewards/final_reward": 1.447985529265118, + "rewards/mask_iou_reward": 0.723992764632559, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1162208020687103, + "rewards/thk_ans_format_reward": 1.0, + "step": 766, + "think_completion_length": 10.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.35417175292969, + "epoch": 2.590219224283305, + "grad_norm": 5.836494401716033, + "kl": 0.646484375, + "learning_rate": 7.840653153153153e-07, + "loss": 0.0007, + "reward": 3.3003053665161133, + "reward_std": 0.06181888282299042, + "rewards/final_reward": 1.5327181532154235, + "rewards/mask_iou_reward": 0.7663590766077117, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3003052473068237, + "rewards/thk_ans_format_reward": 1.0, + "step": 767, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.82292175292969, + "epoch": 2.5935919055649244, + "grad_norm": 31.644724773768857, + "kl": 0.599609375, + "learning_rate": 7.837837837837838e-07, + "loss": 0.0006, + "reward": 3.611915349960327, + "reward_std": 0.133062107488513, + "rewards/final_reward": 1.6232534975612043, + "rewards/mask_iou_reward": 0.8116267487806021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6119154691696167, + "rewards/thk_ans_format_reward": 1.0, + "step": 768, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.23958587646484, + "epoch": 2.5969645868465427, + "grad_norm": 6.714877945185845, + "kl": 0.595703125, + "learning_rate": 7.835022522522522e-07, + "loss": 0.0006, + "reward": 3.281790614128113, + "reward_std": 0.27748487889766693, + "rewards/final_reward": 1.1574744637593963, + "rewards/mask_iou_reward": 0.5787372318796982, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2817904949188232, + "rewards/thk_ans_format_reward": 1.0, + "step": 769, + "think_completion_length": 11.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.28125381469727, + "epoch": 2.600337268128162, + "grad_norm": 16.498780792457595, + "kl": 0.677734375, + "learning_rate": 7.832207207207206e-07, + "loss": 0.0007, + "reward": 3.0205026865005493, + "reward_std": 0.08207221701741219, + "rewards/final_reward": 1.0467192538355548, + "rewards/mask_iou_reward": 0.5233596269177774, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0205026268959045, + "rewards/thk_ans_format_reward": 1.0, + "step": 770, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.52083587646484, + "epoch": 2.603709949409781, + "grad_norm": 11.788568156409315, + "kl": 0.57421875, + "learning_rate": 7.829391891891891e-07, + "loss": 0.0006, + "reward": 3.317116856575012, + "reward_std": 0.11956719309091568, + "rewards/final_reward": 1.6758310418194653, + "rewards/mask_iou_reward": 0.8379155209097326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3171168565750122, + "rewards/thk_ans_format_reward": 1.0, + "step": 771, + "think_completion_length": 11.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.4375, + "epoch": 2.6070826306913997, + "grad_norm": 33.04893941697202, + "kl": 0.5859375, + "learning_rate": 7.826576576576576e-07, + "loss": 0.0006, + "reward": 3.2055855989456177, + "reward_std": 0.08506038412451744, + "rewards/final_reward": 1.090031976130355, + "rewards/mask_iou_reward": 0.5450159880651775, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2055857181549072, + "rewards/thk_ans_format_reward": 1.0, + "step": 772, + "think_completion_length": 9.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.94791793823242, + "epoch": 2.6104553119730185, + "grad_norm": 16.00591255201323, + "kl": 0.62109375, + "learning_rate": 7.823761261261261e-07, + "loss": 0.0006, + "reward": 3.259799003601074, + "reward_std": 0.06505817919969559, + "rewards/final_reward": 1.2474749216725154, + "rewards/mask_iou_reward": 0.6237374608362577, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2597991228103638, + "rewards/thk_ans_format_reward": 1.0, + "step": 773, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.30208587646484, + "epoch": 2.6138279932546373, + "grad_norm": 16.467078793275938, + "kl": 0.509765625, + "learning_rate": 7.820945945945945e-07, + "loss": 0.0005, + "reward": 3.4382166862487793, + "reward_std": 0.09156141243875027, + "rewards/final_reward": 1.5640969101735402, + "rewards/mask_iou_reward": 0.7820484550867701, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4382166266441345, + "rewards/thk_ans_format_reward": 1.0, + "step": 774, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.69791793823242, + "epoch": 2.6172006745362566, + "grad_norm": 8.68110014124045, + "kl": 0.560546875, + "learning_rate": 7.81813063063063e-07, + "loss": 0.0005, + "reward": 3.143561840057373, + "reward_std": 0.05993725173175335, + "rewards/final_reward": 1.4912784029807846, + "rewards/mask_iou_reward": 0.7456392014903923, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1435619592666626, + "rewards/thk_ans_format_reward": 1.0, + "step": 775, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.79166793823242, + "epoch": 2.620573355817875, + "grad_norm": 8.67400352232433, + "kl": 0.60546875, + "learning_rate": 7.815315315315315e-07, + "loss": 0.0006, + "reward": 3.6337943077087402, + "reward_std": 0.2572034075856209, + "rewards/final_reward": 1.5875537717192088, + "rewards/mask_iou_reward": 0.7937768858596044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6337940096855164, + "rewards/thk_ans_format_reward": 1.0, + "step": 776, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.41667175292969, + "epoch": 2.6239460370994943, + "grad_norm": 8.556882945235555, + "kl": 0.71875, + "learning_rate": 7.812499999999999e-07, + "loss": 0.0007, + "reward": 3.184899926185608, + "reward_std": 0.141241867095232, + "rewards/final_reward": 1.1142707135886631, + "rewards/mask_iou_reward": 0.5571353567943316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1848998963832855, + "rewards/thk_ans_format_reward": 1.0, + "step": 777, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.36458587646484, + "epoch": 2.627318718381113, + "grad_norm": 26.69004358153012, + "kl": 0.5859375, + "learning_rate": 7.809684684684684e-07, + "loss": 0.0006, + "reward": 3.3422510623931885, + "reward_std": 0.10112036764621735, + "rewards/final_reward": 1.6472102036234255, + "rewards/mask_iou_reward": 0.8236051018117128, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3422508835792542, + "rewards/thk_ans_format_reward": 1.0, + "step": 778, + "think_completion_length": 10.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.23958587646484, + "epoch": 2.630691399662732, + "grad_norm": 12.340711701333001, + "kl": 0.607421875, + "learning_rate": 7.806869369369369e-07, + "loss": 0.0006, + "reward": 3.3806689977645874, + "reward_std": 0.3221001923084259, + "rewards/final_reward": 1.4816500684072826, + "rewards/mask_iou_reward": 0.7408250342036413, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3806690573692322, + "rewards/thk_ans_format_reward": 1.0, + "step": 779, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.14583587646484, + "epoch": 2.6340640809443507, + "grad_norm": 28.7933229375175, + "kl": 0.599609375, + "learning_rate": 7.804054054054053e-07, + "loss": 0.0006, + "reward": 3.145534038543701, + "reward_std": 0.1955154836177826, + "rewards/final_reward": 1.109550532358149, + "rewards/mask_iou_reward": 0.5547752661790745, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1455340385437012, + "rewards/thk_ans_format_reward": 1.0, + "step": 780, + "think_completion_length": 10.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.23958587646484, + "epoch": 2.6374367622259696, + "grad_norm": 13.664880476457705, + "kl": 0.638671875, + "learning_rate": 7.801238738738738e-07, + "loss": 0.0007, + "reward": 3.010381579399109, + "reward_std": 0.16878048330545425, + "rewards/final_reward": 1.2768509072340888, + "rewards/mask_iou_reward": 0.6384254536170444, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0207980871200562, + "rewards/thk_ans_format_reward": 1.0, + "step": 781, + "think_completion_length": 11.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.81250381469727, + "epoch": 2.6408094435075884, + "grad_norm": 6.9108413654848455, + "kl": 0.755859375, + "learning_rate": 7.798423423423422e-07, + "loss": 0.0008, + "reward": 3.547639846801758, + "reward_std": 0.07762636616826057, + "rewards/final_reward": 1.7800088484193495, + "rewards/mask_iou_reward": 0.8900044242096747, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5476398468017578, + "rewards/thk_ans_format_reward": 1.0, + "step": 782, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.85416793823242, + "epoch": 2.6441821247892072, + "grad_norm": 9.5857677536209, + "kl": 0.640625, + "learning_rate": 7.795608108108108e-07, + "loss": 0.0006, + "reward": 3.4680780172348022, + "reward_std": 0.11806654557585716, + "rewards/final_reward": 1.7424451239109253, + "rewards/mask_iou_reward": 0.8712225619554627, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4680778980255127, + "rewards/thk_ans_format_reward": 1.0, + "step": 783, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.3125, + "epoch": 2.6475548060708265, + "grad_norm": 57.780748368086115, + "kl": 0.5546875, + "learning_rate": 7.792792792792793e-07, + "loss": 0.0007, + "reward": 2.8306479454040527, + "reward_std": 0.1650489717721939, + "rewards/final_reward": 0.5575330646046901, + "rewards/mask_iou_reward": 0.27876653230234505, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8306480944156647, + "rewards/thk_ans_format_reward": 1.0, + "step": 784, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.29167175292969, + "epoch": 2.6509274873524453, + "grad_norm": 22.501702902369207, + "kl": 0.6015625, + "learning_rate": 7.789977477477477e-07, + "loss": 0.0008, + "reward": 3.1414239406585693, + "reward_std": 0.08215552754700184, + "rewards/final_reward": 0.9676854785923981, + "rewards/mask_iou_reward": 0.48384273929619903, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1414238214492798, + "rewards/thk_ans_format_reward": 1.0, + "step": 785, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.88541793823242, + "epoch": 2.654300168634064, + "grad_norm": 21.46706283599788, + "kl": 0.5703125, + "learning_rate": 7.787162162162162e-07, + "loss": 0.0006, + "reward": 3.4730279445648193, + "reward_std": 0.08347597345709801, + "rewards/final_reward": 1.4931069761398357, + "rewards/mask_iou_reward": 0.7465534880699178, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4730280041694641, + "rewards/thk_ans_format_reward": 1.0, + "step": 786, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.5, + "epoch": 2.657672849915683, + "grad_norm": 9.05205086856852, + "kl": 0.595703125, + "learning_rate": 7.784346846846846e-07, + "loss": 0.0006, + "reward": 3.118174910545349, + "reward_std": 0.0817178450524807, + "rewards/final_reward": 1.3790720365145952, + "rewards/mask_iou_reward": 0.6895360182572976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1181747615337372, + "rewards/thk_ans_format_reward": 1.0, + "step": 787, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.87500381469727, + "epoch": 2.661045531197302, + "grad_norm": 21.72331596169387, + "kl": 0.787109375, + "learning_rate": 7.781531531531531e-07, + "loss": 0.0008, + "reward": 3.3093440532684326, + "reward_std": 0.10094352997839451, + "rewards/final_reward": 1.704846715631396, + "rewards/mask_iou_reward": 0.852423357815698, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3093441128730774, + "rewards/thk_ans_format_reward": 1.0, + "step": 788, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.23958587646484, + "epoch": 2.6644182124789206, + "grad_norm": 10.87397622578721, + "kl": 0.7890625, + "learning_rate": 7.778716216216216e-07, + "loss": 0.0008, + "reward": 3.4442501068115234, + "reward_std": 0.07595014199614525, + "rewards/final_reward": 1.6731010669686723, + "rewards/mask_iou_reward": 0.8365505334843362, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4442499279975891, + "rewards/thk_ans_format_reward": 1.0, + "step": 789, + "think_completion_length": 10.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.16666793823242, + "epoch": 2.6677908937605395, + "grad_norm": 34.326508886063536, + "kl": 0.55078125, + "learning_rate": 7.7759009009009e-07, + "loss": 0.0006, + "reward": 3.0237648487091064, + "reward_std": 0.14004899561405182, + "rewards/final_reward": 0.8710051356850619, + "rewards/mask_iou_reward": 0.43550256784253094, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0237650275230408, + "rewards/thk_ans_format_reward": 1.0, + "step": 790, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.41667175292969, + "epoch": 2.6711635750421587, + "grad_norm": 10.07884750719032, + "kl": 0.541015625, + "learning_rate": 7.773085585585585e-07, + "loss": 0.0005, + "reward": 3.0886744260787964, + "reward_std": 0.14586707949638367, + "rewards/final_reward": 0.8601907362518856, + "rewards/mask_iou_reward": 0.4300953681259428, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.088674247264862, + "rewards/thk_ans_format_reward": 1.0, + "step": 791, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.63542175292969, + "epoch": 2.6745362563237776, + "grad_norm": 41.51262508899244, + "kl": 0.599609375, + "learning_rate": 7.77027027027027e-07, + "loss": 0.0006, + "reward": 3.345741033554077, + "reward_std": 0.09703287482261658, + "rewards/final_reward": 1.1597580532708955, + "rewards/mask_iou_reward": 0.5798790266354478, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3457409143447876, + "rewards/thk_ans_format_reward": 1.0, + "step": 792, + "think_completion_length": 11.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.09375, + "epoch": 2.6779089376053964, + "grad_norm": 7.769022810438781, + "kl": 0.654296875, + "learning_rate": 7.767454954954955e-07, + "loss": 0.0007, + "reward": 3.578448176383972, + "reward_std": 0.07281693629920483, + "rewards/final_reward": 1.2307804240001008, + "rewards/mask_iou_reward": 0.6153902120000504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5784481167793274, + "rewards/thk_ans_format_reward": 1.0, + "step": 793, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.22916793823242, + "epoch": 2.681281618887015, + "grad_norm": 15.460994244582118, + "kl": 0.578125, + "learning_rate": 7.76463963963964e-07, + "loss": 0.0006, + "reward": 3.132830500602722, + "reward_std": 0.18471045047044754, + "rewards/final_reward": 1.4256940575311807, + "rewards/mask_iou_reward": 0.7128470287655904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1328304409980774, + "rewards/thk_ans_format_reward": 1.0, + "step": 794, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.05208587646484, + "epoch": 2.684654300168634, + "grad_norm": 10.330542806577336, + "kl": 0.58203125, + "learning_rate": 7.761824324324324e-07, + "loss": 0.0006, + "reward": 2.9948168992996216, + "reward_std": 0.11794888228178024, + "rewards/final_reward": 0.36272560535996373, + "rewards/mask_iou_reward": 0.18136280267998187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9948166310787201, + "rewards/thk_ans_format_reward": 1.0, + "step": 795, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.75000381469727, + "epoch": 2.688026981450253, + "grad_norm": 8.850614484587357, + "kl": 0.578125, + "learning_rate": 7.759009009009009e-07, + "loss": 0.0006, + "reward": 3.410898804664612, + "reward_std": 0.031522348057478666, + "rewards/final_reward": 1.9422227818541504, + "rewards/mask_iou_reward": 0.9711113909270752, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4108988046646118, + "rewards/thk_ans_format_reward": 1.0, + "step": 796, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.875, + "epoch": 2.6913996627318717, + "grad_norm": 40.152092093398544, + "kl": 0.6640625, + "learning_rate": 7.756193693693694e-07, + "loss": 0.0007, + "reward": 3.6436657905578613, + "reward_std": 0.10985162109136581, + "rewards/final_reward": 1.8214199167549583, + "rewards/mask_iou_reward": 0.9107099583774791, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6436654925346375, + "rewards/thk_ans_format_reward": 1.0, + "step": 797, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.71875762939453, + "epoch": 2.694772344013491, + "grad_norm": 10.087858747837071, + "kl": 0.537109375, + "learning_rate": 7.753378378378378e-07, + "loss": 0.0006, + "reward": 3.4809383153915405, + "reward_std": 0.08559620007872581, + "rewards/final_reward": 0.6748064299090106, + "rewards/mask_iou_reward": 0.3374032149545053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.48093843460083, + "rewards/thk_ans_format_reward": 1.0, + "step": 798, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.15625381469727, + "epoch": 2.6981450252951094, + "grad_norm": 18.52801654982961, + "kl": 0.548828125, + "learning_rate": 7.750563063063063e-07, + "loss": 0.0006, + "reward": 3.5452533960342407, + "reward_std": 0.1478636972606182, + "rewards/final_reward": 1.5319860973186294, + "rewards/mask_iou_reward": 0.7659930486593147, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.545253336429596, + "rewards/thk_ans_format_reward": 1.0, + "step": 799, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.13542175292969, + "epoch": 2.7015177065767286, + "grad_norm": 10.657004525322808, + "kl": 0.58203125, + "learning_rate": 7.747747747747747e-07, + "loss": 0.0006, + "reward": 3.6775232553482056, + "reward_std": 0.09374432638287544, + "rewards/final_reward": 1.8238359214270736, + "rewards/mask_iou_reward": 0.9119179607135368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.677523136138916, + "rewards/thk_ans_format_reward": 1.0, + "step": 800, + "think_completion_length": 10.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.4166717529297, + "epoch": 2.7048903878583475, + "grad_norm": 11.342698990143404, + "kl": 0.552734375, + "learning_rate": 7.744932432432432e-07, + "loss": 0.0006, + "reward": 2.8509023189544678, + "reward_std": 0.1925317421555519, + "rewards/final_reward": 1.3280640382980335, + "rewards/mask_iou_reward": 0.6640320191490168, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.850902259349823, + "rewards/thk_ans_format_reward": 1.0, + "step": 801, + "think_completion_length": 9.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.60417175292969, + "epoch": 2.7082630691399663, + "grad_norm": 9.082344751225259, + "kl": 0.69140625, + "learning_rate": 7.742117117117117e-07, + "loss": 0.0007, + "reward": 3.264283299446106, + "reward_std": 0.0901465336792171, + "rewards/final_reward": 1.0489787386958307, + "rewards/mask_iou_reward": 0.5244893693479153, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2642829716205597, + "rewards/thk_ans_format_reward": 1.0, + "step": 802, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.3229217529297, + "epoch": 2.711635750421585, + "grad_norm": 20.722170032828355, + "kl": 0.486328125, + "learning_rate": 7.739301801801802e-07, + "loss": 0.0005, + "reward": 3.3872246742248535, + "reward_std": 0.10432956367731094, + "rewards/final_reward": 1.1002763490882639, + "rewards/mask_iou_reward": 0.5501381745441319, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3872247338294983, + "rewards/thk_ans_format_reward": 1.0, + "step": 803, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.33333587646484, + "epoch": 2.715008431703204, + "grad_norm": 7.361199516507246, + "kl": 0.58203125, + "learning_rate": 7.736486486486487e-07, + "loss": 0.0006, + "reward": 3.1800429821014404, + "reward_std": 0.07173176482319832, + "rewards/final_reward": 1.3100901139880954, + "rewards/mask_iou_reward": 0.6550450569940477, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1800428628921509, + "rewards/thk_ans_format_reward": 1.0, + "step": 804, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.66666793823242, + "epoch": 2.718381112984823, + "grad_norm": 9.78121969722757, + "kl": 0.578125, + "learning_rate": 7.733671171171171e-07, + "loss": 0.0006, + "reward": 3.287275791168213, + "reward_std": 0.06260454282164574, + "rewards/final_reward": 1.7459222437466897, + "rewards/mask_iou_reward": 0.8729611218733448, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.287275791168213, + "rewards/thk_ans_format_reward": 1.0, + "step": 805, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.63541793823242, + "epoch": 2.7217537942664416, + "grad_norm": 10.8247555149658, + "kl": 0.658203125, + "learning_rate": 7.730855855855856e-07, + "loss": 0.0007, + "reward": 3.0468918085098267, + "reward_std": 0.18924224376678467, + "rewards/final_reward": 1.2246028478679378, + "rewards/mask_iou_reward": 0.6123014239339689, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.046891689300537, + "rewards/thk_ans_format_reward": 1.0, + "step": 806, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.56250381469727, + "epoch": 2.725126475548061, + "grad_norm": 17.086063759084684, + "kl": 1.6005859375, + "learning_rate": 7.728040540540541e-07, + "loss": 0.0016, + "reward": 3.525339722633362, + "reward_std": 0.03663340024650097, + "rewards/final_reward": 1.7980175535130674, + "rewards/mask_iou_reward": 0.8990087767565337, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.525339961051941, + "rewards/thk_ans_format_reward": 1.0, + "step": 807, + "think_completion_length": 10.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.34375, + "epoch": 2.7284991568296797, + "grad_norm": 7.833542813069643, + "kl": 0.62890625, + "learning_rate": 7.725225225225225e-07, + "loss": 0.0006, + "reward": 3.1154762506484985, + "reward_std": 0.04759081266820431, + "rewards/final_reward": 0.8310892527079872, + "rewards/mask_iou_reward": 0.4155446263539936, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.115476131439209, + "rewards/thk_ans_format_reward": 1.0, + "step": 808, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.48958969116211, + "epoch": 2.7318718381112985, + "grad_norm": 9.54137374323096, + "kl": 0.544921875, + "learning_rate": 7.722409909909909e-07, + "loss": 0.0006, + "reward": 3.054728627204895, + "reward_std": 0.10850285552442074, + "rewards/final_reward": 1.290442879401124, + "rewards/mask_iou_reward": 0.645221439700562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0547286868095398, + "rewards/thk_ans_format_reward": 1.0, + "step": 809, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.55208587646484, + "epoch": 2.7352445193929174, + "grad_norm": 11.466989004969372, + "kl": 0.564453125, + "learning_rate": 7.719594594594593e-07, + "loss": 0.0006, + "reward": 3.473750948905945, + "reward_std": 0.1244993582367897, + "rewards/final_reward": 0.9779945483246706, + "rewards/mask_iou_reward": 0.4889972741623353, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4737508893013, + "rewards/thk_ans_format_reward": 1.0, + "step": 810, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.77083587646484, + "epoch": 2.738617200674536, + "grad_norm": 6.899361597270846, + "kl": 0.58203125, + "learning_rate": 7.716779279279278e-07, + "loss": 0.0006, + "reward": 3.7204372882843018, + "reward_std": 0.06413896754384041, + "rewards/final_reward": 1.8257265424655205, + "rewards/mask_iou_reward": 0.9128632712327602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7204372882843018, + "rewards/thk_ans_format_reward": 1.0, + "step": 811, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.70833587646484, + "epoch": 2.741989881956155, + "grad_norm": 26.308406144808927, + "kl": 0.6171875, + "learning_rate": 7.713963963963963e-07, + "loss": 0.0006, + "reward": 3.508059859275818, + "reward_std": 0.12611358240246773, + "rewards/final_reward": 1.4690036592278546, + "rewards/mask_iou_reward": 0.7345018296139273, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.508059799671173, + "rewards/thk_ans_format_reward": 1.0, + "step": 812, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.87500381469727, + "epoch": 2.745362563237774, + "grad_norm": 12.883486258773985, + "kl": 0.634765625, + "learning_rate": 7.711148648648648e-07, + "loss": 0.0006, + "reward": 3.470430374145508, + "reward_std": 0.1318624820560217, + "rewards/final_reward": 1.7007833258640779, + "rewards/mask_iou_reward": 0.8503916629320389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4704301953315735, + "rewards/thk_ans_format_reward": 1.0, + "step": 813, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.57292175292969, + "epoch": 2.748735244519393, + "grad_norm": 33.01004357932649, + "kl": 0.607421875, + "learning_rate": 7.708333333333333e-07, + "loss": 0.0006, + "reward": 3.14322030544281, + "reward_std": 0.16609105467796326, + "rewards/final_reward": 1.6312836242368816, + "rewards/mask_iou_reward": 0.8156418121184408, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.14322030544281, + "rewards/thk_ans_format_reward": 1.0, + "step": 814, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.04167175292969, + "epoch": 2.752107925801012, + "grad_norm": 11.569718364209079, + "kl": 0.54296875, + "learning_rate": 7.705518018018018e-07, + "loss": 0.0006, + "reward": 3.368572950363159, + "reward_std": 0.09339471906423569, + "rewards/final_reward": 1.6949728648788742, + "rewards/mask_iou_reward": 0.8474864324394371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3685729503631592, + "rewards/thk_ans_format_reward": 1.0, + "step": 815, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.89583587646484, + "epoch": 2.7554806070826308, + "grad_norm": 10.47686924222413, + "kl": 0.609375, + "learning_rate": 7.702702702702702e-07, + "loss": 0.0006, + "reward": 3.353748917579651, + "reward_std": 0.06471065618097782, + "rewards/final_reward": 0.9686143763127055, + "rewards/mask_iou_reward": 0.48430718815635276, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3537487983703613, + "rewards/thk_ans_format_reward": 1.0, + "step": 816, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.36458587646484, + "epoch": 2.7588532883642496, + "grad_norm": 14.196036160401253, + "kl": 0.56640625, + "learning_rate": 7.699887387387387e-07, + "loss": 0.0006, + "reward": 3.155430555343628, + "reward_std": 0.17521775886416435, + "rewards/final_reward": 1.5953339400653568, + "rewards/mask_iou_reward": 0.7976669700326784, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1554304361343384, + "rewards/thk_ans_format_reward": 1.0, + "step": 817, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.17708587646484, + "epoch": 2.7622259696458684, + "grad_norm": 8.868325037864638, + "kl": 0.630859375, + "learning_rate": 7.697072072072071e-07, + "loss": 0.0006, + "reward": 3.2271395921707153, + "reward_std": 0.1283954232931137, + "rewards/final_reward": 1.859811171144015, + "rewards/mask_iou_reward": 0.9299055855720075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2271394431591034, + "rewards/thk_ans_format_reward": 1.0, + "step": 818, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.84375381469727, + "epoch": 2.7655986509274872, + "grad_norm": 10.61611861966689, + "kl": 0.6015625, + "learning_rate": 7.694256756756756e-07, + "loss": 0.0006, + "reward": 3.309920072555542, + "reward_std": 0.09556181728839874, + "rewards/final_reward": 1.5073089264477035, + "rewards/mask_iou_reward": 0.7536544632238518, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.309919834136963, + "rewards/thk_ans_format_reward": 1.0, + "step": 819, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.68750381469727, + "epoch": 2.768971332209106, + "grad_norm": 23.21356291600513, + "kl": 0.58984375, + "learning_rate": 7.691441441441441e-07, + "loss": 0.0006, + "reward": 3.2571535110473633, + "reward_std": 0.09767593070864677, + "rewards/final_reward": 1.4686262349494574, + "rewards/mask_iou_reward": 0.7343131174747287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2571535110473633, + "rewards/thk_ans_format_reward": 1.0, + "step": 820, + "think_completion_length": 7.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.82291793823242, + "epoch": 2.7723440134907253, + "grad_norm": 18.56850560046818, + "kl": 0.611328125, + "learning_rate": 7.688626126126125e-07, + "loss": 0.0006, + "reward": 3.1783725023269653, + "reward_std": 0.12528940849006176, + "rewards/final_reward": 0.9751798189706983, + "rewards/mask_iou_reward": 0.48758990948534914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1783722639083862, + "rewards/thk_ans_format_reward": 1.0, + "step": 821, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.22916793823242, + "epoch": 2.775716694772344, + "grad_norm": 11.552415569645232, + "kl": 0.578125, + "learning_rate": 7.68581081081081e-07, + "loss": 0.0006, + "reward": 3.4517630338668823, + "reward_std": 0.09905361756682396, + "rewards/final_reward": 1.0850289802981778, + "rewards/mask_iou_reward": 0.5425144901490889, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4517630338668823, + "rewards/thk_ans_format_reward": 1.0, + "step": 822, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.69791793823242, + "epoch": 2.779089376053963, + "grad_norm": 14.11041251237749, + "kl": 0.67578125, + "learning_rate": 7.682995495495495e-07, + "loss": 0.0007, + "reward": 3.6109098196029663, + "reward_std": 0.15111944265663624, + "rewards/final_reward": 1.4433314059687583, + "rewards/mask_iou_reward": 0.7216657029843792, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.610909879207611, + "rewards/thk_ans_format_reward": 1.0, + "step": 823, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.13541793823242, + "epoch": 2.782462057335582, + "grad_norm": 19.407652730449964, + "kl": 0.59375, + "learning_rate": 7.68018018018018e-07, + "loss": 0.0006, + "reward": 3.5962259769439697, + "reward_std": 0.09219707362353802, + "rewards/final_reward": 1.0636648632991816, + "rewards/mask_iou_reward": 0.5318324316495908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5962256789207458, + "rewards/thk_ans_format_reward": 1.0, + "step": 824, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.45833587646484, + "epoch": 2.7858347386172007, + "grad_norm": 48.76370678893969, + "kl": 0.61328125, + "learning_rate": 7.677364864864865e-07, + "loss": 0.0006, + "reward": 3.2339768409729004, + "reward_std": 0.021290178410708904, + "rewards/final_reward": 0.9685718969102269, + "rewards/mask_iou_reward": 0.48428594845511347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2339767217636108, + "rewards/thk_ans_format_reward": 1.0, + "step": 825, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.19791793823242, + "epoch": 2.7892074198988195, + "grad_norm": 23.00837890523965, + "kl": 0.615234375, + "learning_rate": 7.674549549549549e-07, + "loss": 0.0006, + "reward": 3.3137834072113037, + "reward_std": 0.12772230803966522, + "rewards/final_reward": 0.8328103990514315, + "rewards/mask_iou_reward": 0.41640519952571575, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3137832880020142, + "rewards/thk_ans_format_reward": 1.0, + "step": 826, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.57291793823242, + "epoch": 2.7925801011804383, + "grad_norm": 10.93210808105748, + "kl": 0.599609375, + "learning_rate": 7.671734234234234e-07, + "loss": 0.0006, + "reward": 3.1316012144088745, + "reward_std": 0.13967030495405197, + "rewards/final_reward": 0.7290396522833856, + "rewards/mask_iou_reward": 0.3645198261416928, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1316012740135193, + "rewards/thk_ans_format_reward": 1.0, + "step": 827, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.54166793823242, + "epoch": 2.7959527824620576, + "grad_norm": 7.509396937254856, + "kl": 0.576171875, + "learning_rate": 7.668918918918918e-07, + "loss": 0.0006, + "reward": 3.354486584663391, + "reward_std": 0.09209609404206276, + "rewards/final_reward": 0.9352793145560208, + "rewards/mask_iou_reward": 0.4676396572780104, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3544864654541016, + "rewards/thk_ans_format_reward": 1.0, + "step": 828, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.16666793823242, + "epoch": 2.799325463743676, + "grad_norm": 19.71623898667479, + "kl": 0.59765625, + "learning_rate": 7.666103603603603e-07, + "loss": 0.0006, + "reward": 3.440225601196289, + "reward_std": 0.15994113497436047, + "rewards/final_reward": 1.4639581496698817, + "rewards/mask_iou_reward": 0.7319790748349408, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.440225601196289, + "rewards/thk_ans_format_reward": 1.0, + "step": 829, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.73958587646484, + "epoch": 2.8026981450252952, + "grad_norm": 7.641932508738693, + "kl": 0.974609375, + "learning_rate": 7.663288288288288e-07, + "loss": 0.001, + "reward": 3.4703879356384277, + "reward_std": 0.06228171847760677, + "rewards/final_reward": 0.9386934150923482, + "rewards/mask_iou_reward": 0.4693467075461741, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4703876972198486, + "rewards/thk_ans_format_reward": 1.0, + "step": 830, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.61458587646484, + "epoch": 2.806070826306914, + "grad_norm": 8.790813148317039, + "kl": 0.5625, + "learning_rate": 7.660472972972972e-07, + "loss": 0.0006, + "reward": 3.3965048789978027, + "reward_std": 0.12809063494205475, + "rewards/final_reward": 1.6194442213451081, + "rewards/mask_iou_reward": 0.8097221106725541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3965047597885132, + "rewards/thk_ans_format_reward": 1.0, + "step": 831, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.71875381469727, + "epoch": 2.809443507588533, + "grad_norm": 11.087230061558003, + "kl": 0.6015625, + "learning_rate": 7.657657657657657e-07, + "loss": 0.0006, + "reward": 2.8211324214935303, + "reward_std": 0.06655344553291798, + "rewards/final_reward": 0.9339966788083726, + "rewards/mask_iou_reward": 0.4669983394041863, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.821132242679596, + "rewards/thk_ans_format_reward": 1.0, + "step": 832, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.75000381469727, + "epoch": 2.8128161888701517, + "grad_norm": 11.599365716013358, + "kl": 0.6171875, + "learning_rate": 7.654842342342343e-07, + "loss": 0.0006, + "reward": 3.1649749279022217, + "reward_std": 0.1306835636496544, + "rewards/final_reward": 1.1254830358712231, + "rewards/mask_iou_reward": 0.5627415179356116, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1649749279022217, + "rewards/thk_ans_format_reward": 1.0, + "step": 833, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.16666793823242, + "epoch": 2.8161888701517706, + "grad_norm": 34.09302576043981, + "kl": 0.658203125, + "learning_rate": 7.652027027027027e-07, + "loss": 0.0007, + "reward": 3.0825788974761963, + "reward_std": 0.19801976531744003, + "rewards/final_reward": 1.8678686666522455, + "rewards/mask_iou_reward": 0.9339343333261227, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0825787782669067, + "rewards/thk_ans_format_reward": 1.0, + "step": 834, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.60416793823242, + "epoch": 2.8195615514333894, + "grad_norm": 6.514989494359038, + "kl": 0.638671875, + "learning_rate": 7.649211711711712e-07, + "loss": 0.0006, + "reward": 3.1945159435272217, + "reward_std": 0.21621747314929962, + "rewards/final_reward": 1.2842385429337646, + "rewards/mask_iou_reward": 0.6421192714668823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1945159435272217, + "rewards/thk_ans_format_reward": 1.0, + "step": 835, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.25, + "epoch": 2.822934232715008, + "grad_norm": 12.012866329193594, + "kl": 0.71875, + "learning_rate": 7.646396396396396e-07, + "loss": 0.0007, + "reward": 3.374995231628418, + "reward_std": 0.10654059797525406, + "rewards/final_reward": 0.9926595462633434, + "rewards/mask_iou_reward": 0.4963297731316717, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3749953508377075, + "rewards/thk_ans_format_reward": 1.0, + "step": 836, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.05208587646484, + "epoch": 2.8263069139966275, + "grad_norm": 12.41159393978972, + "kl": 0.533203125, + "learning_rate": 7.643581081081081e-07, + "loss": 0.0005, + "reward": 3.1620161533355713, + "reward_std": 0.11073334142565727, + "rewards/final_reward": 1.1209113472967716, + "rewards/mask_iou_reward": 0.5604556736483858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1620161533355713, + "rewards/thk_ans_format_reward": 1.0, + "step": 837, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.93750762939453, + "epoch": 2.8296795952782463, + "grad_norm": 31.69858166724224, + "kl": 0.55859375, + "learning_rate": 7.640765765765766e-07, + "loss": 0.0006, + "reward": 3.7063937187194824, + "reward_std": 0.14060832560062408, + "rewards/final_reward": 1.7010022958199755, + "rewards/mask_iou_reward": 0.8505011479099878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7063936591148376, + "rewards/thk_ans_format_reward": 1.0, + "step": 838, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8958396911621, + "epoch": 2.833052276559865, + "grad_norm": 16.940299380888987, + "kl": 0.580078125, + "learning_rate": 7.63795045045045e-07, + "loss": 0.0006, + "reward": 3.263391137123108, + "reward_std": 0.27852288633584976, + "rewards/final_reward": 1.2191613669587613, + "rewards/mask_iou_reward": 0.6095806834793807, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2738077640533447, + "rewards/thk_ans_format_reward": 1.0, + "step": 839, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.56250381469727, + "epoch": 2.836424957841484, + "grad_norm": 9.3217946169954, + "kl": 0.5703125, + "learning_rate": 7.635135135135135e-07, + "loss": 0.0006, + "reward": 3.2963112592697144, + "reward_std": 0.2867426946759224, + "rewards/final_reward": 1.3462905702697516, + "rewards/mask_iou_reward": 0.6731452851348758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.29631108045578, + "rewards/thk_ans_format_reward": 1.0, + "step": 840, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.23958587646484, + "epoch": 2.839797639123103, + "grad_norm": 6.720125561600698, + "kl": 0.64453125, + "learning_rate": 7.632319819819819e-07, + "loss": 0.0006, + "reward": 3.3049758672714233, + "reward_std": 0.1726619452238083, + "rewards/final_reward": 1.7811929357135363, + "rewards/mask_iou_reward": 0.8905964678567682, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3049756586551666, + "rewards/thk_ans_format_reward": 1.0, + "step": 841, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.30208587646484, + "epoch": 2.8431703204047216, + "grad_norm": 12.783520229439375, + "kl": 0.58984375, + "learning_rate": 7.629504504504504e-07, + "loss": 0.0006, + "reward": 3.605306386947632, + "reward_std": 0.09047617763280869, + "rewards/final_reward": 1.752034756424516, + "rewards/mask_iou_reward": 0.876017378212258, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6053063869476318, + "rewards/thk_ans_format_reward": 1.0, + "step": 842, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.61458587646484, + "epoch": 2.8465430016863404, + "grad_norm": 13.67923872207617, + "kl": 0.666015625, + "learning_rate": 7.62668918918919e-07, + "loss": 0.0007, + "reward": 3.123603582382202, + "reward_std": 0.21856746077537537, + "rewards/final_reward": 0.8532247935583459, + "rewards/mask_iou_reward": 0.42661239677917295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1236035227775574, + "rewards/thk_ans_format_reward": 1.0, + "step": 843, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.44792175292969, + "epoch": 2.8499156829679597, + "grad_norm": 12.297845593512243, + "kl": 0.59765625, + "learning_rate": 7.623873873873874e-07, + "loss": 0.0006, + "reward": 3.408300280570984, + "reward_std": 0.11788310110569, + "rewards/final_reward": 1.5378804551548173, + "rewards/mask_iou_reward": 0.7689402275774087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4083001613616943, + "rewards/thk_ans_format_reward": 1.0, + "step": 844, + "think_completion_length": 5.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.13542175292969, + "epoch": 2.8532883642495785, + "grad_norm": 7.898260141271431, + "kl": 0.650390625, + "learning_rate": 7.621058558558559e-07, + "loss": 0.0007, + "reward": 3.4168694019317627, + "reward_std": 0.0871502235531807, + "rewards/final_reward": 1.674952679337213, + "rewards/mask_iou_reward": 0.8374763396686065, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.416869342327118, + "rewards/thk_ans_format_reward": 1.0, + "step": 845, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.26041793823242, + "epoch": 2.8566610455311974, + "grad_norm": 18.807496744343155, + "kl": 0.634765625, + "learning_rate": 7.618243243243244e-07, + "loss": 0.0007, + "reward": 3.249427914619446, + "reward_std": 0.11862549185752869, + "rewards/final_reward": 0.9638047950481252, + "rewards/mask_iou_reward": 0.4819023975240626, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2494277954101562, + "rewards/thk_ans_format_reward": 1.0, + "step": 846, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.76041793823242, + "epoch": 2.860033726812816, + "grad_norm": 13.577778955368622, + "kl": 0.609375, + "learning_rate": 7.615427927927928e-07, + "loss": 0.0006, + "reward": 3.3712258338928223, + "reward_std": 0.08537932112812996, + "rewards/final_reward": 1.7251943154869958, + "rewards/mask_iou_reward": 0.8625971577434979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3712258338928223, + "rewards/thk_ans_format_reward": 1.0, + "step": 847, + "think_completion_length": 6.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.09375, + "epoch": 2.863406408094435, + "grad_norm": 15.904610578246603, + "kl": 0.595703125, + "learning_rate": 7.612612612612613e-07, + "loss": 0.0006, + "reward": 3.2905749082565308, + "reward_std": 0.13749309442937374, + "rewards/final_reward": 1.7126632806370585, + "rewards/mask_iou_reward": 0.8563316403185293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2905749678611755, + "rewards/thk_ans_format_reward": 1.0, + "step": 848, + "think_completion_length": 5.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.93750381469727, + "epoch": 2.866779089376054, + "grad_norm": 44.65435750438078, + "kl": 0.658203125, + "learning_rate": 7.609797297297296e-07, + "loss": 0.0007, + "reward": 3.3973218202590942, + "reward_std": 0.16537553817033768, + "rewards/final_reward": 1.4147304409035975, + "rewards/mask_iou_reward": 0.7073652204517987, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.4494050741195679, + "rewards/thk_ans_format_reward": 1.0, + "step": 849, + "think_completion_length": 5.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.84375, + "epoch": 2.8701517706576727, + "grad_norm": 114.68917591790206, + "kl": 0.619140625, + "learning_rate": 7.606981981981981e-07, + "loss": 0.0006, + "reward": 3.299852728843689, + "reward_std": 0.10934684053063393, + "rewards/final_reward": 0.6728932611020485, + "rewards/mask_iou_reward": 0.33644663055102425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.299852728843689, + "rewards/thk_ans_format_reward": 1.0, + "step": 850, + "think_completion_length": 7.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.3854217529297, + "epoch": 2.873524451939292, + "grad_norm": 14.365891116065718, + "kl": 0.5625, + "learning_rate": 7.604166666666666e-07, + "loss": 0.0006, + "reward": 3.340060830116272, + "reward_std": 0.19183791242539883, + "rewards/final_reward": 0.27853339711707975, + "rewards/mask_iou_reward": 0.13926669855853988, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.36089426279068, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 851, + "think_completion_length": 5.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.73958587646484, + "epoch": 2.876897133220911, + "grad_norm": 84.95926295981053, + "kl": 0.75, + "learning_rate": 7.60135135135135e-07, + "loss": 0.0007, + "reward": 3.185040235519409, + "reward_std": 0.18357310444116592, + "rewards/final_reward": 0.8179576324073183, + "rewards/mask_iou_reward": 0.40897881620365917, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.18503999710083, + "rewards/thk_ans_format_reward": 1.0, + "step": 852, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.94792175292969, + "epoch": 2.8802698145025296, + "grad_norm": 14.3857169488127, + "kl": 0.607421875, + "learning_rate": 7.598536036036036e-07, + "loss": 0.0006, + "reward": 3.0738601684570312, + "reward_std": 0.1482534147799015, + "rewards/final_reward": 1.5809381835741352, + "rewards/mask_iou_reward": 0.7904690917870676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0738602876663208, + "rewards/thk_ans_format_reward": 1.0, + "step": 853, + "think_completion_length": 5.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.34375, + "epoch": 2.8836424957841484, + "grad_norm": 15.143727058052493, + "kl": 0.572265625, + "learning_rate": 7.59572072072072e-07, + "loss": 0.0006, + "reward": 3.4103574752807617, + "reward_std": 0.18912208080291748, + "rewards/final_reward": 1.4438347686562816, + "rewards/mask_iou_reward": 0.7219173843281408, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4311907291412354, + "rewards/thk_ans_format_reward": 1.0, + "step": 854, + "think_completion_length": 5.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.71875381469727, + "epoch": 2.8870151770657673, + "grad_norm": 9.044759438493474, + "kl": 0.64453125, + "learning_rate": 7.592905405405405e-07, + "loss": 0.0006, + "reward": 3.445785164833069, + "reward_std": 0.11425403878092766, + "rewards/final_reward": 1.7823204157680697, + "rewards/mask_iou_reward": 0.8911602078840348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4457851648330688, + "rewards/thk_ans_format_reward": 1.0, + "step": 855, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.75, + "epoch": 2.890387858347386, + "grad_norm": 6.976236686403784, + "kl": 0.576171875, + "learning_rate": 7.59009009009009e-07, + "loss": 0.0006, + "reward": 3.5354377031326294, + "reward_std": 0.0927988737821579, + "rewards/final_reward": 1.2084556142828462, + "rewards/mask_iou_reward": 0.6042278071414231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5354375839233398, + "rewards/thk_ans_format_reward": 1.0, + "step": 856, + "think_completion_length": 6.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.40625762939453, + "epoch": 2.893760539629005, + "grad_norm": 13.588204117509628, + "kl": 0.564453125, + "learning_rate": 7.587274774774774e-07, + "loss": 0.0006, + "reward": 3.5360511541366577, + "reward_std": 0.13357886672019958, + "rewards/final_reward": 1.4003553047999098, + "rewards/mask_iou_reward": 0.7001776523999549, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.536051332950592, + "rewards/thk_ans_format_reward": 1.0, + "step": 857, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.46875, + "epoch": 2.897133220910624, + "grad_norm": 7.591543471473635, + "kl": 0.544921875, + "learning_rate": 7.584459459459459e-07, + "loss": 0.0005, + "reward": 3.4365715980529785, + "reward_std": 0.02841498889029026, + "rewards/final_reward": 0.8789773549323615, + "rewards/mask_iou_reward": 0.43948867746618075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.436571717262268, + "rewards/thk_ans_format_reward": 1.0, + "step": 858, + "think_completion_length": 5.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.77083587646484, + "epoch": 2.9005059021922426, + "grad_norm": 69.82960550237168, + "kl": 0.638671875, + "learning_rate": 7.581644144144143e-07, + "loss": 0.0006, + "reward": 3.4785648584365845, + "reward_std": 0.03144015744328499, + "rewards/final_reward": 1.0896791036053899, + "rewards/mask_iou_reward": 0.5448395518026949, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4785647988319397, + "rewards/thk_ans_format_reward": 1.0, + "step": 859, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.38541793823242, + "epoch": 2.903878583473862, + "grad_norm": 41.63556782679187, + "kl": 0.546875, + "learning_rate": 7.578828828828828e-07, + "loss": 0.0005, + "reward": 3.352591037750244, + "reward_std": 0.19262491166591644, + "rewards/final_reward": 1.6292080190650648, + "rewards/mask_iou_reward": 0.8146040095325324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3525908589363098, + "rewards/thk_ans_format_reward": 1.0, + "step": 860, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.44792175292969, + "epoch": 2.9072512647554807, + "grad_norm": 17.09162298189723, + "kl": 0.58203125, + "learning_rate": 7.576013513513513e-07, + "loss": 0.0006, + "reward": 3.504484534263611, + "reward_std": 0.04877541400492191, + "rewards/final_reward": 1.386411616863553, + "rewards/mask_iou_reward": 0.6932058084317765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.50448477268219, + "rewards/thk_ans_format_reward": 1.0, + "step": 861, + "think_completion_length": 5.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.34375381469727, + "epoch": 2.9106239460370995, + "grad_norm": 17.334156061583297, + "kl": 0.732421875, + "learning_rate": 7.573198198198197e-07, + "loss": 0.0007, + "reward": 3.111849308013916, + "reward_std": 0.15942617878317833, + "rewards/final_reward": 0.8906268643760444, + "rewards/mask_iou_reward": 0.4453134321880222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1222659349441528, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 862, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.60416793823242, + "epoch": 2.9139966273187183, + "grad_norm": 9.828375359131886, + "kl": 0.611328125, + "learning_rate": 7.570382882882883e-07, + "loss": 0.0006, + "reward": 3.6048978567123413, + "reward_std": 0.08647706173360348, + "rewards/final_reward": 1.4054294071757991, + "rewards/mask_iou_reward": 0.7027147035878996, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6048980355262756, + "rewards/thk_ans_format_reward": 1.0, + "step": 863, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.61458587646484, + "epoch": 2.917369308600337, + "grad_norm": 14.152492330600408, + "kl": 0.8125, + "learning_rate": 7.567567567567568e-07, + "loss": 0.0008, + "reward": 3.552889108657837, + "reward_std": 0.06724633555859327, + "rewards/final_reward": 1.411230216936757, + "rewards/mask_iou_reward": 0.7056151084683785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5528889298439026, + "rewards/thk_ans_format_reward": 1.0, + "step": 864, + "think_completion_length": 7.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.81250381469727, + "epoch": 2.920741989881956, + "grad_norm": 12.085736518884486, + "kl": 0.576171875, + "learning_rate": 7.564752252252252e-07, + "loss": 0.0006, + "reward": 3.794761896133423, + "reward_std": 0.0494751688092947, + "rewards/final_reward": 1.8719720376871902, + "rewards/mask_iou_reward": 0.9359860188435951, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.794761836528778, + "rewards/thk_ans_format_reward": 1.0, + "step": 865, + "think_completion_length": 5.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.80208587646484, + "epoch": 2.924114671163575, + "grad_norm": 10.394094315559089, + "kl": 0.591796875, + "learning_rate": 7.561936936936937e-07, + "loss": 0.0006, + "reward": 3.1822298765182495, + "reward_std": 0.09092389792203903, + "rewards/final_reward": 0.8456114424522805, + "rewards/mask_iou_reward": 0.42280572122614024, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1822299361228943, + "rewards/thk_ans_format_reward": 1.0, + "step": 866, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.58333587646484, + "epoch": 2.927487352445194, + "grad_norm": 10.304628611502135, + "kl": 0.728515625, + "learning_rate": 7.559121621621621e-07, + "loss": 0.0007, + "reward": 3.419139266014099, + "reward_std": 0.14908288419246674, + "rewards/final_reward": 0.6543206464945732, + "rewards/mask_iou_reward": 0.3271603232472866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4191393852233887, + "rewards/thk_ans_format_reward": 1.0, + "step": 867, + "think_completion_length": 5.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.71875762939453, + "epoch": 2.930860033726813, + "grad_norm": 13.912370115835516, + "kl": 0.58203125, + "learning_rate": 7.556306306306306e-07, + "loss": 0.0006, + "reward": 3.4562323093414307, + "reward_std": 0.11177996918559074, + "rewards/final_reward": 1.6936401492810278, + "rewards/mask_iou_reward": 0.8468200746405139, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4562321901321411, + "rewards/thk_ans_format_reward": 1.0, + "step": 868, + "think_completion_length": 5.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.64583969116211, + "epoch": 2.9342327150084317, + "grad_norm": 5.671121715929217, + "kl": 0.58203125, + "learning_rate": 7.553490990990991e-07, + "loss": 0.0006, + "reward": 3.084256410598755, + "reward_std": 0.11680587381124496, + "rewards/final_reward": 1.2962688239052136, + "rewards/mask_iou_reward": 0.6481344119526068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0842564702033997, + "rewards/thk_ans_format_reward": 1.0, + "step": 869, + "think_completion_length": 5.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.05208587646484, + "epoch": 2.9376053962900506, + "grad_norm": 13.12728442117305, + "kl": 0.60546875, + "learning_rate": 7.550675675675675e-07, + "loss": 0.0006, + "reward": 3.352104663848877, + "reward_std": 0.062287621200084686, + "rewards/final_reward": 1.205827396942925, + "rewards/mask_iou_reward": 0.6029136984714625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.352104663848877, + "rewards/thk_ans_format_reward": 1.0, + "step": 870, + "think_completion_length": 5.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.44791793823242, + "epoch": 2.9409780775716694, + "grad_norm": 22.34161037724869, + "kl": 0.591796875, + "learning_rate": 7.54786036036036e-07, + "loss": 0.0006, + "reward": 3.1144481897354126, + "reward_std": 0.09380730241537094, + "rewards/final_reward": 0.7915381221938045, + "rewards/mask_iou_reward": 0.39576906109690224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1144481599330902, + "rewards/thk_ans_format_reward": 1.0, + "step": 871, + "think_completion_length": 5.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.97916793823242, + "epoch": 2.9443507588532882, + "grad_norm": 13.595642606692284, + "kl": 0.607421875, + "learning_rate": 7.545045045045044e-07, + "loss": 0.0006, + "reward": 3.6105018854141235, + "reward_std": 0.15074967592954636, + "rewards/final_reward": 1.491989261637721, + "rewards/mask_iou_reward": 0.7459946308188605, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6105021238327026, + "rewards/thk_ans_format_reward": 1.0, + "step": 872, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.01041793823242, + "epoch": 2.947723440134907, + "grad_norm": 15.298145363684752, + "kl": 0.62109375, + "learning_rate": 7.54222972972973e-07, + "loss": 0.0006, + "reward": 3.4203200340270996, + "reward_std": 0.11649902537465096, + "rewards/final_reward": 1.252674476480225, + "rewards/mask_iou_reward": 0.6263372382401124, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.42031991481781, + "rewards/thk_ans_format_reward": 1.0, + "step": 873, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.15625762939453, + "epoch": 2.9510961214165263, + "grad_norm": 6.164036490191176, + "kl": 0.76171875, + "learning_rate": 7.539414414414415e-07, + "loss": 0.0008, + "reward": 3.377174139022827, + "reward_std": 0.25197170674800873, + "rewards/final_reward": 0.8577654543055984, + "rewards/mask_iou_reward": 0.4288827271527992, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.387590765953064, + "rewards/thk_ans_format_reward": 1.0, + "step": 874, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.56250762939453, + "epoch": 2.954468802698145, + "grad_norm": 7.4405944010792116, + "kl": 0.56640625, + "learning_rate": 7.536599099099099e-07, + "loss": 0.0006, + "reward": 3.17443311214447, + "reward_std": 0.17868845723569393, + "rewards/final_reward": 0.5799214539574395, + "rewards/mask_iou_reward": 0.2899607269787198, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1744331121444702, + "rewards/thk_ans_format_reward": 1.0, + "step": 875, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.69792175292969, + "epoch": 2.957841483979764, + "grad_norm": 18.856636432233937, + "kl": 0.55859375, + "learning_rate": 7.533783783783784e-07, + "loss": 0.0006, + "reward": 3.4503493309020996, + "reward_std": 0.14452160894870758, + "rewards/final_reward": 1.6115709043227955, + "rewards/mask_iou_reward": 0.8057854521613977, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4503492712974548, + "rewards/thk_ans_format_reward": 1.0, + "step": 876, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.64583587646484, + "epoch": 2.961214165261383, + "grad_norm": 6.072809246494209, + "kl": 0.5546875, + "learning_rate": 7.530968468468468e-07, + "loss": 0.0006, + "reward": 3.4942249059677124, + "reward_std": 0.3860067129135132, + "rewards/final_reward": 1.5278986439451647, + "rewards/mask_iou_reward": 0.7639493219725824, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.5254749059677124, + "rewards/thk_ans_format_reward": 1.0, + "step": 877, + "think_completion_length": 5.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.39583587646484, + "epoch": 2.9645868465430016, + "grad_norm": 33.280175809232574, + "kl": 0.541015625, + "learning_rate": 7.528153153153153e-07, + "loss": 0.0006, + "reward": 3.324143171310425, + "reward_std": 0.04724482260644436, + "rewards/final_reward": 0.8673007401404467, + "rewards/mask_iou_reward": 0.43365037007022333, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.324143260717392, + "rewards/thk_ans_format_reward": 1.0, + "step": 878, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.7291717529297, + "epoch": 2.9679595278246205, + "grad_norm": 13.072328707508886, + "kl": 0.5078125, + "learning_rate": 7.525337837837838e-07, + "loss": 0.0005, + "reward": 3.472671866416931, + "reward_std": 0.11400551535189152, + "rewards/final_reward": 1.63759505136584, + "rewards/mask_iou_reward": 0.81879752568292, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4726718068122864, + "rewards/thk_ans_format_reward": 1.0, + "step": 879, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.64583587646484, + "epoch": 2.9713322091062393, + "grad_norm": 17.45267800201795, + "kl": 0.57421875, + "learning_rate": 7.522522522522522e-07, + "loss": 0.0006, + "reward": 3.4755067825317383, + "reward_std": 0.14281757548451424, + "rewards/final_reward": 1.3163042838611094, + "rewards/mask_iou_reward": 0.6581521419305547, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.475506603717804, + "rewards/thk_ans_format_reward": 1.0, + "step": 880, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.42708587646484, + "epoch": 2.9747048903878586, + "grad_norm": 21.418213924625412, + "kl": 0.708984375, + "learning_rate": 7.519707207207207e-07, + "loss": 0.0007, + "reward": 3.1879799365997314, + "reward_std": 0.05884386505931616, + "rewards/final_reward": 1.182249554893902, + "rewards/mask_iou_reward": 0.591124777446951, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1879799962043762, + "rewards/thk_ans_format_reward": 1.0, + "step": 881, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.31250381469727, + "epoch": 2.9780775716694774, + "grad_norm": 20.640043092169407, + "kl": 0.578125, + "learning_rate": 7.516891891891891e-07, + "loss": 0.0006, + "reward": 3.0795818567276, + "reward_std": 0.10945974290370941, + "rewards/final_reward": 0.5469563823219387, + "rewards/mask_iou_reward": 0.27347819116096933, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0795818269252777, + "rewards/thk_ans_format_reward": 1.0, + "step": 882, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.18750381469727, + "epoch": 2.9814502529510962, + "grad_norm": 11.876012262077612, + "kl": 0.591796875, + "learning_rate": 7.514076576576577e-07, + "loss": 0.0006, + "reward": 3.385973572731018, + "reward_std": 0.05023301625624299, + "rewards/final_reward": 1.2205781767959176, + "rewards/mask_iou_reward": 0.6102890883979588, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3859735131263733, + "rewards/thk_ans_format_reward": 1.0, + "step": 883, + "think_completion_length": 5.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.19791793823242, + "epoch": 2.984822934232715, + "grad_norm": 14.764258051685124, + "kl": 0.544921875, + "learning_rate": 7.511261261261262e-07, + "loss": 0.0006, + "reward": 3.135034918785095, + "reward_std": 0.18100818619132042, + "rewards/final_reward": 1.6384016507086034, + "rewards/mask_iou_reward": 0.8192008253543017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1350347995758057, + "rewards/thk_ans_format_reward": 1.0, + "step": 884, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.32291793823242, + "epoch": 2.988195615514334, + "grad_norm": 18.135963045966133, + "kl": 0.59765625, + "learning_rate": 7.508445945945946e-07, + "loss": 0.0006, + "reward": 3.519152283668518, + "reward_std": 0.13311653956770897, + "rewards/final_reward": 1.8226950428889377, + "rewards/mask_iou_reward": 0.9113475214444688, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5191519260406494, + "rewards/thk_ans_format_reward": 1.0, + "step": 885, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.25, + "epoch": 2.9915682967959527, + "grad_norm": 12.289949298946091, + "kl": 0.5625, + "learning_rate": 7.505630630630631e-07, + "loss": 0.0006, + "reward": 3.38117778301239, + "reward_std": 0.05419469904154539, + "rewards/final_reward": 1.1812705416065237, + "rewards/mask_iou_reward": 0.5906352708032618, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3811779618263245, + "rewards/thk_ans_format_reward": 1.0, + "step": 886, + "think_completion_length": 5.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.62500381469727, + "epoch": 2.9949409780775715, + "grad_norm": 14.89633635692612, + "kl": 0.57421875, + "learning_rate": 7.502815315315316e-07, + "loss": 0.0006, + "reward": 3.042057514190674, + "reward_std": 0.16031931340694427, + "rewards/final_reward": 1.0657722241196894, + "rewards/mask_iou_reward": 0.5328861120598447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.042057454586029, + "rewards/thk_ans_format_reward": 1.0, + "step": 887, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.10526657104492, + "epoch": 2.998313659359191, + "grad_norm": 16.25986106299666, + "kl": 2.525390625, + "learning_rate": 7.5e-07, + "loss": 0.0025, + "reward": 3.4078052043914795, + "reward_std": 0.08364403434097767, + "rewards/final_reward": 1.5097456033230692, + "rewards/mask_iou_reward": 0.7548728016615346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4078055024147034, + "rewards/thk_ans_format_reward": 1.0, + "step": 888, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.72917175292969, + "epoch": 3.003372681281619, + "grad_norm": 12.515595643382929, + "kl": 0.56640625, + "learning_rate": 7.497184684684684e-07, + "loss": 0.0006, + "reward": 3.089647054672241, + "reward_std": 0.08703098073601723, + "rewards/final_reward": 1.5181027098126436, + "rewards/mask_iou_reward": 0.7590513549063218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0896472930908203, + "rewards/thk_ans_format_reward": 1.0, + "step": 889, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.18750381469727, + "epoch": 3.0067453625632377, + "grad_norm": 8.349129573087078, + "kl": 0.63671875, + "learning_rate": 7.494369369369368e-07, + "loss": 0.0006, + "reward": 3.489098906517029, + "reward_std": 0.047829316928982735, + "rewards/final_reward": 1.3113581184064553, + "rewards/mask_iou_reward": 0.6556790592032277, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4890989661216736, + "rewards/thk_ans_format_reward": 1.0, + "step": 890, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.59375, + "epoch": 3.0101180438448565, + "grad_norm": 13.899706919226007, + "kl": 0.58984375, + "learning_rate": 7.491554054054053e-07, + "loss": 0.0006, + "reward": 3.176019310951233, + "reward_std": 0.07747252658009529, + "rewards/final_reward": 0.5975464560543442, + "rewards/mask_iou_reward": 0.2987732280271721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1760192811489105, + "rewards/thk_ans_format_reward": 1.0, + "step": 891, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.66667175292969, + "epoch": 3.0134907251264758, + "grad_norm": 11.626946310302078, + "kl": 0.595703125, + "learning_rate": 7.488738738738738e-07, + "loss": 0.0006, + "reward": 3.317869186401367, + "reward_std": 0.08332300186157227, + "rewards/final_reward": 1.464982306688062, + "rewards/mask_iou_reward": 0.732491153344031, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3178690671920776, + "rewards/thk_ans_format_reward": 1.0, + "step": 892, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.82292175292969, + "epoch": 3.0168634064080946, + "grad_norm": 15.906736400086396, + "kl": 0.70703125, + "learning_rate": 7.485923423423422e-07, + "loss": 0.0007, + "reward": 3.5675476789474487, + "reward_std": 0.07752594165503979, + "rewards/final_reward": 1.9184772041573201, + "rewards/mask_iou_reward": 0.9592386020786601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5675475597381592, + "rewards/thk_ans_format_reward": 1.0, + "step": 893, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.07292175292969, + "epoch": 3.0202360876897134, + "grad_norm": 7.317486836759224, + "kl": 0.6796875, + "learning_rate": 7.483108108108108e-07, + "loss": 0.0007, + "reward": 2.8391889333724976, + "reward_std": 0.12115252763032913, + "rewards/final_reward": 1.2024703756275028, + "rewards/mask_iou_reward": 0.6012351878137514, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8391889035701752, + "rewards/thk_ans_format_reward": 1.0, + "step": 894, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.17708587646484, + "epoch": 3.0236087689713322, + "grad_norm": 7.041223797807325, + "kl": 0.6171875, + "learning_rate": 7.480292792792792e-07, + "loss": 0.0006, + "reward": 3.25685453414917, + "reward_std": 0.05167396366596222, + "rewards/final_reward": 1.5520342513306697, + "rewards/mask_iou_reward": 0.7760171256653349, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2568546533584595, + "rewards/thk_ans_format_reward": 1.0, + "step": 895, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.96875381469727, + "epoch": 3.026981450252951, + "grad_norm": 21.643386059658464, + "kl": 0.64453125, + "learning_rate": 7.477477477477477e-07, + "loss": 0.0007, + "reward": 3.4809720516204834, + "reward_std": 0.16744033992290497, + "rewards/final_reward": 1.7740240475182896, + "rewards/mask_iou_reward": 0.8870120237591448, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4809719920158386, + "rewards/thk_ans_format_reward": 1.0, + "step": 896, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.375, + "epoch": 3.03035413153457, + "grad_norm": 12.520660533843897, + "kl": 0.5859375, + "learning_rate": 7.474662162162162e-07, + "loss": 0.0006, + "reward": 3.8004626035690308, + "reward_std": 0.04519801028072834, + "rewards/final_reward": 1.9572329694899626, + "rewards/mask_iou_reward": 0.9786164847449813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8004624843597412, + "rewards/thk_ans_format_reward": 1.0, + "step": 897, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.80208969116211, + "epoch": 3.0337268128161887, + "grad_norm": 8.19496444222374, + "kl": 0.642578125, + "learning_rate": 7.471846846846846e-07, + "loss": 0.0006, + "reward": 3.576119303703308, + "reward_std": 0.09716996923089027, + "rewards/final_reward": 1.809313079459582, + "rewards/mask_iou_reward": 0.904656539729791, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5761194229125977, + "rewards/thk_ans_format_reward": 1.0, + "step": 898, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.42708587646484, + "epoch": 3.0370994940978076, + "grad_norm": 15.121604716993511, + "kl": 0.646484375, + "learning_rate": 7.469031531531531e-07, + "loss": 0.0007, + "reward": 3.381696939468384, + "reward_std": 0.03125000186264515, + "rewards/final_reward": 1.098972567366859, + "rewards/mask_iou_reward": 0.5494862836834296, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.381696879863739, + "rewards/thk_ans_format_reward": 1.0, + "step": 899, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.39583587646484, + "epoch": 3.040472175379427, + "grad_norm": 16.861506466251836, + "kl": 0.6953125, + "learning_rate": 7.466216216216215e-07, + "loss": 0.0007, + "reward": 3.390763521194458, + "reward_std": 0.14983459934592247, + "rewards/final_reward": 0.7988987902225801, + "rewards/mask_iou_reward": 0.3994493951112901, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3907636404037476, + "rewards/thk_ans_format_reward": 1.0, + "step": 900, + "think_completion_length": 6.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.01041793823242, + "epoch": 3.0438448566610457, + "grad_norm": 7.578870543907528, + "kl": 2.359375, + "learning_rate": 7.4634009009009e-07, + "loss": 0.0025, + "reward": 3.7292349338531494, + "reward_std": 0.05419635772705078, + "rewards/final_reward": 1.5353630167973926, + "rewards/mask_iou_reward": 0.7676815083986963, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7292349934577942, + "rewards/thk_ans_format_reward": 1.0, + "step": 901, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.26041793823242, + "epoch": 3.0472175379426645, + "grad_norm": 7.03828666042413, + "kl": 0.6171875, + "learning_rate": 7.460585585585585e-07, + "loss": 0.0006, + "reward": 3.1864256858825684, + "reward_std": 0.11836714297533035, + "rewards/final_reward": 1.870425437434307, + "rewards/mask_iou_reward": 0.9352127187171535, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1864253878593445, + "rewards/thk_ans_format_reward": 1.0, + "step": 902, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.31250381469727, + "epoch": 3.0505902192242833, + "grad_norm": 78.39294728322437, + "kl": 0.595703125, + "learning_rate": 7.457770270270269e-07, + "loss": 0.0006, + "reward": 3.6524301767349243, + "reward_std": 0.05959741398692131, + "rewards/final_reward": 1.6786844417748537, + "rewards/mask_iou_reward": 0.8393422208874268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6524302959442139, + "rewards/thk_ans_format_reward": 1.0, + "step": 903, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.60417175292969, + "epoch": 3.053962900505902, + "grad_norm": 20.847676302517282, + "kl": 0.6171875, + "learning_rate": 7.454954954954955e-07, + "loss": 0.0006, + "reward": 3.5041786432266235, + "reward_std": 0.02324836002662778, + "rewards/final_reward": 1.0827496827285863, + "rewards/mask_iou_reward": 0.5413748413642931, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5041785836219788, + "rewards/thk_ans_format_reward": 1.0, + "step": 904, + "think_completion_length": 6.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.53125762939453, + "epoch": 3.057335581787521, + "grad_norm": 20.33434168015281, + "kl": 1.6171875, + "learning_rate": 7.45213963963964e-07, + "loss": 0.0016, + "reward": 3.3115906715393066, + "reward_std": 0.1272381842136383, + "rewards/final_reward": 0.7400105838265411, + "rewards/mask_iou_reward": 0.37000529191327053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3115903735160828, + "rewards/thk_ans_format_reward": 1.0, + "step": 905, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.5, + "epoch": 3.06070826306914, + "grad_norm": 11.339915806504091, + "kl": 0.60546875, + "learning_rate": 7.449324324324324e-07, + "loss": 0.0006, + "reward": 3.45768141746521, + "reward_std": 0.03249655629042536, + "rewards/final_reward": 1.942120311669283, + "rewards/mask_iou_reward": 0.9710601558346414, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4576812982559204, + "rewards/thk_ans_format_reward": 1.0, + "step": 906, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.10416793823242, + "epoch": 3.064080944350759, + "grad_norm": 126.51719502876821, + "kl": 0.64453125, + "learning_rate": 7.446509009009009e-07, + "loss": 0.0006, + "reward": 3.493595004081726, + "reward_std": 0.14895956590771675, + "rewards/final_reward": 1.4613195587593473, + "rewards/mask_iou_reward": 0.7306597793796736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4935947060585022, + "rewards/thk_ans_format_reward": 1.0, + "step": 907, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.86458587646484, + "epoch": 3.067453625632378, + "grad_norm": 10.176054229968825, + "kl": 0.671875, + "learning_rate": 7.443693693693693e-07, + "loss": 0.0007, + "reward": 3.5275977849960327, + "reward_std": 0.06466570496559143, + "rewards/final_reward": 1.871962030287786, + "rewards/mask_iou_reward": 0.935981015143893, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.527597725391388, + "rewards/thk_ans_format_reward": 1.0, + "step": 908, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.18750762939453, + "epoch": 3.0708263069139967, + "grad_norm": 139.48223009045432, + "kl": 0.55859375, + "learning_rate": 7.440878378378378e-07, + "loss": 0.0006, + "reward": 3.551697611808777, + "reward_std": 0.1385558396577835, + "rewards/final_reward": 1.7636108345880088, + "rewards/mask_iou_reward": 0.8818054172940044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.551697313785553, + "rewards/thk_ans_format_reward": 1.0, + "step": 909, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.07291793823242, + "epoch": 3.0741989881956155, + "grad_norm": 59.71015922108442, + "kl": 0.634765625, + "learning_rate": 7.438063063063063e-07, + "loss": 0.0007, + "reward": 3.5257649421691895, + "reward_std": 0.04206752963364124, + "rewards/final_reward": 1.8485441979398265, + "rewards/mask_iou_reward": 0.9242720989699132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5257649421691895, + "rewards/thk_ans_format_reward": 1.0, + "step": 910, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.09375, + "epoch": 3.0775716694772344, + "grad_norm": 9.506682856346176, + "kl": 0.57421875, + "learning_rate": 7.435247747747747e-07, + "loss": 0.0006, + "reward": 3.1859865188598633, + "reward_std": 0.20397471636533737, + "rewards/final_reward": 1.104930686579807, + "rewards/mask_iou_reward": 0.5524653432899035, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.185986340045929, + "rewards/thk_ans_format_reward": 1.0, + "step": 911, + "think_completion_length": 6.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.58333587646484, + "epoch": 3.080944350758853, + "grad_norm": 92.45653407457144, + "kl": 0.52734375, + "learning_rate": 7.432432432432432e-07, + "loss": 0.0005, + "reward": 3.264729380607605, + "reward_std": 0.11243878304958344, + "rewards/final_reward": 1.5774552687450192, + "rewards/mask_iou_reward": 0.7887276343725096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2647295594215393, + "rewards/thk_ans_format_reward": 1.0, + "step": 912, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.96875381469727, + "epoch": 3.084317032040472, + "grad_norm": 166.42384825356794, + "kl": 0.62890625, + "learning_rate": 7.429617117117116e-07, + "loss": 0.0006, + "reward": 3.5839874744415283, + "reward_std": 0.08542875573039055, + "rewards/final_reward": 1.7537484169410895, + "rewards/mask_iou_reward": 0.8768742084705448, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5839874744415283, + "rewards/thk_ans_format_reward": 1.0, + "step": 913, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5416717529297, + "epoch": 3.087689713322091, + "grad_norm": 13.859197836267999, + "kl": 0.689453125, + "learning_rate": 7.426801801801802e-07, + "loss": 0.0007, + "reward": 3.61310076713562, + "reward_std": 0.09716634452342987, + "rewards/final_reward": 1.817545375249206, + "rewards/mask_iou_reward": 0.908772687624603, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6131007075309753, + "rewards/thk_ans_format_reward": 1.0, + "step": 914, + "think_completion_length": 6.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.9375, + "epoch": 3.09106239460371, + "grad_norm": 24.634255031261517, + "kl": 0.677734375, + "learning_rate": 7.423986486486487e-07, + "loss": 0.0007, + "reward": 3.1513859033584595, + "reward_std": 0.07639642804861069, + "rewards/final_reward": 1.7290901915677561, + "rewards/mask_iou_reward": 0.8645450957838781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1513857245445251, + "rewards/thk_ans_format_reward": 1.0, + "step": 915, + "think_completion_length": 6.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.01041793823242, + "epoch": 3.094435075885329, + "grad_norm": 36.117144199009914, + "kl": 0.626953125, + "learning_rate": 7.421171171171171e-07, + "loss": 0.0006, + "reward": 3.2969553470611572, + "reward_std": 0.027974323369562626, + "rewards/final_reward": 1.5465697682341046, + "rewards/mask_iou_reward": 0.7732848841170523, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.296955168247223, + "rewards/thk_ans_format_reward": 1.0, + "step": 916, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.01041793823242, + "epoch": 3.097807757166948, + "grad_norm": 8.421180282732553, + "kl": 0.630859375, + "learning_rate": 7.418355855855856e-07, + "loss": 0.0006, + "reward": 3.229664444923401, + "reward_std": 0.10771491751074791, + "rewards/final_reward": 1.196235040485815, + "rewards/mask_iou_reward": 0.5981175202429075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2296642065048218, + "rewards/thk_ans_format_reward": 1.0, + "step": 917, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.95833587646484, + "epoch": 3.1011804384485666, + "grad_norm": 7.4188141354688275, + "kl": 0.546875, + "learning_rate": 7.41554054054054e-07, + "loss": 0.0006, + "reward": 3.500945806503296, + "reward_std": 0.0665590912103653, + "rewards/final_reward": 1.8405408670810295, + "rewards/mask_iou_reward": 0.9202704335405147, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5009459853172302, + "rewards/thk_ans_format_reward": 1.0, + "step": 918, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.75000381469727, + "epoch": 3.1045531197301854, + "grad_norm": 55.03653096258315, + "kl": 0.5859375, + "learning_rate": 7.412725225225225e-07, + "loss": 0.0006, + "reward": 3.723360061645508, + "reward_std": 0.0918300710618496, + "rewards/final_reward": 1.5982669399787868, + "rewards/mask_iou_reward": 0.7991334699893934, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7233601808547974, + "rewards/thk_ans_format_reward": 1.0, + "step": 919, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.71875, + "epoch": 3.1079258010118043, + "grad_norm": 12.853547166556982, + "kl": 0.5380859375, + "learning_rate": 7.40990990990991e-07, + "loss": 0.0005, + "reward": 3.3267405033111572, + "reward_std": 0.07052680477499962, + "rewards/final_reward": 1.8134163801056866, + "rewards/mask_iou_reward": 0.9067081900528433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3267402052879333, + "rewards/thk_ans_format_reward": 1.0, + "step": 920, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.58333587646484, + "epoch": 3.111298482293423, + "grad_norm": 8.187308407871758, + "kl": 0.65234375, + "learning_rate": 7.407094594594594e-07, + "loss": 0.0007, + "reward": 3.596097469329834, + "reward_std": 0.06756597012281418, + "rewards/final_reward": 1.6684026304190667, + "rewards/mask_iou_reward": 0.8342013152095333, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.596097707748413, + "rewards/thk_ans_format_reward": 1.0, + "step": 921, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.12500381469727, + "epoch": 3.1146711635750424, + "grad_norm": 52.74813869912448, + "kl": 0.607421875, + "learning_rate": 7.404279279279279e-07, + "loss": 0.0006, + "reward": 3.3573185205459595, + "reward_std": 0.09697642922401428, + "rewards/final_reward": 1.0614244569828557, + "rewards/mask_iou_reward": 0.5307122284914279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3573182821273804, + "rewards/thk_ans_format_reward": 1.0, + "step": 922, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.90625381469727, + "epoch": 3.118043844856661, + "grad_norm": 8.819321791565677, + "kl": 0.71875, + "learning_rate": 7.401463963963964e-07, + "loss": 0.0007, + "reward": 3.456843376159668, + "reward_std": 0.17637907341122627, + "rewards/final_reward": 1.6175144350984227, + "rewards/mask_iou_reward": 0.8087572175492114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4568431377410889, + "rewards/thk_ans_format_reward": 1.0, + "step": 923, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.38541793823242, + "epoch": 3.12141652613828, + "grad_norm": 91.36087855306776, + "kl": 0.76171875, + "learning_rate": 7.398648648648649e-07, + "loss": 0.0008, + "reward": 3.151221990585327, + "reward_std": 0.07979770191013813, + "rewards/final_reward": 1.3401678123640988, + "rewards/mask_iou_reward": 0.6700839061820494, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1512219905853271, + "rewards/thk_ans_format_reward": 1.0, + "step": 924, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.90625381469727, + "epoch": 3.124789207419899, + "grad_norm": 19.614225208295004, + "kl": 0.595703125, + "learning_rate": 7.395833333333334e-07, + "loss": 0.0006, + "reward": 3.4868357181549072, + "reward_std": 0.13209467381238937, + "rewards/final_reward": 1.517233627020157, + "rewards/mask_iou_reward": 0.7586168135100785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4868356585502625, + "rewards/thk_ans_format_reward": 1.0, + "step": 925, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.69791793823242, + "epoch": 3.1281618887015177, + "grad_norm": 122.10948460669458, + "kl": 138.37109375, + "learning_rate": 7.393018018018018e-07, + "loss": 0.1394, + "reward": 3.5085551738739014, + "reward_std": 0.0744461640715599, + "rewards/final_reward": 1.2090095717241383, + "rewards/mask_iou_reward": 0.6045047858620691, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5085551142692566, + "rewards/thk_ans_format_reward": 1.0, + "step": 926, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.85417175292969, + "epoch": 3.1315345699831365, + "grad_norm": 12.186450381014977, + "kl": 0.599609375, + "learning_rate": 7.390202702702703e-07, + "loss": 0.0006, + "reward": 3.471701979637146, + "reward_std": 0.12539499625563622, + "rewards/final_reward": 1.2408520506021146, + "rewards/mask_iou_reward": 0.6204260253010573, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4717020392417908, + "rewards/thk_ans_format_reward": 1.0, + "step": 927, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.02083587646484, + "epoch": 3.1349072512647553, + "grad_norm": 8.400877279407904, + "kl": 0.5634765625, + "learning_rate": 7.387387387387387e-07, + "loss": 0.0006, + "reward": 3.4186493158340454, + "reward_std": 0.12011561915278435, + "rewards/final_reward": 1.0084027343217206, + "rewards/mask_iou_reward": 0.5042013671608603, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4290658235549927, + "rewards/thk_ans_format_reward": 1.0, + "step": 928, + "think_completion_length": 5.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.00000381469727, + "epoch": 3.138279932546374, + "grad_norm": 4.785202894611176, + "kl": 0.66796875, + "learning_rate": 7.384572072072071e-07, + "loss": 0.0007, + "reward": 3.502060651779175, + "reward_std": 0.03595947311259806, + "rewards/final_reward": 1.7552573719457105, + "rewards/mask_iou_reward": 0.8776286859728553, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.50206059217453, + "rewards/thk_ans_format_reward": 1.0, + "step": 929, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.46875, + "epoch": 3.1416526138279934, + "grad_norm": 48.073277858921884, + "kl": 0.75390625, + "learning_rate": 7.381756756756756e-07, + "loss": 0.0008, + "reward": 3.3581387996673584, + "reward_std": 0.08590967021882534, + "rewards/final_reward": 1.7207684166968091, + "rewards/mask_iou_reward": 0.8603842083484046, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3581388592720032, + "rewards/thk_ans_format_reward": 1.0, + "step": 930, + "think_completion_length": 5.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.79166793823242, + "epoch": 3.1450252951096123, + "grad_norm": 42.756840812915485, + "kl": 2.42578125, + "learning_rate": 7.37894144144144e-07, + "loss": 0.0024, + "reward": 3.279011845588684, + "reward_std": 0.08761481195688248, + "rewards/final_reward": 1.0889703858570359, + "rewards/mask_iou_reward": 0.5444851929285179, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2790116667747498, + "rewards/thk_ans_format_reward": 1.0, + "step": 931, + "think_completion_length": 5.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.40625, + "epoch": 3.148397976391231, + "grad_norm": 13.101321531658755, + "kl": 0.568359375, + "learning_rate": 7.376126126126125e-07, + "loss": 0.0006, + "reward": 3.520051956176758, + "reward_std": 0.10048893839120865, + "rewards/final_reward": 1.7624207074205627, + "rewards/mask_iou_reward": 0.8812103537102813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.520051896572113, + "rewards/thk_ans_format_reward": 1.0, + "step": 932, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.48958587646484, + "epoch": 3.15177065767285, + "grad_norm": 17.5324248573886, + "kl": 0.572265625, + "learning_rate": 7.37331081081081e-07, + "loss": 0.0006, + "reward": 3.2548365592956543, + "reward_std": 0.049117712303996086, + "rewards/final_reward": 0.8347675929608372, + "rewards/mask_iou_reward": 0.4173837964804186, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2548364996910095, + "rewards/thk_ans_format_reward": 1.0, + "step": 933, + "think_completion_length": 5.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.09375381469727, + "epoch": 3.1551433389544687, + "grad_norm": 8.456217280680285, + "kl": 0.533203125, + "learning_rate": 7.370495495495495e-07, + "loss": 0.0005, + "reward": 3.2601990699768066, + "reward_std": 0.11271973326802254, + "rewards/final_reward": 1.0942927702331247, + "rewards/mask_iou_reward": 0.5471463851165623, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2601988911628723, + "rewards/thk_ans_format_reward": 1.0, + "step": 934, + "think_completion_length": 5.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.32291793823242, + "epoch": 3.1585160202360876, + "grad_norm": 28.327635527597177, + "kl": 0.615234375, + "learning_rate": 7.36768018018018e-07, + "loss": 0.0006, + "reward": 3.3950055837631226, + "reward_std": 0.12466976046562195, + "rewards/final_reward": 1.5702693201072733, + "rewards/mask_iou_reward": 0.7851346600536366, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3950055241584778, + "rewards/thk_ans_format_reward": 1.0, + "step": 935, + "think_completion_length": 5.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.17708587646484, + "epoch": 3.1618887015177064, + "grad_norm": 24.013934474599623, + "kl": 0.615234375, + "learning_rate": 7.364864864864864e-07, + "loss": 0.0006, + "reward": 2.972390651702881, + "reward_std": 0.20534201711416245, + "rewards/final_reward": 1.6523960960794286, + "rewards/mask_iou_reward": 0.8261980480397143, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9723906219005585, + "rewards/thk_ans_format_reward": 1.0, + "step": 936, + "think_completion_length": 5.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.40625381469727, + "epoch": 3.1652613827993257, + "grad_norm": 16.595506940383846, + "kl": 0.591796875, + "learning_rate": 7.362049549549549e-07, + "loss": 0.0006, + "reward": 3.3171249628067017, + "reward_std": 0.18518901616334915, + "rewards/final_reward": 1.7000440197250972, + "rewards/mask_iou_reward": 0.8500220098625486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3171249628067017, + "rewards/thk_ans_format_reward": 1.0, + "step": 937, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3645896911621, + "epoch": 3.1686340640809445, + "grad_norm": 8.610812224568315, + "kl": 0.65625, + "learning_rate": 7.359234234234234e-07, + "loss": 0.0007, + "reward": 2.8874831199645996, + "reward_std": 0.19021157920360565, + "rewards/final_reward": 0.7664530144380096, + "rewards/mask_iou_reward": 0.3832265072190048, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8874831795692444, + "rewards/thk_ans_format_reward": 1.0, + "step": 938, + "think_completion_length": 5.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.20833587646484, + "epoch": 3.1720067453625633, + "grad_norm": 10.624550680340164, + "kl": 0.578125, + "learning_rate": 7.356418918918918e-07, + "loss": 0.0006, + "reward": 3.331384778022766, + "reward_std": 0.33483661711215973, + "rewards/final_reward": 1.2137129383296097, + "rewards/mask_iou_reward": 0.6068564691648048, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.4147179126739502, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 939, + "think_completion_length": 5.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.54166793823242, + "epoch": 3.175379426644182, + "grad_norm": 13.798134393053907, + "kl": 0.7109375, + "learning_rate": 7.353603603603603e-07, + "loss": 0.0007, + "reward": 3.5317646265029907, + "reward_std": 0.12036162614822388, + "rewards/final_reward": 1.7559949168114533, + "rewards/mask_iou_reward": 0.8779974584057266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5317646265029907, + "rewards/thk_ans_format_reward": 1.0, + "step": 940, + "think_completion_length": 6.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.60417175292969, + "epoch": 3.178752107925801, + "grad_norm": 8.133255831518898, + "kl": 0.56640625, + "learning_rate": 7.350788288288288e-07, + "loss": 0.0006, + "reward": 3.4287021160125732, + "reward_std": 0.07130642794072628, + "rewards/final_reward": 1.857115805332103, + "rewards/mask_iou_reward": 0.9285579026660515, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.428701937198639, + "rewards/thk_ans_format_reward": 1.0, + "step": 941, + "think_completion_length": 6.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.46875381469727, + "epoch": 3.18212478920742, + "grad_norm": 227.71057599835822, + "kl": 0.5048828125, + "learning_rate": 7.347972972972972e-07, + "loss": 0.0005, + "reward": 3.421720027923584, + "reward_std": 0.10221107676625252, + "rewards/final_reward": 0.7738278771749434, + "rewards/mask_iou_reward": 0.3869139385874717, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4217200875282288, + "rewards/thk_ans_format_reward": 1.0, + "step": 942, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.59375, + "epoch": 3.1854974704890386, + "grad_norm": 15.557246933598993, + "kl": 0.56640625, + "learning_rate": 7.345157657657657e-07, + "loss": 0.0006, + "reward": 3.083019495010376, + "reward_std": 0.10839825868606567, + "rewards/final_reward": 0.886000136338485, + "rewards/mask_iou_reward": 0.4430000681692425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0830194354057312, + "rewards/thk_ans_format_reward": 1.0, + "step": 943, + "think_completion_length": 5.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.94792175292969, + "epoch": 3.1888701517706575, + "grad_norm": 8.349077581952226, + "kl": 0.587890625, + "learning_rate": 7.342342342342342e-07, + "loss": 0.0006, + "reward": 3.22943913936615, + "reward_std": 0.20535418391227722, + "rewards/final_reward": 1.4362136554059581, + "rewards/mask_iou_reward": 0.7181068277029791, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2294391095638275, + "rewards/thk_ans_format_reward": 1.0, + "step": 944, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.23958587646484, + "epoch": 3.1922428330522767, + "grad_norm": 21.27652996901679, + "kl": 0.564453125, + "learning_rate": 7.339527027027027e-07, + "loss": 0.0006, + "reward": 3.469966769218445, + "reward_std": 0.08743277750909328, + "rewards/final_reward": 1.046480487366158, + "rewards/mask_iou_reward": 0.523240243683079, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4699668884277344, + "rewards/thk_ans_format_reward": 1.0, + "step": 945, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.2916717529297, + "epoch": 3.1956155143338956, + "grad_norm": 15.628271383029498, + "kl": 0.568359375, + "learning_rate": 7.336711711711712e-07, + "loss": 0.0006, + "reward": 3.507090926170349, + "reward_std": 0.090285774320364, + "rewards/final_reward": 1.6270001024893102, + "rewards/mask_iou_reward": 0.8135000512446551, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5070908665657043, + "rewards/thk_ans_format_reward": 1.0, + "step": 946, + "think_completion_length": 5.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.66667175292969, + "epoch": 3.1989881956155144, + "grad_norm": 11.767457004148945, + "kl": 0.607421875, + "learning_rate": 7.333896396396396e-07, + "loss": 0.0006, + "reward": 3.5310587882995605, + "reward_std": 0.12183480244129896, + "rewards/final_reward": 1.6213213793067296, + "rewards/mask_iou_reward": 0.8106606896533648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.531058669090271, + "rewards/thk_ans_format_reward": 1.0, + "step": 947, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.82291793823242, + "epoch": 3.2023608768971332, + "grad_norm": 31.23548501325163, + "kl": 0.904296875, + "learning_rate": 7.331081081081081e-07, + "loss": 0.0009, + "reward": 3.2420685291290283, + "reward_std": 0.14567308127880096, + "rewards/final_reward": 1.282041882933168, + "rewards/mask_iou_reward": 0.641020941466584, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2420682311058044, + "rewards/thk_ans_format_reward": 1.0, + "step": 948, + "think_completion_length": 5.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.13541793823242, + "epoch": 3.205733558178752, + "grad_norm": 23.86354633860368, + "kl": 0.5703125, + "learning_rate": 7.328265765765765e-07, + "loss": 0.0006, + "reward": 3.7221736907958984, + "reward_std": 0.06484057754278183, + "rewards/final_reward": 1.404249844720991, + "rewards/mask_iou_reward": 0.7021249223604955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.722173810005188, + "rewards/thk_ans_format_reward": 1.0, + "step": 949, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.1145896911621, + "epoch": 3.209106239460371, + "grad_norm": 12.380395095394663, + "kl": 0.57421875, + "learning_rate": 7.32545045045045e-07, + "loss": 0.0006, + "reward": 3.3760533332824707, + "reward_std": 0.1979294866323471, + "rewards/final_reward": 1.7005252617653546, + "rewards/mask_iou_reward": 0.8502626308826773, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3968866467475891, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 950, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.21875, + "epoch": 3.2124789207419897, + "grad_norm": 8.26952523545033, + "kl": 0.576171875, + "learning_rate": 7.322635135135135e-07, + "loss": 0.0006, + "reward": 3.1699079275131226, + "reward_std": 0.07622763887047768, + "rewards/final_reward": 1.3101456169278447, + "rewards/mask_iou_reward": 0.6550728084639224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.169907808303833, + "rewards/thk_ans_format_reward": 1.0, + "step": 951, + "think_completion_length": 5.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.95833587646484, + "epoch": 3.2158516020236085, + "grad_norm": 9.638521530190893, + "kl": 0.4833984375, + "learning_rate": 7.319819819819819e-07, + "loss": 0.0005, + "reward": 3.4880365133285522, + "reward_std": 0.1652812361717224, + "rewards/final_reward": 1.8128116128729428, + "rewards/mask_iou_reward": 0.9064058064364714, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5088698863983154, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 952, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.25000762939453, + "epoch": 3.219224283305228, + "grad_norm": 8.410396514523777, + "kl": 0.5703125, + "learning_rate": 7.317004504504504e-07, + "loss": 0.0006, + "reward": 3.0509506464004517, + "reward_std": 0.08591302763670683, + "rewards/final_reward": 1.466382150338418, + "rewards/mask_iou_reward": 0.733191075169209, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0509507060050964, + "rewards/thk_ans_format_reward": 1.0, + "step": 953, + "think_completion_length": 5.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.68750381469727, + "epoch": 3.2225969645868466, + "grad_norm": 21.489909003804172, + "kl": 0.658203125, + "learning_rate": 7.31418918918919e-07, + "loss": 0.0007, + "reward": 3.298169493675232, + "reward_std": 0.1636265590786934, + "rewards/final_reward": 0.8282513182543959, + "rewards/mask_iou_reward": 0.41412565912719795, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3190029561519623, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 954, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.50000381469727, + "epoch": 3.2259696458684655, + "grad_norm": 5.470747692734715, + "kl": 0.517578125, + "learning_rate": 7.311373873873874e-07, + "loss": 0.0005, + "reward": 3.7121076583862305, + "reward_std": 0.09200547635555267, + "rewards/final_reward": 1.6347616027676106, + "rewards/mask_iou_reward": 0.8173808013838053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7121077179908752, + "rewards/thk_ans_format_reward": 1.0, + "step": 955, + "think_completion_length": 5.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.03125, + "epoch": 3.2293423271500843, + "grad_norm": 10.76266896794945, + "kl": 0.53515625, + "learning_rate": 7.308558558558559e-07, + "loss": 0.0005, + "reward": 3.611359715461731, + "reward_std": 0.2015197928994894, + "rewards/final_reward": 1.4773672950030936, + "rewards/mask_iou_reward": 0.7386836475015468, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6321927905082703, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 956, + "think_completion_length": 5.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.3854217529297, + "epoch": 3.232715008431703, + "grad_norm": 25.410441482857728, + "kl": 0.4921875, + "learning_rate": 7.305743243243243e-07, + "loss": 0.0005, + "reward": 3.1916489601135254, + "reward_std": 0.4065837115049362, + "rewards/final_reward": 1.5027361413481117, + "rewards/mask_iou_reward": 0.7513680706740559, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.2333155870437622, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 957, + "think_completion_length": 5.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.47916793823242, + "epoch": 3.236087689713322, + "grad_norm": 6.036564560288615, + "kl": 0.498046875, + "learning_rate": 7.302927927927928e-07, + "loss": 0.0005, + "reward": 3.3439793586730957, + "reward_std": 0.2639719545841217, + "rewards/final_reward": 1.7893254228602058, + "rewards/mask_iou_reward": 0.8946627114301029, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3648127317428589, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 958, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.30208587646484, + "epoch": 3.2394603709949408, + "grad_norm": 9.480511697981585, + "kl": 0.56640625, + "learning_rate": 7.300112612612613e-07, + "loss": 0.0006, + "reward": 3.5378222465515137, + "reward_std": 0.06817605718970299, + "rewards/final_reward": 1.8195322300541235, + "rewards/mask_iou_reward": 0.9097661150270617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.537822186946869, + "rewards/thk_ans_format_reward": 1.0, + "step": 959, + "think_completion_length": 5.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.59375762939453, + "epoch": 3.24283305227656, + "grad_norm": 21.98608140049359, + "kl": 0.626953125, + "learning_rate": 7.297297297297297e-07, + "loss": 0.0006, + "reward": 3.4865334033966064, + "reward_std": 0.1169372908771038, + "rewards/final_reward": 0.8317292459713557, + "rewards/mask_iou_reward": 0.41586462298567783, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.486533284187317, + "rewards/thk_ans_format_reward": 1.0, + "step": 960, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.96875, + "epoch": 3.246205733558179, + "grad_norm": 22.098295435474142, + "kl": 0.626953125, + "learning_rate": 7.294481981981982e-07, + "loss": 0.0006, + "reward": 3.630619168281555, + "reward_std": 0.10067232511937618, + "rewards/final_reward": 1.8147314003863455, + "rewards/mask_iou_reward": 0.9073657001931728, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6306187510490417, + "rewards/thk_ans_format_reward": 1.0, + "step": 961, + "think_completion_length": 6.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.14583587646484, + "epoch": 3.2495784148397977, + "grad_norm": 56.289192010391524, + "kl": 0.5048828125, + "learning_rate": 7.291666666666666e-07, + "loss": 0.0005, + "reward": 3.6313655376434326, + "reward_std": 0.312417708337307, + "rewards/final_reward": 1.5549074475717943, + "rewards/mask_iou_reward": 0.7774537237858972, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.6938652992248535, + "rewards/thk_ans_format_reward": 0.96875, + "step": 962, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.40625381469727, + "epoch": 3.2529510961214165, + "grad_norm": 10.269311605816217, + "kl": 0.529296875, + "learning_rate": 7.288851351351351e-07, + "loss": 0.0005, + "reward": 3.4660444259643555, + "reward_std": 0.17312489449977875, + "rewards/final_reward": 1.5843367482599913, + "rewards/mask_iou_reward": 0.7921683741299956, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4764612317085266, + "rewards/thk_ans_format_reward": 1.0, + "step": 963, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.29166793823242, + "epoch": 3.2563237774030354, + "grad_norm": 9.743631276599844, + "kl": 0.619140625, + "learning_rate": 7.286036036036037e-07, + "loss": 0.0006, + "reward": 3.665140748023987, + "reward_std": 0.07369671761989594, + "rewards/final_reward": 1.884653197648869, + "rewards/mask_iou_reward": 0.9423265988244345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6651407480239868, + "rewards/thk_ans_format_reward": 1.0, + "step": 964, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.54166793823242, + "epoch": 3.259696458684654, + "grad_norm": 6.260349384279472, + "kl": 0.62109375, + "learning_rate": 7.283220720720721e-07, + "loss": 0.0006, + "reward": 3.545573592185974, + "reward_std": 0.0774066224694252, + "rewards/final_reward": 1.257693189516263, + "rewards/mask_iou_reward": 0.6288465947581315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5455738306045532, + "rewards/thk_ans_format_reward": 1.0, + "step": 965, + "think_completion_length": 6.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.67708587646484, + "epoch": 3.263069139966273, + "grad_norm": 10.940295050921081, + "kl": 0.560546875, + "learning_rate": 7.280405405405406e-07, + "loss": 0.0006, + "reward": 3.335355520248413, + "reward_std": 0.3451412171125412, + "rewards/final_reward": 1.4840171773306559, + "rewards/mask_iou_reward": 0.7420085886653279, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3770219087600708, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 966, + "think_completion_length": 6.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.90625, + "epoch": 3.2664418212478923, + "grad_norm": 18.922494525837497, + "kl": 0.60546875, + "learning_rate": 7.27759009009009e-07, + "loss": 0.0006, + "reward": 3.4567893743515015, + "reward_std": 0.19448533281683922, + "rewards/final_reward": 1.2659538781637694, + "rewards/mask_iou_reward": 0.6329769390818847, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4672061204910278, + "rewards/thk_ans_format_reward": 1.0, + "step": 967, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4791717529297, + "epoch": 3.269814502529511, + "grad_norm": 14.231900921928126, + "kl": 0.5859375, + "learning_rate": 7.274774774774774e-07, + "loss": 0.0006, + "reward": 3.3344634771347046, + "reward_std": 0.286946564912796, + "rewards/final_reward": 1.7930482971354982, + "rewards/mask_iou_reward": 0.8965241485677491, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3761300444602966, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 968, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.1770896911621, + "epoch": 3.27318718381113, + "grad_norm": 6.237807951070738, + "kl": 0.56640625, + "learning_rate": 7.271959459459459e-07, + "loss": 0.0006, + "reward": 3.3685081005096436, + "reward_std": 0.1493084542453289, + "rewards/final_reward": 0.5514383027772634, + "rewards/mask_iou_reward": 0.2757191513886317, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3789244294166565, + "rewards/thk_ans_format_reward": 1.0, + "step": 969, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6875, + "epoch": 3.2765598650927488, + "grad_norm": 37.178831343629014, + "kl": 0.607421875, + "learning_rate": 7.269144144144143e-07, + "loss": 0.0006, + "reward": 3.1892114877700806, + "reward_std": 0.07886414229869843, + "rewards/final_reward": 1.2344537926142465, + "rewards/mask_iou_reward": 0.6172268963071232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1892114281654358, + "rewards/thk_ans_format_reward": 1.0, + "step": 970, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.20833587646484, + "epoch": 3.2799325463743676, + "grad_norm": 30.71626399986299, + "kl": 0.7421875, + "learning_rate": 7.266328828828828e-07, + "loss": 0.0007, + "reward": 3.5415124893188477, + "reward_std": 0.16267375275492668, + "rewards/final_reward": 1.4044140215816594, + "rewards/mask_iou_reward": 0.7022070107908297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5415124297142029, + "rewards/thk_ans_format_reward": 1.0, + "step": 971, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.0104217529297, + "epoch": 3.2833052276559864, + "grad_norm": 42.712795178679805, + "kl": 0.572265625, + "learning_rate": 7.263513513513512e-07, + "loss": 0.0006, + "reward": 3.5569279193878174, + "reward_std": 0.16379550099372864, + "rewards/final_reward": 1.6601145498407242, + "rewards/mask_iou_reward": 0.8300572749203621, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5569279789924622, + "rewards/thk_ans_format_reward": 1.0, + "step": 972, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0104217529297, + "epoch": 3.2866779089376053, + "grad_norm": 19.441170666993003, + "kl": 0.5546875, + "learning_rate": 7.260698198198197e-07, + "loss": 0.0006, + "reward": 3.3968396186828613, + "reward_std": 0.23672273010015488, + "rewards/final_reward": 1.2417896327478177, + "rewards/mask_iou_reward": 0.6208948163739089, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.438506305217743, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 973, + "think_completion_length": 5.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.6979217529297, + "epoch": 3.2900505902192245, + "grad_norm": 13.760331090171656, + "kl": 0.5078125, + "learning_rate": 7.257882882882883e-07, + "loss": 0.0005, + "reward": 3.2817405462265015, + "reward_std": 0.2441714182496071, + "rewards/final_reward": 1.0549440408050355, + "rewards/mask_iou_reward": 0.5274720204025177, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2817405462265015, + "rewards/thk_ans_format_reward": 1.0, + "step": 974, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.55208587646484, + "epoch": 3.2934232715008434, + "grad_norm": 18.820609699132905, + "kl": 0.58203125, + "learning_rate": 7.255067567567567e-07, + "loss": 0.0006, + "reward": 3.1479690074920654, + "reward_std": 0.18045621365308762, + "rewards/final_reward": 1.0056742193732744, + "rewards/mask_iou_reward": 0.5028371096866372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.147968977689743, + "rewards/thk_ans_format_reward": 1.0, + "step": 975, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.09375381469727, + "epoch": 3.296795952782462, + "grad_norm": 9.561561905297891, + "kl": 0.71875, + "learning_rate": 7.252252252252252e-07, + "loss": 0.0007, + "reward": 3.4808413982391357, + "reward_std": 0.045143453404307365, + "rewards/final_reward": 1.7793298953695453, + "rewards/mask_iou_reward": 0.8896649476847727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4808412790298462, + "rewards/thk_ans_format_reward": 1.0, + "step": 976, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.4583396911621, + "epoch": 3.300168634064081, + "grad_norm": 25.168918736528095, + "kl": 0.5029296875, + "learning_rate": 7.249436936936937e-07, + "loss": 0.0005, + "reward": 3.6008447408676147, + "reward_std": 0.15011634677648544, + "rewards/final_reward": 1.8015549346745128, + "rewards/mask_iou_reward": 0.9007774673372564, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.60084468126297, + "rewards/thk_ans_format_reward": 1.0, + "step": 977, + "think_completion_length": 5.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.65625, + "epoch": 3.3035413153457, + "grad_norm": 11.306943843758415, + "kl": 0.615234375, + "learning_rate": 7.246621621621621e-07, + "loss": 0.0006, + "reward": 3.3849350214004517, + "reward_std": 0.07456074655056, + "rewards/final_reward": 1.1514664823950427, + "rewards/mask_iou_reward": 0.5757332411975213, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3849350214004517, + "rewards/thk_ans_format_reward": 1.0, + "step": 978, + "think_completion_length": 5.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.93750762939453, + "epoch": 3.3069139966273187, + "grad_norm": 10.21767200683652, + "kl": 0.654296875, + "learning_rate": 7.243806306306306e-07, + "loss": 0.0007, + "reward": 3.547337055206299, + "reward_std": 0.17310263961553574, + "rewards/final_reward": 1.8143662268646201, + "rewards/mask_iou_reward": 0.9071831134323101, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5473372340202332, + "rewards/thk_ans_format_reward": 1.0, + "step": 979, + "think_completion_length": 5.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.69791793823242, + "epoch": 3.3102866779089375, + "grad_norm": 11.932914959574404, + "kl": 1.396484375, + "learning_rate": 7.24099099099099e-07, + "loss": 0.0014, + "reward": 3.4563785791397095, + "reward_std": 0.20236939936876297, + "rewards/final_reward": 1.5273377877499197, + "rewards/mask_iou_reward": 0.7636688938749598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.456378698348999, + "rewards/thk_ans_format_reward": 1.0, + "step": 980, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.21875, + "epoch": 3.3136593591905563, + "grad_norm": 18.065931443302414, + "kl": 0.56640625, + "learning_rate": 7.238175675675675e-07, + "loss": 0.0006, + "reward": 3.3971447944641113, + "reward_std": 0.1286163404583931, + "rewards/final_reward": 1.5317366674830568, + "rewards/mask_iou_reward": 0.7658683337415284, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3971449732780457, + "rewards/thk_ans_format_reward": 1.0, + "step": 981, + "think_completion_length": 4.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.17708587646484, + "epoch": 3.317032040472175, + "grad_norm": 11.931088848754653, + "kl": 0.541015625, + "learning_rate": 7.23536036036036e-07, + "loss": 0.0006, + "reward": 3.5437206029891968, + "reward_std": 0.09862393140792847, + "rewards/final_reward": 1.673236912252018, + "rewards/mask_iou_reward": 0.836618456126009, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5437206029891968, + "rewards/thk_ans_format_reward": 1.0, + "step": 982, + "think_completion_length": 5.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1979217529297, + "epoch": 3.3204047217537944, + "grad_norm": 14.687787310829892, + "kl": 0.55859375, + "learning_rate": 7.232545045045044e-07, + "loss": 0.0006, + "reward": 3.3935028314590454, + "reward_std": 0.09046576172113419, + "rewards/final_reward": 1.3983763624905539, + "rewards/mask_iou_reward": 0.6991881812452769, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3935027718544006, + "rewards/thk_ans_format_reward": 1.0, + "step": 983, + "think_completion_length": 5.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0729217529297, + "epoch": 3.3237774030354132, + "grad_norm": 15.914576651462694, + "kl": 0.578125, + "learning_rate": 7.22972972972973e-07, + "loss": 0.0006, + "reward": 3.3592745065689087, + "reward_std": 0.2667535990476608, + "rewards/final_reward": 1.5896124561677536, + "rewards/mask_iou_reward": 0.7948062280838768, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3801075220108032, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 984, + "think_completion_length": 6.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.88541793823242, + "epoch": 3.327150084317032, + "grad_norm": 16.2544577755076, + "kl": 0.615234375, + "learning_rate": 7.226914414414414e-07, + "loss": 0.0006, + "reward": 3.4338074922561646, + "reward_std": 0.18879379332065582, + "rewards/final_reward": 1.4561126090209395, + "rewards/mask_iou_reward": 0.7280563045104698, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4338074922561646, + "rewards/thk_ans_format_reward": 1.0, + "step": 985, + "think_completion_length": 5.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.7291717529297, + "epoch": 3.330522765598651, + "grad_norm": 24.925539285671643, + "kl": 0.51953125, + "learning_rate": 7.224099099099099e-07, + "loss": 0.0005, + "reward": 3.032238721847534, + "reward_std": 0.1719028726220131, + "rewards/final_reward": 0.9244178777787244, + "rewards/mask_iou_reward": 0.4622089388893622, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0426553189754486, + "rewards/thk_ans_format_reward": 1.0, + "step": 986, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.30208587646484, + "epoch": 3.3338954468802697, + "grad_norm": 12.043070314100586, + "kl": 0.583984375, + "learning_rate": 7.221283783783784e-07, + "loss": 0.0006, + "reward": 3.320284605026245, + "reward_std": 0.17223292589187622, + "rewards/final_reward": 1.2791357239045653, + "rewards/mask_iou_reward": 0.6395678619522827, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3202844858169556, + "rewards/thk_ans_format_reward": 1.0, + "step": 987, + "think_completion_length": 5.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.00000762939453, + "epoch": 3.3372681281618886, + "grad_norm": 58.09821409600536, + "kl": 0.591796875, + "learning_rate": 7.218468468468468e-07, + "loss": 0.0007, + "reward": 2.9446089267730713, + "reward_std": 0.19425636157393456, + "rewards/final_reward": 0.6028217155810873, + "rewards/mask_iou_reward": 0.30141085779054366, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9446086883544922, + "rewards/thk_ans_format_reward": 1.0, + "step": 988, + "think_completion_length": 6.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.46875, + "epoch": 3.3406408094435074, + "grad_norm": 25.87132248920914, + "kl": 2.58984375, + "learning_rate": 7.215653153153153e-07, + "loss": 0.0026, + "reward": 3.2142951488494873, + "reward_std": 0.08715818449854851, + "rewards/final_reward": 0.05117792402181443, + "rewards/mask_iou_reward": 0.025588962010907216, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2142953276634216, + "rewards/thk_ans_format_reward": 1.0, + "step": 989, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.56250381469727, + "epoch": 3.3440134907251267, + "grad_norm": 29.103910507604148, + "kl": 0.67578125, + "learning_rate": 7.212837837837837e-07, + "loss": 0.0007, + "reward": 3.138665556907654, + "reward_std": 0.13922191970050335, + "rewards/final_reward": 0.9125553380184775, + "rewards/mask_iou_reward": 0.4562776690092388, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1386656165122986, + "rewards/thk_ans_format_reward": 1.0, + "step": 990, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.48958587646484, + "epoch": 3.3473861720067455, + "grad_norm": 76.55762479280577, + "kl": 0.5703125, + "learning_rate": 7.210022522522522e-07, + "loss": 0.0006, + "reward": 3.4942747354507446, + "reward_std": 0.050971828401088715, + "rewards/final_reward": 1.619205718547097, + "rewards/mask_iou_reward": 0.8096028592735485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4942745566368103, + "rewards/thk_ans_format_reward": 1.0, + "step": 991, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.48958587646484, + "epoch": 3.3507588532883643, + "grad_norm": 8.513639306430193, + "kl": 0.5546875, + "learning_rate": 7.207207207207207e-07, + "loss": 0.0006, + "reward": 3.1507691144943237, + "reward_std": 0.13545623049139977, + "rewards/final_reward": 1.7922970168952874, + "rewards/mask_iou_reward": 0.8961485084476437, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1716025471687317, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 992, + "think_completion_length": 5.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.14583587646484, + "epoch": 3.354131534569983, + "grad_norm": 14.357147390481792, + "kl": 0.552734375, + "learning_rate": 7.204391891891891e-07, + "loss": 0.0006, + "reward": 3.3162766695022583, + "reward_std": 0.19133785367012024, + "rewards/final_reward": 1.6321412486895803, + "rewards/mask_iou_reward": 0.8160706243447902, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3579435348510742, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 993, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.5729217529297, + "epoch": 3.357504215851602, + "grad_norm": 16.092738065915093, + "kl": 0.6015625, + "learning_rate": 7.201576576576577e-07, + "loss": 0.0006, + "reward": 3.2174041271209717, + "reward_std": 0.1023724116384983, + "rewards/final_reward": 1.379500604239666, + "rewards/mask_iou_reward": 0.689750302119833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2174039483070374, + "rewards/thk_ans_format_reward": 1.0, + "step": 994, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.45833587646484, + "epoch": 3.360876897133221, + "grad_norm": 6.582151617464147, + "kl": 0.580078125, + "learning_rate": 7.198761261261262e-07, + "loss": 0.0006, + "reward": 3.3776434659957886, + "reward_std": 0.18915196508169174, + "rewards/final_reward": 1.344115784489828, + "rewards/mask_iou_reward": 0.672057892244914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3776432871818542, + "rewards/thk_ans_format_reward": 1.0, + "step": 995, + "think_completion_length": 4.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.53125381469727, + "epoch": 3.3642495784148396, + "grad_norm": 745.984834898941, + "kl": 0.603515625, + "learning_rate": 7.195945945945946e-07, + "loss": 0.0006, + "reward": 3.356271982192993, + "reward_std": 0.10604298114776611, + "rewards/final_reward": 0.7671502588916607, + "rewards/mask_iou_reward": 0.38357512944583033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3562718629837036, + "rewards/thk_ans_format_reward": 1.0, + "step": 996, + "think_completion_length": 7.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.72916793823242, + "epoch": 3.367622259696459, + "grad_norm": 9.526246336319616, + "kl": 1.103515625, + "learning_rate": 7.193130630630631e-07, + "loss": 0.0011, + "reward": 3.502687931060791, + "reward_std": 0.11498154327273369, + "rewards/final_reward": 1.8288208638873686, + "rewards/mask_iou_reward": 0.9144104319436843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5026878118515015, + "rewards/thk_ans_format_reward": 1.0, + "step": 997, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.39583587646484, + "epoch": 3.3709949409780777, + "grad_norm": 14.226556765416788, + "kl": 0.70703125, + "learning_rate": 7.190315315315315e-07, + "loss": 0.0007, + "reward": 3.314034104347229, + "reward_std": 0.1424049399793148, + "rewards/final_reward": 1.7067345169496768, + "rewards/mask_iou_reward": 0.8533672584748384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3140341639518738, + "rewards/thk_ans_format_reward": 1.0, + "step": 998, + "think_completion_length": 5.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.9583396911621, + "epoch": 3.3743676222596966, + "grad_norm": 23.291971729599407, + "kl": 0.560546875, + "learning_rate": 7.1875e-07, + "loss": 0.0006, + "reward": 3.2234108448028564, + "reward_std": 0.12070946767926216, + "rewards/final_reward": 1.400489735686926, + "rewards/mask_iou_reward": 0.700244867843463, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2234107553958893, + "rewards/thk_ans_format_reward": 1.0, + "step": 999, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.73958587646484, + "epoch": 3.3777403035413154, + "grad_norm": 14.459397846292022, + "kl": 0.689453125, + "learning_rate": 7.184684684684685e-07, + "loss": 0.0007, + "reward": 3.5166208744049072, + "reward_std": 0.19298768788576126, + "rewards/final_reward": 1.4085889481186704, + "rewards/mask_iou_reward": 0.7042944740593352, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5166207551956177, + "rewards/thk_ans_format_reward": 1.0, + "step": 1000, + "think_completion_length": 5.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.41667175292969, + "epoch": 3.381112984822934, + "grad_norm": 39.82344513333924, + "kl": 0.552734375, + "learning_rate": 7.181869369369369e-07, + "loss": 0.0006, + "reward": 3.3766207695007324, + "reward_std": 0.1761082075536251, + "rewards/final_reward": 1.035476774600391, + "rewards/mask_iou_reward": 0.5177383873001955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3766207098960876, + "rewards/thk_ans_format_reward": 1.0, + "step": 1001, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.48958587646484, + "epoch": 3.384485666104553, + "grad_norm": 26.368496651256653, + "kl": 0.654296875, + "learning_rate": 7.179054054054054e-07, + "loss": 0.0007, + "reward": 3.17868971824646, + "reward_std": 0.1292325034737587, + "rewards/final_reward": 1.2441194261216724, + "rewards/mask_iou_reward": 0.6220597130608362, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1786896586418152, + "rewards/thk_ans_format_reward": 1.0, + "step": 1002, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.62500381469727, + "epoch": 3.387858347386172, + "grad_norm": 16.701541199494095, + "kl": 0.583984375, + "learning_rate": 7.176238738738738e-07, + "loss": 0.0006, + "reward": 3.057352304458618, + "reward_std": 0.32341183722019196, + "rewards/final_reward": 1.515199413691938, + "rewards/mask_iou_reward": 0.757599706845969, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.057352364063263, + "rewards/thk_ans_format_reward": 1.0, + "step": 1003, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.09375, + "epoch": 3.391231028667791, + "grad_norm": 10.720874146170441, + "kl": 0.78515625, + "learning_rate": 7.173423423423423e-07, + "loss": 0.0008, + "reward": 2.9834378957748413, + "reward_std": 0.18407592922449112, + "rewards/final_reward": 1.285521686629659, + "rewards/mask_iou_reward": 0.6427608433148295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9834379553794861, + "rewards/thk_ans_format_reward": 1.0, + "step": 1004, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.78125, + "epoch": 3.39460370994941, + "grad_norm": 12.028385144223307, + "kl": 0.53125, + "learning_rate": 7.170608108108109e-07, + "loss": 0.0005, + "reward": 3.208145260810852, + "reward_std": 0.22115997970104218, + "rewards/final_reward": 1.1253257130810643, + "rewards/mask_iou_reward": 0.5626628565405322, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2081450521945953, + "rewards/thk_ans_format_reward": 1.0, + "step": 1005, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.27083587646484, + "epoch": 3.397976391231029, + "grad_norm": 294.64644819140955, + "kl": 0.916015625, + "learning_rate": 7.167792792792793e-07, + "loss": 0.0009, + "reward": 3.362850785255432, + "reward_std": 0.12509119138121605, + "rewards/final_reward": 1.4450654677791082, + "rewards/mask_iou_reward": 0.7225327338895541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3628507852554321, + "rewards/thk_ans_format_reward": 1.0, + "step": 1006, + "think_completion_length": 6.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.90625762939453, + "epoch": 3.4013490725126476, + "grad_norm": 11.539631893789023, + "kl": 0.58203125, + "learning_rate": 7.164977477477477e-07, + "loss": 0.0006, + "reward": 3.2372653484344482, + "reward_std": 0.2422248274087906, + "rewards/final_reward": 1.3411681388721064, + "rewards/mask_iou_reward": 0.6705840694360532, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2372653186321259, + "rewards/thk_ans_format_reward": 1.0, + "step": 1007, + "think_completion_length": 4.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.00000381469727, + "epoch": 3.4047217537942664, + "grad_norm": 14.000104300475737, + "kl": 0.6171875, + "learning_rate": 7.162162162162161e-07, + "loss": 0.0006, + "reward": 3.285541296005249, + "reward_std": 0.16645950078964233, + "rewards/final_reward": 0.8122575552678174, + "rewards/mask_iou_reward": 0.4061287776339087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2855412364006042, + "rewards/thk_ans_format_reward": 1.0, + "step": 1008, + "think_completion_length": 5.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.14583587646484, + "epoch": 3.4080944350758853, + "grad_norm": 6.440987836858945, + "kl": 0.583984375, + "learning_rate": 7.159346846846846e-07, + "loss": 0.0006, + "reward": 3.5113813877105713, + "reward_std": 0.17040079832077026, + "rewards/final_reward": 1.3076065447677623, + "rewards/mask_iou_reward": 0.6538032723838811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5113813281059265, + "rewards/thk_ans_format_reward": 1.0, + "step": 1009, + "think_completion_length": 5.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.46875381469727, + "epoch": 3.411467116357504, + "grad_norm": 77.78402545991266, + "kl": 0.591796875, + "learning_rate": 7.156531531531531e-07, + "loss": 0.0006, + "reward": 3.265267848968506, + "reward_std": 0.0866379663348198, + "rewards/final_reward": 1.559751116566344, + "rewards/mask_iou_reward": 0.779875558283172, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2652679085731506, + "rewards/thk_ans_format_reward": 1.0, + "step": 1010, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.38541793823242, + "epoch": 3.414839797639123, + "grad_norm": 9.63407648262111, + "kl": 0.671875, + "learning_rate": 7.153716216216215e-07, + "loss": 0.0007, + "reward": 3.1705085039138794, + "reward_std": 0.1311897709965706, + "rewards/final_reward": 0.8278068040052939, + "rewards/mask_iou_reward": 0.41390340200264697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1705085337162018, + "rewards/thk_ans_format_reward": 1.0, + "step": 1011, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.19791793823242, + "epoch": 3.4182124789207418, + "grad_norm": 10.822874300002365, + "kl": 0.5703125, + "learning_rate": 7.1509009009009e-07, + "loss": 0.0006, + "reward": 2.9343758821487427, + "reward_std": 0.1505995076149702, + "rewards/final_reward": 0.016859356044498678, + "rewards/mask_iou_reward": 0.008429678022249339, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9343758225440979, + "rewards/thk_ans_format_reward": 1.0, + "step": 1012, + "think_completion_length": 5.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.15625, + "epoch": 3.421585160202361, + "grad_norm": 30.516206363772966, + "kl": 0.65234375, + "learning_rate": 7.148085585585584e-07, + "loss": 0.0007, + "reward": 3.3128777742385864, + "reward_std": 0.09216344356536865, + "rewards/final_reward": 1.3309258266346848, + "rewards/mask_iou_reward": 0.6654629133173424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3128776550292969, + "rewards/thk_ans_format_reward": 1.0, + "step": 1013, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.69791793823242, + "epoch": 3.42495784148398, + "grad_norm": 9.172659732253166, + "kl": 0.619140625, + "learning_rate": 7.145270270270269e-07, + "loss": 0.0006, + "reward": 3.099686861038208, + "reward_std": 0.16987330839037895, + "rewards/final_reward": 0.5487296181785621, + "rewards/mask_iou_reward": 0.27436480908928107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.099686861038208, + "rewards/thk_ans_format_reward": 1.0, + "step": 1014, + "think_completion_length": 5.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.57291793823242, + "epoch": 3.4283305227655987, + "grad_norm": 14.378581603595407, + "kl": 0.583984375, + "learning_rate": 7.142454954954955e-07, + "loss": 0.0006, + "reward": 3.4818849563598633, + "reward_std": 0.09820106998085976, + "rewards/final_reward": 0.8706115831652791, + "rewards/mask_iou_reward": 0.43530579158263955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.481884777545929, + "rewards/thk_ans_format_reward": 1.0, + "step": 1015, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.18750381469727, + "epoch": 3.4317032040472175, + "grad_norm": 81.49210537692733, + "kl": 0.578125, + "learning_rate": 7.139639639639639e-07, + "loss": 0.0006, + "reward": 3.278176784515381, + "reward_std": 0.17319020628929138, + "rewards/final_reward": 1.311001176552672, + "rewards/mask_iou_reward": 0.655500588276336, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.278176724910736, + "rewards/thk_ans_format_reward": 1.0, + "step": 1016, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.10416793823242, + "epoch": 3.4350758853288363, + "grad_norm": 9.026460969823404, + "kl": 0.72265625, + "learning_rate": 7.136824324324324e-07, + "loss": 0.0007, + "reward": 3.395389676094055, + "reward_std": 0.24909770116209984, + "rewards/final_reward": 1.7078976175332112, + "rewards/mask_iou_reward": 0.8539488087666056, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.447472870349884, + "rewards/thk_ans_format_reward": 1.0, + "step": 1017, + "think_completion_length": 5.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.09375762939453, + "epoch": 3.438448566610455, + "grad_norm": 15.858132519987185, + "kl": 0.595703125, + "learning_rate": 7.134009009009009e-07, + "loss": 0.0006, + "reward": 3.220812678337097, + "reward_std": 0.048954208847135305, + "rewards/final_reward": 1.0691695640976275, + "rewards/mask_iou_reward": 0.5345847820488138, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2208125591278076, + "rewards/thk_ans_format_reward": 1.0, + "step": 1018, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.25, + "epoch": 3.441821247892074, + "grad_norm": 17.508912895515763, + "kl": 0.54296875, + "learning_rate": 7.131193693693693e-07, + "loss": 0.0005, + "reward": 3.421711802482605, + "reward_std": 0.08426211401820183, + "rewards/final_reward": 1.4825259952470544, + "rewards/mask_iou_reward": 0.7412629976235272, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4217117428779602, + "rewards/thk_ans_format_reward": 1.0, + "step": 1019, + "think_completion_length": 6.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.70833587646484, + "epoch": 3.4451939291736933, + "grad_norm": 9.819818344541755, + "kl": 0.5859375, + "learning_rate": 7.128378378378378e-07, + "loss": 0.0006, + "reward": 3.5098577737808228, + "reward_std": 0.0756399855017662, + "rewards/final_reward": 1.8016748094536283, + "rewards/mask_iou_reward": 0.9008374047268142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.509857714176178, + "rewards/thk_ans_format_reward": 1.0, + "step": 1020, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.62500381469727, + "epoch": 3.448566610455312, + "grad_norm": 9.870525631678277, + "kl": 0.59765625, + "learning_rate": 7.125563063063062e-07, + "loss": 0.0006, + "reward": 3.2804505825042725, + "reward_std": 0.044149222783744335, + "rewards/final_reward": 1.4287736457226252, + "rewards/mask_iou_reward": 0.7143868228613126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.280450701713562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1021, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.01041793823242, + "epoch": 3.451939291736931, + "grad_norm": 14.709563001580337, + "kl": 0.58984375, + "learning_rate": 7.122747747747747e-07, + "loss": 0.0006, + "reward": 3.495166778564453, + "reward_std": 0.08700309973210096, + "rewards/final_reward": 1.4906810415064982, + "rewards/mask_iou_reward": 0.7453405207532491, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4951667785644531, + "rewards/thk_ans_format_reward": 1.0, + "step": 1022, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.00000381469727, + "epoch": 3.4553119730185498, + "grad_norm": 11.39047362652609, + "kl": 0.58203125, + "learning_rate": 7.119932432432432e-07, + "loss": 0.0006, + "reward": 3.120210886001587, + "reward_std": 0.11821487359702587, + "rewards/final_reward": 0.7625099604747464, + "rewards/mask_iou_reward": 0.3812549802373732, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1202109456062317, + "rewards/thk_ans_format_reward": 1.0, + "step": 1023, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.14583587646484, + "epoch": 3.4586846543001686, + "grad_norm": 12.194540716632517, + "kl": 0.640625, + "learning_rate": 7.117117117117116e-07, + "loss": 0.0006, + "reward": 3.422505021095276, + "reward_std": 0.06610206328332424, + "rewards/final_reward": 1.3190821987653705, + "rewards/mask_iou_reward": 0.6595410993826852, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4225048422813416, + "rewards/thk_ans_format_reward": 1.0, + "step": 1024, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.20833587646484, + "epoch": 3.4620573355817874, + "grad_norm": 27.511552738652743, + "kl": 0.59765625, + "learning_rate": 7.114301801801802e-07, + "loss": 0.0006, + "reward": 3.3963574171066284, + "reward_std": 0.05695942044258118, + "rewards/final_reward": 1.834786710391498, + "rewards/mask_iou_reward": 0.917393355195749, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3963574171066284, + "rewards/thk_ans_format_reward": 1.0, + "step": 1025, + "think_completion_length": 6.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.625, + "epoch": 3.4654300168634062, + "grad_norm": 32.633397899339734, + "kl": 0.625, + "learning_rate": 7.111486486486487e-07, + "loss": 0.0006, + "reward": 3.3901573419570923, + "reward_std": 0.3244527727365494, + "rewards/final_reward": 1.6744121917113473, + "rewards/mask_iou_reward": 0.8372060958556736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3901574611663818, + "rewards/thk_ans_format_reward": 1.0, + "step": 1026, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.47916793823242, + "epoch": 3.4688026981450255, + "grad_norm": 11.68248831433917, + "kl": 0.578125, + "learning_rate": 7.108671171171171e-07, + "loss": 0.0006, + "reward": 3.42316472530365, + "reward_std": 0.1372026950120926, + "rewards/final_reward": 1.2456748072284278, + "rewards/mask_iou_reward": 0.6228374036142139, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4231645464897156, + "rewards/thk_ans_format_reward": 1.0, + "step": 1027, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.57291793823242, + "epoch": 3.4721753794266443, + "grad_norm": 15.197451678878082, + "kl": 1.07421875, + "learning_rate": 7.105855855855856e-07, + "loss": 0.0011, + "reward": 3.4628374576568604, + "reward_std": 0.14769380167126656, + "rewards/final_reward": 1.2430186680061066, + "rewards/mask_iou_reward": 0.6215093340030533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4628373980522156, + "rewards/thk_ans_format_reward": 1.0, + "step": 1028, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.56250381469727, + "epoch": 3.475548060708263, + "grad_norm": 8.819992248258746, + "kl": 0.623046875, + "learning_rate": 7.10304054054054e-07, + "loss": 0.0006, + "reward": 3.103990077972412, + "reward_std": 0.17439210042357445, + "rewards/final_reward": 0.11795010824929059, + "rewards/mask_iou_reward": 0.05897505412464529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1039898991584778, + "rewards/thk_ans_format_reward": 1.0, + "step": 1029, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.91666793823242, + "epoch": 3.478920741989882, + "grad_norm": 34.69363820910185, + "kl": 0.83203125, + "learning_rate": 7.100225225225225e-07, + "loss": 0.0008, + "reward": 3.3977935314178467, + "reward_std": 0.08490690216422081, + "rewards/final_reward": 1.6032253984319262, + "rewards/mask_iou_reward": 0.8016126992159631, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3977934122085571, + "rewards/thk_ans_format_reward": 1.0, + "step": 1030, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.34375381469727, + "epoch": 3.482293423271501, + "grad_norm": 27.013938649093514, + "kl": 0.5703125, + "learning_rate": 7.09740990990991e-07, + "loss": 0.0006, + "reward": 3.4430354833602905, + "reward_std": 0.10244200751185417, + "rewards/final_reward": 1.7051466739701793, + "rewards/mask_iou_reward": 0.8525733369850896, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4430354833602905, + "rewards/thk_ans_format_reward": 1.0, + "step": 1031, + "think_completion_length": 5.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.48958587646484, + "epoch": 3.4856661045531196, + "grad_norm": 12.142628291268021, + "kl": 0.65625, + "learning_rate": 7.094594594594594e-07, + "loss": 0.0007, + "reward": 3.4855599403381348, + "reward_std": 0.14212478697299957, + "rewards/final_reward": 1.9155205822997325, + "rewards/mask_iou_reward": 0.9577602911498663, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4855600595474243, + "rewards/thk_ans_format_reward": 1.0, + "step": 1032, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.60416793823242, + "epoch": 3.4890387858347385, + "grad_norm": 36.40473198415941, + "kl": 0.6171875, + "learning_rate": 7.091779279279279e-07, + "loss": 0.0006, + "reward": 3.26971971988678, + "reward_std": 0.11622913181781769, + "rewards/final_reward": 1.2890900177746465, + "rewards/mask_iou_reward": 0.6445450088873232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2697195410728455, + "rewards/thk_ans_format_reward": 1.0, + "step": 1033, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.01041793823242, + "epoch": 3.4924114671163577, + "grad_norm": 8.404072392453646, + "kl": 0.61328125, + "learning_rate": 7.088963963963963e-07, + "loss": 0.0007, + "reward": 3.4270265102386475, + "reward_std": 0.10212980210781097, + "rewards/final_reward": 1.8368501315873327, + "rewards/mask_iou_reward": 0.9184250657936663, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4270267486572266, + "rewards/thk_ans_format_reward": 1.0, + "step": 1034, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.04166793823242, + "epoch": 3.4957841483979766, + "grad_norm": 7.301922297356568, + "kl": 0.587890625, + "learning_rate": 7.086148648648649e-07, + "loss": 0.0006, + "reward": 3.2225120067596436, + "reward_std": 0.18624672293663025, + "rewards/final_reward": 1.7009970313208873, + "rewards/mask_iou_reward": 0.8504985156604437, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.222511887550354, + "rewards/thk_ans_format_reward": 1.0, + "step": 1035, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.02083587646484, + "epoch": 3.4991568296795954, + "grad_norm": 11.427769416824685, + "kl": 0.560546875, + "learning_rate": 7.083333333333334e-07, + "loss": 0.0006, + "reward": 3.1986045837402344, + "reward_std": 0.16096675768494606, + "rewards/final_reward": 0.6870702507692115, + "rewards/mask_iou_reward": 0.34353512538460573, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1986045241355896, + "rewards/thk_ans_format_reward": 1.0, + "step": 1036, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.78125381469727, + "epoch": 3.5025295109612142, + "grad_norm": 30.010243571681038, + "kl": 0.564453125, + "learning_rate": 7.080518018018018e-07, + "loss": 0.0006, + "reward": 3.338430643081665, + "reward_std": 0.1133829839527607, + "rewards/final_reward": 1.564505636833713, + "rewards/mask_iou_reward": 0.7822528184168565, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.338430643081665, + "rewards/thk_ans_format_reward": 1.0, + "step": 1037, + "think_completion_length": 5.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.76042175292969, + "epoch": 3.505902192242833, + "grad_norm": 16.436476631009363, + "kl": 0.55859375, + "learning_rate": 7.077702702702703e-07, + "loss": 0.0006, + "reward": 3.476973533630371, + "reward_std": 0.14361018687486649, + "rewards/final_reward": 1.3896836175215372, + "rewards/mask_iou_reward": 0.6948418087607686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.476973533630371, + "rewards/thk_ans_format_reward": 1.0, + "step": 1038, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.56250381469727, + "epoch": 3.509274873524452, + "grad_norm": 24.14126315202132, + "kl": 0.587890625, + "learning_rate": 7.074887387387387e-07, + "loss": 0.0006, + "reward": 3.375579357147217, + "reward_std": 0.057378935627639294, + "rewards/final_reward": 1.8330912942769158, + "rewards/mask_iou_reward": 0.9165456471384579, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3755793571472168, + "rewards/thk_ans_format_reward": 1.0, + "step": 1039, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.67708587646484, + "epoch": 3.5126475548060707, + "grad_norm": 11.93388249586833, + "kl": 0.6484375, + "learning_rate": 7.072072072072072e-07, + "loss": 0.0007, + "reward": 3.459592580795288, + "reward_std": 0.10880524665117264, + "rewards/final_reward": 1.161511180571614, + "rewards/mask_iou_reward": 0.580755590285807, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4595925211906433, + "rewards/thk_ans_format_reward": 1.0, + "step": 1040, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.89583587646484, + "epoch": 3.51602023608769, + "grad_norm": 11.0119929277027, + "kl": 0.669921875, + "learning_rate": 7.069256756756757e-07, + "loss": 0.0007, + "reward": 2.977562189102173, + "reward_std": 0.2377164401113987, + "rewards/final_reward": 0.13724020798525324, + "rewards/mask_iou_reward": 0.06862010399262662, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9775621891021729, + "rewards/thk_ans_format_reward": 1.0, + "step": 1041, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.52083587646484, + "epoch": 3.5193929173693084, + "grad_norm": 10.690119770965087, + "kl": 0.6015625, + "learning_rate": 7.066441441441441e-07, + "loss": 0.0006, + "reward": 3.548872470855713, + "reward_std": 0.05538544990122318, + "rewards/final_reward": 1.0239648673086363, + "rewards/mask_iou_reward": 0.5119824336543182, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.548872470855713, + "rewards/thk_ans_format_reward": 1.0, + "step": 1042, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.15625, + "epoch": 3.5227655986509276, + "grad_norm": 13.157116461404055, + "kl": 0.544921875, + "learning_rate": 7.063626126126126e-07, + "loss": 0.0005, + "reward": 3.264909267425537, + "reward_std": 0.08749586343765259, + "rewards/final_reward": 1.3016585640148213, + "rewards/mask_iou_reward": 0.6508292820074106, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2649091482162476, + "rewards/thk_ans_format_reward": 1.0, + "step": 1043, + "think_completion_length": 5.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.58333587646484, + "epoch": 3.5261382799325465, + "grad_norm": 32.2281674012685, + "kl": 2.115234375, + "learning_rate": 7.06081081081081e-07, + "loss": 0.0021, + "reward": 3.306381583213806, + "reward_std": 0.1866590976715088, + "rewards/final_reward": 1.010364329315494, + "rewards/mask_iou_reward": 0.505182164657747, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.3376315236091614, + "rewards/thk_ans_format_reward": 1.0, + "step": 1044, + "think_completion_length": 6.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.04166793823242, + "epoch": 3.5295109612141653, + "grad_norm": 25.27852220793088, + "kl": 0.671875, + "learning_rate": 7.057995495495496e-07, + "loss": 0.0007, + "reward": 3.2020857334136963, + "reward_std": 0.07453176006674767, + "rewards/final_reward": 1.422622440794243, + "rewards/mask_iou_reward": 0.7113112203971215, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2020857334136963, + "rewards/thk_ans_format_reward": 1.0, + "step": 1045, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.01042175292969, + "epoch": 3.532883642495784, + "grad_norm": 21.604898654705316, + "kl": 0.703125, + "learning_rate": 7.055180180180181e-07, + "loss": 0.0007, + "reward": 3.4886926412582397, + "reward_std": 0.18602124601602554, + "rewards/final_reward": 0.9582341538163569, + "rewards/mask_iou_reward": 0.47911707690817845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4886927604675293, + "rewards/thk_ans_format_reward": 1.0, + "step": 1046, + "think_completion_length": 5.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.2916717529297, + "epoch": 3.536256323777403, + "grad_norm": 15.95223692471976, + "kl": 0.69140625, + "learning_rate": 7.052364864864864e-07, + "loss": 0.0007, + "reward": 3.167095899581909, + "reward_std": 0.20952048152685165, + "rewards/final_reward": 1.5740200985301716, + "rewards/mask_iou_reward": 0.7870100492650858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1670955419540405, + "rewards/thk_ans_format_reward": 1.0, + "step": 1047, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.97916793823242, + "epoch": 3.539629005059022, + "grad_norm": 17.810160005926072, + "kl": 0.556640625, + "learning_rate": 7.049549549549549e-07, + "loss": 0.0006, + "reward": 3.306037187576294, + "reward_std": 0.2319406047463417, + "rewards/final_reward": 1.0558346546299109, + "rewards/mask_iou_reward": 0.5279173273149554, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.306037187576294, + "rewards/thk_ans_format_reward": 1.0, + "step": 1048, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.87500381469727, + "epoch": 3.5430016863406406, + "grad_norm": 163.3696580898773, + "kl": 0.609375, + "learning_rate": 7.046734234234234e-07, + "loss": 0.0006, + "reward": 3.6224377155303955, + "reward_std": 0.013898211065679789, + "rewards/final_reward": 1.6734318576156038, + "rewards/mask_iou_reward": 0.8367159288078019, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6224374771118164, + "rewards/thk_ans_format_reward": 1.0, + "step": 1049, + "think_completion_length": 5.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.61458587646484, + "epoch": 3.54637436762226, + "grad_norm": 18.906848115177475, + "kl": 0.666015625, + "learning_rate": 7.043918918918918e-07, + "loss": 0.0007, + "reward": 3.389710783958435, + "reward_std": 0.23390305787324905, + "rewards/final_reward": 1.5296034337359203, + "rewards/mask_iou_reward": 0.7648017168679602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3897106647491455, + "rewards/thk_ans_format_reward": 1.0, + "step": 1050, + "think_completion_length": 6.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.37500381469727, + "epoch": 3.5497470489038787, + "grad_norm": 14.33713455543594, + "kl": 0.580078125, + "learning_rate": 7.041103603603603e-07, + "loss": 0.0006, + "reward": 2.87336528301239, + "reward_std": 0.24747732281684875, + "rewards/final_reward": 1.3500116230362549, + "rewards/mask_iou_reward": 0.6750058115181274, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 0.946281909942627, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1051, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.94791793823242, + "epoch": 3.5531197301854975, + "grad_norm": 9.87205102185769, + "kl": 0.658203125, + "learning_rate": 7.038288288288287e-07, + "loss": 0.0007, + "reward": 3.4340622425079346, + "reward_std": 0.11470721662044525, + "rewards/final_reward": 1.6583001580940349, + "rewards/mask_iou_reward": 0.8291500790470174, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4340623021125793, + "rewards/thk_ans_format_reward": 1.0, + "step": 1052, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.08333587646484, + "epoch": 3.5564924114671164, + "grad_norm": 9.62732499087711, + "kl": 0.587890625, + "learning_rate": 7.035472972972972e-07, + "loss": 0.0006, + "reward": 3.484450578689575, + "reward_std": 0.11886984389275312, + "rewards/final_reward": 1.9086974388312166, + "rewards/mask_iou_reward": 0.9543487194156083, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4844504594802856, + "rewards/thk_ans_format_reward": 1.0, + "step": 1053, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.83333587646484, + "epoch": 3.559865092748735, + "grad_norm": 14.8438354944717, + "kl": 0.775390625, + "learning_rate": 7.032657657657657e-07, + "loss": 0.0008, + "reward": 3.4443854093551636, + "reward_std": 0.2593641094863415, + "rewards/final_reward": 1.3469927356049844, + "rewards/mask_iou_reward": 0.6734963678024922, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4443853497505188, + "rewards/thk_ans_format_reward": 1.0, + "step": 1054, + "think_completion_length": 5.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.56250381469727, + "epoch": 3.563237774030354, + "grad_norm": 11.380540635575645, + "kl": 0.6015625, + "learning_rate": 7.029842342342342e-07, + "loss": 0.0006, + "reward": 3.0353925228118896, + "reward_std": 0.15303052216768265, + "rewards/final_reward": 1.38494825478689, + "rewards/mask_iou_reward": 0.692474127393445, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0353924930095673, + "rewards/thk_ans_format_reward": 1.0, + "step": 1055, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.71875762939453, + "epoch": 3.566610455311973, + "grad_norm": 8.725713225667999, + "kl": 0.765625, + "learning_rate": 7.027027027027027e-07, + "loss": 0.0008, + "reward": 3.3467824459075928, + "reward_std": 0.3739045560359955, + "rewards/final_reward": 1.7465660805194694, + "rewards/mask_iou_reward": 0.8732830402597347, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3780324459075928, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1056, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.22916793823242, + "epoch": 3.569983136593592, + "grad_norm": 15.459697290041083, + "kl": 0.62109375, + "learning_rate": 7.024211711711711e-07, + "loss": 0.0006, + "reward": 3.3566296100616455, + "reward_std": 0.14935403689742088, + "rewards/final_reward": 1.384715234534162, + "rewards/mask_iou_reward": 0.692357617267081, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3566296696662903, + "rewards/thk_ans_format_reward": 1.0, + "step": 1057, + "think_completion_length": 6.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.39583587646484, + "epoch": 3.573355817875211, + "grad_norm": 12.131914905442958, + "kl": 0.83203125, + "learning_rate": 7.021396396396396e-07, + "loss": 0.0008, + "reward": 3.395006537437439, + "reward_std": 0.12985088303685188, + "rewards/final_reward": 1.3578735883213409, + "rewards/mask_iou_reward": 0.6789367941606704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4054231643676758, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1058, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.85417175292969, + "epoch": 3.5767284991568298, + "grad_norm": 6.470768374066208, + "kl": 0.599609375, + "learning_rate": 7.018581081081081e-07, + "loss": 0.0006, + "reward": 3.2740269899368286, + "reward_std": 0.16072769463062286, + "rewards/final_reward": 1.8859906667640165, + "rewards/mask_iou_reward": 0.9429953333820082, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2740269899368286, + "rewards/thk_ans_format_reward": 1.0, + "step": 1059, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.96875381469727, + "epoch": 3.5801011804384486, + "grad_norm": 10.983155260058247, + "kl": 0.580078125, + "learning_rate": 7.015765765765765e-07, + "loss": 0.0006, + "reward": 3.5100547075271606, + "reward_std": 0.11096128076314926, + "rewards/final_reward": 1.2615163082914997, + "rewards/mask_iou_reward": 0.6307581541457499, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5100546479225159, + "rewards/thk_ans_format_reward": 1.0, + "step": 1060, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.09375762939453, + "epoch": 3.5834738617200674, + "grad_norm": 8.844336705792847, + "kl": 0.826171875, + "learning_rate": 7.01295045045045e-07, + "loss": 0.0008, + "reward": 3.2920873165130615, + "reward_std": 0.09549107030034065, + "rewards/final_reward": 1.6379837260078178, + "rewards/mask_iou_reward": 0.8189918630039089, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2920872569084167, + "rewards/thk_ans_format_reward": 1.0, + "step": 1061, + "think_completion_length": 5.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.60416793823242, + "epoch": 3.5868465430016863, + "grad_norm": 16.903653696816175, + "kl": 0.708984375, + "learning_rate": 7.010135135135134e-07, + "loss": 0.0007, + "reward": 3.3538187742233276, + "reward_std": 0.24481885135173798, + "rewards/final_reward": 1.5848143437602702, + "rewards/mask_iou_reward": 0.7924071718801351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3538187742233276, + "rewards/thk_ans_format_reward": 1.0, + "step": 1062, + "think_completion_length": 5.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.91666793823242, + "epoch": 3.590219224283305, + "grad_norm": 27.439916704869667, + "kl": 0.572265625, + "learning_rate": 7.007319819819819e-07, + "loss": 0.0006, + "reward": 3.5271008014678955, + "reward_std": 0.05490931309759617, + "rewards/final_reward": 1.4191140366165103, + "rewards/mask_iou_reward": 0.7095570183082551, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.527100920677185, + "rewards/thk_ans_format_reward": 1.0, + "step": 1063, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.62500381469727, + "epoch": 3.5935919055649244, + "grad_norm": 13.85268564742067, + "kl": 0.55078125, + "learning_rate": 7.004504504504504e-07, + "loss": 0.0006, + "reward": 3.6005152463912964, + "reward_std": 0.12097604386508465, + "rewards/final_reward": 1.4686288065436273, + "rewards/mask_iou_reward": 0.7343144032718136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6005152463912964, + "rewards/thk_ans_format_reward": 1.0, + "step": 1064, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.55208587646484, + "epoch": 3.5969645868465427, + "grad_norm": 19.073402183690888, + "kl": 0.677734375, + "learning_rate": 7.001689189189189e-07, + "loss": 0.0007, + "reward": 2.8053646087646484, + "reward_std": 0.062431491911411285, + "rewards/final_reward": 0.9842462252401399, + "rewards/mask_iou_reward": 0.49212311262006997, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.8157810866832733, + "rewards/thk_ans_format_reward": 1.0, + "step": 1065, + "think_completion_length": 5.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.34375381469727, + "epoch": 3.600337268128162, + "grad_norm": 12.129261872140868, + "kl": 0.609375, + "learning_rate": 6.998873873873874e-07, + "loss": 0.0006, + "reward": 3.1443766355514526, + "reward_std": 0.04327939311042428, + "rewards/final_reward": 0.2489097145314306, + "rewards/mask_iou_reward": 0.1244548572657153, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.144376516342163, + "rewards/thk_ans_format_reward": 1.0, + "step": 1066, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.65625381469727, + "epoch": 3.603709949409781, + "grad_norm": 90.03169091556349, + "kl": 0.62890625, + "learning_rate": 6.996058558558559e-07, + "loss": 0.0006, + "reward": 3.5203282833099365, + "reward_std": 0.09074808657169342, + "rewards/final_reward": 1.6439105325893757, + "rewards/mask_iou_reward": 0.8219552662946878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5203283429145813, + "rewards/thk_ans_format_reward": 1.0, + "step": 1067, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.76041793823242, + "epoch": 3.6070826306913997, + "grad_norm": 9.523252458140334, + "kl": 0.67578125, + "learning_rate": 6.993243243243243e-07, + "loss": 0.0007, + "reward": 3.134290933609009, + "reward_std": 0.1437622308731079, + "rewards/final_reward": 1.7129957587646047, + "rewards/mask_iou_reward": 0.8564978793823024, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.144707441329956, + "rewards/thk_ans_format_reward": 1.0, + "step": 1068, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.59375381469727, + "epoch": 3.6104553119730185, + "grad_norm": 34.119749728566035, + "kl": 0.59765625, + "learning_rate": 6.990427927927928e-07, + "loss": 0.0006, + "reward": 3.7921831607818604, + "reward_std": 0.05658973567187786, + "rewards/final_reward": 1.8885518493785414, + "rewards/mask_iou_reward": 0.9442759246892707, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7921829223632812, + "rewards/thk_ans_format_reward": 1.0, + "step": 1069, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.58333587646484, + "epoch": 3.6138279932546373, + "grad_norm": 16.675311982963155, + "kl": 0.587890625, + "learning_rate": 6.987612612612612e-07, + "loss": 0.0006, + "reward": 3.5710495710372925, + "reward_std": 0.14011128805577755, + "rewards/final_reward": 1.092474775438216, + "rewards/mask_iou_reward": 0.546237387719108, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.571049690246582, + "rewards/thk_ans_format_reward": 1.0, + "step": 1070, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.54167175292969, + "epoch": 3.6172006745362566, + "grad_norm": 28.795790674378512, + "kl": 0.677734375, + "learning_rate": 6.984797297297297e-07, + "loss": 0.0007, + "reward": 3.082141876220703, + "reward_std": 0.22064311429858208, + "rewards/final_reward": 0.9153156300068633, + "rewards/mask_iou_reward": 0.45765781500343167, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0821418762207031, + "rewards/thk_ans_format_reward": 1.0, + "step": 1071, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.59375, + "epoch": 3.620573355817875, + "grad_norm": 11.444374570788154, + "kl": 0.810546875, + "learning_rate": 6.981981981981982e-07, + "loss": 0.0008, + "reward": 3.4599640369415283, + "reward_std": 0.1495438888669014, + "rewards/final_reward": 1.1625894442026803, + "rewards/mask_iou_reward": 0.5812947221013401, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4599639773368835, + "rewards/thk_ans_format_reward": 1.0, + "step": 1072, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.16666793823242, + "epoch": 3.6239460370994943, + "grad_norm": 12.101685454634838, + "kl": 0.658203125, + "learning_rate": 6.979166666666666e-07, + "loss": 0.0006, + "reward": 3.401353597640991, + "reward_std": 0.10515595600008965, + "rewards/final_reward": 0.9484960177048483, + "rewards/mask_iou_reward": 0.47424800885242413, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4013535976409912, + "rewards/thk_ans_format_reward": 1.0, + "step": 1073, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.63542175292969, + "epoch": 3.627318718381113, + "grad_norm": 14.46073592297596, + "kl": 0.82421875, + "learning_rate": 6.976351351351351e-07, + "loss": 0.0008, + "reward": 3.64565372467041, + "reward_std": 0.12325317412614822, + "rewards/final_reward": 1.910804574900559, + "rewards/mask_iou_reward": 0.9554022874502796, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6456536650657654, + "rewards/thk_ans_format_reward": 1.0, + "step": 1074, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.06250381469727, + "epoch": 3.630691399662732, + "grad_norm": 80.55259372670554, + "kl": 0.7421875, + "learning_rate": 6.973536036036036e-07, + "loss": 0.0008, + "reward": 3.5288643836975098, + "reward_std": 0.2130519635975361, + "rewards/final_reward": 1.7231965755366625, + "rewards/mask_iou_reward": 0.8615982877683313, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.549697756767273, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1075, + "think_completion_length": 5.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.72916793823242, + "epoch": 3.6340640809443507, + "grad_norm": 6.664651113291088, + "kl": 0.76953125, + "learning_rate": 6.970720720720721e-07, + "loss": 0.0008, + "reward": 3.5804613828659058, + "reward_std": 0.07663201168179512, + "rewards/final_reward": 1.1199998042376362, + "rewards/mask_iou_reward": 0.5599999021188181, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5804613828659058, + "rewards/thk_ans_format_reward": 1.0, + "step": 1076, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.85417175292969, + "epoch": 3.6374367622259696, + "grad_norm": 7.411611992399397, + "kl": 0.611328125, + "learning_rate": 6.967905405405406e-07, + "loss": 0.0006, + "reward": 3.0753601789474487, + "reward_std": 0.22175676375627518, + "rewards/final_reward": 1.190157277543239, + "rewards/mask_iou_reward": 0.5950786387716195, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0753599405288696, + "rewards/thk_ans_format_reward": 1.0, + "step": 1077, + "think_completion_length": 5.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4479217529297, + "epoch": 3.6408094435075884, + "grad_norm": 16.851340992880754, + "kl": 0.826171875, + "learning_rate": 6.96509009009009e-07, + "loss": 0.0009, + "reward": 3.230084538459778, + "reward_std": 0.26128628849983215, + "rewards/final_reward": 0.0064829542036839855, + "rewards/mask_iou_reward": 0.0032414771018419927, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.2613343596458435, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1078, + "think_completion_length": 6.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.06250762939453, + "epoch": 3.6441821247892072, + "grad_norm": 24.82890569936366, + "kl": 0.671875, + "learning_rate": 6.962274774774775e-07, + "loss": 0.0007, + "reward": 3.4247443675994873, + "reward_std": 0.21899319719523191, + "rewards/final_reward": 1.5726510366981412, + "rewards/mask_iou_reward": 0.7863255183490706, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.455994427204132, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1079, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.11458587646484, + "epoch": 3.6475548060708265, + "grad_norm": 11.50113447748046, + "kl": 0.607421875, + "learning_rate": 6.95945945945946e-07, + "loss": 0.0006, + "reward": 3.3249796628952026, + "reward_std": 0.15425102412700653, + "rewards/final_reward": 1.3915881672936048, + "rewards/mask_iou_reward": 0.6957940836468024, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.324979841709137, + "rewards/thk_ans_format_reward": 1.0, + "step": 1080, + "think_completion_length": 5.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.95833587646484, + "epoch": 3.6509274873524453, + "grad_norm": 9.449392867624718, + "kl": 0.625, + "learning_rate": 6.956644144144144e-07, + "loss": 0.0006, + "reward": 3.43624210357666, + "reward_std": 0.01355983130633831, + "rewards/final_reward": 0.9758531922051863, + "rewards/mask_iou_reward": 0.48792659610259315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4362419247627258, + "rewards/thk_ans_format_reward": 1.0, + "step": 1081, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.88541793823242, + "epoch": 3.654300168634064, + "grad_norm": 9.488940964063072, + "kl": 0.666015625, + "learning_rate": 6.953828828828829e-07, + "loss": 0.0007, + "reward": 3.3464990854263306, + "reward_std": 0.30547909438610077, + "rewards/final_reward": 1.5841524038260197, + "rewards/mask_iou_reward": 0.7920762019130099, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3464989066123962, + "rewards/thk_ans_format_reward": 1.0, + "step": 1082, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.42708587646484, + "epoch": 3.657672849915683, + "grad_norm": 12.788530897249785, + "kl": 0.59375, + "learning_rate": 6.951013513513513e-07, + "loss": 0.0006, + "reward": 3.588488817214966, + "reward_std": 0.07584836706519127, + "rewards/final_reward": 1.5972509743272374, + "rewards/mask_iou_reward": 0.7986254871636187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5884888768196106, + "rewards/thk_ans_format_reward": 1.0, + "step": 1083, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.10416793823242, + "epoch": 3.661045531197302, + "grad_norm": 14.276790072775468, + "kl": 0.591796875, + "learning_rate": 6.948198198198198e-07, + "loss": 0.0006, + "reward": 3.343292236328125, + "reward_std": 0.1573808193206787, + "rewards/final_reward": 0.9708995355436782, + "rewards/mask_iou_reward": 0.4854497677718391, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3432920575141907, + "rewards/thk_ans_format_reward": 1.0, + "step": 1084, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.77083587646484, + "epoch": 3.6644182124789206, + "grad_norm": 13.768661675669374, + "kl": 0.70703125, + "learning_rate": 6.945382882882884e-07, + "loss": 0.0007, + "reward": 3.697332262992859, + "reward_std": 0.03931210841983557, + "rewards/final_reward": 1.948831729093463, + "rewards/mask_iou_reward": 0.9744158645467315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6973322629928589, + "rewards/thk_ans_format_reward": 1.0, + "step": 1085, + "think_completion_length": 5.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.03125381469727, + "epoch": 3.6677908937605395, + "grad_norm": 19.0330526800938, + "kl": 0.6484375, + "learning_rate": 6.942567567567568e-07, + "loss": 0.0006, + "reward": 3.194105863571167, + "reward_std": 0.17677438259124756, + "rewards/final_reward": 1.039079084824662, + "rewards/mask_iou_reward": 0.519539542412331, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1941059827804565, + "rewards/thk_ans_format_reward": 1.0, + "step": 1086, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.07291793823242, + "epoch": 3.6711635750421587, + "grad_norm": 15.123282614981838, + "kl": 0.625, + "learning_rate": 6.939752252252252e-07, + "loss": 0.0006, + "reward": 3.3549336194992065, + "reward_std": 0.0925322026014328, + "rewards/final_reward": 1.3952954506377155, + "rewards/mask_iou_reward": 0.6976477253188578, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.354933738708496, + "rewards/thk_ans_format_reward": 1.0, + "step": 1087, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.98959350585938, + "epoch": 3.6745362563237776, + "grad_norm": 24.47213890889503, + "kl": 0.55859375, + "learning_rate": 6.936936936936936e-07, + "loss": 0.0006, + "reward": 3.6780649423599243, + "reward_std": 0.15190696716308594, + "rewards/final_reward": 1.8513842679316488, + "rewards/mask_iou_reward": 0.9256921339658244, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6884815692901611, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1088, + "think_completion_length": 6.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.59375762939453, + "epoch": 3.6779089376053964, + "grad_norm": 7.552053296826617, + "kl": 0.595703125, + "learning_rate": 6.934121621621621e-07, + "loss": 0.0006, + "reward": 3.3988085985183716, + "reward_std": 0.21870959550142288, + "rewards/final_reward": 0.8550849830494813, + "rewards/mask_iou_reward": 0.42754249152474066, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3988087177276611, + "rewards/thk_ans_format_reward": 1.0, + "step": 1089, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.62500381469727, + "epoch": 3.681281618887015, + "grad_norm": 36.284952682642654, + "kl": 0.59765625, + "learning_rate": 6.931306306306306e-07, + "loss": 0.0006, + "reward": 3.5274451971054077, + "reward_std": 0.1046636514365673, + "rewards/final_reward": 1.7440145405320384, + "rewards/mask_iou_reward": 0.8720072702660192, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.527445137500763, + "rewards/thk_ans_format_reward": 1.0, + "step": 1090, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.83333587646484, + "epoch": 3.684654300168634, + "grad_norm": 10.87900442779485, + "kl": 0.6484375, + "learning_rate": 6.92849099099099e-07, + "loss": 0.0006, + "reward": 3.2674392461776733, + "reward_std": 0.2030637189745903, + "rewards/final_reward": 1.2155159261571356, + "rewards/mask_iou_reward": 0.6077579630785678, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.267439216375351, + "rewards/thk_ans_format_reward": 1.0, + "step": 1091, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.72916793823242, + "epoch": 3.688026981450253, + "grad_norm": 11.530580162530784, + "kl": 0.712890625, + "learning_rate": 6.925675675675675e-07, + "loss": 0.0007, + "reward": 3.406046509742737, + "reward_std": 0.15012041572481394, + "rewards/final_reward": 1.4677173188719124, + "rewards/mask_iou_reward": 0.7338586594359562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4060462713241577, + "rewards/thk_ans_format_reward": 1.0, + "step": 1092, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.23958587646484, + "epoch": 3.6913996627318717, + "grad_norm": 8.836161855161931, + "kl": 0.62890625, + "learning_rate": 6.922860360360359e-07, + "loss": 0.0006, + "reward": 3.214850425720215, + "reward_std": 0.09776772558689117, + "rewards/final_reward": 1.3247986783728343, + "rewards/mask_iou_reward": 0.6623993391864171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2148504257202148, + "rewards/thk_ans_format_reward": 1.0, + "step": 1093, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.40625, + "epoch": 3.694772344013491, + "grad_norm": 5.76477120156763, + "kl": 0.748046875, + "learning_rate": 6.920045045045044e-07, + "loss": 0.0008, + "reward": 3.4666571617126465, + "reward_std": 0.2039150409400463, + "rewards/final_reward": 1.340760683680751, + "rewards/mask_iou_reward": 0.6703803418403755, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4874904155731201, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1094, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.8229217529297, + "epoch": 3.6981450252951094, + "grad_norm": 9.585543183381706, + "kl": 0.587890625, + "learning_rate": 6.91722972972973e-07, + "loss": 0.0006, + "reward": 3.43148934841156, + "reward_std": 0.09372329898178577, + "rewards/final_reward": 0.9576623108718747, + "rewards/mask_iou_reward": 0.47883115543593735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4314891397953033, + "rewards/thk_ans_format_reward": 1.0, + "step": 1095, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.21875, + "epoch": 3.7015177065767286, + "grad_norm": 10.038236194020055, + "kl": 0.791015625, + "learning_rate": 6.914414414414414e-07, + "loss": 0.0008, + "reward": 3.4386956691741943, + "reward_std": 0.08693969808518887, + "rewards/final_reward": 1.8579690886791975, + "rewards/mask_iou_reward": 0.9289845443395988, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4386956095695496, + "rewards/thk_ans_format_reward": 1.0, + "step": 1096, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.84375762939453, + "epoch": 3.7048903878583475, + "grad_norm": 109.9158361563393, + "kl": 0.537109375, + "learning_rate": 6.911599099099099e-07, + "loss": 0.0005, + "reward": 3.5106626749038696, + "reward_std": 0.051349299028515816, + "rewards/final_reward": 1.7499927058187152, + "rewards/mask_iou_reward": 0.8749963529093576, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5106624960899353, + "rewards/thk_ans_format_reward": 1.0, + "step": 1097, + "think_completion_length": 6.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.02083587646484, + "epoch": 3.7082630691399663, + "grad_norm": 10.121253017002285, + "kl": 0.66796875, + "learning_rate": 6.908783783783783e-07, + "loss": 0.0007, + "reward": 3.3785040378570557, + "reward_std": 0.2980985939502716, + "rewards/final_reward": 1.6061714274948828, + "rewards/mask_iou_reward": 0.8030857137474414, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.399337112903595, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1098, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.52083587646484, + "epoch": 3.711635750421585, + "grad_norm": 9.155132777155144, + "kl": 0.587890625, + "learning_rate": 6.905968468468468e-07, + "loss": 0.0006, + "reward": 3.364309310913086, + "reward_std": 0.1669931337237358, + "rewards/final_reward": 1.3568525625508892, + "rewards/mask_iou_reward": 0.6784262812754446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3643090724945068, + "rewards/thk_ans_format_reward": 1.0, + "step": 1099, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75000381469727, + "epoch": 3.715008431703204, + "grad_norm": 9.480165526801581, + "kl": 0.59375, + "learning_rate": 6.903153153153153e-07, + "loss": 0.0006, + "reward": 3.4969812631607056, + "reward_std": 0.16354385018348694, + "rewards/final_reward": 1.4303749695856478, + "rewards/mask_iou_reward": 0.7151874847928239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.496981143951416, + "rewards/thk_ans_format_reward": 1.0, + "step": 1100, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.62500381469727, + "epoch": 3.718381112984823, + "grad_norm": 19.621153860163872, + "kl": 0.849609375, + "learning_rate": 6.900337837837837e-07, + "loss": 0.0009, + "reward": 3.537114977836609, + "reward_std": 0.07870265282690525, + "rewards/final_reward": 1.685322757336444, + "rewards/mask_iou_reward": 0.842661378668222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5371148586273193, + "rewards/thk_ans_format_reward": 1.0, + "step": 1101, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.55208587646484, + "epoch": 3.7217537942664416, + "grad_norm": 10.553586034745063, + "kl": 0.7578125, + "learning_rate": 6.897522522522522e-07, + "loss": 0.0008, + "reward": 3.653933048248291, + "reward_std": 0.10681610554456711, + "rewards/final_reward": 1.5648624304704355, + "rewards/mask_iou_reward": 0.7824312152352177, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6539331674575806, + "rewards/thk_ans_format_reward": 1.0, + "step": 1102, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.91666793823242, + "epoch": 3.725126475548061, + "grad_norm": 7.397855296650245, + "kl": 0.5859375, + "learning_rate": 6.894707207207207e-07, + "loss": 0.0006, + "reward": 3.233435034751892, + "reward_std": 0.25034917145967484, + "rewards/final_reward": 0.9141035635760388, + "rewards/mask_iou_reward": 0.4570517817880194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2334350645542145, + "rewards/thk_ans_format_reward": 1.0, + "step": 1103, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.81250762939453, + "epoch": 3.7284991568296797, + "grad_norm": 37.53188467605128, + "kl": 0.5380859375, + "learning_rate": 6.891891891891891e-07, + "loss": 0.0005, + "reward": 3.3896223306655884, + "reward_std": 0.2384193167090416, + "rewards/final_reward": 1.4327673652494468, + "rewards/mask_iou_reward": 0.7163836826247234, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4104554653167725, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1104, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.95833587646484, + "epoch": 3.7318718381112985, + "grad_norm": 9.254473164975776, + "kl": 0.556640625, + "learning_rate": 6.889076576576577e-07, + "loss": 0.0006, + "reward": 3.3896210193634033, + "reward_std": 0.13112148270010948, + "rewards/final_reward": 1.5128234298816556, + "rewards/mask_iou_reward": 0.7564117149408278, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3896209001541138, + "rewards/thk_ans_format_reward": 1.0, + "step": 1105, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.89583587646484, + "epoch": 3.7352445193929174, + "grad_norm": 23.84978331881755, + "kl": 0.56640625, + "learning_rate": 6.886261261261261e-07, + "loss": 0.0006, + "reward": 3.1603639125823975, + "reward_std": 0.07676676660776138, + "rewards/final_reward": 0.890607477137064, + "rewards/mask_iou_reward": 0.445303738568532, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.160364031791687, + "rewards/thk_ans_format_reward": 1.0, + "step": 1106, + "think_completion_length": 6.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.4791717529297, + "epoch": 3.738617200674536, + "grad_norm": 46.65639431978475, + "kl": 0.5625, + "learning_rate": 6.883445945945946e-07, + "loss": 0.0005, + "reward": 3.095438241958618, + "reward_std": 0.2738206684589386, + "rewards/final_reward": 0.7180541243761372, + "rewards/mask_iou_reward": 0.3590270621880686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.095438003540039, + "rewards/thk_ans_format_reward": 1.0, + "step": 1107, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.32292175292969, + "epoch": 3.741989881956155, + "grad_norm": 31.345126429693313, + "kl": 1.119140625, + "learning_rate": 6.880630630630631e-07, + "loss": 0.0011, + "reward": 3.4316246509552, + "reward_std": 0.08794242702424526, + "rewards/final_reward": 1.6411358926009716, + "rewards/mask_iou_reward": 0.8205679463004858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4316245317459106, + "rewards/thk_ans_format_reward": 1.0, + "step": 1108, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.71875, + "epoch": 3.745362563237774, + "grad_norm": 12.541312240734433, + "kl": 0.62109375, + "learning_rate": 6.877815315315315e-07, + "loss": 0.0006, + "reward": 3.412677526473999, + "reward_std": 0.2318500354886055, + "rewards/final_reward": 1.5392361667886947, + "rewards/mask_iou_reward": 0.7696180833943473, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.412677526473999, + "rewards/thk_ans_format_reward": 1.0, + "step": 1109, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.8541717529297, + "epoch": 3.748735244519393, + "grad_norm": 15.281198773897037, + "kl": 0.8828125, + "learning_rate": 6.875e-07, + "loss": 0.0009, + "reward": 3.305709719657898, + "reward_std": 0.31281551718711853, + "rewards/final_reward": 1.7111558294875617, + "rewards/mask_iou_reward": 0.8555779147437809, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.3682093620300293, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1110, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.1354217529297, + "epoch": 3.752107925801012, + "grad_norm": 18.422191479114865, + "kl": 0.423828125, + "learning_rate": 6.872184684684684e-07, + "loss": 0.0004, + "reward": 3.6392204761505127, + "reward_std": 0.04167993552982807, + "rewards/final_reward": 1.9108645855743962, + "rewards/mask_iou_reward": 0.9554322927871981, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.639220416545868, + "rewards/thk_ans_format_reward": 1.0, + "step": 1111, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.2291717529297, + "epoch": 3.7554806070826308, + "grad_norm": 12.32647367095304, + "kl": 0.6015625, + "learning_rate": 6.869369369369369e-07, + "loss": 0.0006, + "reward": 3.3169971704483032, + "reward_std": 0.07655757665634155, + "rewards/final_reward": 1.1560575467479777, + "rewards/mask_iou_reward": 0.5780287733739888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.316997230052948, + "rewards/thk_ans_format_reward": 1.0, + "step": 1112, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.50000762939453, + "epoch": 3.7588532883642496, + "grad_norm": 16.190413962214407, + "kl": 0.5869140625, + "learning_rate": 6.866554054054054e-07, + "loss": 0.0006, + "reward": 3.449712872505188, + "reward_std": 0.16217003017663956, + "rewards/final_reward": 1.5921062061509579, + "rewards/mask_iou_reward": 0.7960531030754789, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4497130513191223, + "rewards/thk_ans_format_reward": 1.0, + "step": 1113, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25000762939453, + "epoch": 3.7622259696458684, + "grad_norm": 22.800503624529753, + "kl": 0.560546875, + "learning_rate": 6.863738738738738e-07, + "loss": 0.0005, + "reward": 3.677310585975647, + "reward_std": 0.05152285099029541, + "rewards/final_reward": 1.4880345993776913, + "rewards/mask_iou_reward": 0.7440172996888457, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6773105263710022, + "rewards/thk_ans_format_reward": 1.0, + "step": 1114, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.625, + "epoch": 3.7655986509274872, + "grad_norm": 19.32782592827037, + "kl": 0.658203125, + "learning_rate": 6.860923423423423e-07, + "loss": 0.0007, + "reward": 3.0054088830947876, + "reward_std": 0.04576574079692364, + "rewards/final_reward": 0.54155883498149, + "rewards/mask_iou_reward": 0.270779417490745, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0054087042808533, + "rewards/thk_ans_format_reward": 1.0, + "step": 1115, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.9479217529297, + "epoch": 3.768971332209106, + "grad_norm": 8.843701911133191, + "kl": 0.5390625, + "learning_rate": 6.858108108108109e-07, + "loss": 0.0005, + "reward": 3.573105573654175, + "reward_std": 0.1294175200164318, + "rewards/final_reward": 1.6244050962650063, + "rewards/mask_iou_reward": 0.8122025481325031, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5731057524681091, + "rewards/thk_ans_format_reward": 1.0, + "step": 1116, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.25000762939453, + "epoch": 3.7723440134907253, + "grad_norm": 83.32854232345196, + "kl": 0.533203125, + "learning_rate": 6.855292792792793e-07, + "loss": 0.0005, + "reward": 3.2631919384002686, + "reward_std": 0.10688711702823639, + "rewards/final_reward": 1.4339104671690461, + "rewards/mask_iou_reward": 0.7169552335845231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2631917893886566, + "rewards/thk_ans_format_reward": 1.0, + "step": 1117, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.54167175292969, + "epoch": 3.775716694772344, + "grad_norm": 9.126275736652579, + "kl": 0.59375, + "learning_rate": 6.852477477477478e-07, + "loss": 0.0006, + "reward": 3.5523250102996826, + "reward_std": 0.1462704762816429, + "rewards/final_reward": 1.4037499565724185, + "rewards/mask_iou_reward": 0.7018749782862093, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5523249506950378, + "rewards/thk_ans_format_reward": 1.0, + "step": 1118, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.27083587646484, + "epoch": 3.779089376053963, + "grad_norm": 9.693226489065822, + "kl": 0.5087890625, + "learning_rate": 6.849662162162162e-07, + "loss": 0.0005, + "reward": 3.363922119140625, + "reward_std": 0.0753621906042099, + "rewards/final_reward": 1.296634945318167, + "rewards/mask_iou_reward": 0.6483174726590835, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3639219999313354, + "rewards/thk_ans_format_reward": 1.0, + "step": 1119, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0104217529297, + "epoch": 3.782462057335582, + "grad_norm": 15.448804342717906, + "kl": 0.56640625, + "learning_rate": 6.846846846846847e-07, + "loss": 0.0006, + "reward": 3.552070379257202, + "reward_std": 0.3433392718434334, + "rewards/final_reward": 1.423108607564154, + "rewards/mask_iou_reward": 0.711554303782077, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.593737006187439, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1120, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.65625762939453, + "epoch": 3.7858347386172007, + "grad_norm": 13.346373731247276, + "kl": 0.453125, + "learning_rate": 6.844031531531532e-07, + "loss": 0.0004, + "reward": 3.4322853088378906, + "reward_std": 0.5301657021045685, + "rewards/final_reward": 1.3762657362921482, + "rewards/mask_iou_reward": 0.6881328681460741, + "rewards/sam_format_reward": 0.9375000298023224, + "rewards/sam_reward_func_ultra": 1.5572853684425354, + "rewards/thk_ans_format_reward": 0.9375000298023224, + "step": 1121, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.53125381469727, + "epoch": 3.7892074198988195, + "grad_norm": 31.74734560119016, + "kl": 0.4892578125, + "learning_rate": 6.841216216216216e-07, + "loss": 0.0005, + "reward": 3.7686702013015747, + "reward_std": 0.025162406265735626, + "rewards/final_reward": 1.9371686420295529, + "rewards/mask_iou_reward": 0.9685843210147764, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7686700820922852, + "rewards/thk_ans_format_reward": 1.0, + "step": 1122, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.70833587646484, + "epoch": 3.7925801011804383, + "grad_norm": 22.51580592703938, + "kl": 0.548828125, + "learning_rate": 6.838400900900901e-07, + "loss": 0.0006, + "reward": 2.8940176963806152, + "reward_std": 0.4165959060192108, + "rewards/final_reward": 1.1415240073547133, + "rewards/mask_iou_reward": 0.5707620036773566, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 0.9981842637062073, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 1123, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.9479217529297, + "epoch": 3.7959527824620576, + "grad_norm": 14.567123366233686, + "kl": 0.5703125, + "learning_rate": 6.835585585585585e-07, + "loss": 0.0006, + "reward": 3.60901141166687, + "reward_std": 0.22202441096305847, + "rewards/final_reward": 1.7287610565966587, + "rewards/mask_iou_reward": 0.8643805282983293, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6298444271087646, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1124, + "think_completion_length": 7.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.93750381469727, + "epoch": 3.799325463743676, + "grad_norm": 13.533824935426894, + "kl": 0.75390625, + "learning_rate": 6.832770270270269e-07, + "loss": 0.0008, + "reward": 3.3092023134231567, + "reward_std": 0.11465185135602951, + "rewards/final_reward": 1.5839463106212694, + "rewards/mask_iou_reward": 0.7919731553106347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3092021346092224, + "rewards/thk_ans_format_reward": 1.0, + "step": 1125, + "think_completion_length": 10.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.09375381469727, + "epoch": 3.8026981450252952, + "grad_norm": 6.412786524388538, + "kl": 0.59765625, + "learning_rate": 6.829954954954955e-07, + "loss": 0.0006, + "reward": 3.368514060974121, + "reward_std": 0.20392443984746933, + "rewards/final_reward": 1.5844456702050551, + "rewards/mask_iou_reward": 0.7922228351025276, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3893473744392395, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1126, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.47916793823242, + "epoch": 3.806070826306914, + "grad_norm": 13.208236486503731, + "kl": 0.71484375, + "learning_rate": 6.827139639639639e-07, + "loss": 0.0007, + "reward": 3.238366961479187, + "reward_std": 0.18330181390047073, + "rewards/final_reward": 0.8698805224904762, + "rewards/mask_iou_reward": 0.4349402612452381, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.238366961479187, + "rewards/thk_ans_format_reward": 1.0, + "step": 1127, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.86458587646484, + "epoch": 3.809443507588533, + "grad_norm": 137.55010232326637, + "kl": 0.642578125, + "learning_rate": 6.824324324324324e-07, + "loss": 0.0006, + "reward": 3.424420475959778, + "reward_std": 0.15352077782154083, + "rewards/final_reward": 1.3288265654138773, + "rewards/mask_iou_reward": 0.6644132827069387, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4244202971458435, + "rewards/thk_ans_format_reward": 1.0, + "step": 1128, + "think_completion_length": 9.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.46875381469727, + "epoch": 3.8128161888701517, + "grad_norm": 16.05562555438284, + "kl": 0.544921875, + "learning_rate": 6.821509009009008e-07, + "loss": 0.0005, + "reward": 3.5405032634735107, + "reward_std": 0.09723260626196861, + "rewards/final_reward": 1.4643371725248082, + "rewards/mask_iou_reward": 0.7321685862624041, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.540503203868866, + "rewards/thk_ans_format_reward": 1.0, + "step": 1129, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.59375381469727, + "epoch": 3.8161888701517706, + "grad_norm": 17.866431008873636, + "kl": 0.59375, + "learning_rate": 6.818693693693693e-07, + "loss": 0.0006, + "reward": 3.3343998193740845, + "reward_std": 0.20225829631090164, + "rewards/final_reward": 1.8135839218908014, + "rewards/mask_iou_reward": 0.9067919609454007, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3343996405601501, + "rewards/thk_ans_format_reward": 1.0, + "step": 1130, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.55208587646484, + "epoch": 3.8195615514333894, + "grad_norm": 12.75502597179819, + "kl": 0.599609375, + "learning_rate": 6.815878378378378e-07, + "loss": 0.0006, + "reward": 3.1991249322891235, + "reward_std": 0.11675117909908295, + "rewards/final_reward": 1.029759909468699, + "rewards/mask_iou_reward": 0.5148799547343496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.199124813079834, + "rewards/thk_ans_format_reward": 1.0, + "step": 1131, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.95833587646484, + "epoch": 3.822934232715008, + "grad_norm": 25.249214949208973, + "kl": 0.60546875, + "learning_rate": 6.813063063063062e-07, + "loss": 0.0006, + "reward": 3.5785523653030396, + "reward_std": 0.03312146570533514, + "rewards/final_reward": 1.8504287888354827, + "rewards/mask_iou_reward": 0.9252143944177413, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5785521268844604, + "rewards/thk_ans_format_reward": 1.0, + "step": 1132, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.6875, + "epoch": 3.8263069139966275, + "grad_norm": 10.199327918569953, + "kl": 0.80078125, + "learning_rate": 6.810247747747747e-07, + "loss": 0.0008, + "reward": 3.3085036277770996, + "reward_std": 0.22457706183195114, + "rewards/final_reward": 1.6140251674854684, + "rewards/mask_iou_reward": 0.8070125837427342, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3293370604515076, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1133, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.25, + "epoch": 3.8296795952782463, + "grad_norm": 10.183729612731424, + "kl": 0.8359375, + "learning_rate": 6.807432432432431e-07, + "loss": 0.0008, + "reward": 3.356192469596863, + "reward_std": 0.20595254004001617, + "rewards/final_reward": 1.705807672602365, + "rewards/mask_iou_reward": 0.8529038363011825, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.356192409992218, + "rewards/thk_ans_format_reward": 1.0, + "step": 1134, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.73958587646484, + "epoch": 3.833052276559865, + "grad_norm": 25.931712730614123, + "kl": 0.607421875, + "learning_rate": 6.804617117117116e-07, + "loss": 0.0006, + "reward": 3.551330327987671, + "reward_std": 0.11808786168694496, + "rewards/final_reward": 1.6091930003779598, + "rewards/mask_iou_reward": 0.8045965001889799, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.551330327987671, + "rewards/thk_ans_format_reward": 1.0, + "step": 1135, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.8645896911621, + "epoch": 3.836424957841484, + "grad_norm": 12.010928338808759, + "kl": 0.564453125, + "learning_rate": 6.801801801801802e-07, + "loss": 0.0006, + "reward": 3.350279211997986, + "reward_std": 0.2695094048976898, + "rewards/final_reward": 1.469435863031313, + "rewards/mask_iou_reward": 0.7347179315156565, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.4336124062538147, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 1136, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.36458587646484, + "epoch": 3.839797639123103, + "grad_norm": 10.129838501473719, + "kl": 0.580078125, + "learning_rate": 6.798986486486486e-07, + "loss": 0.0006, + "reward": 3.593467593193054, + "reward_std": 0.1148192435503006, + "rewards/final_reward": 1.7003924569798057, + "rewards/mask_iou_reward": 0.8501962284899028, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.614300787448883, + "rewards/thk_ans_format_reward": 1.0, + "step": 1137, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.48958587646484, + "epoch": 3.8431703204047216, + "grad_norm": 12.924062041974949, + "kl": 0.58984375, + "learning_rate": 6.796171171171171e-07, + "loss": 0.0006, + "reward": 3.5154197216033936, + "reward_std": 0.09570467099547386, + "rewards/final_reward": 1.6768668541562382, + "rewards/mask_iou_reward": 0.8384334270781191, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5154194235801697, + "rewards/thk_ans_format_reward": 1.0, + "step": 1138, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.0104217529297, + "epoch": 3.8465430016863404, + "grad_norm": 11.279230400648318, + "kl": 0.525390625, + "learning_rate": 6.793355855855856e-07, + "loss": 0.0005, + "reward": 3.1914998292922974, + "reward_std": 0.2051123920828104, + "rewards/final_reward": 1.6003443347049353, + "rewards/mask_iou_reward": 0.8001721673524677, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.212332844734192, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1139, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.59375381469727, + "epoch": 3.8499156829679597, + "grad_norm": 33.25053767645448, + "kl": 0.619140625, + "learning_rate": 6.79054054054054e-07, + "loss": 0.0006, + "reward": 3.5866551399230957, + "reward_std": 0.10260298103094101, + "rewards/final_reward": 1.4827233010826073, + "rewards/mask_iou_reward": 0.7413616505413037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.586655080318451, + "rewards/thk_ans_format_reward": 1.0, + "step": 1140, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.68750762939453, + "epoch": 3.8532883642495785, + "grad_norm": 14.008249722826953, + "kl": 0.5419921875, + "learning_rate": 6.787725225225225e-07, + "loss": 0.0005, + "reward": 3.2890868186950684, + "reward_std": 0.273948322981596, + "rewards/final_reward": 1.393363380868561, + "rewards/mask_iou_reward": 0.6966816904342805, + "rewards/sam_format_reward": 0.90625, + "rewards/sam_reward_func_ultra": 1.4765866994857788, + "rewards/thk_ans_format_reward": 0.90625, + "step": 1141, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.95833587646484, + "epoch": 3.8566610455311974, + "grad_norm": 10.579622025176663, + "kl": 0.59765625, + "learning_rate": 6.784909909909909e-07, + "loss": 0.0006, + "reward": 3.4296700954437256, + "reward_std": 0.18933508545160294, + "rewards/final_reward": 1.7752847816498374, + "rewards/mask_iou_reward": 0.8876423908249187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4296702146530151, + "rewards/thk_ans_format_reward": 1.0, + "step": 1142, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.83333587646484, + "epoch": 3.860033726812816, + "grad_norm": 17.479380467691797, + "kl": 0.654296875, + "learning_rate": 6.782094594594594e-07, + "loss": 0.0007, + "reward": 3.2466046810150146, + "reward_std": 0.2565118744969368, + "rewards/final_reward": 1.645865671176606, + "rewards/mask_iou_reward": 0.822932835588303, + "rewards/sam_format_reward": 0.90625, + "rewards/sam_reward_func_ultra": 1.4341047406196594, + "rewards/thk_ans_format_reward": 0.90625, + "step": 1143, + "think_completion_length": 10.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.7604217529297, + "epoch": 3.863406408094435, + "grad_norm": 20.262099881784287, + "kl": 0.564453125, + "learning_rate": 6.779279279279279e-07, + "loss": 0.0006, + "reward": 3.3922996520996094, + "reward_std": 0.3344555199146271, + "rewards/final_reward": 1.0211181931399709, + "rewards/mask_iou_reward": 0.5105590965699854, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.4652163982391357, + "rewards/thk_ans_format_reward": 0.9687500298023224, + "step": 1144, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.77083587646484, + "epoch": 3.866779089376054, + "grad_norm": 12.989026006240904, + "kl": 0.73046875, + "learning_rate": 6.776463963963963e-07, + "loss": 0.0007, + "reward": 3.320478320121765, + "reward_std": 0.25647711753845215, + "rewards/final_reward": 0.9190338004926744, + "rewards/mask_iou_reward": 0.4595169002463372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.32047837972641, + "rewards/thk_ans_format_reward": 1.0, + "step": 1145, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.95833587646484, + "epoch": 3.8701517706576727, + "grad_norm": 10.956469240134854, + "kl": 0.5625, + "learning_rate": 6.773648648648649e-07, + "loss": 0.0006, + "reward": 3.03474223613739, + "reward_std": 0.12496957927942276, + "rewards/final_reward": 0.44723209186124996, + "rewards/mask_iou_reward": 0.22361604593062498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.034742295742035, + "rewards/thk_ans_format_reward": 1.0, + "step": 1146, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.0625, + "epoch": 3.873524451939292, + "grad_norm": 7.76556935584102, + "kl": 0.61328125, + "learning_rate": 6.770833333333333e-07, + "loss": 0.0006, + "reward": 3.484253764152527, + "reward_std": 0.09405850991606712, + "rewards/final_reward": 1.3491039962728166, + "rewards/mask_iou_reward": 0.6745519981364083, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4842538833618164, + "rewards/thk_ans_format_reward": 1.0, + "step": 1147, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.17708587646484, + "epoch": 3.876897133220911, + "grad_norm": 16.52118909760354, + "kl": 0.5859375, + "learning_rate": 6.768018018018018e-07, + "loss": 0.0006, + "reward": 3.336282253265381, + "reward_std": 0.23352781683206558, + "rewards/final_reward": 1.4455606382828008, + "rewards/mask_iou_reward": 0.7227803191414004, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.377948820590973, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1148, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.6666717529297, + "epoch": 3.8802698145025296, + "grad_norm": 26.974443259321365, + "kl": 0.623046875, + "learning_rate": 6.765202702702703e-07, + "loss": 0.0006, + "reward": 3.422439694404602, + "reward_std": 0.20345600694417953, + "rewards/final_reward": 1.8651120869182578, + "rewards/mask_iou_reward": 0.9325560434591289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4224395751953125, + "rewards/thk_ans_format_reward": 1.0, + "step": 1149, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.21875381469727, + "epoch": 3.8836424957841484, + "grad_norm": 8.892420371329939, + "kl": 0.587890625, + "learning_rate": 6.762387387387387e-07, + "loss": 0.0006, + "reward": 3.4350632429122925, + "reward_std": 0.11019621044397354, + "rewards/final_reward": 1.9606659313791421, + "rewards/mask_iou_reward": 0.9803329656895711, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.435063362121582, + "rewards/thk_ans_format_reward": 1.0, + "step": 1150, + "think_completion_length": 10.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.4791717529297, + "epoch": 3.8870151770657673, + "grad_norm": 10.598498090981675, + "kl": 0.669921875, + "learning_rate": 6.759572072072072e-07, + "loss": 0.0007, + "reward": 3.4017633199691772, + "reward_std": 0.05746646970510483, + "rewards/final_reward": 1.1165454395140448, + "rewards/mask_iou_reward": 0.5582727197570224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4017632603645325, + "rewards/thk_ans_format_reward": 1.0, + "step": 1151, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.9166717529297, + "epoch": 3.890387858347386, + "grad_norm": 8.414786648638504, + "kl": 0.556640625, + "learning_rate": 6.756756756756756e-07, + "loss": 0.0006, + "reward": 3.472022771835327, + "reward_std": 0.09950246475636959, + "rewards/final_reward": 1.4637774236557763, + "rewards/mask_iou_reward": 0.7318887118278882, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4720227718353271, + "rewards/thk_ans_format_reward": 1.0, + "step": 1152, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.32292175292969, + "epoch": 3.893760539629005, + "grad_norm": 8.27522608916324, + "kl": 0.630859375, + "learning_rate": 6.753941441441441e-07, + "loss": 0.0006, + "reward": 3.805325150489807, + "reward_std": 0.0928366631269455, + "rewards/final_reward": 1.8977951290927875, + "rewards/mask_iou_reward": 0.9488975645463937, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8053249716758728, + "rewards/thk_ans_format_reward": 1.0, + "step": 1153, + "think_completion_length": 9.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.625, + "epoch": 3.897133220910624, + "grad_norm": 22.119312379954955, + "kl": 0.642578125, + "learning_rate": 6.751126126126126e-07, + "loss": 0.0007, + "reward": 3.390862822532654, + "reward_std": 0.2109134942293167, + "rewards/final_reward": 1.815931880147895, + "rewards/mask_iou_reward": 0.9079659400739475, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4116960167884827, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1154, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.8854217529297, + "epoch": 3.9005059021922426, + "grad_norm": 37.14682144067229, + "kl": 0.572265625, + "learning_rate": 6.74831081081081e-07, + "loss": 0.0006, + "reward": 3.4662243127822876, + "reward_std": 0.09864114969968796, + "rewards/final_reward": 1.7343714224455264, + "rewards/mask_iou_reward": 0.8671857112227632, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4662241339683533, + "rewards/thk_ans_format_reward": 1.0, + "step": 1155, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.58333587646484, + "epoch": 3.903878583473862, + "grad_norm": 10.005914797429432, + "kl": 0.52734375, + "learning_rate": 6.745495495495496e-07, + "loss": 0.0005, + "reward": 3.347672939300537, + "reward_std": 0.11771451123058796, + "rewards/final_reward": 1.4089836477985758, + "rewards/mask_iou_reward": 0.7044918238992879, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3580895364284515, + "rewards/thk_ans_format_reward": 1.0, + "step": 1156, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.55208587646484, + "epoch": 3.9072512647554807, + "grad_norm": 9.853558177575996, + "kl": 0.599609375, + "learning_rate": 6.742680180180181e-07, + "loss": 0.0006, + "reward": 3.4117908477783203, + "reward_std": 0.0272050928324461, + "rewards/final_reward": 1.1120983698610167, + "rewards/mask_iou_reward": 0.5560491849305084, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4117907881736755, + "rewards/thk_ans_format_reward": 1.0, + "step": 1157, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.14583587646484, + "epoch": 3.9106239460370995, + "grad_norm": 11.277941537448397, + "kl": 0.548828125, + "learning_rate": 6.739864864864865e-07, + "loss": 0.0006, + "reward": 3.5751309394836426, + "reward_std": 0.08195750042796135, + "rewards/final_reward": 1.4355375639291883, + "rewards/mask_iou_reward": 0.7177687819645941, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5751309394836426, + "rewards/thk_ans_format_reward": 1.0, + "step": 1158, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.5729217529297, + "epoch": 3.9139966273187183, + "grad_norm": 9.054297786946076, + "kl": 0.640625, + "learning_rate": 6.73704954954955e-07, + "loss": 0.0006, + "reward": 3.4315420389175415, + "reward_std": 0.17010553926229477, + "rewards/final_reward": 1.4478197334246705, + "rewards/mask_iou_reward": 0.7239098667123353, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4523754119873047, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1159, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.53125762939453, + "epoch": 3.917369308600337, + "grad_norm": 31.29774946350612, + "kl": 0.57421875, + "learning_rate": 6.734234234234234e-07, + "loss": 0.0006, + "reward": 3.2284148931503296, + "reward_std": 0.22033357620239258, + "rewards/final_reward": 1.4817521564526, + "rewards/mask_iou_reward": 0.7408760782263, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.249248206615448, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1160, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.40625, + "epoch": 3.920741989881956, + "grad_norm": 10.40583238132195, + "kl": 0.580078125, + "learning_rate": 6.731418918918919e-07, + "loss": 0.0006, + "reward": 3.0834414958953857, + "reward_std": 0.29399073868989944, + "rewards/final_reward": 1.1915881620589188, + "rewards/mask_iou_reward": 0.5957940810294594, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.2084414660930634, + "rewards/thk_ans_format_reward": 0.9375, + "step": 1161, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.65625381469727, + "epoch": 3.924114671163575, + "grad_norm": 160.03568478542434, + "kl": 0.537109375, + "learning_rate": 6.728603603603604e-07, + "loss": 0.0005, + "reward": 3.5287028551101685, + "reward_std": 0.06423872895538807, + "rewards/final_reward": 0.5818589507508326, + "rewards/mask_iou_reward": 0.2909294753754163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5287025570869446, + "rewards/thk_ans_format_reward": 1.0, + "step": 1162, + "think_completion_length": 9.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.28125381469727, + "epoch": 3.927487352445194, + "grad_norm": 9.822659849234423, + "kl": 1.068359375, + "learning_rate": 6.725788288288288e-07, + "loss": 0.0011, + "reward": 3.2519426345825195, + "reward_std": 0.10252987593412399, + "rewards/final_reward": 0.06949677758991413, + "rewards/mask_iou_reward": 0.034748388794957064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2519423365592957, + "rewards/thk_ans_format_reward": 1.0, + "step": 1163, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.70833587646484, + "epoch": 3.930860033726813, + "grad_norm": 10.798828216838599, + "kl": 0.744140625, + "learning_rate": 6.722972972972972e-07, + "loss": 0.0008, + "reward": 3.3033626079559326, + "reward_std": 0.07960717380046844, + "rewards/final_reward": 1.6421366016422263, + "rewards/mask_iou_reward": 0.8210683008211132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3033625483512878, + "rewards/thk_ans_format_reward": 1.0, + "step": 1164, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.65625381469727, + "epoch": 3.9342327150084317, + "grad_norm": 16.320809642833645, + "kl": 0.669921875, + "learning_rate": 6.720157657657656e-07, + "loss": 0.0007, + "reward": 3.606690526008606, + "reward_std": 0.24704181402921677, + "rewards/final_reward": 1.3225213951972898, + "rewards/mask_iou_reward": 0.6612606975986449, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.6900236010551453, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 1165, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.29166793823242, + "epoch": 3.9376053962900506, + "grad_norm": 35.63580913868573, + "kl": 0.615234375, + "learning_rate": 6.717342342342342e-07, + "loss": 0.0006, + "reward": 3.183878540992737, + "reward_std": 0.1320716105401516, + "rewards/final_reward": 1.3518959103213104, + "rewards/mask_iou_reward": 0.6759479551606552, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1838783025741577, + "rewards/thk_ans_format_reward": 1.0, + "step": 1166, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.32291793823242, + "epoch": 3.9409780775716694, + "grad_norm": 16.859281176433807, + "kl": 0.638671875, + "learning_rate": 6.714527027027027e-07, + "loss": 0.0006, + "reward": 3.1818827390670776, + "reward_std": 0.09869653731584549, + "rewards/final_reward": 1.4249743612102743, + "rewards/mask_iou_reward": 0.7124871806051372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1818827390670776, + "rewards/thk_ans_format_reward": 1.0, + "step": 1167, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.15625, + "epoch": 3.9443507588532882, + "grad_norm": 11.903269690933106, + "kl": 0.63671875, + "learning_rate": 6.711711711711711e-07, + "loss": 0.0006, + "reward": 3.1614683866500854, + "reward_std": 0.14515875279903412, + "rewards/final_reward": 1.5640988689443378, + "rewards/mask_iou_reward": 0.7820494344721689, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1614683866500854, + "rewards/thk_ans_format_reward": 1.0, + "step": 1168, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.52083587646484, + "epoch": 3.947723440134907, + "grad_norm": 18.45401892617275, + "kl": 0.546875, + "learning_rate": 6.708896396396396e-07, + "loss": 0.0005, + "reward": 3.2433966398239136, + "reward_std": 0.06726586446166039, + "rewards/final_reward": 1.529449017547845, + "rewards/mask_iou_reward": 0.7647245087739225, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2433964610099792, + "rewards/thk_ans_format_reward": 1.0, + "step": 1169, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.69792175292969, + "epoch": 3.9510961214165263, + "grad_norm": 28.4186280214568, + "kl": 0.63671875, + "learning_rate": 6.70608108108108e-07, + "loss": 0.0006, + "reward": 3.447712779045105, + "reward_std": 0.09034018777310848, + "rewards/final_reward": 1.7980894564169296, + "rewards/mask_iou_reward": 0.8990447282084648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4477129578590393, + "rewards/thk_ans_format_reward": 1.0, + "step": 1170, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.95833969116211, + "epoch": 3.954468802698145, + "grad_norm": 18.817439024173634, + "kl": 0.537109375, + "learning_rate": 6.703265765765765e-07, + "loss": 0.0005, + "reward": 3.0330607891082764, + "reward_std": 0.08898946642875671, + "rewards/final_reward": 0.7484329408679943, + "rewards/mask_iou_reward": 0.37421647043399714, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0330607891082764, + "rewards/thk_ans_format_reward": 1.0, + "step": 1171, + "think_completion_length": 5.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.2187614440918, + "epoch": 3.957841483979764, + "grad_norm": 10.101315100221358, + "kl": 0.56640625, + "learning_rate": 6.70045045045045e-07, + "loss": 0.0006, + "reward": 3.3744516372680664, + "reward_std": 0.2819393612444401, + "rewards/final_reward": 1.3392306228334026, + "rewards/mask_iou_reward": 0.6696153114167013, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.499451756477356, + "rewards/thk_ans_format_reward": 0.9375, + "step": 1172, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.08333587646484, + "epoch": 3.961214165261383, + "grad_norm": 8.184313790203422, + "kl": 0.583984375, + "learning_rate": 6.697635135135134e-07, + "loss": 0.0006, + "reward": 3.3349242210388184, + "reward_std": 0.1057177446782589, + "rewards/final_reward": 1.6384289076635299, + "rewards/mask_iou_reward": 0.8192144538317649, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3349245190620422, + "rewards/thk_ans_format_reward": 1.0, + "step": 1173, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.87500381469727, + "epoch": 3.9645868465430016, + "grad_norm": 16.448266125331322, + "kl": 0.57421875, + "learning_rate": 6.694819819819819e-07, + "loss": 0.0006, + "reward": 3.4961599111557007, + "reward_std": 0.09959017485380173, + "rewards/final_reward": 1.3022567383415606, + "rewards/mask_iou_reward": 0.6511283691707803, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.496160089969635, + "rewards/thk_ans_format_reward": 1.0, + "step": 1174, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.13541793823242, + "epoch": 3.9679595278246205, + "grad_norm": 10.897067411417096, + "kl": 0.63671875, + "learning_rate": 6.692004504504503e-07, + "loss": 0.0006, + "reward": 3.00723659992218, + "reward_std": 0.07285760110244155, + "rewards/final_reward": 0.502330371989074, + "rewards/mask_iou_reward": 0.251165185994537, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0072364211082458, + "rewards/thk_ans_format_reward": 1.0, + "step": 1175, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.47916793823242, + "epoch": 3.9713322091062393, + "grad_norm": 20.446752251210203, + "kl": 0.619140625, + "learning_rate": 6.689189189189189e-07, + "loss": 0.0006, + "reward": 3.4216020107269287, + "reward_std": 0.1204568762332201, + "rewards/final_reward": 1.6239261922127697, + "rewards/mask_iou_reward": 0.8119630961063848, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4216020107269287, + "rewards/thk_ans_format_reward": 1.0, + "step": 1176, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.83333587646484, + "epoch": 3.9747048903878586, + "grad_norm": 10.013549314476817, + "kl": 0.75, + "learning_rate": 6.686373873873874e-07, + "loss": 0.0008, + "reward": 3.0989030599594116, + "reward_std": 0.10697172209620476, + "rewards/final_reward": 0.9797569255011491, + "rewards/mask_iou_reward": 0.48987846275057456, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0989029705524445, + "rewards/thk_ans_format_reward": 1.0, + "step": 1177, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.34375381469727, + "epoch": 3.9780775716694774, + "grad_norm": 11.631293297558877, + "kl": 0.90234375, + "learning_rate": 6.683558558558558e-07, + "loss": 0.0009, + "reward": 3.0180094242095947, + "reward_std": 0.12886736541986465, + "rewards/final_reward": 1.0945630209917057, + "rewards/mask_iou_reward": 0.5472815104958528, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0180092453956604, + "rewards/thk_ans_format_reward": 1.0, + "step": 1178, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.9375, + "epoch": 3.9814502529510962, + "grad_norm": 10.991482902331736, + "kl": 0.55859375, + "learning_rate": 6.680743243243243e-07, + "loss": 0.0006, + "reward": 3.6170132160186768, + "reward_std": 0.1302567794919014, + "rewards/final_reward": 1.4590375447159973, + "rewards/mask_iou_reward": 0.7295187723579987, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6274299025535583, + "rewards/thk_ans_format_reward": 1.0, + "step": 1179, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0208396911621, + "epoch": 3.984822934232715, + "grad_norm": 6.800394388342199, + "kl": 0.65625, + "learning_rate": 6.677927927927928e-07, + "loss": 0.0007, + "reward": 3.363734722137451, + "reward_std": 0.015621137339621782, + "rewards/final_reward": 1.6003017140478155, + "rewards/mask_iou_reward": 0.8001508570239078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3637345433235168, + "rewards/thk_ans_format_reward": 1.0, + "step": 1180, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.60416793823242, + "epoch": 3.988195615514334, + "grad_norm": 19.201489375835788, + "kl": 0.6796875, + "learning_rate": 6.675112612612612e-07, + "loss": 0.0007, + "reward": 3.5164124965667725, + "reward_std": 0.12926794216036797, + "rewards/final_reward": 1.8086767008232651, + "rewards/mask_iou_reward": 0.9043383504116326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.516412377357483, + "rewards/thk_ans_format_reward": 1.0, + "step": 1181, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.03125381469727, + "epoch": 3.9915682967959527, + "grad_norm": 15.91782373980905, + "kl": 0.646484375, + "learning_rate": 6.672297297297297e-07, + "loss": 0.0006, + "reward": 3.5790776014328003, + "reward_std": 0.07250684313476086, + "rewards/final_reward": 1.217705241658552, + "rewards/mask_iou_reward": 0.608852620829276, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5790774822235107, + "rewards/thk_ans_format_reward": 1.0, + "step": 1182, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.0729217529297, + "epoch": 3.9949409780775715, + "grad_norm": 11.389974166745496, + "kl": 0.486328125, + "learning_rate": 6.669481981981981e-07, + "loss": 0.0006, + "reward": 3.757065773010254, + "reward_std": 0.0599273145198822, + "rewards/final_reward": 1.9625319193708353, + "rewards/mask_iou_reward": 0.9812659596854176, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.75706547498703, + "rewards/thk_ans_format_reward": 1.0, + "step": 1183, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.6315803527832, + "epoch": 3.998313659359191, + "grad_norm": 15.93861202741278, + "kl": 0.5546875, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0006, + "reward": 3.3559396266937256, + "reward_std": 0.02718531433492899, + "rewards/final_reward": 1.5773362977098602, + "rewards/mask_iou_reward": 0.7886681488549301, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3559398651123047, + "rewards/thk_ans_format_reward": 1.0, + "step": 1184, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.12500381469727, + "epoch": 4.003372681281619, + "grad_norm": 10.554016252070584, + "kl": 1.111328125, + "learning_rate": 6.663851351351351e-07, + "loss": 0.0011, + "reward": 3.439687967300415, + "reward_std": 0.08648747950792313, + "rewards/final_reward": 1.4580134691598818, + "rewards/mask_iou_reward": 0.7290067345799409, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.439687967300415, + "rewards/thk_ans_format_reward": 1.0, + "step": 1185, + "think_completion_length": 6.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.08333587646484, + "epoch": 4.006745362563238, + "grad_norm": 10.95857025554743, + "kl": 0.4609375, + "learning_rate": 6.661036036036036e-07, + "loss": 0.0005, + "reward": 3.328967809677124, + "reward_std": 0.03493300452828407, + "rewards/final_reward": 1.8086746194133259, + "rewards/mask_iou_reward": 0.9043373097066629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3289676308631897, + "rewards/thk_ans_format_reward": 1.0, + "step": 1186, + "think_completion_length": 6.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.22916793823242, + "epoch": 4.010118043844857, + "grad_norm": 6.343836801717278, + "kl": 0.67578125, + "learning_rate": 6.658220720720721e-07, + "loss": 0.0008, + "reward": 3.5031098127365112, + "reward_std": 0.033121745102107525, + "rewards/final_reward": 1.483336888613724, + "rewards/mask_iou_reward": 0.741668444306862, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.503109633922577, + "rewards/thk_ans_format_reward": 1.0, + "step": 1187, + "think_completion_length": 5.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8645896911621, + "epoch": 4.013490725126475, + "grad_norm": 11.33780507114698, + "kl": 0.6640625, + "learning_rate": 6.655405405405405e-07, + "loss": 0.0007, + "reward": 3.250043511390686, + "reward_std": 0.08231607265770435, + "rewards/final_reward": 1.5900309584094408, + "rewards/mask_iou_reward": 0.7950154792047204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2500435709953308, + "rewards/thk_ans_format_reward": 1.0, + "step": 1188, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.2916717529297, + "epoch": 4.016863406408095, + "grad_norm": 15.958992045750374, + "kl": 0.591796875, + "learning_rate": 6.65259009009009e-07, + "loss": 0.0006, + "reward": 3.6597955226898193, + "reward_std": 0.16362156346440315, + "rewards/final_reward": 1.5883938304816358, + "rewards/mask_iou_reward": 0.7941969152408179, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6806288361549377, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1189, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.45833587646484, + "epoch": 4.020236087689713, + "grad_norm": 29.774903611078646, + "kl": 0.6328125, + "learning_rate": 6.649774774774775e-07, + "loss": 0.0006, + "reward": 3.4126813411712646, + "reward_std": 0.035650059347972274, + "rewards/final_reward": 1.5868418605872399, + "rewards/mask_iou_reward": 0.7934209302936199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4126812815666199, + "rewards/thk_ans_format_reward": 1.0, + "step": 1190, + "think_completion_length": 6.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.77083587646484, + "epoch": 4.023608768971332, + "grad_norm": 13.734640247347274, + "kl": 0.53515625, + "learning_rate": 6.646959459459459e-07, + "loss": 0.0005, + "reward": 3.3605130910873413, + "reward_std": 0.05231809243559837, + "rewards/final_reward": 0.5977116399046374, + "rewards/mask_iou_reward": 0.2988558199523187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3605130910873413, + "rewards/thk_ans_format_reward": 1.0, + "step": 1191, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.01041793823242, + "epoch": 4.0269814502529515, + "grad_norm": 37.86017681085706, + "kl": 0.5703125, + "learning_rate": 6.644144144144144e-07, + "loss": 0.0006, + "reward": 3.414387583732605, + "reward_std": 0.18529635295271873, + "rewards/final_reward": 1.1384239630747104, + "rewards/mask_iou_reward": 0.5692119815373552, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.435220718383789, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1192, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.82291793823242, + "epoch": 4.03035413153457, + "grad_norm": 20.219296538614426, + "kl": 0.4677734375, + "learning_rate": 6.641328828828829e-07, + "loss": 0.0005, + "reward": 3.0895376205444336, + "reward_std": 0.11390053480863571, + "rewards/final_reward": 0.8502073905759294, + "rewards/mask_iou_reward": 0.4251036952879647, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0895376205444336, + "rewards/thk_ans_format_reward": 1.0, + "step": 1193, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.00000381469727, + "epoch": 4.033726812816189, + "grad_norm": 31.58834779008439, + "kl": 0.41796875, + "learning_rate": 6.638513513513513e-07, + "loss": 0.0004, + "reward": 3.637164354324341, + "reward_std": 0.08336159586906433, + "rewards/final_reward": 1.7560355742195841, + "rewards/mask_iou_reward": 0.8780177871097921, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.63716459274292, + "rewards/thk_ans_format_reward": 1.0, + "step": 1194, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.875, + "epoch": 4.0370994940978076, + "grad_norm": 8.293597312434683, + "kl": 0.4716796875, + "learning_rate": 6.635698198198198e-07, + "loss": 0.0005, + "reward": 3.3773438930511475, + "reward_std": 0.07199937477707863, + "rewards/final_reward": 1.4250182773570836, + "rewards/mask_iou_reward": 0.7125091386785418, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.377343773841858, + "rewards/thk_ans_format_reward": 1.0, + "step": 1195, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.27083587646484, + "epoch": 4.040472175379427, + "grad_norm": 22.502011229703147, + "kl": 0.5283203125, + "learning_rate": 6.632882882882883e-07, + "loss": 0.0005, + "reward": 3.2654829025268555, + "reward_std": 0.06824944447726011, + "rewards/final_reward": 1.5511016928185446, + "rewards/mask_iou_reward": 0.7755508464092723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2654826641082764, + "rewards/thk_ans_format_reward": 1.0, + "step": 1196, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.2604217529297, + "epoch": 4.043844856661045, + "grad_norm": 6.811462987308554, + "kl": 0.724609375, + "learning_rate": 6.630067567567568e-07, + "loss": 0.0008, + "reward": 3.615772247314453, + "reward_std": 0.10267575038596988, + "rewards/final_reward": 1.6120674931977677, + "rewards/mask_iou_reward": 0.8060337465988838, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6261889934539795, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1197, + "think_completion_length": 5.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.11458587646484, + "epoch": 4.0472175379426645, + "grad_norm": 17.715372589544845, + "kl": 0.6015625, + "learning_rate": 6.627252252252253e-07, + "loss": 0.0006, + "reward": 3.3741610050201416, + "reward_std": 0.052045850083231926, + "rewards/final_reward": 0.9774066456483272, + "rewards/mask_iou_reward": 0.4887033228241636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.374160885810852, + "rewards/thk_ans_format_reward": 1.0, + "step": 1198, + "think_completion_length": 6.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.14583587646484, + "epoch": 4.050590219224283, + "grad_norm": 22.80800540898344, + "kl": 0.91015625, + "learning_rate": 6.624436936936937e-07, + "loss": 0.0009, + "reward": 3.4368356466293335, + "reward_std": 0.09212891571223736, + "rewards/final_reward": 1.471651074709624, + "rewards/mask_iou_reward": 0.735825537354812, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4368354678153992, + "rewards/thk_ans_format_reward": 1.0, + "step": 1199, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.7395896911621, + "epoch": 4.053962900505902, + "grad_norm": 41.39021177493442, + "kl": 0.4228515625, + "learning_rate": 6.621621621621622e-07, + "loss": 0.0004, + "reward": 3.3039404153823853, + "reward_std": 0.11558372527360916, + "rewards/final_reward": 1.532649382911797, + "rewards/mask_iou_reward": 0.7663246914558985, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.303940236568451, + "rewards/thk_ans_format_reward": 1.0, + "step": 1200, + "think_completion_length": 5.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.375, + "epoch": 4.057335581787521, + "grad_norm": 43.35919146778135, + "kl": 0.494140625, + "learning_rate": 6.618806306306306e-07, + "loss": 0.0005, + "reward": 3.4193031787872314, + "reward_std": 0.09287399984896183, + "rewards/final_reward": 1.1264470755571945, + "rewards/mask_iou_reward": 0.5632235377785972, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.419303297996521, + "rewards/thk_ans_format_reward": 1.0, + "step": 1201, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.3645896911621, + "epoch": 4.06070826306914, + "grad_norm": 13.071128890946966, + "kl": 0.478515625, + "learning_rate": 6.615990990990991e-07, + "loss": 0.0005, + "reward": 3.0676461458206177, + "reward_std": 0.15790517255663872, + "rewards/final_reward": 1.685211982017151, + "rewards/mask_iou_reward": 0.8426059910085755, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0884793996810913, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1202, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1979217529297, + "epoch": 4.064080944350759, + "grad_norm": 8.496155118140667, + "kl": 0.404296875, + "learning_rate": 6.613175675675676e-07, + "loss": 0.0004, + "reward": 3.4975868463516235, + "reward_std": 0.08468229323625565, + "rewards/final_reward": 1.3996250295911348, + "rewards/mask_iou_reward": 0.6998125147955674, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4975868463516235, + "rewards/thk_ans_format_reward": 1.0, + "step": 1203, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.28125381469727, + "epoch": 4.0674536256323774, + "grad_norm": 15.478406315143841, + "kl": 0.580078125, + "learning_rate": 6.610360360360359e-07, + "loss": 0.0006, + "reward": 3.4111239910125732, + "reward_std": 0.15900106355547905, + "rewards/final_reward": 1.6024432224024738, + "rewards/mask_iou_reward": 0.8012216112012369, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.411124050617218, + "rewards/thk_ans_format_reward": 1.0, + "step": 1204, + "think_completion_length": 6.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.20833587646484, + "epoch": 4.070826306913997, + "grad_norm": 12.152397641597725, + "kl": 0.5, + "learning_rate": 6.607545045045044e-07, + "loss": 0.0005, + "reward": 3.2357778549194336, + "reward_std": 0.07916467823088169, + "rewards/final_reward": 1.2065086567678471, + "rewards/mask_iou_reward": 0.6032543283839236, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.235777735710144, + "rewards/thk_ans_format_reward": 1.0, + "step": 1205, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.63541793823242, + "epoch": 4.074198988195615, + "grad_norm": 14.537589525952134, + "kl": 0.4755859375, + "learning_rate": 6.60472972972973e-07, + "loss": 0.0005, + "reward": 3.667311668395996, + "reward_std": 0.06388841196894646, + "rewards/final_reward": 1.8121912599107288, + "rewards/mask_iou_reward": 0.9060956299553644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6673114895820618, + "rewards/thk_ans_format_reward": 1.0, + "step": 1206, + "think_completion_length": 5.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.14583587646484, + "epoch": 4.077571669477234, + "grad_norm": 15.088706799391279, + "kl": 0.4306640625, + "learning_rate": 6.601914414414414e-07, + "loss": 0.0006, + "reward": 3.3942922353744507, + "reward_std": 0.1983342319726944, + "rewards/final_reward": 1.5898807602430456, + "rewards/mask_iou_reward": 0.7949403801215228, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3942922353744507, + "rewards/thk_ans_format_reward": 1.0, + "step": 1207, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.28125, + "epoch": 4.080944350758854, + "grad_norm": 27.33584550590293, + "kl": 0.4384765625, + "learning_rate": 6.599099099099099e-07, + "loss": 0.0005, + "reward": 3.223801851272583, + "reward_std": 0.02942474838346243, + "rewards/final_reward": 1.6642409626734604, + "rewards/mask_iou_reward": 0.8321204813367302, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2238017916679382, + "rewards/thk_ans_format_reward": 1.0, + "step": 1208, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.15625, + "epoch": 4.084317032040472, + "grad_norm": 5.586636533993314, + "kl": 0.4365234375, + "learning_rate": 6.596283783783783e-07, + "loss": 0.0004, + "reward": 3.6259137392044067, + "reward_std": 0.14700846886262298, + "rewards/final_reward": 1.4255063237605718, + "rewards/mask_iou_reward": 0.7127531618802859, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6467470526695251, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1209, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.51041793823242, + "epoch": 4.087689713322091, + "grad_norm": 10.176513900275186, + "kl": 0.43359375, + "learning_rate": 6.593468468468468e-07, + "loss": 0.0004, + "reward": 3.4102810621261597, + "reward_std": 0.09019586816430092, + "rewards/final_reward": 0.6894702596620408, + "rewards/mask_iou_reward": 0.3447351298310204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.410281240940094, + "rewards/thk_ans_format_reward": 1.0, + "step": 1210, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.04166793823242, + "epoch": 4.09106239460371, + "grad_norm": 17.637085645134885, + "kl": 0.419921875, + "learning_rate": 6.590653153153153e-07, + "loss": 0.0004, + "reward": 3.303520679473877, + "reward_std": 0.19280150532722473, + "rewards/final_reward": 1.1647785308102354, + "rewards/mask_iou_reward": 0.5823892654051177, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.303520679473877, + "rewards/thk_ans_format_reward": 1.0, + "step": 1211, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.83333587646484, + "epoch": 4.094435075885329, + "grad_norm": 15.695428734834808, + "kl": 1.30078125, + "learning_rate": 6.587837837837837e-07, + "loss": 0.0013, + "reward": 3.429495096206665, + "reward_std": 0.12937488220632076, + "rewards/final_reward": 1.789398878602503, + "rewards/mask_iou_reward": 0.8946994393012515, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4294949769973755, + "rewards/thk_ans_format_reward": 1.0, + "step": 1212, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.10416793823242, + "epoch": 4.097807757166947, + "grad_norm": 11.875679375139654, + "kl": 0.4580078125, + "learning_rate": 6.585022522522522e-07, + "loss": 0.0005, + "reward": 3.1290459632873535, + "reward_std": 0.13048794120550156, + "rewards/final_reward": 0.8777940433255967, + "rewards/mask_iou_reward": 0.43889702166279837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.139462798833847, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1213, + "think_completion_length": 5.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.30208587646484, + "epoch": 4.101180438448567, + "grad_norm": 14.024431690361592, + "kl": 0.5234375, + "learning_rate": 6.582207207207206e-07, + "loss": 0.0005, + "reward": 3.3400204181671143, + "reward_std": 0.02741351444274187, + "rewards/final_reward": 1.4409633928774201, + "rewards/mask_iou_reward": 0.7204816964387101, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3400205373764038, + "rewards/thk_ans_format_reward": 1.0, + "step": 1214, + "think_completion_length": 5.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.98958587646484, + "epoch": 4.104553119730186, + "grad_norm": 23.58731569658534, + "kl": 0.638671875, + "learning_rate": 6.579391891891891e-07, + "loss": 0.0006, + "reward": 3.4039725065231323, + "reward_std": 0.04642016626894474, + "rewards/final_reward": 1.8063808337832992, + "rewards/mask_iou_reward": 0.9031904168916496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.403972327709198, + "rewards/thk_ans_format_reward": 1.0, + "step": 1215, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.92708587646484, + "epoch": 4.107925801011804, + "grad_norm": 16.70364403368003, + "kl": 0.44921875, + "learning_rate": 6.576576576576577e-07, + "loss": 0.0004, + "reward": 3.491950511932373, + "reward_std": 0.03475194610655308, + "rewards/final_reward": 1.8578538951172, + "rewards/mask_iou_reward": 0.9289269475586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4919501543045044, + "rewards/thk_ans_format_reward": 1.0, + "step": 1216, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.27083969116211, + "epoch": 4.1112984822934235, + "grad_norm": 20.916392811665588, + "kl": 0.458984375, + "learning_rate": 6.573761261261261e-07, + "loss": 0.0005, + "reward": 3.676409959793091, + "reward_std": 0.05732338689267635, + "rewards/final_reward": 1.8801769016381307, + "rewards/mask_iou_reward": 0.9400884508190653, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6764100193977356, + "rewards/thk_ans_format_reward": 1.0, + "step": 1217, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.68750381469727, + "epoch": 4.114671163575042, + "grad_norm": 22.28216496691829, + "kl": 0.5947265625, + "learning_rate": 6.570945945945946e-07, + "loss": 0.0006, + "reward": 3.378030300140381, + "reward_std": 0.10985900834202766, + "rewards/final_reward": 1.514454998998649, + "rewards/mask_iou_reward": 0.7572274994993246, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3780303597450256, + "rewards/thk_ans_format_reward": 1.0, + "step": 1218, + "think_completion_length": 5.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.22916793823242, + "epoch": 4.118043844856661, + "grad_norm": 21.727131304379803, + "kl": 0.4677734375, + "learning_rate": 6.56813063063063e-07, + "loss": 0.0005, + "reward": 3.600069284439087, + "reward_std": 0.13599379733204842, + "rewards/final_reward": 1.6915890647441116, + "rewards/mask_iou_reward": 0.8457945323720558, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.600069284439087, + "rewards/thk_ans_format_reward": 1.0, + "step": 1219, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.95833587646484, + "epoch": 4.12141652613828, + "grad_norm": 15.270943087396427, + "kl": 1.1640625, + "learning_rate": 6.565315315315315e-07, + "loss": 0.0012, + "reward": 3.3276935815811157, + "reward_std": 0.061925821006298065, + "rewards/final_reward": 1.5652405764330064, + "rewards/mask_iou_reward": 0.7826202882165032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.327693521976471, + "rewards/thk_ans_format_reward": 1.0, + "step": 1220, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.23958587646484, + "epoch": 4.124789207419899, + "grad_norm": 9.675363166519526, + "kl": 0.5361328125, + "learning_rate": 6.5625e-07, + "loss": 0.0006, + "reward": 3.3326576948165894, + "reward_std": 0.09473956376314163, + "rewards/final_reward": 1.5967528080432538, + "rewards/mask_iou_reward": 0.7983764040216269, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3326576948165894, + "rewards/thk_ans_format_reward": 1.0, + "step": 1221, + "think_completion_length": 6.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.6354217529297, + "epoch": 4.128161888701518, + "grad_norm": 16.7886905879466, + "kl": 0.474609375, + "learning_rate": 6.559684684684684e-07, + "loss": 0.0005, + "reward": 3.4593290090560913, + "reward_std": 0.05802651774138212, + "rewards/final_reward": 1.692901515639981, + "rewards/mask_iou_reward": 0.8464507578199905, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4593292474746704, + "rewards/thk_ans_format_reward": 1.0, + "step": 1222, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.6979217529297, + "epoch": 4.1315345699831365, + "grad_norm": 8.029491788908325, + "kl": 0.568359375, + "learning_rate": 6.556869369369369e-07, + "loss": 0.0006, + "reward": 3.5112451314926147, + "reward_std": 0.120102159678936, + "rewards/final_reward": 1.1520923960479097, + "rewards/mask_iou_reward": 0.5760461980239548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5112449526786804, + "rewards/thk_ans_format_reward": 1.0, + "step": 1223, + "think_completion_length": 6.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.63542175292969, + "epoch": 4.134907251264756, + "grad_norm": 9.309653332724258, + "kl": 0.41015625, + "learning_rate": 6.554054054054053e-07, + "loss": 0.0004, + "reward": 3.714353322982788, + "reward_std": 0.018177752383053303, + "rewards/final_reward": 1.531489051594929, + "rewards/mask_iou_reward": 0.7657445257974645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7143531441688538, + "rewards/thk_ans_format_reward": 1.0, + "step": 1224, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.01042175292969, + "epoch": 4.138279932546374, + "grad_norm": 18.742822358444663, + "kl": 0.5947265625, + "learning_rate": 6.551238738738738e-07, + "loss": 0.0007, + "reward": 3.322489023208618, + "reward_std": 0.08387609012424946, + "rewards/final_reward": 1.3946590371260241, + "rewards/mask_iou_reward": 0.6973295185630121, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3224887251853943, + "rewards/thk_ans_format_reward": 1.0, + "step": 1225, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.92708587646484, + "epoch": 4.141652613827993, + "grad_norm": 35.009143337678275, + "kl": 0.4609375, + "learning_rate": 6.548423423423423e-07, + "loss": 0.0005, + "reward": 3.5057541131973267, + "reward_std": 0.09459779784083366, + "rewards/final_reward": 1.7178362719872866, + "rewards/mask_iou_reward": 0.8589181359936433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5057538747787476, + "rewards/thk_ans_format_reward": 1.0, + "step": 1226, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.96875381469727, + "epoch": 4.145025295109612, + "grad_norm": 56.043396569771154, + "kl": 0.4541015625, + "learning_rate": 6.545608108108108e-07, + "loss": 0.0005, + "reward": 3.2239954471588135, + "reward_std": 0.10730738565325737, + "rewards/final_reward": 0.4325340813971122, + "rewards/mask_iou_reward": 0.2162670406985561, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2239955067634583, + "rewards/thk_ans_format_reward": 1.0, + "step": 1227, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.1041717529297, + "epoch": 4.148397976391231, + "grad_norm": 12.000564031917897, + "kl": 0.474609375, + "learning_rate": 6.542792792792793e-07, + "loss": 0.0005, + "reward": 3.229053258895874, + "reward_std": 0.10904721170663834, + "rewards/final_reward": 1.35390044326304, + "rewards/mask_iou_reward": 0.67695022163152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2290531396865845, + "rewards/thk_ans_format_reward": 1.0, + "step": 1228, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.61458587646484, + "epoch": 4.15177065767285, + "grad_norm": 10.836795281979196, + "kl": 1.314453125, + "learning_rate": 6.539977477477478e-07, + "loss": 0.0013, + "reward": 3.5818368196487427, + "reward_std": 0.16170379519462585, + "rewards/final_reward": 1.723583934220796, + "rewards/mask_iou_reward": 0.861791967110398, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5818366408348083, + "rewards/thk_ans_format_reward": 1.0, + "step": 1229, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.38541793823242, + "epoch": 4.155143338954469, + "grad_norm": 84.69549949187494, + "kl": 0.560546875, + "learning_rate": 6.537162162162162e-07, + "loss": 0.0006, + "reward": 3.1797205209732056, + "reward_std": 0.027921637054532766, + "rewards/final_reward": 0.01804651799242533, + "rewards/mask_iou_reward": 0.009023258996212665, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.179720401763916, + "rewards/thk_ans_format_reward": 1.0, + "step": 1230, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.82292175292969, + "epoch": 4.158516020236088, + "grad_norm": 11.292030777174597, + "kl": 0.55078125, + "learning_rate": 6.534346846846847e-07, + "loss": 0.0006, + "reward": 3.534628748893738, + "reward_std": 0.165926992893219, + "rewards/final_reward": 1.533198233303391, + "rewards/mask_iou_reward": 0.7665991166516954, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5450453758239746, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1231, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.34375, + "epoch": 4.161888701517706, + "grad_norm": 7.4161788941658235, + "kl": 0.5703125, + "learning_rate": 6.531531531531531e-07, + "loss": 0.0006, + "reward": 2.960986852645874, + "reward_std": 0.21262395568192005, + "rewards/final_reward": 1.2512099620712724, + "rewards/mask_iou_reward": 0.6256049810356362, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.0234868824481964, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1232, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.28125762939453, + "epoch": 4.165261382799326, + "grad_norm": 11.150973269362533, + "kl": 0.4033203125, + "learning_rate": 6.528716216216216e-07, + "loss": 0.0004, + "reward": 3.5004022121429443, + "reward_std": 0.04145765211433172, + "rewards/final_reward": 1.497370642903789, + "rewards/mask_iou_reward": 0.7486853214518945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5004018545150757, + "rewards/thk_ans_format_reward": 1.0, + "step": 1233, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.62500762939453, + "epoch": 4.168634064080944, + "grad_norm": 9.505429834089737, + "kl": 1.185546875, + "learning_rate": 6.525900900900901e-07, + "loss": 0.0012, + "reward": 3.405234456062317, + "reward_std": 0.11318844556808472, + "rewards/final_reward": 1.910689967757608, + "rewards/mask_iou_reward": 0.955344983878804, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4052343964576721, + "rewards/thk_ans_format_reward": 1.0, + "step": 1234, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.65625, + "epoch": 4.172006745362563, + "grad_norm": 9.054992454458855, + "kl": 0.3935546875, + "learning_rate": 6.523085585585585e-07, + "loss": 0.0004, + "reward": 3.6547305583953857, + "reward_std": 0.04657788202166557, + "rewards/final_reward": 1.819914800489872, + "rewards/mask_iou_reward": 0.909957400244936, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6547306180000305, + "rewards/thk_ans_format_reward": 1.0, + "step": 1235, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.2395896911621, + "epoch": 4.175379426644182, + "grad_norm": 19.602709244183345, + "kl": 0.4765625, + "learning_rate": 6.52027027027027e-07, + "loss": 0.0005, + "reward": 3.5941241979599, + "reward_std": 0.07574337627738714, + "rewards/final_reward": 1.5954112549798443, + "rewards/mask_iou_reward": 0.7977056274899221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5941241383552551, + "rewards/thk_ans_format_reward": 1.0, + "step": 1236, + "think_completion_length": 5.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.73958587646484, + "epoch": 4.178752107925801, + "grad_norm": 13.188752737208313, + "kl": 0.541015625, + "learning_rate": 6.517454954954955e-07, + "loss": 0.0005, + "reward": 3.6106929779052734, + "reward_std": 0.05192565359175205, + "rewards/final_reward": 1.5023569538959587, + "rewards/mask_iou_reward": 0.7511784769479793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6106928586959839, + "rewards/thk_ans_format_reward": 1.0, + "step": 1237, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75000381469727, + "epoch": 4.18212478920742, + "grad_norm": 20.146829558184656, + "kl": 0.4345703125, + "learning_rate": 6.51463963963964e-07, + "loss": 0.0004, + "reward": 3.3862054347991943, + "reward_std": 0.1351282075047493, + "rewards/final_reward": 1.103903692786539, + "rewards/mask_iou_reward": 0.5519518463932696, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3862053751945496, + "rewards/thk_ans_format_reward": 1.0, + "step": 1238, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.69791793823242, + "epoch": 4.185497470489039, + "grad_norm": 10.390083694801291, + "kl": 0.46875, + "learning_rate": 6.511824324324325e-07, + "loss": 0.0006, + "reward": 3.443265676498413, + "reward_std": 0.07178937830030918, + "rewards/final_reward": 1.8056116902199864, + "rewards/mask_iou_reward": 0.9028058451099932, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4432653784751892, + "rewards/thk_ans_format_reward": 1.0, + "step": 1239, + "think_completion_length": 7.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.36458587646484, + "epoch": 4.188870151770658, + "grad_norm": 6.478982019410555, + "kl": 0.533203125, + "learning_rate": 6.509009009009009e-07, + "loss": 0.0005, + "reward": 3.445013165473938, + "reward_std": 0.17272941768169403, + "rewards/final_reward": 1.452560436189965, + "rewards/mask_iou_reward": 0.7262802180949826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4450132846832275, + "rewards/thk_ans_format_reward": 1.0, + "step": 1240, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.125, + "epoch": 4.192242833052276, + "grad_norm": 8.789730060732012, + "kl": 0.484375, + "learning_rate": 6.506193693693694e-07, + "loss": 0.0005, + "reward": 3.440682053565979, + "reward_std": 0.09131734818220139, + "rewards/final_reward": 1.649042618797143, + "rewards/mask_iou_reward": 0.8245213093985715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4406821131706238, + "rewards/thk_ans_format_reward": 1.0, + "step": 1241, + "think_completion_length": 5.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.77083587646484, + "epoch": 4.195615514333896, + "grad_norm": 15.601941559299878, + "kl": 0.435546875, + "learning_rate": 6.503378378378378e-07, + "loss": 0.0005, + "reward": 3.457811713218689, + "reward_std": 0.23156233131885529, + "rewards/final_reward": 1.834207483629815, + "rewards/mask_iou_reward": 0.9171037418149075, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4994783997535706, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1242, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.50000762939453, + "epoch": 4.198988195615514, + "grad_norm": 88.22556284587016, + "kl": 0.46875, + "learning_rate": 6.500563063063062e-07, + "loss": 0.0005, + "reward": 3.180579423904419, + "reward_std": 0.12762167118489742, + "rewards/final_reward": 0.7456849647915553, + "rewards/mask_iou_reward": 0.37284248239577766, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1805793642997742, + "rewards/thk_ans_format_reward": 1.0, + "step": 1243, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.52083587646484, + "epoch": 4.202360876897133, + "grad_norm": 23.92275189355112, + "kl": 0.470703125, + "learning_rate": 6.497747747747747e-07, + "loss": 0.0005, + "reward": 3.466734290122986, + "reward_std": 0.0954241082072258, + "rewards/final_reward": 1.2845573218016204, + "rewards/mask_iou_reward": 0.6422786609008102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4667341113090515, + "rewards/thk_ans_format_reward": 1.0, + "step": 1244, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.1979217529297, + "epoch": 4.2057335581787525, + "grad_norm": 12.833793769275767, + "kl": 0.4365234375, + "learning_rate": 6.494932432432431e-07, + "loss": 0.0004, + "reward": 3.397306203842163, + "reward_std": 0.06800971738994122, + "rewards/final_reward": 1.8772874987421284, + "rewards/mask_iou_reward": 0.9386437493710642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3973060846328735, + "rewards/thk_ans_format_reward": 1.0, + "step": 1245, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11459350585938, + "epoch": 4.209106239460371, + "grad_norm": 13.385538785948139, + "kl": 0.4296875, + "learning_rate": 6.492117117117116e-07, + "loss": 0.0004, + "reward": 3.5194268226623535, + "reward_std": 0.25978654250502586, + "rewards/final_reward": 1.5930146855113068, + "rewards/mask_iou_reward": 0.7965073427556534, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5402601957321167, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1246, + "think_completion_length": 5.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.375, + "epoch": 4.21247892074199, + "grad_norm": 8.409320177212509, + "kl": 0.623046875, + "learning_rate": 6.489301801801802e-07, + "loss": 0.0006, + "reward": 3.4033043384552, + "reward_std": 0.1465322431176901, + "rewards/final_reward": 0.9809823485167375, + "rewards/mask_iou_reward": 0.49049117425836875, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4033044576644897, + "rewards/thk_ans_format_reward": 1.0, + "step": 1247, + "think_completion_length": 5.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.5104217529297, + "epoch": 4.2158516020236085, + "grad_norm": 60.95788374408251, + "kl": 0.4169921875, + "learning_rate": 6.486486486486486e-07, + "loss": 0.0003, + "reward": 3.5154601335525513, + "reward_std": 0.18371056020259857, + "rewards/final_reward": 1.4486695819317195, + "rewards/mask_iou_reward": 0.7243347909658597, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5362934470176697, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1248, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.20833587646484, + "epoch": 4.219224283305228, + "grad_norm": 18.31368359785162, + "kl": 0.439453125, + "learning_rate": 6.483671171171171e-07, + "loss": 0.0004, + "reward": 3.470528244972229, + "reward_std": 0.21738072484731674, + "rewards/final_reward": 1.4415366605300044, + "rewards/mask_iou_reward": 0.7207683302650022, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.5330281853675842, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1249, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.54167175292969, + "epoch": 4.222596964586846, + "grad_norm": 13.461789249031524, + "kl": 0.41015625, + "learning_rate": 6.480855855855855e-07, + "loss": 0.0004, + "reward": 3.3450616598129272, + "reward_std": 0.07049424014985561, + "rewards/final_reward": 1.0090393338430488, + "rewards/mask_iou_reward": 0.5045196669215244, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3450616598129272, + "rewards/thk_ans_format_reward": 1.0, + "step": 1250, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.86458587646484, + "epoch": 4.2259696458684655, + "grad_norm": 12.45960904221618, + "kl": 0.39453125, + "learning_rate": 6.47804054054054e-07, + "loss": 0.0004, + "reward": 3.5850772857666016, + "reward_std": 0.08815484121441841, + "rewards/final_reward": 1.1730620960273108, + "rewards/mask_iou_reward": 0.5865310480136554, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5850771069526672, + "rewards/thk_ans_format_reward": 1.0, + "step": 1251, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.52083587646484, + "epoch": 4.229342327150085, + "grad_norm": 27.854007977705187, + "kl": 0.44140625, + "learning_rate": 6.475225225225225e-07, + "loss": 0.0005, + "reward": 3.6427247524261475, + "reward_std": 0.06672817096114159, + "rewards/final_reward": 1.2199035694148141, + "rewards/mask_iou_reward": 0.6099517847074071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6427247524261475, + "rewards/thk_ans_format_reward": 1.0, + "step": 1252, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.5520896911621, + "epoch": 4.232715008431703, + "grad_norm": 29.167713475717548, + "kl": 0.396484375, + "learning_rate": 6.472409909909909e-07, + "loss": 0.0004, + "reward": 3.5975239276885986, + "reward_std": 0.03818482160568237, + "rewards/final_reward": 1.666841807357827, + "rewards/mask_iou_reward": 0.8334209036789135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5975240468978882, + "rewards/thk_ans_format_reward": 1.0, + "step": 1253, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.20834350585938, + "epoch": 4.236087689713322, + "grad_norm": 27.32010950165432, + "kl": 0.5859375, + "learning_rate": 6.469594594594594e-07, + "loss": 0.0006, + "reward": 3.4661457538604736, + "reward_std": 0.09239022061228752, + "rewards/final_reward": 1.206821584190531, + "rewards/mask_iou_reward": 0.6034107920952655, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4661457538604736, + "rewards/thk_ans_format_reward": 1.0, + "step": 1254, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.08333587646484, + "epoch": 4.239460370994941, + "grad_norm": 21.433106992655098, + "kl": 0.5380859375, + "learning_rate": 6.466779279279278e-07, + "loss": 0.0005, + "reward": 3.370681881904602, + "reward_std": 0.05036402679979801, + "rewards/final_reward": 1.8078184431682338, + "rewards/mask_iou_reward": 0.9039092215841169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3706818222999573, + "rewards/thk_ans_format_reward": 1.0, + "step": 1255, + "think_completion_length": 6.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.67708587646484, + "epoch": 4.24283305227656, + "grad_norm": 11.401224674392935, + "kl": 0.400390625, + "learning_rate": 6.463963963963963e-07, + "loss": 0.0004, + "reward": 3.387624502182007, + "reward_std": 0.05631537130102515, + "rewards/final_reward": 1.8458147221685657, + "rewards/mask_iou_reward": 0.9229073610842828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3876244127750397, + "rewards/thk_ans_format_reward": 1.0, + "step": 1256, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.2604217529297, + "epoch": 4.246205733558178, + "grad_norm": 8.2827018516746, + "kl": 0.400390625, + "learning_rate": 6.461148648648649e-07, + "loss": 0.0005, + "reward": 3.403711438179016, + "reward_std": 0.06762434728443623, + "rewards/final_reward": 1.920771464048671, + "rewards/mask_iou_reward": 0.9603857320243355, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4037113785743713, + "rewards/thk_ans_format_reward": 1.0, + "step": 1257, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.56250762939453, + "epoch": 4.249578414839798, + "grad_norm": 9.468059780611297, + "kl": 0.3837890625, + "learning_rate": 6.458333333333333e-07, + "loss": 0.0004, + "reward": 3.150208592414856, + "reward_std": 0.10077772289514542, + "rewards/final_reward": 1.0828550566399318, + "rewards/mask_iou_reward": 0.5414275283199659, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1502084136009216, + "rewards/thk_ans_format_reward": 1.0, + "step": 1258, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.85417175292969, + "epoch": 4.252951096121416, + "grad_norm": 28.135678363113584, + "kl": 0.404296875, + "learning_rate": 6.455518018018018e-07, + "loss": 0.0004, + "reward": 3.407992720603943, + "reward_std": 0.014361603185534477, + "rewards/final_reward": 1.6231428704540085, + "rewards/mask_iou_reward": 0.8115714352270043, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4079925417900085, + "rewards/thk_ans_format_reward": 1.0, + "step": 1259, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.2604217529297, + "epoch": 4.256323777403035, + "grad_norm": 8.01168528846507, + "kl": 0.4619140625, + "learning_rate": 6.452702702702702e-07, + "loss": 0.0004, + "reward": 3.174490809440613, + "reward_std": 0.14233126863837242, + "rewards/final_reward": 1.0650623705048627, + "rewards/mask_iou_reward": 0.5325311852524314, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1953240036964417, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1260, + "think_completion_length": 5.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.35417175292969, + "epoch": 4.259696458684655, + "grad_norm": 10.994190019550278, + "kl": 0.55859375, + "learning_rate": 6.449887387387387e-07, + "loss": 0.0006, + "reward": 3.624230146408081, + "reward_std": 0.08744936436414719, + "rewards/final_reward": 1.5999830015346452, + "rewards/mask_iou_reward": 0.7999915007673226, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6242303252220154, + "rewards/thk_ans_format_reward": 1.0, + "step": 1261, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.73958587646484, + "epoch": 4.263069139966273, + "grad_norm": 10.331129254992351, + "kl": 0.45703125, + "learning_rate": 6.447072072072072e-07, + "loss": 0.0005, + "reward": 3.3891860246658325, + "reward_std": 0.09818197600543499, + "rewards/final_reward": 1.2423954959032273, + "rewards/mask_iou_reward": 0.6211977479516136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3891857862472534, + "rewards/thk_ans_format_reward": 1.0, + "step": 1262, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.23958587646484, + "epoch": 4.266441821247892, + "grad_norm": 21.318030836117394, + "kl": 0.52734375, + "learning_rate": 6.444256756756756e-07, + "loss": 0.0006, + "reward": 3.590371608734131, + "reward_std": 0.03496438264846802, + "rewards/final_reward": 1.0998050889015756, + "rewards/mask_iou_reward": 0.5499025444507878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5903714895248413, + "rewards/thk_ans_format_reward": 1.0, + "step": 1263, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.68750762939453, + "epoch": 4.269814502529511, + "grad_norm": 11.324690008730549, + "kl": 0.4638671875, + "learning_rate": 6.441441441441441e-07, + "loss": 0.0005, + "reward": 3.3355226516723633, + "reward_std": 0.07569969445466995, + "rewards/final_reward": 1.583805177521902, + "rewards/mask_iou_reward": 0.791902588760951, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3355226516723633, + "rewards/thk_ans_format_reward": 1.0, + "step": 1264, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.29166793823242, + "epoch": 4.27318718381113, + "grad_norm": 56.87368885988965, + "kl": 0.419921875, + "learning_rate": 6.438626126126126e-07, + "loss": 0.0004, + "reward": 3.379696846008301, + "reward_std": 0.09507296234369278, + "rewards/final_reward": 1.3212376749816899, + "rewards/mask_iou_reward": 0.6606188374908449, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3796967267990112, + "rewards/thk_ans_format_reward": 1.0, + "step": 1265, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.46875, + "epoch": 4.276559865092748, + "grad_norm": 15.725288435528828, + "kl": 0.44921875, + "learning_rate": 6.43581081081081e-07, + "loss": 0.0005, + "reward": 3.3752676248550415, + "reward_std": 0.09868980012834072, + "rewards/final_reward": 1.2282691318397194, + "rewards/mask_iou_reward": 0.6141345659198597, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3856841921806335, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1266, + "think_completion_length": 5.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.9791717529297, + "epoch": 4.279932546374368, + "grad_norm": 15.177451072912048, + "kl": 0.4609375, + "learning_rate": 6.432995495495496e-07, + "loss": 0.0005, + "reward": 3.4971253871917725, + "reward_std": 0.04681009333580732, + "rewards/final_reward": 0.8785843499122169, + "rewards/mask_iou_reward": 0.43929217495610845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4971250295639038, + "rewards/thk_ans_format_reward": 1.0, + "step": 1267, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.40625762939453, + "epoch": 4.283305227655987, + "grad_norm": 11.990275841882601, + "kl": 0.453125, + "learning_rate": 6.43018018018018e-07, + "loss": 0.0005, + "reward": 3.2263606786727905, + "reward_std": 0.04418545961380005, + "rewards/final_reward": 0.4898923797029757, + "rewards/mask_iou_reward": 0.24494618985148786, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2263606786727905, + "rewards/thk_ans_format_reward": 1.0, + "step": 1268, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.42708587646484, + "epoch": 4.286677908937605, + "grad_norm": 90.20677125257647, + "kl": 1.072265625, + "learning_rate": 6.427364864864865e-07, + "loss": 0.0011, + "reward": 3.1920053958892822, + "reward_std": 0.2112291418015957, + "rewards/final_reward": 0.7658425978021275, + "rewards/mask_iou_reward": 0.38292129890106374, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1920052468776703, + "rewards/thk_ans_format_reward": 1.0, + "step": 1269, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.9583396911621, + "epoch": 4.2900505902192245, + "grad_norm": 12.851434274016533, + "kl": 0.408203125, + "learning_rate": 6.42454954954955e-07, + "loss": 0.0004, + "reward": 3.476992130279541, + "reward_std": 0.24782110750675201, + "rewards/final_reward": 1.6170369546662968, + "rewards/mask_iou_reward": 0.8085184773331484, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5186585187911987, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1270, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.2916717529297, + "epoch": 4.293423271500843, + "grad_norm": 11.311288767429877, + "kl": 0.3720703125, + "learning_rate": 6.421734234234234e-07, + "loss": 0.0004, + "reward": 3.519341826438904, + "reward_std": 0.07771720364689827, + "rewards/final_reward": 1.6035272777919118, + "rewards/mask_iou_reward": 0.8017636388959559, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5193416476249695, + "rewards/thk_ans_format_reward": 1.0, + "step": 1271, + "think_completion_length": 5.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.86458587646484, + "epoch": 4.296795952782462, + "grad_norm": 14.915353679554494, + "kl": 0.486328125, + "learning_rate": 6.418918918918919e-07, + "loss": 0.0006, + "reward": 3.6986976861953735, + "reward_std": 0.024407205171883106, + "rewards/final_reward": 1.8355800602343713, + "rewards/mask_iou_reward": 0.9177900301171856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6986975073814392, + "rewards/thk_ans_format_reward": 1.0, + "step": 1272, + "think_completion_length": 5.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.8229217529297, + "epoch": 4.300168634064081, + "grad_norm": 22.386881153592117, + "kl": 0.501953125, + "learning_rate": 6.416103603603603e-07, + "loss": 0.0005, + "reward": 3.482214570045471, + "reward_std": 0.07844844087958336, + "rewards/final_reward": 1.8699623549921272, + "rewards/mask_iou_reward": 0.9349811774960636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4822145104408264, + "rewards/thk_ans_format_reward": 1.0, + "step": 1273, + "think_completion_length": 5.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.09375762939453, + "epoch": 4.3035413153457, + "grad_norm": 11.281865408345398, + "kl": 0.4189453125, + "learning_rate": 6.413288288288288e-07, + "loss": 0.0004, + "reward": 3.472105860710144, + "reward_std": 0.19757422804832458, + "rewards/final_reward": 1.609137134690231, + "rewards/mask_iou_reward": 0.8045685673451155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.472105860710144, + "rewards/thk_ans_format_reward": 1.0, + "step": 1274, + "think_completion_length": 5.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.88542175292969, + "epoch": 4.306913996627319, + "grad_norm": 10.133217004993806, + "kl": 0.5869140625, + "learning_rate": 6.410472972972973e-07, + "loss": 0.0006, + "reward": 3.6658592224121094, + "reward_std": 0.17215874418616295, + "rewards/final_reward": 1.944337073758779, + "rewards/mask_iou_reward": 0.9721685368793895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.665859341621399, + "rewards/thk_ans_format_reward": 1.0, + "step": 1275, + "think_completion_length": 5.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.8854217529297, + "epoch": 4.3102866779089375, + "grad_norm": 39.645176419264196, + "kl": 0.4033203125, + "learning_rate": 6.407657657657657e-07, + "loss": 0.0004, + "reward": 3.4578869342803955, + "reward_std": 0.05655907094478607, + "rewards/final_reward": 1.2396400380269288, + "rewards/mask_iou_reward": 0.6198200190134644, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.457886815071106, + "rewards/thk_ans_format_reward": 1.0, + "step": 1276, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.4583396911621, + "epoch": 4.313659359190557, + "grad_norm": 12.543109846229324, + "kl": 0.390625, + "learning_rate": 6.404842342342343e-07, + "loss": 0.0004, + "reward": 3.536691188812256, + "reward_std": 0.12136990204453468, + "rewards/final_reward": 1.310790119763332, + "rewards/mask_iou_reward": 0.655395059881666, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5366910696029663, + "rewards/thk_ans_format_reward": 1.0, + "step": 1277, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.09375762939453, + "epoch": 4.317032040472175, + "grad_norm": 15.233310117516359, + "kl": 0.53515625, + "learning_rate": 6.402027027027028e-07, + "loss": 0.0005, + "reward": 3.1646311283111572, + "reward_std": 0.10930739156901836, + "rewards/final_reward": 0.5577201157215373, + "rewards/mask_iou_reward": 0.2788600578607687, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1646308898925781, + "rewards/thk_ans_format_reward": 1.0, + "step": 1278, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.44792175292969, + "epoch": 4.320404721753794, + "grad_norm": 7.540695635548057, + "kl": 0.4638671875, + "learning_rate": 6.399211711711712e-07, + "loss": 0.0005, + "reward": 3.6288951635360718, + "reward_std": 0.07137066684663296, + "rewards/final_reward": 1.7092706493317622, + "rewards/mask_iou_reward": 0.8546353246658811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6288952827453613, + "rewards/thk_ans_format_reward": 1.0, + "step": 1279, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.3229217529297, + "epoch": 4.323777403035413, + "grad_norm": 11.45759006501233, + "kl": 0.4765625, + "learning_rate": 6.396396396396397e-07, + "loss": 0.0005, + "reward": 3.403845429420471, + "reward_std": 0.15239424258470535, + "rewards/final_reward": 1.4765252724296953, + "rewards/mask_iou_reward": 0.7382626362148477, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4038452506065369, + "rewards/thk_ans_format_reward": 1.0, + "step": 1280, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.75, + "epoch": 4.327150084317032, + "grad_norm": 12.659566610037718, + "kl": 0.4296875, + "learning_rate": 6.393581081081081e-07, + "loss": 0.0004, + "reward": 3.546845316886902, + "reward_std": 0.1074911393225193, + "rewards/final_reward": 1.731462015738102, + "rewards/mask_iou_reward": 0.865731007869051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5468451976776123, + "rewards/thk_ans_format_reward": 1.0, + "step": 1281, + "think_completion_length": 5.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.6458396911621, + "epoch": 4.330522765598651, + "grad_norm": 8.705391227331303, + "kl": 0.37890625, + "learning_rate": 6.390765765765766e-07, + "loss": 0.0004, + "reward": 3.6814361810684204, + "reward_std": 0.027851653285324574, + "rewards/final_reward": 1.8943658471287275, + "rewards/mask_iou_reward": 0.9471829235643637, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6814362406730652, + "rewards/thk_ans_format_reward": 1.0, + "step": 1282, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.08333587646484, + "epoch": 4.33389544688027, + "grad_norm": 7.4392061951139645, + "kl": 0.427734375, + "learning_rate": 6.38795045045045e-07, + "loss": 0.0004, + "reward": 3.3992159366607666, + "reward_std": 0.07810256537050009, + "rewards/final_reward": 1.6026946067459202, + "rewards/mask_iou_reward": 0.8013473033729601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3992160558700562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1283, + "think_completion_length": 5.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.47917175292969, + "epoch": 4.337268128161889, + "grad_norm": 9.69670778371472, + "kl": 0.4384765625, + "learning_rate": 6.385135135135134e-07, + "loss": 0.0004, + "reward": 3.6353808641433716, + "reward_std": 0.08136463444679976, + "rewards/final_reward": 1.7156594083437389, + "rewards/mask_iou_reward": 0.8578297041718694, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6353808045387268, + "rewards/thk_ans_format_reward": 1.0, + "step": 1284, + "think_completion_length": 4.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0729217529297, + "epoch": 4.340640809443507, + "grad_norm": 12.762926580219476, + "kl": 0.462890625, + "learning_rate": 6.382319819819819e-07, + "loss": 0.0005, + "reward": 3.420608162879944, + "reward_std": 0.18191301077604294, + "rewards/final_reward": 1.348219115977413, + "rewards/mask_iou_reward": 0.6741095579887065, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4414416551589966, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1285, + "think_completion_length": 5.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.98958587646484, + "epoch": 4.344013490725127, + "grad_norm": 17.299410660768007, + "kl": 0.4404296875, + "learning_rate": 6.379504504504503e-07, + "loss": 0.0004, + "reward": 3.370743989944458, + "reward_std": 0.1613960862159729, + "rewards/final_reward": 1.7371300769846425, + "rewards/mask_iou_reward": 0.8685650384923213, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3915773034095764, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1286, + "think_completion_length": 5.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.57292175292969, + "epoch": 4.347386172006745, + "grad_norm": 25.087388286333983, + "kl": 0.59765625, + "learning_rate": 6.376689189189189e-07, + "loss": 0.0006, + "reward": 3.4849319458007812, + "reward_std": 0.09557798132300377, + "rewards/final_reward": 1.4741651557191244, + "rewards/mask_iou_reward": 0.7370825778595622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4849318265914917, + "rewards/thk_ans_format_reward": 1.0, + "step": 1287, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.95833587646484, + "epoch": 4.350758853288364, + "grad_norm": 19.652298591259633, + "kl": 0.48828125, + "learning_rate": 6.373873873873874e-07, + "loss": 0.0005, + "reward": 3.369731068611145, + "reward_std": 0.24395397305488586, + "rewards/final_reward": 1.849694773799825, + "rewards/mask_iou_reward": 0.9248473868999125, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4113975167274475, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1288, + "think_completion_length": 6.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.2916717529297, + "epoch": 4.354131534569984, + "grad_norm": 9.013563522241386, + "kl": 0.41796875, + "learning_rate": 6.371058558558558e-07, + "loss": 0.0004, + "reward": 3.424813151359558, + "reward_std": 0.0731990858912468, + "rewards/final_reward": 1.9547891026851874, + "rewards/mask_iou_reward": 0.9773945513425937, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4248132109642029, + "rewards/thk_ans_format_reward": 1.0, + "step": 1289, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.87500762939453, + "epoch": 4.357504215851602, + "grad_norm": 6.721369378846812, + "kl": 0.4375, + "learning_rate": 6.368243243243243e-07, + "loss": 0.0004, + "reward": 3.436237096786499, + "reward_std": 0.08576931431889534, + "rewards/final_reward": 1.467071944155215, + "rewards/mask_iou_reward": 0.7335359720776075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.436237096786499, + "rewards/thk_ans_format_reward": 1.0, + "step": 1290, + "think_completion_length": 5.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.29167175292969, + "epoch": 4.360876897133221, + "grad_norm": 7.362840756798237, + "kl": 0.5146484375, + "learning_rate": 6.365427927927927e-07, + "loss": 0.0006, + "reward": 3.4356051683425903, + "reward_std": 0.16319414228200912, + "rewards/final_reward": 1.925304111768032, + "rewards/mask_iou_reward": 0.962652055884016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4356051683425903, + "rewards/thk_ans_format_reward": 1.0, + "step": 1291, + "think_completion_length": 5.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.7916717529297, + "epoch": 4.36424957841484, + "grad_norm": 185.28901934065232, + "kl": 0.37890625, + "learning_rate": 6.362612612612612e-07, + "loss": 0.0004, + "reward": 3.370970845222473, + "reward_std": 0.33027198910713196, + "rewards/final_reward": 1.114027995257803, + "rewards/mask_iou_reward": 0.5570139976289015, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4126374125480652, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1292, + "think_completion_length": 6.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.7916717529297, + "epoch": 4.367622259696459, + "grad_norm": 6.847292096928454, + "kl": 0.3837890625, + "learning_rate": 6.359797297297297e-07, + "loss": 0.0004, + "reward": 3.2700021266937256, + "reward_std": 0.3615192845463753, + "rewards/final_reward": 1.1498082631709108, + "rewards/mask_iou_reward": 0.5749041315854554, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.3429186344146729, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 1293, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.83333587646484, + "epoch": 4.370994940978077, + "grad_norm": 8.913277865445579, + "kl": 0.552734375, + "learning_rate": 6.356981981981981e-07, + "loss": 0.0006, + "reward": 3.4847034215927124, + "reward_std": 0.273854024708271, + "rewards/final_reward": 1.7407927712937312, + "rewards/mask_iou_reward": 0.8703963856468656, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.505536675453186, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1294, + "think_completion_length": 6.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.20833587646484, + "epoch": 4.3743676222596966, + "grad_norm": 6.8652638016315475, + "kl": 0.58203125, + "learning_rate": 6.354166666666666e-07, + "loss": 0.0006, + "reward": 3.5416958332061768, + "reward_std": 0.2061656340956688, + "rewards/final_reward": 1.8498229882408612, + "rewards/mask_iou_reward": 0.9249114941204306, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5625290870666504, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1295, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.46875762939453, + "epoch": 4.377740303541315, + "grad_norm": 17.815113300408775, + "kl": 0.58203125, + "learning_rate": 6.35135135135135e-07, + "loss": 0.0006, + "reward": 3.021910071372986, + "reward_std": 0.131107859313488, + "rewards/final_reward": 0.8340113627958121, + "rewards/mask_iou_reward": 0.41700568139790606, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0219102799892426, + "rewards/thk_ans_format_reward": 1.0, + "step": 1296, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.82291793823242, + "epoch": 4.381112984822934, + "grad_norm": 42.24876306015295, + "kl": 0.4775390625, + "learning_rate": 6.348536036036036e-07, + "loss": 0.0005, + "reward": 3.639923334121704, + "reward_std": 0.07676161080598831, + "rewards/final_reward": 1.888736375245885, + "rewards/mask_iou_reward": 0.9443681876229425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6399233937263489, + "rewards/thk_ans_format_reward": 1.0, + "step": 1297, + "think_completion_length": 6.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.33333587646484, + "epoch": 4.3844856661045535, + "grad_norm": 17.35495373942962, + "kl": 0.40234375, + "learning_rate": 6.345720720720721e-07, + "loss": 0.0004, + "reward": 3.204251527786255, + "reward_std": 0.03969000466167927, + "rewards/final_reward": 1.0037151846973167, + "rewards/mask_iou_reward": 0.5018575923486583, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.20425146818161, + "rewards/thk_ans_format_reward": 1.0, + "step": 1298, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.6979217529297, + "epoch": 4.387858347386172, + "grad_norm": 20.977587134561993, + "kl": 0.4111328125, + "learning_rate": 6.342905405405405e-07, + "loss": 0.0004, + "reward": 3.381840944290161, + "reward_std": 0.07051502913236618, + "rewards/final_reward": 0.9913485842903418, + "rewards/mask_iou_reward": 0.4956742921451709, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3818408250808716, + "rewards/thk_ans_format_reward": 1.0, + "step": 1299, + "think_completion_length": 5.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.46875762939453, + "epoch": 4.391231028667791, + "grad_norm": 28.793575846253628, + "kl": 0.51953125, + "learning_rate": 6.34009009009009e-07, + "loss": 0.0005, + "reward": 3.0043944120407104, + "reward_std": 0.12208867445588112, + "rewards/final_reward": 1.4536093123239269, + "rewards/mask_iou_reward": 0.7268046561619634, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0043944120407104, + "rewards/thk_ans_format_reward": 1.0, + "step": 1300, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.3541717529297, + "epoch": 4.3946037099494095, + "grad_norm": 10.88683265592209, + "kl": 0.494140625, + "learning_rate": 6.337274774774775e-07, + "loss": 0.0005, + "reward": 3.437264561653137, + "reward_std": 0.12238720059394836, + "rewards/final_reward": 1.0413577423258804, + "rewards/mask_iou_reward": 0.5206788711629402, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4372643828392029, + "rewards/thk_ans_format_reward": 1.0, + "step": 1301, + "think_completion_length": 5.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.54166793823242, + "epoch": 4.397976391231029, + "grad_norm": 13.458149194489804, + "kl": 0.76171875, + "learning_rate": 6.334459459459459e-07, + "loss": 0.0008, + "reward": 3.532782554626465, + "reward_std": 0.1034855768084526, + "rewards/final_reward": 1.2482119400414557, + "rewards/mask_iou_reward": 0.6241059700207279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5327824354171753, + "rewards/thk_ans_format_reward": 1.0, + "step": 1302, + "think_completion_length": 5.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.95833587646484, + "epoch": 4.401349072512647, + "grad_norm": 29.113274377786073, + "kl": 0.8466796875, + "learning_rate": 6.331644144144144e-07, + "loss": 0.0009, + "reward": 3.2336429357528687, + "reward_std": 0.10814763605594635, + "rewards/final_reward": 1.2469824715224955, + "rewards/mask_iou_reward": 0.6234912357612478, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2336428463459015, + "rewards/thk_ans_format_reward": 1.0, + "step": 1303, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5104217529297, + "epoch": 4.4047217537942664, + "grad_norm": 10.438587288068794, + "kl": 0.4140625, + "learning_rate": 6.328828828828828e-07, + "loss": 0.0004, + "reward": 3.369965076446533, + "reward_std": 0.2397690787911415, + "rewards/final_reward": 1.3658386005190497, + "rewards/mask_iou_reward": 0.6829193002595249, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3907981514930725, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1304, + "think_completion_length": 6.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.875, + "epoch": 4.408094435075886, + "grad_norm": 9.686637334920153, + "kl": 0.4208984375, + "learning_rate": 6.326013513513513e-07, + "loss": 0.0005, + "reward": 3.1423239707946777, + "reward_std": 0.10288365464657545, + "rewards/final_reward": 1.4988730307858993, + "rewards/mask_iou_reward": 0.7494365153929496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.142323911190033, + "rewards/thk_ans_format_reward": 1.0, + "step": 1305, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.48959350585938, + "epoch": 4.411467116357504, + "grad_norm": 9.02812056455549, + "kl": 0.5029296875, + "learning_rate": 6.323198198198198e-07, + "loss": 0.0005, + "reward": 3.1927590370178223, + "reward_std": 0.2850857675075531, + "rewards/final_reward": 1.5902582021310268, + "rewards/mask_iou_reward": 0.7951291010655134, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.2552589774131775, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1306, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.62500762939453, + "epoch": 4.414839797639123, + "grad_norm": 6.8909826541799974, + "kl": 0.38671875, + "learning_rate": 6.320382882882883e-07, + "loss": 0.0004, + "reward": 3.4335436820983887, + "reward_std": 0.18952222168445587, + "rewards/final_reward": 1.7601613200196897, + "rewards/mask_iou_reward": 0.8800806600098449, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4543770551681519, + "rewards/thk_ans_format_reward": 1.0, + "step": 1307, + "think_completion_length": 5.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.19792175292969, + "epoch": 4.418212478920742, + "grad_norm": 12.016535270077375, + "kl": 0.447265625, + "learning_rate": 6.317567567567568e-07, + "loss": 0.0005, + "reward": 3.2483325004577637, + "reward_std": 0.08590874914079905, + "rewards/final_reward": 1.1596071175494251, + "rewards/mask_iou_reward": 0.5798035587747126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2483325004577637, + "rewards/thk_ans_format_reward": 1.0, + "step": 1308, + "think_completion_length": 6.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.9270896911621, + "epoch": 4.421585160202361, + "grad_norm": 8.452737652006032, + "kl": 0.51171875, + "learning_rate": 6.314752252252252e-07, + "loss": 0.0005, + "reward": 3.5426175594329834, + "reward_std": 0.11622235551476479, + "rewards/final_reward": 1.5961111244796113, + "rewards/mask_iou_reward": 0.7980555622398057, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5426174998283386, + "rewards/thk_ans_format_reward": 1.0, + "step": 1309, + "think_completion_length": 6.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.39583587646484, + "epoch": 4.424957841483979, + "grad_norm": 13.0609759023184, + "kl": 0.4267578125, + "learning_rate": 6.311936936936937e-07, + "loss": 0.0004, + "reward": 3.4041415452957153, + "reward_std": 0.19920283555984497, + "rewards/final_reward": 1.66883441199862, + "rewards/mask_iou_reward": 0.83441720599931, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4041413068771362, + "rewards/thk_ans_format_reward": 1.0, + "step": 1310, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.75000381469727, + "epoch": 4.428330522765599, + "grad_norm": 90.23292436090381, + "kl": 0.416015625, + "learning_rate": 6.309121621621622e-07, + "loss": 0.0004, + "reward": 2.907870292663574, + "reward_std": 0.10285294055938721, + "rewards/final_reward": 0.7910406349387594, + "rewards/mask_iou_reward": 0.3955203174693797, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9078702032566071, + "rewards/thk_ans_format_reward": 1.0, + "step": 1311, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.42708587646484, + "epoch": 4.431703204047217, + "grad_norm": 17.978114646607587, + "kl": 0.412109375, + "learning_rate": 6.306306306306306e-07, + "loss": 0.0005, + "reward": 3.2035940885543823, + "reward_std": 0.17428059503436089, + "rewards/final_reward": 1.4809223040645283, + "rewards/mask_iou_reward": 0.7404611520322641, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2035939693450928, + "rewards/thk_ans_format_reward": 1.0, + "step": 1312, + "think_completion_length": 5.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.03125762939453, + "epoch": 4.435075885328836, + "grad_norm": 9.933008430647831, + "kl": 0.486328125, + "learning_rate": 6.303490990990991e-07, + "loss": 0.0005, + "reward": 3.2731047868728638, + "reward_std": 0.14033827558159828, + "rewards/final_reward": 1.1736736908276793, + "rewards/mask_iou_reward": 0.5868368454138396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2731046676635742, + "rewards/thk_ans_format_reward": 1.0, + "step": 1313, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.55208587646484, + "epoch": 4.438448566610456, + "grad_norm": 31.427049740289576, + "kl": 0.4716796875, + "learning_rate": 6.300675675675675e-07, + "loss": 0.0005, + "reward": 3.422204375267029, + "reward_std": 0.1572865154594183, + "rewards/final_reward": 1.091553492553749, + "rewards/mask_iou_reward": 0.5457767462768744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4222044348716736, + "rewards/thk_ans_format_reward": 1.0, + "step": 1314, + "think_completion_length": 5.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5729217529297, + "epoch": 4.441821247892074, + "grad_norm": 20.019555623602535, + "kl": 0.431640625, + "learning_rate": 6.29786036036036e-07, + "loss": 0.0004, + "reward": 3.2552809715270996, + "reward_std": 0.26685456931591034, + "rewards/final_reward": 1.2028960215663902, + "rewards/mask_iou_reward": 0.6014480107831951, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2865309417247772, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1315, + "think_completion_length": 6.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.40625, + "epoch": 4.445193929173693, + "grad_norm": 8.662922043418455, + "kl": 0.46875, + "learning_rate": 6.295045045045045e-07, + "loss": 0.0005, + "reward": 3.4265466928482056, + "reward_std": 0.05025552585721016, + "rewards/final_reward": 0.9454525025156142, + "rewards/mask_iou_reward": 0.4727262512578071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.42654687166214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1316, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8333396911621, + "epoch": 4.448566610455312, + "grad_norm": 36.21317582943972, + "kl": 0.501953125, + "learning_rate": 6.29222972972973e-07, + "loss": 0.0005, + "reward": 3.3867627382278442, + "reward_std": 0.11366377770900726, + "rewards/final_reward": 1.4993498673055825, + "rewards/mask_iou_reward": 0.7496749336527913, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.386762797832489, + "rewards/thk_ans_format_reward": 1.0, + "step": 1317, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.88541793823242, + "epoch": 4.451939291736931, + "grad_norm": 16.897114106900812, + "kl": 0.498046875, + "learning_rate": 6.289414414414415e-07, + "loss": 0.0005, + "reward": 3.1020525693893433, + "reward_std": 0.14741721376776695, + "rewards/final_reward": 0.520114738677751, + "rewards/mask_iou_reward": 0.2600573693388755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1020523607730865, + "rewards/thk_ans_format_reward": 1.0, + "step": 1318, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.64583587646484, + "epoch": 4.455311973018549, + "grad_norm": 26.997113281837546, + "kl": 0.591796875, + "learning_rate": 6.2865990990991e-07, + "loss": 0.0006, + "reward": 3.3456215858459473, + "reward_std": 0.07093912735581398, + "rewards/final_reward": 1.579614805626064, + "rewards/mask_iou_reward": 0.789807402813032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3456215262413025, + "rewards/thk_ans_format_reward": 1.0, + "step": 1319, + "think_completion_length": 6.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.6666717529297, + "epoch": 4.458684654300169, + "grad_norm": 8.579021486082592, + "kl": 0.4130859375, + "learning_rate": 6.283783783783784e-07, + "loss": 0.0004, + "reward": 3.255575180053711, + "reward_std": 0.4096851944923401, + "rewards/final_reward": 1.7250021070975252, + "rewards/mask_iou_reward": 0.8625010535487626, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.3597416281700134, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 1320, + "think_completion_length": 7.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.67708587646484, + "epoch": 4.462057335581788, + "grad_norm": 11.564712852567604, + "kl": 0.4521484375, + "learning_rate": 6.280968468468469e-07, + "loss": 0.0005, + "reward": 3.681838870048523, + "reward_std": 0.05845123156905174, + "rewards/final_reward": 1.5727482671470696, + "rewards/mask_iou_reward": 0.7863741335735348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.681838870048523, + "rewards/thk_ans_format_reward": 1.0, + "step": 1321, + "think_completion_length": 5.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5833396911621, + "epoch": 4.465430016863406, + "grad_norm": 10.570742234389874, + "kl": 0.4306640625, + "learning_rate": 6.278153153153153e-07, + "loss": 0.0004, + "reward": 3.0406211614608765, + "reward_std": 0.1231082808226347, + "rewards/final_reward": 1.0502326924803995, + "rewards/mask_iou_reward": 0.5251163462401998, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0406211614608765, + "rewards/thk_ans_format_reward": 1.0, + "step": 1322, + "think_completion_length": 5.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.65625, + "epoch": 4.4688026981450255, + "grad_norm": 48.40569580023704, + "kl": 0.51953125, + "learning_rate": 6.275337837837837e-07, + "loss": 0.0005, + "reward": 3.138838768005371, + "reward_std": 0.06746555864810944, + "rewards/final_reward": 0.777066802693318, + "rewards/mask_iou_reward": 0.388533401346659, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1388386487960815, + "rewards/thk_ans_format_reward": 1.0, + "step": 1323, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.23958587646484, + "epoch": 4.472175379426644, + "grad_norm": 10.429145624903171, + "kl": 0.44140625, + "learning_rate": 6.272522522522522e-07, + "loss": 0.0005, + "reward": 3.3782697916030884, + "reward_std": 0.07179485633969307, + "rewards/final_reward": 1.6552169970137809, + "rewards/mask_iou_reward": 0.8276084985068904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3782697319984436, + "rewards/thk_ans_format_reward": 1.0, + "step": 1324, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.53125, + "epoch": 4.475548060708263, + "grad_norm": 17.4447687740572, + "kl": 0.400390625, + "learning_rate": 6.269707207207206e-07, + "loss": 0.0004, + "reward": 3.270228624343872, + "reward_std": 0.04852992668747902, + "rewards/final_reward": 1.771550318762778, + "rewards/mask_iou_reward": 0.885775159381389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2702285051345825, + "rewards/thk_ans_format_reward": 1.0, + "step": 1325, + "think_completion_length": 5.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.36458587646484, + "epoch": 4.4789207419898815, + "grad_norm": 9.387307475222283, + "kl": 0.439453125, + "learning_rate": 6.266891891891891e-07, + "loss": 0.0004, + "reward": 3.5763256549835205, + "reward_std": 0.11707734689116478, + "rewards/final_reward": 1.6630674648210952, + "rewards/mask_iou_reward": 0.8315337324105476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.576325535774231, + "rewards/thk_ans_format_reward": 1.0, + "step": 1326, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.89584350585938, + "epoch": 4.482293423271501, + "grad_norm": 7.513275324902006, + "kl": 0.3955078125, + "learning_rate": 6.264076576576576e-07, + "loss": 0.0004, + "reward": 3.507278084754944, + "reward_std": 0.08537270268425345, + "rewards/final_reward": 1.874773920447942, + "rewards/mask_iou_reward": 0.937386960223971, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.507278025150299, + "rewards/thk_ans_format_reward": 1.0, + "step": 1327, + "think_completion_length": 5.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.0416717529297, + "epoch": 4.48566610455312, + "grad_norm": 11.916571704103813, + "kl": 0.4365234375, + "learning_rate": 6.261261261261261e-07, + "loss": 0.0004, + "reward": 3.7991596460342407, + "reward_std": 0.07272768579423428, + "rewards/final_reward": 1.9099894526726762, + "rewards/mask_iou_reward": 0.9549947263363381, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7991596460342407, + "rewards/thk_ans_format_reward": 1.0, + "step": 1328, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.71875, + "epoch": 4.4890387858347385, + "grad_norm": 24.874831175357656, + "kl": 0.419921875, + "learning_rate": 6.258445945945946e-07, + "loss": 0.0004, + "reward": 3.59783673286438, + "reward_std": 0.06748372502624989, + "rewards/final_reward": 1.6861565863299195, + "rewards/mask_iou_reward": 0.8430782931649597, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5978366136550903, + "rewards/thk_ans_format_reward": 1.0, + "step": 1329, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.40625, + "epoch": 4.492411467116358, + "grad_norm": 28.278413944207397, + "kl": 0.4697265625, + "learning_rate": 6.25563063063063e-07, + "loss": 0.0005, + "reward": 3.4091928005218506, + "reward_std": 0.18366630002856255, + "rewards/final_reward": 1.167090611647462, + "rewards/mask_iou_reward": 0.583545305823731, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4300260543823242, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1330, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.67708587646484, + "epoch": 4.495784148397976, + "grad_norm": 9.278625107287716, + "kl": 0.3818359375, + "learning_rate": 6.252815315315315e-07, + "loss": 0.0004, + "reward": 2.7817625999450684, + "reward_std": 0.47148652374744415, + "rewards/final_reward": 0.9396521678417691, + "rewards/mask_iou_reward": 0.46982608392088454, + "rewards/sam_format_reward": 0.8750000298023224, + "rewards/sam_reward_func_ultra": 1.031762421131134, + "rewards/thk_ans_format_reward": 0.8750000298023224, + "step": 1331, + "think_completion_length": 5.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0729217529297, + "epoch": 4.499156829679595, + "grad_norm": 13.909337670026057, + "kl": 0.46484375, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0005, + "reward": 3.4207370281219482, + "reward_std": 0.1648988574743271, + "rewards/final_reward": 1.8079818096558335, + "rewards/mask_iou_reward": 0.9039909048279168, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4207369089126587, + "rewards/thk_ans_format_reward": 1.0, + "step": 1332, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.56250762939453, + "epoch": 4.502529510961214, + "grad_norm": 12.323677907036371, + "kl": 0.4287109375, + "learning_rate": 6.247184684684684e-07, + "loss": 0.0004, + "reward": 3.5210859775543213, + "reward_std": 0.14288469403982162, + "rewards/final_reward": 1.5541338166677165, + "rewards/mask_iou_reward": 0.7770669083338583, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5210858583450317, + "rewards/thk_ans_format_reward": 1.0, + "step": 1333, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.07291793823242, + "epoch": 4.505902192242833, + "grad_norm": 13.204691731923932, + "kl": 0.4228515625, + "learning_rate": 6.244369369369369e-07, + "loss": 0.0004, + "reward": 3.0131133794784546, + "reward_std": 0.10102058947086334, + "rewards/final_reward": 1.1227487359589867, + "rewards/mask_iou_reward": 0.5613743679794934, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0235299468040466, + "rewards/thk_ans_format_reward": 1.0, + "step": 1334, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.375, + "epoch": 4.509274873524452, + "grad_norm": 10.704382763607255, + "kl": 0.4423828125, + "learning_rate": 6.241554054054053e-07, + "loss": 0.0004, + "reward": 3.5165557861328125, + "reward_std": 0.1345103308558464, + "rewards/final_reward": 1.8046991433654505, + "rewards/mask_iou_reward": 0.9023495716827252, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5165559649467468, + "rewards/thk_ans_format_reward": 1.0, + "step": 1335, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.70833587646484, + "epoch": 4.512647554806071, + "grad_norm": 31.840986007738767, + "kl": 0.4755859375, + "learning_rate": 6.238738738738738e-07, + "loss": 0.0005, + "reward": 3.4962968826293945, + "reward_std": 0.15561959147453308, + "rewards/final_reward": 1.4510761611293062, + "rewards/mask_iou_reward": 0.7255380805646531, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4962967038154602, + "rewards/thk_ans_format_reward": 1.0, + "step": 1336, + "think_completion_length": 6.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.47916793823242, + "epoch": 4.51602023608769, + "grad_norm": 10.79087014899047, + "kl": 0.4814453125, + "learning_rate": 6.235923423423422e-07, + "loss": 0.0005, + "reward": 3.343773126602173, + "reward_std": 0.031131713651120663, + "rewards/final_reward": 0.8331768862643368, + "rewards/mask_iou_reward": 0.4165884431321684, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3437729477882385, + "rewards/thk_ans_format_reward": 1.0, + "step": 1337, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.6875, + "epoch": 4.519392917369308, + "grad_norm": 7.266833367822084, + "kl": 0.44921875, + "learning_rate": 6.233108108108108e-07, + "loss": 0.0004, + "reward": 3.615713596343994, + "reward_std": 0.08736434578895569, + "rewards/final_reward": 1.7009432890917302, + "rewards/mask_iou_reward": 0.8504716445458651, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.615713357925415, + "rewards/thk_ans_format_reward": 1.0, + "step": 1338, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.63541793823242, + "epoch": 4.522765598650928, + "grad_norm": 36.82887927583875, + "kl": 0.451171875, + "learning_rate": 6.230292792792793e-07, + "loss": 0.0005, + "reward": 3.28295361995697, + "reward_std": 0.2342146709561348, + "rewards/final_reward": 1.0344168897945436, + "rewards/mask_iou_reward": 0.5172084448972718, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.345453679561615, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1339, + "think_completion_length": 6.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.67708587646484, + "epoch": 4.526138279932546, + "grad_norm": 10.016939819024225, + "kl": 0.451171875, + "learning_rate": 6.227477477477477e-07, + "loss": 0.0005, + "reward": 3.27169930934906, + "reward_std": 0.14648236706852913, + "rewards/final_reward": 1.2877338441051873, + "rewards/mask_iou_reward": 0.6438669220525937, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2716993689537048, + "rewards/thk_ans_format_reward": 1.0, + "step": 1340, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.73958587646484, + "epoch": 4.529510961214165, + "grad_norm": 10.741459866609754, + "kl": 0.5107421875, + "learning_rate": 6.224662162162162e-07, + "loss": 0.0005, + "reward": 3.593325614929199, + "reward_std": 0.1311676874756813, + "rewards/final_reward": 1.751861304958525, + "rewards/mask_iou_reward": 0.8759306524792625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5933258533477783, + "rewards/thk_ans_format_reward": 1.0, + "step": 1341, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.81250762939453, + "epoch": 4.532883642495785, + "grad_norm": 18.01725136665382, + "kl": 1.0830078125, + "learning_rate": 6.221846846846847e-07, + "loss": 0.0011, + "reward": 3.5322015285491943, + "reward_std": 0.21869247313588858, + "rewards/final_reward": 1.128467963366134, + "rewards/mask_iou_reward": 0.564233981683067, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5530345439910889, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1342, + "think_completion_length": 6.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.86458587646484, + "epoch": 4.536256323777403, + "grad_norm": 31.487359076929582, + "kl": 0.4267578125, + "learning_rate": 6.219031531531531e-07, + "loss": 0.0004, + "reward": 3.56356418132782, + "reward_std": 0.06110507994890213, + "rewards/final_reward": 1.6827129801909355, + "rewards/mask_iou_reward": 0.8413564900954678, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5635643601417542, + "rewards/thk_ans_format_reward": 1.0, + "step": 1343, + "think_completion_length": 6.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.45833587646484, + "epoch": 4.539629005059022, + "grad_norm": 11.837423389388698, + "kl": 0.4384765625, + "learning_rate": 6.216216216216216e-07, + "loss": 0.0004, + "reward": 3.5896217823028564, + "reward_std": 0.1191295669414103, + "rewards/final_reward": 1.7449539403166883, + "rewards/mask_iou_reward": 0.8724769701583441, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.589621901512146, + "rewards/thk_ans_format_reward": 1.0, + "step": 1344, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.58333587646484, + "epoch": 4.543001686340641, + "grad_norm": 9.16782009417388, + "kl": 0.4365234375, + "learning_rate": 6.2134009009009e-07, + "loss": 0.0004, + "reward": 3.5739206075668335, + "reward_std": 0.03104757610708475, + "rewards/final_reward": 1.2965923239817678, + "rewards/mask_iou_reward": 0.6482961619908839, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5739206671714783, + "rewards/thk_ans_format_reward": 1.0, + "step": 1345, + "think_completion_length": 6.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.95833587646484, + "epoch": 4.54637436762226, + "grad_norm": 12.876924496797981, + "kl": 0.41796875, + "learning_rate": 6.210585585585585e-07, + "loss": 0.0004, + "reward": 3.4988181591033936, + "reward_std": 0.1466284692287445, + "rewards/final_reward": 1.6481023773213026, + "rewards/mask_iou_reward": 0.8240511886606513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4988181591033936, + "rewards/thk_ans_format_reward": 1.0, + "step": 1346, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.80208587646484, + "epoch": 4.549747048903878, + "grad_norm": 8.330779387661776, + "kl": 0.41796875, + "learning_rate": 6.20777027027027e-07, + "loss": 0.0004, + "reward": 3.434284806251526, + "reward_std": 0.1272013932466507, + "rewards/final_reward": 1.7430539413468273, + "rewards/mask_iou_reward": 0.8715269706734137, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4342848062515259, + "rewards/thk_ans_format_reward": 1.0, + "step": 1347, + "think_completion_length": 6.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.30208587646484, + "epoch": 4.5531197301854975, + "grad_norm": 11.60438500151872, + "kl": 0.392578125, + "learning_rate": 6.204954954954955e-07, + "loss": 0.0004, + "reward": 3.43938946723938, + "reward_std": 0.10462233331054449, + "rewards/final_reward": 1.2083157667735134, + "rewards/mask_iou_reward": 0.6041578833867567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4393895864486694, + "rewards/thk_ans_format_reward": 1.0, + "step": 1348, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.23958587646484, + "epoch": 4.556492411467117, + "grad_norm": 8.767250500131237, + "kl": 0.5, + "learning_rate": 6.20213963963964e-07, + "loss": 0.0005, + "reward": 3.2020862102508545, + "reward_std": 0.15238425135612488, + "rewards/final_reward": 1.4537183116413868, + "rewards/mask_iou_reward": 0.7268591558206934, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2020861506462097, + "rewards/thk_ans_format_reward": 1.0, + "step": 1349, + "think_completion_length": 7.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.52083587646484, + "epoch": 4.559865092748735, + "grad_norm": 9.766717118278587, + "kl": 0.5546875, + "learning_rate": 6.199324324324324e-07, + "loss": 0.0006, + "reward": 3.612934708595276, + "reward_std": 0.06734197214245796, + "rewards/final_reward": 1.7688350117457257, + "rewards/mask_iou_reward": 0.8844175058728628, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.612934648990631, + "rewards/thk_ans_format_reward": 1.0, + "step": 1350, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.85416793823242, + "epoch": 4.5632377740303545, + "grad_norm": 8.70242430872768, + "kl": 0.51171875, + "learning_rate": 6.196509009009009e-07, + "loss": 0.0005, + "reward": 3.0413984060287476, + "reward_std": 0.26274920254945755, + "rewards/final_reward": 1.7419171839168266, + "rewards/mask_iou_reward": 0.8709585919584133, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0622316002845764, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1351, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.37500762939453, + "epoch": 4.566610455311973, + "grad_norm": 11.112925150548403, + "kl": 0.4404296875, + "learning_rate": 6.193693693693694e-07, + "loss": 0.0004, + "reward": 3.1636022329330444, + "reward_std": 0.1556643471121788, + "rewards/final_reward": 1.5595026733749058, + "rewards/mask_iou_reward": 0.7797513366874529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.163602352142334, + "rewards/thk_ans_format_reward": 1.0, + "step": 1352, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5104217529297, + "epoch": 4.569983136593592, + "grad_norm": 20.78715812576491, + "kl": 0.41796875, + "learning_rate": 6.190878378378378e-07, + "loss": 0.0004, + "reward": 3.53654682636261, + "reward_std": 0.1526901237666607, + "rewards/final_reward": 1.8200043833348483, + "rewards/mask_iou_reward": 0.9100021916674241, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5365468263626099, + "rewards/thk_ans_format_reward": 1.0, + "step": 1353, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4479217529297, + "epoch": 4.5733558178752105, + "grad_norm": 17.193569910426874, + "kl": 0.4052734375, + "learning_rate": 6.188063063063063e-07, + "loss": 0.0004, + "reward": 3.4392212629318237, + "reward_std": 0.15608344972133636, + "rewards/final_reward": 1.1345150048448946, + "rewards/mask_iou_reward": 0.5672575024224473, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4392213821411133, + "rewards/thk_ans_format_reward": 1.0, + "step": 1354, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.53125762939453, + "epoch": 4.57672849915683, + "grad_norm": 10.423335221856972, + "kl": 0.453125, + "learning_rate": 6.185247747747748e-07, + "loss": 0.0005, + "reward": 3.3039698600769043, + "reward_std": 0.11607521027326584, + "rewards/final_reward": 1.2098033741665402, + "rewards/mask_iou_reward": 0.6049016870832701, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3039699792861938, + "rewards/thk_ans_format_reward": 1.0, + "step": 1355, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.59375762939453, + "epoch": 4.580101180438449, + "grad_norm": 9.238720863822698, + "kl": 0.4248046875, + "learning_rate": 6.182432432432432e-07, + "loss": 0.0004, + "reward": 3.4512531757354736, + "reward_std": 0.10461808368563652, + "rewards/final_reward": 1.6735856737578172, + "rewards/mask_iou_reward": 0.8367928368789086, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4512531161308289, + "rewards/thk_ans_format_reward": 1.0, + "step": 1356, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.40625381469727, + "epoch": 4.583473861720067, + "grad_norm": 17.215769281190024, + "kl": 0.4697265625, + "learning_rate": 6.179617117117117e-07, + "loss": 0.0005, + "reward": 3.7004971504211426, + "reward_std": 0.05688786879181862, + "rewards/final_reward": 1.8174990980498769, + "rewards/mask_iou_reward": 0.9087495490249384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7004972100257874, + "rewards/thk_ans_format_reward": 1.0, + "step": 1357, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.33333587646484, + "epoch": 4.586846543001687, + "grad_norm": 23.536469707220405, + "kl": 0.517578125, + "learning_rate": 6.176801801801802e-07, + "loss": 0.0005, + "reward": 3.5313466787338257, + "reward_std": 0.1645505577325821, + "rewards/final_reward": 1.4728273831129242, + "rewards/mask_iou_reward": 0.7364136915564621, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5313466787338257, + "rewards/thk_ans_format_reward": 1.0, + "step": 1358, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.83333587646484, + "epoch": 4.590219224283305, + "grad_norm": 26.026138897910474, + "kl": 0.462890625, + "learning_rate": 6.173986486486487e-07, + "loss": 0.0005, + "reward": 3.2835291624069214, + "reward_std": 0.4738253206014633, + "rewards/final_reward": 1.3618498747858259, + "rewards/mask_iou_reward": 0.6809249373929129, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3251959085464478, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1359, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.8229217529297, + "epoch": 4.593591905564924, + "grad_norm": 9.427231475053144, + "kl": 0.4375, + "learning_rate": 6.171171171171172e-07, + "loss": 0.0005, + "reward": 3.4778225421905518, + "reward_std": 0.34040002152323723, + "rewards/final_reward": 1.142595404825373, + "rewards/mask_iou_reward": 0.5712977024126865, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.581989347934723, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 1360, + "think_completion_length": 6.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.91666793823242, + "epoch": 4.596964586846543, + "grad_norm": 20.106595600779748, + "kl": 0.4150390625, + "learning_rate": 6.168355855855856e-07, + "loss": 0.0004, + "reward": 3.325606107711792, + "reward_std": 0.12346979975700378, + "rewards/final_reward": 1.5796197811707622, + "rewards/mask_iou_reward": 0.7898098905853811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.325605869293213, + "rewards/thk_ans_format_reward": 1.0, + "step": 1361, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.89583587646484, + "epoch": 4.600337268128162, + "grad_norm": 8.00573726219941, + "kl": 0.515625, + "learning_rate": 6.16554054054054e-07, + "loss": 0.0005, + "reward": 3.4754496812820435, + "reward_std": 0.07985536009073257, + "rewards/final_reward": 0.9789360400835958, + "rewards/mask_iou_reward": 0.4894680200417979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4754494428634644, + "rewards/thk_ans_format_reward": 1.0, + "step": 1362, + "think_completion_length": 9.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.64583587646484, + "epoch": 4.60370994940978, + "grad_norm": 22.410320780410302, + "kl": 0.416015625, + "learning_rate": 6.162725225225224e-07, + "loss": 0.0004, + "reward": 3.6258490085601807, + "reward_std": 0.09473934583365917, + "rewards/final_reward": 1.558775344475041, + "rewards/mask_iou_reward": 0.7793876722375205, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6258489489555359, + "rewards/thk_ans_format_reward": 1.0, + "step": 1363, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.50000762939453, + "epoch": 4.6070826306914, + "grad_norm": 124.86281572745158, + "kl": 0.3779296875, + "learning_rate": 6.159909909909909e-07, + "loss": 0.0004, + "reward": 3.177658796310425, + "reward_std": 0.26666849851608276, + "rewards/final_reward": 1.3526290204311469, + "rewards/mask_iou_reward": 0.6763145102155734, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.198492169380188, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1364, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.5104217529297, + "epoch": 4.610455311973018, + "grad_norm": 26.843313276077073, + "kl": 0.4287109375, + "learning_rate": 6.157094594594594e-07, + "loss": 0.0004, + "reward": 3.3961726427078247, + "reward_std": 0.2304963506758213, + "rewards/final_reward": 1.4802080106953877, + "rewards/mask_iou_reward": 0.7401040053476938, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.45867258310318, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1365, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.55209350585938, + "epoch": 4.613827993254637, + "grad_norm": 9.086084177990507, + "kl": 0.390625, + "learning_rate": 6.154279279279278e-07, + "loss": 0.0004, + "reward": 3.3631885051727295, + "reward_std": 0.16303710266947746, + "rewards/final_reward": 1.048821753167892, + "rewards/mask_iou_reward": 0.524410876583946, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.36318838596344, + "rewards/thk_ans_format_reward": 1.0, + "step": 1366, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.7916717529297, + "epoch": 4.617200674536257, + "grad_norm": 5.668347516194788, + "kl": 0.4736328125, + "learning_rate": 6.151463963963963e-07, + "loss": 0.0005, + "reward": 3.458433151245117, + "reward_std": 0.13166548311710358, + "rewards/final_reward": 1.3981328072953145, + "rewards/mask_iou_reward": 0.6990664036476573, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4584329724311829, + "rewards/thk_ans_format_reward": 1.0, + "step": 1367, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.23959350585938, + "epoch": 4.620573355817875, + "grad_norm": 9.115177765492065, + "kl": 0.435546875, + "learning_rate": 6.148648648648648e-07, + "loss": 0.0004, + "reward": 3.657396078109741, + "reward_std": 0.12225788831710815, + "rewards/final_reward": 1.6637135937798875, + "rewards/mask_iou_reward": 0.8318567968899437, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6573959589004517, + "rewards/thk_ans_format_reward": 1.0, + "step": 1368, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.5729217529297, + "epoch": 4.623946037099494, + "grad_norm": 13.399152863153292, + "kl": 0.373046875, + "learning_rate": 6.145833333333333e-07, + "loss": 0.0004, + "reward": 3.6571751832962036, + "reward_std": 0.15783963352441788, + "rewards/final_reward": 1.6372354588546512, + "rewards/mask_iou_reward": 0.8186177294273256, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6780081987380981, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1369, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.56250762939453, + "epoch": 4.627318718381113, + "grad_norm": 9.441000443753211, + "kl": 0.39453125, + "learning_rate": 6.143018018018018e-07, + "loss": 0.0004, + "reward": 3.1151891946792603, + "reward_std": 0.34237323701381683, + "rewards/final_reward": 1.4576075154407924, + "rewards/mask_iou_reward": 0.7288037577203962, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.156855821609497, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1370, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.13542938232422, + "epoch": 4.630691399662732, + "grad_norm": 6.958805381363635, + "kl": 0.3974609375, + "learning_rate": 6.140202702702702e-07, + "loss": 0.0004, + "reward": 3.1723281145095825, + "reward_std": 0.41160689294338226, + "rewards/final_reward": 1.6349061756987167, + "rewards/mask_iou_reward": 0.8174530878493583, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.2348282039165497, + "rewards/thk_ans_format_reward": 0.9687500298023224, + "step": 1371, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.09375, + "epoch": 4.63406408094435, + "grad_norm": 5.81861981981799, + "kl": 0.3837890625, + "learning_rate": 6.137387387387387e-07, + "loss": 0.0004, + "reward": 3.5201971530914307, + "reward_std": 0.17057428415864706, + "rewards/final_reward": 1.792878767361338, + "rewards/mask_iou_reward": 0.896439383680669, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5410303473472595, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1372, + "think_completion_length": 6.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.44792938232422, + "epoch": 4.63743676222597, + "grad_norm": 22.718158938368305, + "kl": 0.4345703125, + "learning_rate": 6.134572072072072e-07, + "loss": 0.0004, + "reward": 3.308037281036377, + "reward_std": 0.3673863261938095, + "rewards/final_reward": 1.1550472984914382, + "rewards/mask_iou_reward": 0.5775236492457191, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.3705371618270874, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1373, + "think_completion_length": 6.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.1979217529297, + "epoch": 4.640809443507589, + "grad_norm": 9.528830517463504, + "kl": 0.412109375, + "learning_rate": 6.131756756756756e-07, + "loss": 0.0004, + "reward": 3.2841968536376953, + "reward_std": 0.3351758047938347, + "rewards/final_reward": 0.7085192624043266, + "rewards/mask_iou_reward": 0.3542596312021633, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.3675304651260376, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 1374, + "think_completion_length": 9.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.4895896911621, + "epoch": 4.644182124789207, + "grad_norm": 13.657284467532623, + "kl": 0.490234375, + "learning_rate": 6.128941441441441e-07, + "loss": 0.0005, + "reward": 3.5196319818496704, + "reward_std": 0.383151039481163, + "rewards/final_reward": 1.7381896307966391, + "rewards/mask_iou_reward": 0.8690948153983196, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5612985491752625, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1375, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.7291717529297, + "epoch": 4.6475548060708265, + "grad_norm": 7.919045895049307, + "kl": 0.4375, + "learning_rate": 6.126126126126125e-07, + "loss": 0.0004, + "reward": 3.3381824493408203, + "reward_std": 0.4271131902933121, + "rewards/final_reward": 1.0904384314160225, + "rewards/mask_iou_reward": 0.5452192157080112, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.379849135875702, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1376, + "think_completion_length": 6.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.21875762939453, + "epoch": 4.650927487352445, + "grad_norm": 8.13795156602028, + "kl": 0.41015625, + "learning_rate": 6.12331081081081e-07, + "loss": 0.0004, + "reward": 3.506591320037842, + "reward_std": 0.096873689442873, + "rewards/final_reward": 1.4166667266640771, + "rewards/mask_iou_reward": 0.7083333633320386, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5170079469680786, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1377, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.05208587646484, + "epoch": 4.654300168634064, + "grad_norm": 11.229457774273127, + "kl": 0.3623046875, + "learning_rate": 6.120495495495496e-07, + "loss": 0.0004, + "reward": 3.238800048828125, + "reward_std": 0.4140184000134468, + "rewards/final_reward": 1.082652828058263, + "rewards/mask_iou_reward": 0.5413264140291315, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.3429667353630066, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 1378, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.4791717529297, + "epoch": 4.6576728499156825, + "grad_norm": 10.954054113328358, + "kl": 0.44140625, + "learning_rate": 6.11768018018018e-07, + "loss": 0.0005, + "reward": 3.3927810192108154, + "reward_std": 0.27320830151438713, + "rewards/final_reward": 1.2286505609184124, + "rewards/mask_iou_reward": 0.6143252804592062, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4240307211875916, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1379, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.61459350585938, + "epoch": 4.661045531197302, + "grad_norm": 23.253338505109735, + "kl": 0.4228515625, + "learning_rate": 6.114864864864865e-07, + "loss": 0.0004, + "reward": 3.422883987426758, + "reward_std": 0.08235886693000793, + "rewards/final_reward": 1.3982151428079277, + "rewards/mask_iou_reward": 0.6991075714039638, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4228841662406921, + "rewards/thk_ans_format_reward": 1.0, + "step": 1380, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.30208587646484, + "epoch": 4.664418212478921, + "grad_norm": 9.892617377063814, + "kl": 0.4013671875, + "learning_rate": 6.112049549549549e-07, + "loss": 0.0004, + "reward": 3.4144232273101807, + "reward_std": 0.18493741005659103, + "rewards/final_reward": 1.6812816005396218, + "rewards/mask_iou_reward": 0.8406408002698109, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4352566003799438, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1381, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.68750762939453, + "epoch": 4.6677908937605395, + "grad_norm": 7.174720082473362, + "kl": 0.416015625, + "learning_rate": 6.109234234234234e-07, + "loss": 0.0004, + "reward": 3.52528715133667, + "reward_std": 0.04167925659567118, + "rewards/final_reward": 1.6017496577825279, + "rewards/mask_iou_reward": 0.8008748288912639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5252870917320251, + "rewards/thk_ans_format_reward": 1.0, + "step": 1382, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9791717529297, + "epoch": 4.671163575042159, + "grad_norm": 6.330696699691036, + "kl": 0.396484375, + "learning_rate": 6.106418918918919e-07, + "loss": 0.0004, + "reward": 3.408154010772705, + "reward_std": 0.10704836994409561, + "rewards/final_reward": 1.7884134848846656, + "rewards/mask_iou_reward": 0.8942067424423328, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4081538915634155, + "rewards/thk_ans_format_reward": 1.0, + "step": 1383, + "think_completion_length": 10.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.0416717529297, + "epoch": 4.674536256323777, + "grad_norm": 10.35051839748518, + "kl": 0.3828125, + "learning_rate": 6.103603603603603e-07, + "loss": 0.0004, + "reward": 3.3026130199432373, + "reward_std": 0.36125922203063965, + "rewards/final_reward": 1.2660540193293603, + "rewards/mask_iou_reward": 0.6330270096646802, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.3651129007339478, + "rewards/thk_ans_format_reward": 0.9687500298023224, + "step": 1384, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.2604217529297, + "epoch": 4.677908937605396, + "grad_norm": 8.441998141784056, + "kl": 0.400390625, + "learning_rate": 6.100788288288288e-07, + "loss": 0.0004, + "reward": 3.605415463447571, + "reward_std": 0.15792207419872284, + "rewards/final_reward": 1.3898948118203842, + "rewards/mask_iou_reward": 0.6949474059101921, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.626248836517334, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1385, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.1041717529297, + "epoch": 4.681281618887015, + "grad_norm": 7.202620273596892, + "kl": 0.5478515625, + "learning_rate": 6.097972972972972e-07, + "loss": 0.0006, + "reward": 3.121170401573181, + "reward_std": 0.3351920619606972, + "rewards/final_reward": 1.4420571140304157, + "rewards/mask_iou_reward": 0.7210285570152078, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.1836704313755035, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1386, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.1979217529297, + "epoch": 4.684654300168634, + "grad_norm": 10.040654884372035, + "kl": 0.4248046875, + "learning_rate": 6.095157657657657e-07, + "loss": 0.0004, + "reward": 3.5814239978790283, + "reward_std": 0.05522888898849487, + "rewards/final_reward": 1.634387054315411, + "rewards/mask_iou_reward": 0.8171935271577055, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5814239978790283, + "rewards/thk_ans_format_reward": 1.0, + "step": 1387, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.61458587646484, + "epoch": 4.688026981450253, + "grad_norm": 12.756624503560145, + "kl": 0.4873046875, + "learning_rate": 6.092342342342343e-07, + "loss": 0.0005, + "reward": 3.7236673831939697, + "reward_std": 0.1432779412716627, + "rewards/final_reward": 1.6886858155096605, + "rewards/mask_iou_reward": 0.8443429077548302, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7236673831939697, + "rewards/thk_ans_format_reward": 1.0, + "step": 1388, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.1666717529297, + "epoch": 4.691399662731872, + "grad_norm": 54.60026001446657, + "kl": 0.4697265625, + "learning_rate": 6.089527027027027e-07, + "loss": 0.0005, + "reward": 3.6017699241638184, + "reward_std": 0.12201762199401855, + "rewards/final_reward": 1.8574840466847982, + "rewards/mask_iou_reward": 0.9287420233423991, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.601769745349884, + "rewards/thk_ans_format_reward": 1.0, + "step": 1389, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.42708587646484, + "epoch": 4.694772344013491, + "grad_norm": 5.655602631361176, + "kl": 0.40234375, + "learning_rate": 6.086711711711712e-07, + "loss": 0.0004, + "reward": 3.3374961614608765, + "reward_std": 0.231519166380167, + "rewards/final_reward": 0.9501356463935893, + "rewards/mask_iou_reward": 0.47506782319679464, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.379162847995758, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1390, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.75000762939453, + "epoch": 4.698145025295109, + "grad_norm": 13.136877331272428, + "kl": 0.587890625, + "learning_rate": 6.083896396396397e-07, + "loss": 0.0006, + "reward": 3.544087290763855, + "reward_std": 0.07956103049218655, + "rewards/final_reward": 1.8201977647482337, + "rewards/mask_iou_reward": 0.9100988823741168, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5440873503684998, + "rewards/thk_ans_format_reward": 1.0, + "step": 1391, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.21875, + "epoch": 4.701517706576729, + "grad_norm": 9.365444962009972, + "kl": 0.3564453125, + "learning_rate": 6.081081081081081e-07, + "loss": 0.0004, + "reward": 3.349295735359192, + "reward_std": 0.09699325263500214, + "rewards/final_reward": 1.2572575228387097, + "rewards/mask_iou_reward": 0.6286287614193549, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3492956757545471, + "rewards/thk_ans_format_reward": 1.0, + "step": 1392, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0729217529297, + "epoch": 4.704890387858347, + "grad_norm": 18.814682442066502, + "kl": 0.62109375, + "learning_rate": 6.078265765765766e-07, + "loss": 0.0006, + "reward": 3.586809515953064, + "reward_std": 0.20244847238063812, + "rewards/final_reward": 1.6999612887206061, + "rewards/mask_iou_reward": 0.8499806443603031, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6076428890228271, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1393, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.5, + "epoch": 4.708263069139966, + "grad_norm": 12.611804363395612, + "kl": 0.513671875, + "learning_rate": 6.07545045045045e-07, + "loss": 0.0005, + "reward": 3.6310946941375732, + "reward_std": 0.1718064285814762, + "rewards/final_reward": 1.7734070202279941, + "rewards/mask_iou_reward": 0.8867035101139971, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6519279479980469, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1394, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.55208587646484, + "epoch": 4.7116357504215856, + "grad_norm": 12.701339132671743, + "kl": 0.400390625, + "learning_rate": 6.072635135135135e-07, + "loss": 0.0004, + "reward": 2.95338237285614, + "reward_std": 0.30559471249580383, + "rewards/final_reward": 1.1044576161481927, + "rewards/mask_iou_reward": 0.5522288080740964, + "rewards/sam_format_reward": 0.9270833432674408, + "rewards/sam_reward_func_ultra": 1.1096324920654297, + "rewards/thk_ans_format_reward": 0.9166666865348816, + "step": 1395, + "think_completion_length": 6.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.11458587646484, + "epoch": 4.715008431703204, + "grad_norm": 6.719133591335739, + "kl": 0.470703125, + "learning_rate": 6.06981981981982e-07, + "loss": 0.0005, + "reward": 3.0169700384140015, + "reward_std": 0.20718349143862724, + "rewards/final_reward": 0.8760415241116719, + "rewards/mask_iou_reward": 0.43802076205583595, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0169699788093567, + "rewards/thk_ans_format_reward": 1.0, + "step": 1396, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.93750762939453, + "epoch": 4.718381112984823, + "grad_norm": 13.94165033038778, + "kl": 0.484375, + "learning_rate": 6.067004504504504e-07, + "loss": 0.0005, + "reward": 3.592189908027649, + "reward_std": 0.10593907162547112, + "rewards/final_reward": 1.8382220583082103, + "rewards/mask_iou_reward": 0.9191110291541051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.592189610004425, + "rewards/thk_ans_format_reward": 1.0, + "step": 1397, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.59375, + "epoch": 4.721753794266442, + "grad_norm": 11.529880285191956, + "kl": 0.40234375, + "learning_rate": 6.06418918918919e-07, + "loss": 0.0004, + "reward": 3.289812684059143, + "reward_std": 0.07469776272773743, + "rewards/final_reward": 1.5201540594804328, + "rewards/mask_iou_reward": 0.7600770297402164, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2898123264312744, + "rewards/thk_ans_format_reward": 1.0, + "step": 1398, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.3854217529297, + "epoch": 4.725126475548061, + "grad_norm": 14.133682200224834, + "kl": 0.6181640625, + "learning_rate": 6.061373873873874e-07, + "loss": 0.0006, + "reward": 3.644963264465332, + "reward_std": 0.04531935974955559, + "rewards/final_reward": 1.778519984668355, + "rewards/mask_iou_reward": 0.8892599923341775, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6449633240699768, + "rewards/thk_ans_format_reward": 1.0, + "step": 1399, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.23958587646484, + "epoch": 4.728499156829679, + "grad_norm": 9.726824492131778, + "kl": 0.47265625, + "learning_rate": 6.058558558558559e-07, + "loss": 0.0005, + "reward": 3.299925208091736, + "reward_std": 0.08626305125653744, + "rewards/final_reward": 0.9017449776055927, + "rewards/mask_iou_reward": 0.45087248880279635, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2999252676963806, + "rewards/thk_ans_format_reward": 1.0, + "step": 1400, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.2291717529297, + "epoch": 4.7318718381112985, + "grad_norm": 9.027558807675197, + "kl": 0.4716796875, + "learning_rate": 6.055743243243244e-07, + "loss": 0.0005, + "reward": 3.4581342935562134, + "reward_std": 0.25569941103458405, + "rewards/final_reward": 1.6504690959386323, + "rewards/mask_iou_reward": 0.8252345479693162, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4789676666259766, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1401, + "think_completion_length": 5.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.1354217529297, + "epoch": 4.735244519392918, + "grad_norm": 19.171285365895933, + "kl": 0.55859375, + "learning_rate": 6.052927927927927e-07, + "loss": 0.0006, + "reward": 3.3989903926849365, + "reward_std": 0.04316495731472969, + "rewards/final_reward": 1.8268622792643263, + "rewards/mask_iou_reward": 0.9134311396321632, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.398990273475647, + "rewards/thk_ans_format_reward": 1.0, + "step": 1402, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.40625, + "epoch": 4.738617200674536, + "grad_norm": 10.350478375974465, + "kl": 0.431640625, + "learning_rate": 6.050112612612612e-07, + "loss": 0.0004, + "reward": 2.90904438495636, + "reward_std": 0.5360357463359833, + "rewards/final_reward": 0.6511499265661819, + "rewards/mask_iou_reward": 0.32557496328309093, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.0027942955493927, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 1403, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.46875, + "epoch": 4.7419898819561555, + "grad_norm": 12.387022468737703, + "kl": 0.4169921875, + "learning_rate": 6.047297297297296e-07, + "loss": 0.0004, + "reward": 3.4975666999816895, + "reward_std": 0.19361478835344315, + "rewards/final_reward": 1.6649348209872064, + "rewards/mask_iou_reward": 0.8324674104936032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.507983148097992, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1404, + "think_completion_length": 9.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.2291717529297, + "epoch": 4.745362563237774, + "grad_norm": 11.760509932057095, + "kl": 0.41796875, + "learning_rate": 6.044481981981981e-07, + "loss": 0.0005, + "reward": 3.5798988342285156, + "reward_std": 0.01169863436371088, + "rewards/final_reward": 1.5996546826614297, + "rewards/mask_iou_reward": 0.7998273413307149, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5798988342285156, + "rewards/thk_ans_format_reward": 1.0, + "step": 1405, + "think_completion_length": 11.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.21875762939453, + "epoch": 4.748735244519393, + "grad_norm": 11.042883825916984, + "kl": 0.4013671875, + "learning_rate": 6.041666666666666e-07, + "loss": 0.0004, + "reward": 3.4733498096466064, + "reward_std": 0.08175930939614773, + "rewards/final_reward": 1.1120204333436654, + "rewards/mask_iou_reward": 0.5560102166718327, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4733495712280273, + "rewards/thk_ans_format_reward": 1.0, + "step": 1406, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.17709350585938, + "epoch": 4.7521079258010115, + "grad_norm": 16.189002753636895, + "kl": 0.3818359375, + "learning_rate": 6.03885135135135e-07, + "loss": 0.0004, + "reward": 3.2562655210494995, + "reward_std": 0.13706044666469097, + "rewards/final_reward": 0.955771945037772, + "rewards/mask_iou_reward": 0.477885972518886, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3083489537239075, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1407, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.17708587646484, + "epoch": 4.755480607082631, + "grad_norm": 10.681538589372371, + "kl": 0.5, + "learning_rate": 6.036036036036036e-07, + "loss": 0.0005, + "reward": 2.9784927368164062, + "reward_std": 0.16897240281105042, + "rewards/final_reward": 0.5949273793559162, + "rewards/mask_iou_reward": 0.2974636896779581, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.9889095425605774, + "rewards/thk_ans_format_reward": 1.0, + "step": 1408, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.20834350585938, + "epoch": 4.75885328836425, + "grad_norm": 7.624113833794622, + "kl": 0.380859375, + "learning_rate": 6.03322072072072e-07, + "loss": 0.0004, + "reward": 3.5433114767074585, + "reward_std": 0.17104174941778183, + "rewards/final_reward": 1.7751413765493096, + "rewards/mask_iou_reward": 0.8875706882746548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5433114767074585, + "rewards/thk_ans_format_reward": 1.0, + "step": 1409, + "think_completion_length": 6.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.84375762939453, + "epoch": 4.762225969645868, + "grad_norm": 5.271956909636253, + "kl": 0.470703125, + "learning_rate": 6.030405405405405e-07, + "loss": 0.0005, + "reward": 3.537356972694397, + "reward_std": 0.15852384641766548, + "rewards/final_reward": 1.7470706569995778, + "rewards/mask_iou_reward": 0.8735353284997889, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.537356972694397, + "rewards/thk_ans_format_reward": 1.0, + "step": 1410, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.52084350585938, + "epoch": 4.765598650927488, + "grad_norm": 60.182323202213084, + "kl": 0.451171875, + "learning_rate": 6.02759009009009e-07, + "loss": 0.0005, + "reward": 3.4663702249526978, + "reward_std": 0.39432157576084137, + "rewards/final_reward": 1.5362254960852642, + "rewards/mask_iou_reward": 0.7681127480426321, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5080366730690002, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1411, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.1041717529297, + "epoch": 4.768971332209106, + "grad_norm": 6.119217135940791, + "kl": 0.4111328125, + "learning_rate": 6.024774774774774e-07, + "loss": 0.0004, + "reward": 3.2781678438186646, + "reward_std": 0.1051796767860651, + "rewards/final_reward": 1.2576380901643076, + "rewards/mask_iou_reward": 0.6288190450821538, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2781679034233093, + "rewards/thk_ans_format_reward": 1.0, + "step": 1412, + "think_completion_length": 10.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.48959350585938, + "epoch": 4.772344013490725, + "grad_norm": 26.195928703726985, + "kl": 0.4921875, + "learning_rate": 6.021959459459459e-07, + "loss": 0.0005, + "reward": 3.382575273513794, + "reward_std": 0.12796253710985184, + "rewards/final_reward": 1.0492009290654098, + "rewards/mask_iou_reward": 0.5246004645327049, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3825750946998596, + "rewards/thk_ans_format_reward": 1.0, + "step": 1413, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.90625, + "epoch": 4.775716694772344, + "grad_norm": 8.068286894497055, + "kl": 0.361328125, + "learning_rate": 6.019144144144144e-07, + "loss": 0.0004, + "reward": 3.5492480993270874, + "reward_std": 0.13272245228290558, + "rewards/final_reward": 1.8132955229145629, + "rewards/mask_iou_reward": 0.9066477614572814, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5492480993270874, + "rewards/thk_ans_format_reward": 1.0, + "step": 1414, + "think_completion_length": 6.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.8854217529297, + "epoch": 4.779089376053963, + "grad_norm": 15.414406295867698, + "kl": 0.3779296875, + "learning_rate": 6.016328828828828e-07, + "loss": 0.0004, + "reward": 3.150847315788269, + "reward_std": 0.22631582617759705, + "rewards/final_reward": 1.752640437267114, + "rewards/mask_iou_reward": 0.876320218633557, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.182097315788269, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1415, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.68750762939453, + "epoch": 4.782462057335582, + "grad_norm": 11.22454762434086, + "kl": 0.4189453125, + "learning_rate": 6.013513513513513e-07, + "loss": 0.0004, + "reward": 3.4703203439712524, + "reward_std": 0.23415010422468185, + "rewards/final_reward": 1.4415937930558345, + "rewards/mask_iou_reward": 0.7207968965279172, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.5015702843666077, + "rewards/thk_ans_format_reward": 1.0, + "step": 1416, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.89583587646484, + "epoch": 4.785834738617201, + "grad_norm": 7.733062562592615, + "kl": 0.4580078125, + "learning_rate": 6.010698198198197e-07, + "loss": 0.0005, + "reward": 3.6019084453582764, + "reward_std": 0.07121942192316055, + "rewards/final_reward": 0.7838488290370744, + "rewards/mask_iou_reward": 0.3919244145185372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6019084453582764, + "rewards/thk_ans_format_reward": 1.0, + "step": 1417, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.63542175292969, + "epoch": 4.78920741989882, + "grad_norm": 8.28171036478787, + "kl": 0.4697265625, + "learning_rate": 6.007882882882883e-07, + "loss": 0.0005, + "reward": 3.7588918209075928, + "reward_std": 0.015361388213932514, + "rewards/final_reward": 1.5930138037725348, + "rewards/mask_iou_reward": 0.7965069018862674, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7588915824890137, + "rewards/thk_ans_format_reward": 1.0, + "step": 1418, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0416717529297, + "epoch": 4.792580101180438, + "grad_norm": 7.398839807072086, + "kl": 0.431640625, + "learning_rate": 6.005067567567568e-07, + "loss": 0.0004, + "reward": 3.1072298288345337, + "reward_std": 0.22314369678497314, + "rewards/final_reward": 0.765689319467859, + "rewards/mask_iou_reward": 0.3828446597339295, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.1280630826950073, + "rewards/thk_ans_format_reward": 1.0, + "step": 1419, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.4166717529297, + "epoch": 4.795952782462058, + "grad_norm": 15.551402360333316, + "kl": 0.45703125, + "learning_rate": 6.002252252252252e-07, + "loss": 0.0005, + "reward": 3.691345453262329, + "reward_std": 0.0775423776358366, + "rewards/final_reward": 1.7338988018583557, + "rewards/mask_iou_reward": 0.8669494009291778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.691345453262329, + "rewards/thk_ans_format_reward": 1.0, + "step": 1420, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.27083587646484, + "epoch": 4.799325463743676, + "grad_norm": 175.63575080043208, + "kl": 0.419921875, + "learning_rate": 5.999436936936937e-07, + "loss": 0.0004, + "reward": 3.6315362453460693, + "reward_std": 0.0602062102407217, + "rewards/final_reward": 1.7756131187543025, + "rewards/mask_iou_reward": 0.8878065593771512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6315361857414246, + "rewards/thk_ans_format_reward": 1.0, + "step": 1421, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.6666717529297, + "epoch": 4.802698145025295, + "grad_norm": 22.810287856480006, + "kl": 0.548828125, + "learning_rate": 5.996621621621621e-07, + "loss": 0.0005, + "reward": 3.733266234397888, + "reward_std": 0.06191633269190788, + "rewards/final_reward": 1.4032215032311224, + "rewards/mask_iou_reward": 0.7016107516155612, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7332662343978882, + "rewards/thk_ans_format_reward": 1.0, + "step": 1422, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.23958587646484, + "epoch": 4.806070826306914, + "grad_norm": 6.424184368298101, + "kl": 0.400390625, + "learning_rate": 5.993806306306306e-07, + "loss": 0.0004, + "reward": 3.5381908416748047, + "reward_std": 0.1660333201289177, + "rewards/final_reward": 1.5462451945725153, + "rewards/mask_iou_reward": 0.7731225972862577, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.548607885837555, + "rewards/thk_ans_format_reward": 1.0, + "step": 1423, + "think_completion_length": 9.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.58333587646484, + "epoch": 4.809443507588533, + "grad_norm": 10.449873295033866, + "kl": 0.560546875, + "learning_rate": 5.990990990990991e-07, + "loss": 0.0006, + "reward": 3.2715214490890503, + "reward_std": 0.05700042471289635, + "rewards/final_reward": 1.455750249986035, + "rewards/mask_iou_reward": 0.7278751249930175, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2715213894844055, + "rewards/thk_ans_format_reward": 1.0, + "step": 1424, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.05208587646484, + "epoch": 4.812816188870151, + "grad_norm": 13.190764748599955, + "kl": 0.513671875, + "learning_rate": 5.988175675675675e-07, + "loss": 0.0005, + "reward": 3.528395652770996, + "reward_std": 0.07729190587997437, + "rewards/final_reward": 1.8860675174581736, + "rewards/mask_iou_reward": 0.9430337587290868, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5283954739570618, + "rewards/thk_ans_format_reward": 1.0, + "step": 1425, + "think_completion_length": 10.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.34375, + "epoch": 4.8161888701517706, + "grad_norm": 5.8108007501621755, + "kl": 0.4501953125, + "learning_rate": 5.98536036036036e-07, + "loss": 0.0004, + "reward": 3.5327943563461304, + "reward_std": 0.14703011512756348, + "rewards/final_reward": 1.2986196006218722, + "rewards/mask_iou_reward": 0.6493098003109361, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5432112216949463, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1426, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.80208587646484, + "epoch": 4.81956155143339, + "grad_norm": 19.684142950430235, + "kl": 0.48046875, + "learning_rate": 5.982545045045045e-07, + "loss": 0.0005, + "reward": 3.745737075805664, + "reward_std": 0.04851808398962021, + "rewards/final_reward": 1.7769890462739044, + "rewards/mask_iou_reward": 0.8884945231369522, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7457371950149536, + "rewards/thk_ans_format_reward": 1.0, + "step": 1427, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.23959350585938, + "epoch": 4.822934232715008, + "grad_norm": 15.89792351747273, + "kl": 0.494140625, + "learning_rate": 5.97972972972973e-07, + "loss": 0.0005, + "reward": 3.3736536502838135, + "reward_std": 0.03965951129794121, + "rewards/final_reward": 0.9939795966808459, + "rewards/mask_iou_reward": 0.49698979834042295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3736535906791687, + "rewards/thk_ans_format_reward": 1.0, + "step": 1428, + "think_completion_length": 10.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.3541717529297, + "epoch": 4.8263069139966275, + "grad_norm": 13.482933812720312, + "kl": 0.369140625, + "learning_rate": 5.976914414414415e-07, + "loss": 0.0004, + "reward": 3.585512638092041, + "reward_std": 0.06997106038033962, + "rewards/final_reward": 1.7870791411846856, + "rewards/mask_iou_reward": 0.8935395705923428, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5855128169059753, + "rewards/thk_ans_format_reward": 1.0, + "step": 1429, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.08333587646484, + "epoch": 4.829679595278246, + "grad_norm": 19.937100574525502, + "kl": 0.4462890625, + "learning_rate": 5.974099099099099e-07, + "loss": 0.0005, + "reward": 3.270692229270935, + "reward_std": 0.1783260926604271, + "rewards/final_reward": 1.8939412539997733, + "rewards/mask_iou_reward": 0.9469706269998867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.270692229270935, + "rewards/thk_ans_format_reward": 1.0, + "step": 1430, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.27084350585938, + "epoch": 4.833052276559865, + "grad_norm": 13.139160373627659, + "kl": 0.5029296875, + "learning_rate": 5.971283783783784e-07, + "loss": 0.0005, + "reward": 3.3824121952056885, + "reward_std": 0.21076303720474243, + "rewards/final_reward": 1.3698366206645685, + "rewards/mask_iou_reward": 0.6849183103322842, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3824121952056885, + "rewards/thk_ans_format_reward": 1.0, + "step": 1431, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.17708587646484, + "epoch": 4.8364249578414835, + "grad_norm": 8.548335447624954, + "kl": 0.47265625, + "learning_rate": 5.968468468468469e-07, + "loss": 0.0005, + "reward": 3.6033719778060913, + "reward_std": 0.20962823927402496, + "rewards/final_reward": 1.5506067036857636, + "rewards/mask_iou_reward": 0.7753033518428818, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6033719778060913, + "rewards/thk_ans_format_reward": 1.0, + "step": 1432, + "think_completion_length": 9.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5416717529297, + "epoch": 4.839797639123103, + "grad_norm": 7.23649023928491, + "kl": 0.380859375, + "learning_rate": 5.965653153153153e-07, + "loss": 0.0004, + "reward": 3.7130861282348633, + "reward_std": 0.1642559003084898, + "rewards/final_reward": 1.9265573826772417, + "rewards/mask_iou_reward": 0.9632786913386209, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7339193224906921, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1433, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.80208587646484, + "epoch": 4.843170320404722, + "grad_norm": 12.446966451587427, + "kl": 0.38671875, + "learning_rate": 5.962837837837838e-07, + "loss": 0.0004, + "reward": 3.2934367656707764, + "reward_std": 0.1459517478942871, + "rewards/final_reward": 0.9782698315033347, + "rewards/mask_iou_reward": 0.48913491575166734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2934367060661316, + "rewards/thk_ans_format_reward": 1.0, + "step": 1434, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.17708587646484, + "epoch": 4.8465430016863404, + "grad_norm": 8.99444927904533, + "kl": 0.4326171875, + "learning_rate": 5.960022522522522e-07, + "loss": 0.0004, + "reward": 3.3557018041610718, + "reward_std": 0.13160298392176628, + "rewards/final_reward": 1.91063537233244, + "rewards/mask_iou_reward": 0.95531768616622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3557018637657166, + "rewards/thk_ans_format_reward": 1.0, + "step": 1435, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.34375762939453, + "epoch": 4.84991568296796, + "grad_norm": 8.347854214205052, + "kl": 0.3916015625, + "learning_rate": 5.957207207207207e-07, + "loss": 0.0004, + "reward": 3.199750542640686, + "reward_std": 0.10312426835298538, + "rewards/final_reward": 1.1158206020736225, + "rewards/mask_iou_reward": 0.5579103010368113, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.199750542640686, + "rewards/thk_ans_format_reward": 1.0, + "step": 1436, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.3854217529297, + "epoch": 4.853288364249578, + "grad_norm": 10.778873977904322, + "kl": 0.5888671875, + "learning_rate": 5.954391891891892e-07, + "loss": 0.0006, + "reward": 3.339724063873291, + "reward_std": 0.09653700515627861, + "rewards/final_reward": 1.8371860152172692, + "rewards/mask_iou_reward": 0.9185930076086346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3397240042686462, + "rewards/thk_ans_format_reward": 1.0, + "step": 1437, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.73958587646484, + "epoch": 4.856661045531197, + "grad_norm": 12.160043758137181, + "kl": 0.44921875, + "learning_rate": 5.951576576576577e-07, + "loss": 0.0005, + "reward": 3.5388941764831543, + "reward_std": 0.15586276352405548, + "rewards/final_reward": 1.532367128831696, + "rewards/mask_iou_reward": 0.766183564415848, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5388941764831543, + "rewards/thk_ans_format_reward": 1.0, + "step": 1438, + "think_completion_length": 9.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.48958587646484, + "epoch": 4.860033726812816, + "grad_norm": 10.08293778091881, + "kl": 0.4091796875, + "learning_rate": 5.948761261261262e-07, + "loss": 0.0004, + "reward": 3.389430522918701, + "reward_std": 0.09414727240800858, + "rewards/final_reward": 1.8392757522290495, + "rewards/mask_iou_reward": 0.9196378761145247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3894306421279907, + "rewards/thk_ans_format_reward": 1.0, + "step": 1439, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.80208587646484, + "epoch": 4.863406408094435, + "grad_norm": 7.986576988318299, + "kl": 0.51171875, + "learning_rate": 5.945945945945947e-07, + "loss": 0.0005, + "reward": 3.439391613006592, + "reward_std": 0.21128800511360168, + "rewards/final_reward": 1.0754574248509696, + "rewards/mask_iou_reward": 0.5377287124254848, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4393916130065918, + "rewards/thk_ans_format_reward": 1.0, + "step": 1440, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.2604217529297, + "epoch": 4.866779089376054, + "grad_norm": 17.852470988682573, + "kl": 0.4482421875, + "learning_rate": 5.94313063063063e-07, + "loss": 0.0005, + "reward": 3.288352608680725, + "reward_std": 0.12657052278518677, + "rewards/final_reward": 1.0126949471567626, + "rewards/mask_iou_reward": 0.5063474735783813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2883524894714355, + "rewards/thk_ans_format_reward": 1.0, + "step": 1441, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.36458587646484, + "epoch": 4.870151770657673, + "grad_norm": 5.582480495011308, + "kl": 0.48828125, + "learning_rate": 5.940315315315315e-07, + "loss": 0.0005, + "reward": 3.4604064226150513, + "reward_std": 0.053551677614450455, + "rewards/final_reward": 0.9912932618618167, + "rewards/mask_iou_reward": 0.4956466309309083, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.460406243801117, + "rewards/thk_ans_format_reward": 1.0, + "step": 1442, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.11458587646484, + "epoch": 4.873524451939292, + "grad_norm": 10.726826988763884, + "kl": 0.4052734375, + "learning_rate": 5.937499999999999e-07, + "loss": 0.0004, + "reward": 3.3676618337631226, + "reward_std": 0.10566180571913719, + "rewards/final_reward": 1.4990256896295167, + "rewards/mask_iou_reward": 0.7495128448147583, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3676618337631226, + "rewards/thk_ans_format_reward": 1.0, + "step": 1443, + "think_completion_length": 7.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.97916793823242, + "epoch": 4.87689713322091, + "grad_norm": 48.36969289130845, + "kl": 0.482421875, + "learning_rate": 5.934684684684684e-07, + "loss": 0.0005, + "reward": 3.831332206726074, + "reward_std": 0.05491393432021141, + "rewards/final_reward": 1.74433423874409, + "rewards/mask_iou_reward": 0.872167119372045, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.83133202791214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1444, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.48958587646484, + "epoch": 4.88026981450253, + "grad_norm": 8.382209616122617, + "kl": 0.44140625, + "learning_rate": 5.931869369369368e-07, + "loss": 0.0005, + "reward": 3.520848274230957, + "reward_std": 0.060753241181373596, + "rewards/final_reward": 1.5039464577605335, + "rewards/mask_iou_reward": 0.7519732288802667, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5208481550216675, + "rewards/thk_ans_format_reward": 1.0, + "step": 1445, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.73958587646484, + "epoch": 4.883642495784148, + "grad_norm": 14.90449864621606, + "kl": 0.435546875, + "learning_rate": 5.929054054054053e-07, + "loss": 0.0004, + "reward": 3.542003035545349, + "reward_std": 0.07900802604854107, + "rewards/final_reward": 1.5796984591925156, + "rewards/mask_iou_reward": 0.7898492295962578, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5420030355453491, + "rewards/thk_ans_format_reward": 1.0, + "step": 1446, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.125, + "epoch": 4.887015177065767, + "grad_norm": 17.873395622327013, + "kl": 0.439453125, + "learning_rate": 5.926238738738738e-07, + "loss": 0.0004, + "reward": 3.635874629020691, + "reward_std": 0.08757461607456207, + "rewards/final_reward": 1.907717467305979, + "rewards/mask_iou_reward": 0.9538587336529895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6358747482299805, + "rewards/thk_ans_format_reward": 1.0, + "step": 1447, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.54166793823242, + "epoch": 4.8903878583473865, + "grad_norm": 9.895515957426772, + "kl": 0.4208984375, + "learning_rate": 5.923423423423422e-07, + "loss": 0.0004, + "reward": 3.433847188949585, + "reward_std": 0.06330831721425056, + "rewards/final_reward": 1.2834112597618437, + "rewards/mask_iou_reward": 0.6417056298809218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4338471293449402, + "rewards/thk_ans_format_reward": 1.0, + "step": 1448, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.625, + "epoch": 4.893760539629005, + "grad_norm": 13.782892970677487, + "kl": 0.5009765625, + "learning_rate": 5.920608108108108e-07, + "loss": 0.0005, + "reward": 3.507854461669922, + "reward_std": 0.06417267397046089, + "rewards/final_reward": 1.227488244552132, + "rewards/mask_iou_reward": 0.613744122276066, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5078545808792114, + "rewards/thk_ans_format_reward": 1.0, + "step": 1449, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.1979217529297, + "epoch": 4.897133220910624, + "grad_norm": 11.388583621848147, + "kl": 0.486328125, + "learning_rate": 5.917792792792793e-07, + "loss": 0.0005, + "reward": 3.4151848554611206, + "reward_std": 0.3494800329208374, + "rewards/final_reward": 1.294598148507474, + "rewards/mask_iou_reward": 0.647299074253737, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.4881016612052917, + "rewards/thk_ans_format_reward": 0.96875, + "step": 1450, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.90625, + "epoch": 4.900505902192243, + "grad_norm": 13.374894758145272, + "kl": 0.400390625, + "learning_rate": 5.914977477477477e-07, + "loss": 0.0004, + "reward": 3.179789900779724, + "reward_std": 0.19759593158960342, + "rewards/final_reward": 1.0805071135765294, + "rewards/mask_iou_reward": 0.5402535567882647, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2006232142448425, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1451, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.84375, + "epoch": 4.903878583473862, + "grad_norm": 35.74475640711918, + "kl": 0.5029296875, + "learning_rate": 5.912162162162162e-07, + "loss": 0.0005, + "reward": 3.46237576007843, + "reward_std": 0.08721278607845306, + "rewards/final_reward": 1.8576137448561374, + "rewards/mask_iou_reward": 0.9288068724280687, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.462375819683075, + "rewards/thk_ans_format_reward": 1.0, + "step": 1452, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.11459350585938, + "epoch": 4.90725126475548, + "grad_norm": 10.470587615041929, + "kl": 0.431640625, + "learning_rate": 5.909346846846846e-07, + "loss": 0.0004, + "reward": 2.773538589477539, + "reward_std": 0.09802227839827538, + "rewards/final_reward": 0.6910804862719137, + "rewards/mask_iou_reward": 0.3455402431359568, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.7735386490821838, + "rewards/thk_ans_format_reward": 1.0, + "step": 1453, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.6354217529297, + "epoch": 4.9106239460370995, + "grad_norm": 9.188969334508629, + "kl": 0.4609375, + "learning_rate": 5.906531531531531e-07, + "loss": 0.0005, + "reward": 3.2502578496932983, + "reward_std": 0.19383827969431877, + "rewards/final_reward": 1.5927118283877038, + "rewards/mask_iou_reward": 0.7963559141938519, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2502577900886536, + "rewards/thk_ans_format_reward": 1.0, + "step": 1454, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.55209350585938, + "epoch": 4.913996627318719, + "grad_norm": 7.67389208349097, + "kl": 0.376953125, + "learning_rate": 5.903716216216216e-07, + "loss": 0.0004, + "reward": 3.3607468605041504, + "reward_std": 0.19103029370307922, + "rewards/final_reward": 1.5983448402015643, + "rewards/mask_iou_reward": 0.7991724201007822, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3607465624809265, + "rewards/thk_ans_format_reward": 1.0, + "step": 1455, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1354217529297, + "epoch": 4.917369308600337, + "grad_norm": 9.04516507270446, + "kl": 0.4609375, + "learning_rate": 5.9009009009009e-07, + "loss": 0.0005, + "reward": 3.50595223903656, + "reward_std": 0.0903414785861969, + "rewards/final_reward": 1.373421515151684, + "rewards/mask_iou_reward": 0.686710757575842, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5059520602226257, + "rewards/thk_ans_format_reward": 1.0, + "step": 1456, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.80208587646484, + "epoch": 4.920741989881956, + "grad_norm": 15.530085309166886, + "kl": 0.4208984375, + "learning_rate": 5.898085585585585e-07, + "loss": 0.0004, + "reward": 3.2726866006851196, + "reward_std": 0.21423480100929737, + "rewards/final_reward": 0.6115067894808941, + "rewards/mask_iou_reward": 0.30575339474044705, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.293519675731659, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1457, + "think_completion_length": 10.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.89584350585938, + "epoch": 4.924114671163575, + "grad_norm": 14.647227093297408, + "kl": 0.3671875, + "learning_rate": 5.895270270270269e-07, + "loss": 0.0004, + "reward": 3.3387356996536255, + "reward_std": 0.21982141956686974, + "rewards/final_reward": 1.7117802352980065, + "rewards/mask_iou_reward": 0.8558901176490032, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3595690727233887, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1458, + "think_completion_length": 10.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.1979217529297, + "epoch": 4.927487352445194, + "grad_norm": 26.526587616046104, + "kl": 0.82421875, + "learning_rate": 5.892454954954955e-07, + "loss": 0.0008, + "reward": 3.442506194114685, + "reward_std": 0.19030030816793442, + "rewards/final_reward": 1.678046948788117, + "rewards/mask_iou_reward": 0.8390234743940586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4425063133239746, + "rewards/thk_ans_format_reward": 1.0, + "step": 1459, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.3125, + "epoch": 4.9308600337268125, + "grad_norm": 12.92104118288375, + "kl": 0.3779296875, + "learning_rate": 5.88963963963964e-07, + "loss": 0.0004, + "reward": 3.5892350673675537, + "reward_std": 0.15997378155589104, + "rewards/final_reward": 1.3809674531930873, + "rewards/mask_iou_reward": 0.6904837265965437, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.589235007762909, + "rewards/thk_ans_format_reward": 1.0, + "step": 1460, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.1979217529297, + "epoch": 4.934232715008432, + "grad_norm": 10.699530552849511, + "kl": 0.46484375, + "learning_rate": 5.886824324324324e-07, + "loss": 0.0005, + "reward": 3.7261245250701904, + "reward_std": 0.0693025141954422, + "rewards/final_reward": 1.8081499676675472, + "rewards/mask_iou_reward": 0.9040749838337736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7261244654655457, + "rewards/thk_ans_format_reward": 1.0, + "step": 1461, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.3541717529297, + "epoch": 4.937605396290051, + "grad_norm": 15.085039386224084, + "kl": 0.357421875, + "learning_rate": 5.884009009009009e-07, + "loss": 0.0004, + "reward": 3.5327210426330566, + "reward_std": 0.16988344490528107, + "rewards/final_reward": 1.0301080061032815, + "rewards/mask_iou_reward": 0.5150540030516407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5327209830284119, + "rewards/thk_ans_format_reward": 1.0, + "step": 1462, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.12500762939453, + "epoch": 4.940978077571669, + "grad_norm": 33.75329709138452, + "kl": 0.572265625, + "learning_rate": 5.881193693693694e-07, + "loss": 0.0006, + "reward": 3.8426930904388428, + "reward_std": 0.07315381523221731, + "rewards/final_reward": 1.9806365523159024, + "rewards/mask_iou_reward": 0.9903182761579512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.842693030834198, + "rewards/thk_ans_format_reward": 1.0, + "step": 1463, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.6354217529297, + "epoch": 4.944350758853289, + "grad_norm": 8.529144022685584, + "kl": 0.3935546875, + "learning_rate": 5.878378378378378e-07, + "loss": 0.0004, + "reward": 3.2548428773880005, + "reward_std": 0.257319413125515, + "rewards/final_reward": 1.4339055670117853, + "rewards/mask_iou_reward": 0.7169527835058926, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2756760120391846, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1464, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.77084350585938, + "epoch": 4.947723440134907, + "grad_norm": 23.35727370278157, + "kl": 0.435546875, + "learning_rate": 5.875563063063063e-07, + "loss": 0.0004, + "reward": 3.2339917421340942, + "reward_std": 0.3385896082036197, + "rewards/final_reward": 1.5851923339722218, + "rewards/mask_iou_reward": 0.7925961669861109, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.3173249959945679, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 1465, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.1666717529297, + "epoch": 4.951096121416526, + "grad_norm": 10.193660842015277, + "kl": 0.37890625, + "learning_rate": 5.872747747747747e-07, + "loss": 0.0004, + "reward": 3.423218607902527, + "reward_std": 0.21466679126024246, + "rewards/final_reward": 1.043962518203048, + "rewards/mask_iou_reward": 0.521981259101524, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.464885175228119, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1466, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.92708587646484, + "epoch": 4.954468802698145, + "grad_norm": 10.053228747894176, + "kl": 0.4296875, + "learning_rate": 5.869932432432432e-07, + "loss": 0.0004, + "reward": 3.3755905628204346, + "reward_std": 0.48930785059928894, + "rewards/final_reward": 1.3053080176855865, + "rewards/mask_iou_reward": 0.6526540088427932, + "rewards/sam_format_reward": 0.9583333730697632, + "rewards/sam_reward_func_ultra": 1.458924114704132, + "rewards/thk_ans_format_reward": 0.9583333730697632, + "step": 1467, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.45834350585938, + "epoch": 4.957841483979764, + "grad_norm": 9.13263919322423, + "kl": 0.3623046875, + "learning_rate": 5.867117117117117e-07, + "loss": 0.0004, + "reward": 3.380911111831665, + "reward_std": 0.14448396861553192, + "rewards/final_reward": 0.9460272971516208, + "rewards/mask_iou_reward": 0.4730136485758104, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3809112310409546, + "rewards/thk_ans_format_reward": 1.0, + "step": 1468, + "think_completion_length": 9.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.64583587646484, + "epoch": 4.961214165261383, + "grad_norm": 12.851598480616993, + "kl": 0.548828125, + "learning_rate": 5.864301801801802e-07, + "loss": 0.0006, + "reward": 3.380996346473694, + "reward_std": 0.0686973761767149, + "rewards/final_reward": 1.735709440736965, + "rewards/mask_iou_reward": 0.8678547203684825, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.380996286869049, + "rewards/thk_ans_format_reward": 1.0, + "step": 1469, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.52084350585938, + "epoch": 4.964586846543002, + "grad_norm": 6.908240979975726, + "kl": 0.40234375, + "learning_rate": 5.861486486486487e-07, + "loss": 0.0004, + "reward": 3.3168845176696777, + "reward_std": 0.33022212237119675, + "rewards/final_reward": 1.392443548241194, + "rewards/mask_iou_reward": 0.696221774120597, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.4002178311347961, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 1470, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.40625, + "epoch": 4.967959527824621, + "grad_norm": 5.682180474450704, + "kl": 0.44921875, + "learning_rate": 5.858671171171171e-07, + "loss": 0.0004, + "reward": 3.4397724866867065, + "reward_std": 0.2822958081960678, + "rewards/final_reward": 1.8190369445658745, + "rewards/mask_iou_reward": 0.9095184722829373, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4814391732215881, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1471, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.62500762939453, + "epoch": 4.971332209106239, + "grad_norm": 4.219765484758162, + "kl": 0.427734375, + "learning_rate": 5.855855855855856e-07, + "loss": 0.0004, + "reward": 2.8198740482330322, + "reward_std": 0.29118700325489044, + "rewards/final_reward": 1.2281644554676672, + "rewards/mask_iou_reward": 0.6140822277338336, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 0.8407072424888611, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1472, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.50000762939453, + "epoch": 4.974704890387859, + "grad_norm": 17.90019864721094, + "kl": 0.3623046875, + "learning_rate": 5.853040540540541e-07, + "loss": 0.0004, + "reward": 3.3107402324676514, + "reward_std": 0.3046105355024338, + "rewards/final_reward": 1.2522745466299563, + "rewards/mask_iou_reward": 0.6261372733149781, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.3628233671188354, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1473, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.53125762939453, + "epoch": 4.978077571669477, + "grad_norm": 24.712536897200188, + "kl": 0.373046875, + "learning_rate": 5.850225225225225e-07, + "loss": 0.0004, + "reward": 3.2573235034942627, + "reward_std": 0.2703036963939667, + "rewards/final_reward": 1.5502094876731096, + "rewards/mask_iou_reward": 0.7751047438365548, + "rewards/sam_format_reward": 0.8750000298023224, + "rewards/sam_reward_func_ultra": 1.4864901304244995, + "rewards/thk_ans_format_reward": 0.8958333432674408, + "step": 1474, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.6979217529297, + "epoch": 4.981450252951096, + "grad_norm": 14.704284009815126, + "kl": 0.455078125, + "learning_rate": 5.84740990990991e-07, + "loss": 0.0005, + "reward": 3.2285962104797363, + "reward_std": 0.24824640899896622, + "rewards/final_reward": 1.611938276483454, + "rewards/mask_iou_reward": 0.805969138241727, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.2702626585960388, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1475, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.8854217529297, + "epoch": 4.9848229342327155, + "grad_norm": 6.630361314100409, + "kl": 0.4365234375, + "learning_rate": 5.844594594594594e-07, + "loss": 0.0004, + "reward": 3.642152428627014, + "reward_std": 0.11073607206344604, + "rewards/final_reward": 1.795592902342593, + "rewards/mask_iou_reward": 0.8977964511712965, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6421523690223694, + "rewards/thk_ans_format_reward": 1.0, + "step": 1476, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.1979217529297, + "epoch": 4.988195615514334, + "grad_norm": 13.41912847632758, + "kl": 0.544921875, + "learning_rate": 5.841779279279279e-07, + "loss": 0.0006, + "reward": 3.583902359008789, + "reward_std": 0.09717679023742676, + "rewards/final_reward": 1.813288793658495, + "rewards/mask_iou_reward": 0.9066443968292475, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.58390212059021, + "rewards/thk_ans_format_reward": 1.0, + "step": 1477, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.20833587646484, + "epoch": 4.991568296795953, + "grad_norm": 11.096277828348555, + "kl": 0.400390625, + "learning_rate": 5.838963963963964e-07, + "loss": 0.0004, + "reward": 3.486131191253662, + "reward_std": 0.14123235642910004, + "rewards/final_reward": 1.5401189557808541, + "rewards/mask_iou_reward": 0.7700594778904271, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4861310720443726, + "rewards/thk_ans_format_reward": 1.0, + "step": 1478, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.53125762939453, + "epoch": 4.9949409780775715, + "grad_norm": 31.457315141376977, + "kl": 0.41796875, + "learning_rate": 5.836148648648649e-07, + "loss": 0.0004, + "reward": 3.2034597396850586, + "reward_std": 0.222567617893219, + "rewards/final_reward": 1.3724964809920568, + "rewards/mask_iou_reward": 0.6862482404960284, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2138763666152954, + "rewards/thk_ans_format_reward": 1.0, + "step": 1479, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.15789794921875, + "epoch": 4.998313659359191, + "grad_norm": 16.960939494270033, + "kl": 0.431640625, + "learning_rate": 5.833333333333334e-07, + "loss": 0.0004, + "reward": 3.2109347581863403, + "reward_std": 0.05437912791967392, + "rewards/final_reward": 0.9108779879936213, + "rewards/mask_iou_reward": 0.45543899399681065, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2109346687793732, + "rewards/thk_ans_format_reward": 1.0, + "step": 1480, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.83333587646484, + "epoch": 5.003372681281619, + "grad_norm": 6.790820122613479, + "kl": 0.4619140625, + "learning_rate": 5.830518018018017e-07, + "loss": 0.0005, + "reward": 3.7827104330062866, + "reward_std": 0.08347970061004162, + "rewards/final_reward": 1.7737243143901735, + "rewards/mask_iou_reward": 0.8868621571950868, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.782710313796997, + "rewards/thk_ans_format_reward": 1.0, + "step": 1481, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.06250762939453, + "epoch": 5.006745362563238, + "grad_norm": 7.175459226342949, + "kl": 0.4384765625, + "learning_rate": 5.827702702702702e-07, + "loss": 0.0005, + "reward": 3.3402684926986694, + "reward_std": 0.05066767521202564, + "rewards/final_reward": 1.6801718841758047, + "rewards/mask_iou_reward": 0.8400859420879023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.340268611907959, + "rewards/thk_ans_format_reward": 1.0, + "step": 1482, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.23958587646484, + "epoch": 5.010118043844857, + "grad_norm": 9.831226190656961, + "kl": 0.458984375, + "learning_rate": 5.824887387387387e-07, + "loss": 0.0005, + "reward": 3.557790994644165, + "reward_std": 0.07639824971556664, + "rewards/final_reward": 1.5340993469521953, + "rewards/mask_iou_reward": 0.7670496734760976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5577911138534546, + "rewards/thk_ans_format_reward": 1.0, + "step": 1483, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.61458587646484, + "epoch": 5.013490725126475, + "grad_norm": 10.464371874039816, + "kl": 0.447265625, + "learning_rate": 5.822072072072071e-07, + "loss": 0.0004, + "reward": 3.669014811515808, + "reward_std": 0.1482251062989235, + "rewards/final_reward": 1.6119222329711107, + "rewards/mask_iou_reward": 0.8059611164855554, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6690149903297424, + "rewards/thk_ans_format_reward": 1.0, + "step": 1484, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.87500762939453, + "epoch": 5.016863406408095, + "grad_norm": 26.869203809282517, + "kl": 0.482421875, + "learning_rate": 5.819256756756756e-07, + "loss": 0.0005, + "reward": 3.3169851303100586, + "reward_std": 0.13613472506403923, + "rewards/final_reward": 1.4533780060265058, + "rewards/mask_iou_reward": 0.7266890030132529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3274016380310059, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1485, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.21875381469727, + "epoch": 5.020236087689713, + "grad_norm": 8.257518991856715, + "kl": 0.4091796875, + "learning_rate": 5.816441441441441e-07, + "loss": 0.0004, + "reward": 3.219054937362671, + "reward_std": 0.12979388982057571, + "rewards/final_reward": 0.9981974144377622, + "rewards/mask_iou_reward": 0.4990987072188811, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.219054639339447, + "rewards/thk_ans_format_reward": 1.0, + "step": 1486, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.23958587646484, + "epoch": 5.023608768971332, + "grad_norm": 9.23841667282252, + "kl": 0.53515625, + "learning_rate": 5.813626126126125e-07, + "loss": 0.0005, + "reward": 3.609062910079956, + "reward_std": 0.024191563948988914, + "rewards/final_reward": 1.0073205239101837, + "rewards/mask_iou_reward": 0.5036602619550918, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6090629696846008, + "rewards/thk_ans_format_reward": 1.0, + "step": 1487, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.46875, + "epoch": 5.0269814502529515, + "grad_norm": 13.825371287240923, + "kl": 0.4609375, + "learning_rate": 5.81081081081081e-07, + "loss": 0.0005, + "reward": 3.4536292552948, + "reward_std": 0.28554829210042953, + "rewards/final_reward": 1.3948383521925767, + "rewards/mask_iou_reward": 0.6974191760962883, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4848793745040894, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1488, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0416717529297, + "epoch": 5.03035413153457, + "grad_norm": 12.392904541863194, + "kl": 0.4384765625, + "learning_rate": 5.807995495495495e-07, + "loss": 0.0005, + "reward": 3.346392273902893, + "reward_std": 0.13830474764108658, + "rewards/final_reward": 1.0681874939877285, + "rewards/mask_iou_reward": 0.5340937469938642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.346392273902893, + "rewards/thk_ans_format_reward": 1.0, + "step": 1489, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.12500762939453, + "epoch": 5.033726812816189, + "grad_norm": 17.686435335007896, + "kl": 0.4306640625, + "learning_rate": 5.80518018018018e-07, + "loss": 0.0004, + "reward": 3.561290979385376, + "reward_std": 0.1316972728818655, + "rewards/final_reward": 1.6434079249000093, + "rewards/mask_iou_reward": 0.8217039624500047, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.571707546710968, + "rewards/thk_ans_format_reward": 1.0, + "step": 1490, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4791717529297, + "epoch": 5.0370994940978076, + "grad_norm": 7.365495976354105, + "kl": 0.447265625, + "learning_rate": 5.802364864864865e-07, + "loss": 0.0005, + "reward": 3.3517701625823975, + "reward_std": 0.08861459605395794, + "rewards/final_reward": 1.3835822174002097, + "rewards/mask_iou_reward": 0.6917911087001049, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.351770043373108, + "rewards/thk_ans_format_reward": 1.0, + "step": 1491, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.54166793823242, + "epoch": 5.040472175379427, + "grad_norm": 6.787027590541496, + "kl": 0.3955078125, + "learning_rate": 5.799549549549549e-07, + "loss": 0.0004, + "reward": 3.3317559957504272, + "reward_std": 0.12025662325322628, + "rewards/final_reward": 1.7357438439015276, + "rewards/mask_iou_reward": 0.8678719219507638, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.331756055355072, + "rewards/thk_ans_format_reward": 1.0, + "step": 1492, + "think_completion_length": 6.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.53125762939453, + "epoch": 5.043844856661045, + "grad_norm": 12.941794978698631, + "kl": 0.82421875, + "learning_rate": 5.796734234234234e-07, + "loss": 0.0008, + "reward": 3.5921987295150757, + "reward_std": 0.1429782472550869, + "rewards/final_reward": 1.4563671336692763, + "rewards/mask_iou_reward": 0.7281835668346381, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5921986103057861, + "rewards/thk_ans_format_reward": 1.0, + "step": 1493, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.6875, + "epoch": 5.0472175379426645, + "grad_norm": 5.387916165116724, + "kl": 0.4296875, + "learning_rate": 5.793918918918918e-07, + "loss": 0.0004, + "reward": 3.305783987045288, + "reward_std": 0.1328853741288185, + "rewards/final_reward": 1.6526029425179214, + "rewards/mask_iou_reward": 0.8263014712589607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3162005543708801, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1494, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.8541717529297, + "epoch": 5.050590219224283, + "grad_norm": 13.444814680608228, + "kl": 0.71875, + "learning_rate": 5.791103603603603e-07, + "loss": 0.0007, + "reward": 3.694078803062439, + "reward_std": 0.07504570484161377, + "rewards/final_reward": 1.7440251935785165, + "rewards/mask_iou_reward": 0.8720125967892582, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6940789222717285, + "rewards/thk_ans_format_reward": 1.0, + "step": 1495, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.79167175292969, + "epoch": 5.053962900505902, + "grad_norm": 9.212872385754208, + "kl": 0.4521484375, + "learning_rate": 5.788288288288288e-07, + "loss": 0.0005, + "reward": 3.4492886066436768, + "reward_std": 0.034415675327181816, + "rewards/final_reward": 1.0015886302556327, + "rewards/mask_iou_reward": 0.5007943151278164, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.449288547039032, + "rewards/thk_ans_format_reward": 1.0, + "step": 1496, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.31250762939453, + "epoch": 5.057335581787521, + "grad_norm": 10.653258799760168, + "kl": 0.4365234375, + "learning_rate": 5.785472972972972e-07, + "loss": 0.0004, + "reward": 3.4376277923583984, + "reward_std": 0.20355669409036636, + "rewards/final_reward": 1.239369106554923, + "rewards/mask_iou_reward": 0.6196845532774615, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.437627911567688, + "rewards/thk_ans_format_reward": 1.0, + "step": 1497, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.14583587646484, + "epoch": 5.06070826306914, + "grad_norm": 12.34666190314828, + "kl": 0.3916015625, + "learning_rate": 5.782657657657657e-07, + "loss": 0.0004, + "reward": 3.16068696975708, + "reward_std": 0.1016513011418283, + "rewards/final_reward": 1.4004916296674885, + "rewards/mask_iou_reward": 0.7002458148337443, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1606866717338562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1498, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.88541793823242, + "epoch": 5.064080944350759, + "grad_norm": 7.722374138852124, + "kl": 0.427734375, + "learning_rate": 5.779842342342343e-07, + "loss": 0.0005, + "reward": 3.562274694442749, + "reward_std": 0.030085250735282898, + "rewards/final_reward": 1.356150771831553, + "rewards/mask_iou_reward": 0.6780753859157765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5622745752334595, + "rewards/thk_ans_format_reward": 1.0, + "step": 1499, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.27083587646484, + "epoch": 5.0674536256323774, + "grad_norm": 10.427274697363368, + "kl": 0.5087890625, + "learning_rate": 5.777027027027027e-07, + "loss": 0.0005, + "reward": 3.7217488288879395, + "reward_std": 0.09539500810205936, + "rewards/final_reward": 1.7929246014720737, + "rewards/mask_iou_reward": 0.8964623007360368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7217488288879395, + "rewards/thk_ans_format_reward": 1.0, + "step": 1500, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.62500762939453, + "epoch": 5.070826306913997, + "grad_norm": 14.243323096545671, + "kl": 0.4658203125, + "learning_rate": 5.774211711711712e-07, + "loss": 0.0005, + "reward": 3.5437698364257812, + "reward_std": 0.044545894488692284, + "rewards/final_reward": 1.3813278328414316, + "rewards/mask_iou_reward": 0.6906639164207158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5437697768211365, + "rewards/thk_ans_format_reward": 1.0, + "step": 1501, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.71875762939453, + "epoch": 5.074198988195615, + "grad_norm": 45.828657304525194, + "kl": 0.4345703125, + "learning_rate": 5.771396396396396e-07, + "loss": 0.0004, + "reward": 3.3739442825317383, + "reward_std": 0.25980713963508606, + "rewards/final_reward": 1.3380072058453065, + "rewards/mask_iou_reward": 0.6690036029226533, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.384360909461975, + "rewards/thk_ans_format_reward": 1.0, + "step": 1502, + "think_completion_length": 10.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.04166793823242, + "epoch": 5.077571669477234, + "grad_norm": 11.483672966459704, + "kl": 0.4501953125, + "learning_rate": 5.768581081081081e-07, + "loss": 0.0005, + "reward": 3.358354330062866, + "reward_std": 0.06946107372641563, + "rewards/final_reward": 1.7414760491673258, + "rewards/mask_iou_reward": 0.8707380245836629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3583542108535767, + "rewards/thk_ans_format_reward": 1.0, + "step": 1503, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.2291717529297, + "epoch": 5.080944350758854, + "grad_norm": 12.504538881398963, + "kl": 0.3515625, + "learning_rate": 5.765765765765766e-07, + "loss": 0.0004, + "reward": 3.381642699241638, + "reward_std": 0.10796273127198219, + "rewards/final_reward": 1.477920895873604, + "rewards/mask_iou_reward": 0.738960447936802, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3816425800323486, + "rewards/thk_ans_format_reward": 1.0, + "step": 1504, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.73958587646484, + "epoch": 5.084317032040472, + "grad_norm": 37.92833349633823, + "kl": 0.435546875, + "learning_rate": 5.76295045045045e-07, + "loss": 0.0004, + "reward": 3.5506197214126587, + "reward_std": 0.11413155496120453, + "rewards/final_reward": 1.8354678340080102, + "rewards/mask_iou_reward": 0.9177339170040051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5506197214126587, + "rewards/thk_ans_format_reward": 1.0, + "step": 1505, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.76042175292969, + "epoch": 5.087689713322091, + "grad_norm": 23.119291373909714, + "kl": 0.5078125, + "learning_rate": 5.760135135135135e-07, + "loss": 0.0005, + "reward": 3.324196457862854, + "reward_std": 0.02308501861989498, + "rewards/final_reward": 0.940729508191458, + "rewards/mask_iou_reward": 0.470364754095729, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3241963982582092, + "rewards/thk_ans_format_reward": 1.0, + "step": 1506, + "think_completion_length": 10.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.42708587646484, + "epoch": 5.09106239460371, + "grad_norm": 8.347456566306933, + "kl": 0.4541015625, + "learning_rate": 5.757319819819819e-07, + "loss": 0.0005, + "reward": 3.4063055515289307, + "reward_std": 0.05413071811199188, + "rewards/final_reward": 1.8625934872891068, + "rewards/mask_iou_reward": 0.9312967436445534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4063054919242859, + "rewards/thk_ans_format_reward": 1.0, + "step": 1507, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.31250381469727, + "epoch": 5.094435075885329, + "grad_norm": 12.280488648254183, + "kl": 0.3994140625, + "learning_rate": 5.754504504504504e-07, + "loss": 0.0004, + "reward": 3.319761037826538, + "reward_std": 0.15890631824731827, + "rewards/final_reward": 1.2424586590711777, + "rewards/mask_iou_reward": 0.6212293295355888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3197609186172485, + "rewards/thk_ans_format_reward": 1.0, + "step": 1508, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.46875381469727, + "epoch": 5.097807757166947, + "grad_norm": 8.065481225765806, + "kl": 0.4130859375, + "learning_rate": 5.75168918918919e-07, + "loss": 0.0004, + "reward": 3.604817509651184, + "reward_std": 0.0404562558978796, + "rewards/final_reward": 1.7894732818901535, + "rewards/mask_iou_reward": 0.8947366409450768, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6048176884651184, + "rewards/thk_ans_format_reward": 1.0, + "step": 1509, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.58333587646484, + "epoch": 5.101180438448567, + "grad_norm": 14.184383237985477, + "kl": 0.412109375, + "learning_rate": 5.748873873873874e-07, + "loss": 0.0005, + "reward": 3.7557284832000732, + "reward_std": 0.0582825830206275, + "rewards/final_reward": 1.81418842535148, + "rewards/mask_iou_reward": 0.90709421267574, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.755728542804718, + "rewards/thk_ans_format_reward": 1.0, + "step": 1510, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.09375762939453, + "epoch": 5.104553119730186, + "grad_norm": 12.679298662198153, + "kl": 0.4423828125, + "learning_rate": 5.746058558558559e-07, + "loss": 0.0004, + "reward": 3.424263119697571, + "reward_std": 0.09918565303087234, + "rewards/final_reward": 1.429559166461599, + "rewards/mask_iou_reward": 0.7147795832307995, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4242631793022156, + "rewards/thk_ans_format_reward": 1.0, + "step": 1511, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.375, + "epoch": 5.107925801011804, + "grad_norm": 11.672708605208836, + "kl": 0.400390625, + "learning_rate": 5.743243243243243e-07, + "loss": 0.0004, + "reward": 3.4523242712020874, + "reward_std": 0.096237538382411, + "rewards/final_reward": 1.4747727593325761, + "rewards/mask_iou_reward": 0.7373863796662881, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4523241519927979, + "rewards/thk_ans_format_reward": 1.0, + "step": 1512, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.94791793823242, + "epoch": 5.1112984822934235, + "grad_norm": 8.775201882775074, + "kl": 0.4853515625, + "learning_rate": 5.740427927927928e-07, + "loss": 0.0005, + "reward": 3.6957037448883057, + "reward_std": 0.06799108721315861, + "rewards/final_reward": 1.7922714004424427, + "rewards/mask_iou_reward": 0.8961357002212214, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6957036256790161, + "rewards/thk_ans_format_reward": 1.0, + "step": 1513, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.67708587646484, + "epoch": 5.114671163575042, + "grad_norm": 9.70304061676113, + "kl": 0.4052734375, + "learning_rate": 5.737612612612613e-07, + "loss": 0.0005, + "reward": 3.1605879068374634, + "reward_std": 0.1523672752082348, + "rewards/final_reward": 1.2100002109274715, + "rewards/mask_iou_reward": 0.6050001054637357, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.171004593372345, + "rewards/thk_ans_format_reward": 1.0, + "step": 1514, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.83333969116211, + "epoch": 5.118043844856661, + "grad_norm": 19.815276506284814, + "kl": 0.46875, + "learning_rate": 5.734797297297297e-07, + "loss": 0.0005, + "reward": 3.6572550535202026, + "reward_std": 0.12338948994874954, + "rewards/final_reward": 1.8912148086599259, + "rewards/mask_iou_reward": 0.9456074043299629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6572552919387817, + "rewards/thk_ans_format_reward": 1.0, + "step": 1515, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.84375762939453, + "epoch": 5.12141652613828, + "grad_norm": 7.3037408566604975, + "kl": 0.4365234375, + "learning_rate": 5.731981981981982e-07, + "loss": 0.0004, + "reward": 3.1100199222564697, + "reward_std": 0.13631337881088257, + "rewards/final_reward": 0.6195152690810087, + "rewards/mask_iou_reward": 0.30975763454050437, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.110019862651825, + "rewards/thk_ans_format_reward": 1.0, + "step": 1516, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.09375, + "epoch": 5.124789207419899, + "grad_norm": 8.928813460588515, + "kl": 0.3798828125, + "learning_rate": 5.729166666666667e-07, + "loss": 0.0005, + "reward": 3.3842055797576904, + "reward_std": 0.03953359508886933, + "rewards/final_reward": 1.3692225022258984, + "rewards/mask_iou_reward": 0.6846112511129492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3842054903507233, + "rewards/thk_ans_format_reward": 1.0, + "step": 1517, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.90625762939453, + "epoch": 5.128161888701518, + "grad_norm": 18.589885478274518, + "kl": 0.4638671875, + "learning_rate": 5.726351351351351e-07, + "loss": 0.0005, + "reward": 3.3922992944717407, + "reward_std": 0.24772672355175018, + "rewards/final_reward": 1.7797736823794406, + "rewards/mask_iou_reward": 0.8898868411897203, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4131325483322144, + "rewards/thk_ans_format_reward": 1.0, + "step": 1518, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.75, + "epoch": 5.1315345699831365, + "grad_norm": 12.183344084823798, + "kl": 0.435546875, + "learning_rate": 5.723536036036037e-07, + "loss": 0.0004, + "reward": 3.2745349407196045, + "reward_std": 0.051657652482390404, + "rewards/final_reward": 1.6897714946454898, + "rewards/mask_iou_reward": 0.8448857473227449, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2745347321033478, + "rewards/thk_ans_format_reward": 1.0, + "step": 1519, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.71875381469727, + "epoch": 5.134907251264756, + "grad_norm": 11.902841576797009, + "kl": 0.482421875, + "learning_rate": 5.720720720720721e-07, + "loss": 0.0005, + "reward": 3.6553049087524414, + "reward_std": 0.11148537695407867, + "rewards/final_reward": 1.5538138012328986, + "rewards/mask_iou_reward": 0.7769069006164493, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6553047895431519, + "rewards/thk_ans_format_reward": 1.0, + "step": 1520, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8229217529297, + "epoch": 5.138279932546374, + "grad_norm": 16.633666084642627, + "kl": 0.42578125, + "learning_rate": 5.717905405405405e-07, + "loss": 0.0004, + "reward": 3.3416396379470825, + "reward_std": 0.10212976112961769, + "rewards/final_reward": 1.1319948253465024, + "rewards/mask_iou_reward": 0.5659974126732512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.341639518737793, + "rewards/thk_ans_format_reward": 1.0, + "step": 1521, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.57291793823242, + "epoch": 5.141652613827993, + "grad_norm": 10.582573874462401, + "kl": 0.4140625, + "learning_rate": 5.71509009009009e-07, + "loss": 0.0004, + "reward": 3.6125001907348633, + "reward_std": 0.09932881966233253, + "rewards/final_reward": 1.7526732780315073, + "rewards/mask_iou_reward": 0.8763366390157536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6125000715255737, + "rewards/thk_ans_format_reward": 1.0, + "step": 1522, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.54167175292969, + "epoch": 5.145025295109612, + "grad_norm": 7.333966642905111, + "kl": 0.423828125, + "learning_rate": 5.712274774774774e-07, + "loss": 0.0004, + "reward": 3.6571160554885864, + "reward_std": 0.09418375790119171, + "rewards/final_reward": 1.449667431419821, + "rewards/mask_iou_reward": 0.7248337157099105, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6571159362792969, + "rewards/thk_ans_format_reward": 1.0, + "step": 1523, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.15625762939453, + "epoch": 5.148397976391231, + "grad_norm": 12.455752046502456, + "kl": 0.400390625, + "learning_rate": 5.709459459459459e-07, + "loss": 0.0004, + "reward": 3.4153331518173218, + "reward_std": 0.08316674456000328, + "rewards/final_reward": 1.1740201997786626, + "rewards/mask_iou_reward": 0.5870100998893313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.415333330631256, + "rewards/thk_ans_format_reward": 1.0, + "step": 1524, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.34375381469727, + "epoch": 5.15177065767285, + "grad_norm": 13.553438063454204, + "kl": 0.421875, + "learning_rate": 5.706644144144143e-07, + "loss": 0.0004, + "reward": 3.599404454231262, + "reward_std": 0.052878640592098236, + "rewards/final_reward": 1.3837828129699297, + "rewards/mask_iou_reward": 0.6918914064849648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5994043350219727, + "rewards/thk_ans_format_reward": 1.0, + "step": 1525, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.89583587646484, + "epoch": 5.155143338954469, + "grad_norm": 15.193464949430728, + "kl": 0.4140625, + "learning_rate": 5.703828828828828e-07, + "loss": 0.0004, + "reward": 3.3660353422164917, + "reward_std": 0.04505654610693455, + "rewards/final_reward": 1.9256488497998228, + "rewards/mask_iou_reward": 0.9628244248999114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3660351634025574, + "rewards/thk_ans_format_reward": 1.0, + "step": 1526, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.96875381469727, + "epoch": 5.158516020236088, + "grad_norm": 13.55265426027934, + "kl": 0.494140625, + "learning_rate": 5.701013513513513e-07, + "loss": 0.0005, + "reward": 3.4424548149108887, + "reward_std": 0.07838378101587296, + "rewards/final_reward": 1.836514503370909, + "rewards/mask_iou_reward": 0.9182572516854545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.442454755306244, + "rewards/thk_ans_format_reward": 1.0, + "step": 1527, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.75000381469727, + "epoch": 5.161888701517706, + "grad_norm": 18.989102473316567, + "kl": 0.5283203125, + "learning_rate": 5.698198198198197e-07, + "loss": 0.0005, + "reward": 3.1408311128616333, + "reward_std": 0.09010594710707664, + "rewards/final_reward": 1.893888014220626, + "rewards/mask_iou_reward": 0.946944007110313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1408310234546661, + "rewards/thk_ans_format_reward": 1.0, + "step": 1528, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.80208587646484, + "epoch": 5.165261382799326, + "grad_norm": 65.82344999454938, + "kl": 0.50390625, + "learning_rate": 5.695382882882883e-07, + "loss": 0.0005, + "reward": 3.3837637901306152, + "reward_std": 0.05148115009069443, + "rewards/final_reward": 1.5014640082567738, + "rewards/mask_iou_reward": 0.7507320041283869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.383763611316681, + "rewards/thk_ans_format_reward": 1.0, + "step": 1529, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.22916793823242, + "epoch": 5.168634064080944, + "grad_norm": 14.530501175090082, + "kl": 0.6435546875, + "learning_rate": 5.692567567567567e-07, + "loss": 0.0006, + "reward": 3.2932307720184326, + "reward_std": 0.2251843735575676, + "rewards/final_reward": 1.7689563760134575, + "rewards/mask_iou_reward": 0.8844781880067287, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3036476373672485, + "rewards/thk_ans_format_reward": 1.0, + "step": 1530, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.71875762939453, + "epoch": 5.172006745362563, + "grad_norm": 13.65500874924387, + "kl": 0.521484375, + "learning_rate": 5.689752252252252e-07, + "loss": 0.0006, + "reward": 3.4125406742095947, + "reward_std": 0.05845703464001417, + "rewards/final_reward": 0.88716912268446, + "rewards/mask_iou_reward": 0.44358456134223, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4125406742095947, + "rewards/thk_ans_format_reward": 1.0, + "step": 1531, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.27083587646484, + "epoch": 5.175379426644182, + "grad_norm": 12.40091004140753, + "kl": 0.9296875, + "learning_rate": 5.686936936936937e-07, + "loss": 0.0009, + "reward": 3.339430570602417, + "reward_std": 0.12155294232070446, + "rewards/final_reward": 1.3781666494968288, + "rewards/mask_iou_reward": 0.6890833247484144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3394304513931274, + "rewards/thk_ans_format_reward": 1.0, + "step": 1532, + "think_completion_length": 9.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.96875, + "epoch": 5.178752107925801, + "grad_norm": 11.079528878346789, + "kl": 0.5380859375, + "learning_rate": 5.684121621621621e-07, + "loss": 0.0006, + "reward": 3.565992593765259, + "reward_std": 0.17389854416251183, + "rewards/final_reward": 1.696084288793232, + "rewards/mask_iou_reward": 0.848042144396616, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5659927725791931, + "rewards/thk_ans_format_reward": 1.0, + "step": 1533, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.63542175292969, + "epoch": 5.18212478920742, + "grad_norm": 14.322200453536906, + "kl": 0.525390625, + "learning_rate": 5.681306306306306e-07, + "loss": 0.0005, + "reward": 3.643845319747925, + "reward_std": 0.06971078272908926, + "rewards/final_reward": 1.7220737777035455, + "rewards/mask_iou_reward": 0.8610368888517728, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6438450813293457, + "rewards/thk_ans_format_reward": 1.0, + "step": 1534, + "think_completion_length": 7.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.16667175292969, + "epoch": 5.185497470489039, + "grad_norm": 18.093270113097123, + "kl": 0.5693359375, + "learning_rate": 5.67849099099099e-07, + "loss": 0.0006, + "reward": 3.6941890716552734, + "reward_std": 0.3119666241109371, + "rewards/final_reward": 1.6503372209016867, + "rewards/mask_iou_reward": 0.8251686104508433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6941890716552734, + "rewards/thk_ans_format_reward": 1.0, + "step": 1535, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.28125381469727, + "epoch": 5.188870151770658, + "grad_norm": 11.133834519601262, + "kl": 0.5888671875, + "learning_rate": 5.675675675675675e-07, + "loss": 0.0006, + "reward": 3.6762256622314453, + "reward_std": 0.10237638652324677, + "rewards/final_reward": 1.773332255036778, + "rewards/mask_iou_reward": 0.886666127518389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6762259006500244, + "rewards/thk_ans_format_reward": 1.0, + "step": 1536, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.66666793823242, + "epoch": 5.192242833052276, + "grad_norm": 10.01209463545946, + "kl": 0.55859375, + "learning_rate": 5.67286036036036e-07, + "loss": 0.0006, + "reward": 3.216148257255554, + "reward_std": 0.14021393656730652, + "rewards/final_reward": 1.220889013408144, + "rewards/mask_iou_reward": 0.610444506704072, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2161482572555542, + "rewards/thk_ans_format_reward": 1.0, + "step": 1537, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.36458587646484, + "epoch": 5.195615514333896, + "grad_norm": 22.298105118905134, + "kl": 0.453125, + "learning_rate": 5.670045045045044e-07, + "loss": 0.0004, + "reward": 3.0634961128234863, + "reward_std": 0.07166947051882744, + "rewards/final_reward": 0.3871498869639922, + "rewards/mask_iou_reward": 0.1935749434819961, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0634959936141968, + "rewards/thk_ans_format_reward": 1.0, + "step": 1538, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.05208587646484, + "epoch": 5.198988195615514, + "grad_norm": 16.616373424300722, + "kl": 0.49609375, + "learning_rate": 5.66722972972973e-07, + "loss": 0.0005, + "reward": 3.328110694885254, + "reward_std": 0.037119604647159576, + "rewards/final_reward": 0.7062539012306082, + "rewards/mask_iou_reward": 0.3531269506153041, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3281108736991882, + "rewards/thk_ans_format_reward": 1.0, + "step": 1539, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.4375, + "epoch": 5.202360876897133, + "grad_norm": 11.18647739665039, + "kl": 0.4912109375, + "learning_rate": 5.664414414414415e-07, + "loss": 0.0005, + "reward": 3.4070080518722534, + "reward_std": 0.03997157700359821, + "rewards/final_reward": 1.84481949763112, + "rewards/mask_iou_reward": 0.92240974881556, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4070080518722534, + "rewards/thk_ans_format_reward": 1.0, + "step": 1540, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5104217529297, + "epoch": 5.2057335581787525, + "grad_norm": 29.37877858635454, + "kl": 0.412109375, + "learning_rate": 5.661599099099099e-07, + "loss": 0.0004, + "reward": 3.5974971055984497, + "reward_std": 0.14647838473320007, + "rewards/final_reward": 1.7596759500227992, + "rewards/mask_iou_reward": 0.8798379750113996, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5974968671798706, + "rewards/thk_ans_format_reward": 1.0, + "step": 1541, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.54167175292969, + "epoch": 5.209106239460371, + "grad_norm": 6.73445212638002, + "kl": 0.4443359375, + "learning_rate": 5.658783783783784e-07, + "loss": 0.0005, + "reward": 3.4951417446136475, + "reward_std": 0.052374981343746185, + "rewards/final_reward": 1.64993442480199, + "rewards/mask_iou_reward": 0.824967212400995, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.495141625404358, + "rewards/thk_ans_format_reward": 1.0, + "step": 1542, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.11459350585938, + "epoch": 5.21247892074199, + "grad_norm": 19.1238333669945, + "kl": 0.4033203125, + "learning_rate": 5.655968468468468e-07, + "loss": 0.0004, + "reward": 2.794515371322632, + "reward_std": 0.2504820667090826, + "rewards/final_reward": 0.959297504557755, + "rewards/mask_iou_reward": 0.4796487522788775, + "rewards/sam_format_reward": 0.9270833432674408, + "rewards/sam_reward_func_ultra": 0.9299318194389343, + "rewards/thk_ans_format_reward": 0.9375, + "step": 1543, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.21875, + "epoch": 5.2158516020236085, + "grad_norm": 73.96611340269645, + "kl": 0.5205078125, + "learning_rate": 5.653153153153153e-07, + "loss": 0.0005, + "reward": 3.3519543409347534, + "reward_std": 0.1191567312926054, + "rewards/final_reward": 1.5246337668641923, + "rewards/mask_iou_reward": 0.7623168834320961, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3519543409347534, + "rewards/thk_ans_format_reward": 1.0, + "step": 1544, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.98958587646484, + "epoch": 5.219224283305228, + "grad_norm": 8.202103381403266, + "kl": 0.45703125, + "learning_rate": 5.650337837837838e-07, + "loss": 0.0005, + "reward": 3.384859323501587, + "reward_std": 0.0341465137898922, + "rewards/final_reward": 1.4621985641984132, + "rewards/mask_iou_reward": 0.7310992820992066, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3848593831062317, + "rewards/thk_ans_format_reward": 1.0, + "step": 1545, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.35416793823242, + "epoch": 5.222596964586846, + "grad_norm": 14.785637935429422, + "kl": 0.591796875, + "learning_rate": 5.647522522522522e-07, + "loss": 0.0006, + "reward": 3.454978108406067, + "reward_std": 0.11388104408979416, + "rewards/final_reward": 1.659704569059731, + "rewards/mask_iou_reward": 0.8298522845298655, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4549779295921326, + "rewards/thk_ans_format_reward": 1.0, + "step": 1546, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.27083587646484, + "epoch": 5.2259696458684655, + "grad_norm": 10.723802312148857, + "kl": 0.4326171875, + "learning_rate": 5.644707207207207e-07, + "loss": 0.0005, + "reward": 3.6628739833831787, + "reward_std": 0.09409919008612633, + "rewards/final_reward": 1.8867463281451706, + "rewards/mask_iou_reward": 0.9433731640725853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6628742218017578, + "rewards/thk_ans_format_reward": 1.0, + "step": 1547, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.32291793823242, + "epoch": 5.229342327150085, + "grad_norm": 31.792109466430873, + "kl": 0.3984375, + "learning_rate": 5.641891891891891e-07, + "loss": 0.0004, + "reward": 3.559706926345825, + "reward_std": 0.1302800141274929, + "rewards/final_reward": 1.3238463009011596, + "rewards/mask_iou_reward": 0.6619231504505798, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.55970698595047, + "rewards/thk_ans_format_reward": 1.0, + "step": 1548, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1979217529297, + "epoch": 5.232715008431703, + "grad_norm": 23.73896689821121, + "kl": 0.5048828125, + "learning_rate": 5.639076576576577e-07, + "loss": 0.0005, + "reward": 3.1021581888198853, + "reward_std": 0.14375893399119377, + "rewards/final_reward": 0.6169975867768962, + "rewards/mask_iou_reward": 0.3084987933884481, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1021581292152405, + "rewards/thk_ans_format_reward": 1.0, + "step": 1549, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.72916793823242, + "epoch": 5.236087689713322, + "grad_norm": 21.491525250003136, + "kl": 0.4072265625, + "learning_rate": 5.636261261261262e-07, + "loss": 0.0004, + "reward": 3.6628674268722534, + "reward_std": 0.059761207550764084, + "rewards/final_reward": 1.7522080608301884, + "rewards/mask_iou_reward": 0.8761040304150942, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6628673076629639, + "rewards/thk_ans_format_reward": 1.0, + "step": 1550, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.02083587646484, + "epoch": 5.239460370994941, + "grad_norm": 7.610443172810163, + "kl": 0.4619140625, + "learning_rate": 5.633445945945946e-07, + "loss": 0.0005, + "reward": 3.4855546951293945, + "reward_std": 0.12474964559078217, + "rewards/final_reward": 1.6593411417752995, + "rewards/mask_iou_reward": 0.8296705708876497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4855546355247498, + "rewards/thk_ans_format_reward": 1.0, + "step": 1551, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.46875762939453, + "epoch": 5.24283305227656, + "grad_norm": 14.118379922011169, + "kl": 0.4345703125, + "learning_rate": 5.630630630630631e-07, + "loss": 0.0004, + "reward": 3.29573655128479, + "reward_std": 0.13444262370467186, + "rewards/final_reward": 1.6338925846834953, + "rewards/mask_iou_reward": 0.8169462923417476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.295736312866211, + "rewards/thk_ans_format_reward": 1.0, + "step": 1552, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.6666717529297, + "epoch": 5.246205733558178, + "grad_norm": 47.09683397724249, + "kl": 0.4892578125, + "learning_rate": 5.627815315315316e-07, + "loss": 0.0005, + "reward": 3.719792604446411, + "reward_std": 0.03036335203796625, + "rewards/final_reward": 1.713939308841118, + "rewards/mask_iou_reward": 0.856969654420559, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7197925448417664, + "rewards/thk_ans_format_reward": 1.0, + "step": 1553, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.92708587646484, + "epoch": 5.249578414839798, + "grad_norm": 10.575688870182061, + "kl": 0.744140625, + "learning_rate": 5.625e-07, + "loss": 0.0008, + "reward": 3.4308345317840576, + "reward_std": 0.10887641087174416, + "rewards/final_reward": 1.6010893689639407, + "rewards/mask_iou_reward": 0.8005446844819704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4308344721794128, + "rewards/thk_ans_format_reward": 1.0, + "step": 1554, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.5416717529297, + "epoch": 5.252951096121416, + "grad_norm": 21.612969798866093, + "kl": 0.55078125, + "learning_rate": 5.622184684684685e-07, + "loss": 0.0006, + "reward": 3.6511470079421997, + "reward_std": 0.05324476957321167, + "rewards/final_reward": 1.768552601739545, + "rewards/mask_iou_reward": 0.8842763008697725, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6511470079421997, + "rewards/thk_ans_format_reward": 1.0, + "step": 1555, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.34375762939453, + "epoch": 5.256323777403035, + "grad_norm": 25.791714296136618, + "kl": 0.5859375, + "learning_rate": 5.619369369369369e-07, + "loss": 0.0006, + "reward": 3.3546031713485718, + "reward_std": 0.04129011929035187, + "rewards/final_reward": 1.5378495910974854, + "rewards/mask_iou_reward": 0.7689247955487427, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3546032309532166, + "rewards/thk_ans_format_reward": 1.0, + "step": 1556, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.2291717529297, + "epoch": 5.259696458684655, + "grad_norm": 13.404011319795831, + "kl": 0.435546875, + "learning_rate": 5.616554054054054e-07, + "loss": 0.0004, + "reward": 3.324510097503662, + "reward_std": 0.13226917386054993, + "rewards/final_reward": 0.8674480571987822, + "rewards/mask_iou_reward": 0.4337240285993911, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3245099782943726, + "rewards/thk_ans_format_reward": 1.0, + "step": 1557, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.88542175292969, + "epoch": 5.263069139966273, + "grad_norm": 15.12979386432097, + "kl": 1.296875, + "learning_rate": 5.613738738738739e-07, + "loss": 0.0013, + "reward": 3.581545352935791, + "reward_std": 0.04847773676738143, + "rewards/final_reward": 1.8063348684698242, + "rewards/mask_iou_reward": 0.9031674342349121, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5815452933311462, + "rewards/thk_ans_format_reward": 1.0, + "step": 1558, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.42708587646484, + "epoch": 5.266441821247892, + "grad_norm": 42.19614693932147, + "kl": 0.4794921875, + "learning_rate": 5.610923423423422e-07, + "loss": 0.0005, + "reward": 3.4823594093322754, + "reward_std": 0.13475025445222855, + "rewards/final_reward": 1.7472569945766088, + "rewards/mask_iou_reward": 0.8736284972883044, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4823591113090515, + "rewards/thk_ans_format_reward": 1.0, + "step": 1559, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.28125381469727, + "epoch": 5.269814502529511, + "grad_norm": 83.2498651086656, + "kl": 0.5546875, + "learning_rate": 5.608108108108108e-07, + "loss": 0.0006, + "reward": 3.675955653190613, + "reward_std": 0.09048607014119625, + "rewards/final_reward": 1.8582812308316892, + "rewards/mask_iou_reward": 0.9291406154158446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6759557723999023, + "rewards/thk_ans_format_reward": 1.0, + "step": 1560, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5416717529297, + "epoch": 5.27318718381113, + "grad_norm": 22.738778486995905, + "kl": 0.52734375, + "learning_rate": 5.605292792792792e-07, + "loss": 0.0005, + "reward": 3.5443469285964966, + "reward_std": 0.13854551687836647, + "rewards/final_reward": 1.565121317219153, + "rewards/mask_iou_reward": 0.7825606586095765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5443468689918518, + "rewards/thk_ans_format_reward": 1.0, + "step": 1561, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.46875381469727, + "epoch": 5.276559865092748, + "grad_norm": 29.120362052579313, + "kl": 0.4072265625, + "learning_rate": 5.602477477477477e-07, + "loss": 0.0004, + "reward": 3.2357590198516846, + "reward_std": 0.16141557320952415, + "rewards/final_reward": 1.0046967644421276, + "rewards/mask_iou_reward": 0.5023483822210638, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.235758900642395, + "rewards/thk_ans_format_reward": 1.0, + "step": 1562, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.34375762939453, + "epoch": 5.279932546374368, + "grad_norm": 16.034146725438514, + "kl": 0.43359375, + "learning_rate": 5.599662162162162e-07, + "loss": 0.0004, + "reward": 3.4278730154037476, + "reward_std": 0.0456274077296257, + "rewards/final_reward": 1.5122349537878514, + "rewards/mask_iou_reward": 0.7561174768939257, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4278731942176819, + "rewards/thk_ans_format_reward": 1.0, + "step": 1563, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1145896911621, + "epoch": 5.283305227655987, + "grad_norm": 34.61046567649571, + "kl": 0.5732421875, + "learning_rate": 5.596846846846846e-07, + "loss": 0.0006, + "reward": 3.431596279144287, + "reward_std": 0.07788949087262154, + "rewards/final_reward": 1.3865765962860417, + "rewards/mask_iou_reward": 0.6932882981430208, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4315963983535767, + "rewards/thk_ans_format_reward": 1.0, + "step": 1564, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0729217529297, + "epoch": 5.286677908937605, + "grad_norm": 11.795224139319926, + "kl": 0.4521484375, + "learning_rate": 5.594031531531531e-07, + "loss": 0.0005, + "reward": 3.494480848312378, + "reward_std": 0.16206956654787064, + "rewards/final_reward": 1.2220150396439693, + "rewards/mask_iou_reward": 0.6110075198219846, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4944809079170227, + "rewards/thk_ans_format_reward": 1.0, + "step": 1565, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.52083587646484, + "epoch": 5.2900505902192245, + "grad_norm": 10.969810119969456, + "kl": 0.3984375, + "learning_rate": 5.591216216216215e-07, + "loss": 0.0004, + "reward": 3.6718339920043945, + "reward_std": 0.045063115656375885, + "rewards/final_reward": 1.452622763087667, + "rewards/mask_iou_reward": 0.7263113815438335, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6718339323997498, + "rewards/thk_ans_format_reward": 1.0, + "step": 1566, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0729217529297, + "epoch": 5.293423271500843, + "grad_norm": 33.29662349353952, + "kl": 0.4619140625, + "learning_rate": 5.5884009009009e-07, + "loss": 0.0005, + "reward": 3.5407893657684326, + "reward_std": 0.07485915347933769, + "rewards/final_reward": 1.6468581700055525, + "rewards/mask_iou_reward": 0.8234290850027762, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5407893657684326, + "rewards/thk_ans_format_reward": 1.0, + "step": 1567, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.72916793823242, + "epoch": 5.296795952782462, + "grad_norm": 14.017071819423824, + "kl": 0.423828125, + "learning_rate": 5.585585585585585e-07, + "loss": 0.0004, + "reward": 3.55793297290802, + "reward_std": 0.02568998374044895, + "rewards/final_reward": 1.838847874711516, + "rewards/mask_iou_reward": 0.919423937355758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.55793297290802, + "rewards/thk_ans_format_reward": 1.0, + "step": 1568, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.20833587646484, + "epoch": 5.300168634064081, + "grad_norm": 8.392709690302171, + "kl": 0.5263671875, + "learning_rate": 5.582770270270269e-07, + "loss": 0.0005, + "reward": 3.6737680435180664, + "reward_std": 0.06741153821349144, + "rewards/final_reward": 1.8004864144588608, + "rewards/mask_iou_reward": 0.9002432072294304, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.673767864704132, + "rewards/thk_ans_format_reward": 1.0, + "step": 1569, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.8541717529297, + "epoch": 5.3035413153457, + "grad_norm": 10.569948867581923, + "kl": 0.41796875, + "learning_rate": 5.579954954954955e-07, + "loss": 0.0004, + "reward": 3.4037704467773438, + "reward_std": 0.09090332314372063, + "rewards/final_reward": 0.8551043103711985, + "rewards/mask_iou_reward": 0.42755215518559925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4037703275680542, + "rewards/thk_ans_format_reward": 1.0, + "step": 1570, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.10416793823242, + "epoch": 5.306913996627319, + "grad_norm": 13.326835721795648, + "kl": 1.677734375, + "learning_rate": 5.57713963963964e-07, + "loss": 0.0017, + "reward": 3.740381360054016, + "reward_std": 0.056756491772830486, + "rewards/final_reward": 1.4973907536173645, + "rewards/mask_iou_reward": 0.7486953768086823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7403812408447266, + "rewards/thk_ans_format_reward": 1.0, + "step": 1571, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.96875381469727, + "epoch": 5.3102866779089375, + "grad_norm": 10.72925580096195, + "kl": 0.4931640625, + "learning_rate": 5.574324324324324e-07, + "loss": 0.0005, + "reward": 3.5163180828094482, + "reward_std": 0.0905131883919239, + "rewards/final_reward": 1.856243706221524, + "rewards/mask_iou_reward": 0.928121853110762, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5163179636001587, + "rewards/thk_ans_format_reward": 1.0, + "step": 1572, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.14583587646484, + "epoch": 5.313659359190557, + "grad_norm": 11.226461152562312, + "kl": 0.416015625, + "learning_rate": 5.571509009009009e-07, + "loss": 0.0004, + "reward": 3.228415608406067, + "reward_std": 0.07192541658878326, + "rewards/final_reward": 1.3220572332200686, + "rewards/mask_iou_reward": 0.6610286166100343, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2284154891967773, + "rewards/thk_ans_format_reward": 1.0, + "step": 1573, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.04166793823242, + "epoch": 5.317032040472175, + "grad_norm": 8.230439038432156, + "kl": 0.3759765625, + "learning_rate": 5.568693693693693e-07, + "loss": 0.0004, + "reward": 3.8010900020599365, + "reward_std": 0.018786365166306496, + "rewards/final_reward": 1.7107739670918538, + "rewards/mask_iou_reward": 0.8553869835459269, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8010901808738708, + "rewards/thk_ans_format_reward": 1.0, + "step": 1574, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.33333587646484, + "epoch": 5.320404721753794, + "grad_norm": 5.757461841734188, + "kl": 0.42578125, + "learning_rate": 5.565878378378378e-07, + "loss": 0.0004, + "reward": 3.414375066757202, + "reward_std": 0.05427007144317031, + "rewards/final_reward": 1.4775362898487359, + "rewards/mask_iou_reward": 0.7387681449243679, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.414375126361847, + "rewards/thk_ans_format_reward": 1.0, + "step": 1575, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.19791793823242, + "epoch": 5.323777403035413, + "grad_norm": 36.58365647963653, + "kl": 0.419921875, + "learning_rate": 5.563063063063063e-07, + "loss": 0.0004, + "reward": 3.325410485267639, + "reward_std": 0.10779011994600296, + "rewards/final_reward": 1.0423662379730816, + "rewards/mask_iou_reward": 0.5211831189865408, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3254103660583496, + "rewards/thk_ans_format_reward": 1.0, + "step": 1576, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.32291793823242, + "epoch": 5.327150084317032, + "grad_norm": 10.107335069790686, + "kl": 0.7216796875, + "learning_rate": 5.560247747747747e-07, + "loss": 0.0008, + "reward": 3.2031502723693848, + "reward_std": 0.0661102794110775, + "rewards/final_reward": 1.0763216198767998, + "rewards/mask_iou_reward": 0.5381608099383999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2031500935554504, + "rewards/thk_ans_format_reward": 1.0, + "step": 1577, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.36458969116211, + "epoch": 5.330522765598651, + "grad_norm": 10.322027034328684, + "kl": 0.47265625, + "learning_rate": 5.557432432432432e-07, + "loss": 0.0005, + "reward": 3.5067999362945557, + "reward_std": 0.1160063948482275, + "rewards/final_reward": 0.977022600833603, + "rewards/mask_iou_reward": 0.4885113004168015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5067998170852661, + "rewards/thk_ans_format_reward": 1.0, + "step": 1578, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.51041793823242, + "epoch": 5.33389544688027, + "grad_norm": 15.348517850938887, + "kl": 0.5, + "learning_rate": 5.554617117117116e-07, + "loss": 0.0005, + "reward": 3.456921339035034, + "reward_std": 0.06033678911626339, + "rewards/final_reward": 1.3159359344971464, + "rewards/mask_iou_reward": 0.6579679672485732, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4569213390350342, + "rewards/thk_ans_format_reward": 1.0, + "step": 1579, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.55208587646484, + "epoch": 5.337268128161889, + "grad_norm": 15.585442503496923, + "kl": 0.4501953125, + "learning_rate": 5.551801801801802e-07, + "loss": 0.0005, + "reward": 3.488860607147217, + "reward_std": 0.09073191322386265, + "rewards/final_reward": 1.2704860859565354, + "rewards/mask_iou_reward": 0.6352430429782677, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.488860547542572, + "rewards/thk_ans_format_reward": 1.0, + "step": 1580, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.97917175292969, + "epoch": 5.340640809443507, + "grad_norm": 27.820073001383008, + "kl": 0.751953125, + "learning_rate": 5.548986486486487e-07, + "loss": 0.0008, + "reward": 3.639863133430481, + "reward_std": 0.06889799144119024, + "rewards/final_reward": 1.796841014307425, + "rewards/mask_iou_reward": 0.8984205071537125, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6398632526397705, + "rewards/thk_ans_format_reward": 1.0, + "step": 1581, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.83333587646484, + "epoch": 5.344013490725127, + "grad_norm": 11.68937389204564, + "kl": 0.44140625, + "learning_rate": 5.546171171171171e-07, + "loss": 0.0004, + "reward": 3.3747018575668335, + "reward_std": 0.07148859463632107, + "rewards/final_reward": 1.0186688008845346, + "rewards/mask_iou_reward": 0.5093344004422673, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3747016787528992, + "rewards/thk_ans_format_reward": 1.0, + "step": 1582, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.19792175292969, + "epoch": 5.347386172006745, + "grad_norm": 9.387634937829326, + "kl": 0.48046875, + "learning_rate": 5.543355855855856e-07, + "loss": 0.0005, + "reward": 3.5327770709991455, + "reward_std": 0.10092796385288239, + "rewards/final_reward": 1.823864692806687, + "rewards/mask_iou_reward": 0.9119323464033435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5327771306037903, + "rewards/thk_ans_format_reward": 1.0, + "step": 1583, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.14583587646484, + "epoch": 5.350758853288364, + "grad_norm": 12.118552830379011, + "kl": 0.4482421875, + "learning_rate": 5.54054054054054e-07, + "loss": 0.0005, + "reward": 3.5782933235168457, + "reward_std": 0.09411728754639626, + "rewards/final_reward": 1.2275416819809903, + "rewards/mask_iou_reward": 0.6137708409904952, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5782932043075562, + "rewards/thk_ans_format_reward": 1.0, + "step": 1584, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.22916793823242, + "epoch": 5.354131534569984, + "grad_norm": 276.16617115162836, + "kl": 0.54296875, + "learning_rate": 5.537725225225225e-07, + "loss": 0.0005, + "reward": 3.5092782974243164, + "reward_std": 0.12828397750854492, + "rewards/final_reward": 1.4079524650829063, + "rewards/mask_iou_reward": 0.7039762325414531, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5092784762382507, + "rewards/thk_ans_format_reward": 1.0, + "step": 1585, + "think_completion_length": 9.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.09375762939453, + "epoch": 5.357504215851602, + "grad_norm": 15.448136244261567, + "kl": 1.51171875, + "learning_rate": 5.53490990990991e-07, + "loss": 0.0015, + "reward": 3.139350414276123, + "reward_std": 0.08229007199406624, + "rewards/final_reward": 1.1157618654892065, + "rewards/mask_iou_reward": 0.5578809327446033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1393502354621887, + "rewards/thk_ans_format_reward": 1.0, + "step": 1586, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0729217529297, + "epoch": 5.360876897133221, + "grad_norm": 7.589524801705542, + "kl": 0.439453125, + "learning_rate": 5.532094594594594e-07, + "loss": 0.0004, + "reward": 3.525477886199951, + "reward_std": 0.12513011507689953, + "rewards/final_reward": 1.7533777641824975, + "rewards/mask_iou_reward": 0.8766888820912487, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5254778265953064, + "rewards/thk_ans_format_reward": 1.0, + "step": 1587, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.37500762939453, + "epoch": 5.36424957841484, + "grad_norm": 6.570301057011809, + "kl": 0.4521484375, + "learning_rate": 5.529279279279279e-07, + "loss": 0.0005, + "reward": 3.304585814476013, + "reward_std": 0.07918488210998476, + "rewards/final_reward": 1.792820508585685, + "rewards/mask_iou_reward": 0.8964102542928425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.304585576057434, + "rewards/thk_ans_format_reward": 1.0, + "step": 1588, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.875, + "epoch": 5.367622259696459, + "grad_norm": 9.062400524300918, + "kl": 0.537109375, + "learning_rate": 5.526463963963963e-07, + "loss": 0.0005, + "reward": 3.5815329551696777, + "reward_std": 0.2140875719487667, + "rewards/final_reward": 1.1094297889036784, + "rewards/mask_iou_reward": 0.5547148944518392, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5815329551696777, + "rewards/thk_ans_format_reward": 1.0, + "step": 1589, + "think_completion_length": 10.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.03125762939453, + "epoch": 5.370994940978077, + "grad_norm": 9.15405908451973, + "kl": 0.4638671875, + "learning_rate": 5.523648648648649e-07, + "loss": 0.0005, + "reward": 3.3573790788650513, + "reward_std": 0.12218708544969559, + "rewards/final_reward": 1.2496978881252572, + "rewards/mask_iou_reward": 0.6248489440626286, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3573788404464722, + "rewards/thk_ans_format_reward": 1.0, + "step": 1590, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.91666793823242, + "epoch": 5.3743676222596966, + "grad_norm": 16.06898917110685, + "kl": 0.5869140625, + "learning_rate": 5.520833333333334e-07, + "loss": 0.0006, + "reward": 3.674129366874695, + "reward_std": 0.045963745564222336, + "rewards/final_reward": 1.7854430448313332, + "rewards/mask_iou_reward": 0.8927215224156666, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6741293668746948, + "rewards/thk_ans_format_reward": 1.0, + "step": 1591, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.54166793823242, + "epoch": 5.377740303541315, + "grad_norm": 13.666515737584314, + "kl": 0.435546875, + "learning_rate": 5.518018018018018e-07, + "loss": 0.0004, + "reward": 3.001492738723755, + "reward_std": 0.18107537552714348, + "rewards/final_reward": 0.45554146425085257, + "rewards/mask_iou_reward": 0.22777073212542628, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0014926195144653, + "rewards/thk_ans_format_reward": 1.0, + "step": 1592, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.8333396911621, + "epoch": 5.381112984822934, + "grad_norm": 16.562808165815945, + "kl": 0.654296875, + "learning_rate": 5.515202702702703e-07, + "loss": 0.0007, + "reward": 3.5894731283187866, + "reward_std": 0.060048991814255714, + "rewards/final_reward": 1.2045376616534607, + "rewards/mask_iou_reward": 0.6022688308267303, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5894731283187866, + "rewards/thk_ans_format_reward": 1.0, + "step": 1593, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.23958587646484, + "epoch": 5.3844856661045535, + "grad_norm": 7.900805230696725, + "kl": 0.4208984375, + "learning_rate": 5.512387387387388e-07, + "loss": 0.0004, + "reward": 3.297197103500366, + "reward_std": 0.050192068330943584, + "rewards/final_reward": 1.5728006310996099, + "rewards/mask_iou_reward": 0.7864003155498049, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2971969246864319, + "rewards/thk_ans_format_reward": 1.0, + "step": 1594, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.60416793823242, + "epoch": 5.387858347386172, + "grad_norm": 16.85455125388022, + "kl": 0.4453125, + "learning_rate": 5.509572072072072e-07, + "loss": 0.0004, + "reward": 3.0963913202285767, + "reward_std": 0.2514451891183853, + "rewards/final_reward": 1.3276669146343856, + "rewards/mask_iou_reward": 0.6638334573171928, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0963911414146423, + "rewards/thk_ans_format_reward": 1.0, + "step": 1595, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.14583587646484, + "epoch": 5.391231028667791, + "grad_norm": 8.657383821568352, + "kl": 0.4677734375, + "learning_rate": 5.506756756756757e-07, + "loss": 0.0005, + "reward": 3.521684169769287, + "reward_std": 0.2204669639468193, + "rewards/final_reward": 1.7213135550727179, + "rewards/mask_iou_reward": 0.8606567775363589, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5216842293739319, + "rewards/thk_ans_format_reward": 1.0, + "step": 1596, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.7604217529297, + "epoch": 5.3946037099494095, + "grad_norm": 16.606444003821277, + "kl": 0.4267578125, + "learning_rate": 5.503941441441441e-07, + "loss": 0.0004, + "reward": 3.491199016571045, + "reward_std": 0.0791405662894249, + "rewards/final_reward": 1.380018995825186, + "rewards/mask_iou_reward": 0.690009497912593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4911989569664001, + "rewards/thk_ans_format_reward": 1.0, + "step": 1597, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.08333969116211, + "epoch": 5.397976391231029, + "grad_norm": 18.573848879245762, + "kl": 0.5126953125, + "learning_rate": 5.501126126126125e-07, + "loss": 0.0005, + "reward": 3.5595691204071045, + "reward_std": 0.05301516316831112, + "rewards/final_reward": 1.7548742102506711, + "rewards/mask_iou_reward": 0.8774371051253356, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5595690608024597, + "rewards/thk_ans_format_reward": 1.0, + "step": 1598, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.77083587646484, + "epoch": 5.401349072512647, + "grad_norm": 13.677851860701725, + "kl": 0.4833984375, + "learning_rate": 5.49831081081081e-07, + "loss": 0.0005, + "reward": 3.4730799198150635, + "reward_std": 0.09135781228542328, + "rewards/final_reward": 1.5527164072778548, + "rewards/mask_iou_reward": 0.7763582036389274, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4730800986289978, + "rewards/thk_ans_format_reward": 1.0, + "step": 1599, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.88542175292969, + "epoch": 5.4047217537942664, + "grad_norm": 59.41510375735123, + "kl": 0.4638671875, + "learning_rate": 5.495495495495495e-07, + "loss": 0.0005, + "reward": 3.6761807203292847, + "reward_std": 0.10935474932193756, + "rewards/final_reward": 1.7209638783991454, + "rewards/mask_iou_reward": 0.8604819391995727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.676180899143219, + "rewards/thk_ans_format_reward": 1.0, + "step": 1600, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.42708587646484, + "epoch": 5.408094435075886, + "grad_norm": 11.48604334360361, + "kl": 0.400390625, + "learning_rate": 5.49268018018018e-07, + "loss": 0.0004, + "reward": 3.5194766521453857, + "reward_std": 0.06819850951433182, + "rewards/final_reward": 1.7190072709272624, + "rewards/mask_iou_reward": 0.8595036354636312, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.519476592540741, + "rewards/thk_ans_format_reward": 1.0, + "step": 1601, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.0, + "epoch": 5.411467116357504, + "grad_norm": 11.328987027276094, + "kl": 0.44921875, + "learning_rate": 5.489864864864864e-07, + "loss": 0.0005, + "reward": 3.3820735216140747, + "reward_std": 0.05752043426036835, + "rewards/final_reward": 1.6397381276838643, + "rewards/mask_iou_reward": 0.8198690638419321, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3820736408233643, + "rewards/thk_ans_format_reward": 1.0, + "step": 1602, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.98958587646484, + "epoch": 5.414839797639123, + "grad_norm": 15.086284072459184, + "kl": 0.443359375, + "learning_rate": 5.487049549549549e-07, + "loss": 0.0005, + "reward": 3.315527081489563, + "reward_std": 0.1428496576845646, + "rewards/final_reward": 1.7716332694488184, + "rewards/mask_iou_reward": 0.8858166347244092, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3155272006988525, + "rewards/thk_ans_format_reward": 1.0, + "step": 1603, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0416717529297, + "epoch": 5.418212478920742, + "grad_norm": 14.191826769982049, + "kl": 0.4296875, + "learning_rate": 5.484234234234234e-07, + "loss": 0.0004, + "reward": 3.36587655544281, + "reward_std": 0.08461336139589548, + "rewards/final_reward": 1.6119604684506845, + "rewards/mask_iou_reward": 0.8059802342253423, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.365876317024231, + "rewards/thk_ans_format_reward": 1.0, + "step": 1604, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.51041793823242, + "epoch": 5.421585160202361, + "grad_norm": 7.723156096040805, + "kl": 0.4619140625, + "learning_rate": 5.481418918918918e-07, + "loss": 0.0005, + "reward": 3.455763339996338, + "reward_std": 0.13234110176563263, + "rewards/final_reward": 1.3475519159373448, + "rewards/mask_iou_reward": 0.6737759579686724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4557632803916931, + "rewards/thk_ans_format_reward": 1.0, + "step": 1605, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.03125381469727, + "epoch": 5.424957841483979, + "grad_norm": 17.843660716465195, + "kl": 0.4775390625, + "learning_rate": 5.478603603603603e-07, + "loss": 0.0005, + "reward": 3.515939950942993, + "reward_std": 0.04447547905147076, + "rewards/final_reward": 0.7696596498514374, + "rewards/mask_iou_reward": 0.3848298249257187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5159401893615723, + "rewards/thk_ans_format_reward": 1.0, + "step": 1606, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.2291717529297, + "epoch": 5.428330522765599, + "grad_norm": 8.32019114382189, + "kl": 0.4150390625, + "learning_rate": 5.475788288288287e-07, + "loss": 0.0004, + "reward": 3.0867764949798584, + "reward_std": 0.1152360737323761, + "rewards/final_reward": 1.6006940530922766, + "rewards/mask_iou_reward": 0.8003470265461383, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0867764353752136, + "rewards/thk_ans_format_reward": 1.0, + "step": 1607, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.6875, + "epoch": 5.431703204047217, + "grad_norm": 18.9909129285575, + "kl": 0.4521484375, + "learning_rate": 5.472972972972972e-07, + "loss": 0.0005, + "reward": 3.5866810083389282, + "reward_std": 0.10227518156170845, + "rewards/final_reward": 1.8427148180003425, + "rewards/mask_iou_reward": 0.9213574090001713, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5866807699203491, + "rewards/thk_ans_format_reward": 1.0, + "step": 1608, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.1041717529297, + "epoch": 5.435075885328836, + "grad_norm": 14.657295509107488, + "kl": 0.62109375, + "learning_rate": 5.470157657657657e-07, + "loss": 0.0006, + "reward": 3.316844344139099, + "reward_std": 0.07849056646227837, + "rewards/final_reward": 1.6149113626967542, + "rewards/mask_iou_reward": 0.8074556813483771, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3168442845344543, + "rewards/thk_ans_format_reward": 1.0, + "step": 1609, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.46875, + "epoch": 5.438448566610456, + "grad_norm": 40.969177808345904, + "kl": 0.607421875, + "learning_rate": 5.467342342342342e-07, + "loss": 0.0006, + "reward": 3.4543533325195312, + "reward_std": 0.10421181283891201, + "rewards/final_reward": 1.8084816161777553, + "rewards/mask_iou_reward": 0.9042408080888776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4543529748916626, + "rewards/thk_ans_format_reward": 1.0, + "step": 1610, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.43750762939453, + "epoch": 5.441821247892074, + "grad_norm": 10.93391540750094, + "kl": 0.412109375, + "learning_rate": 5.464527027027027e-07, + "loss": 0.0004, + "reward": 3.3363726139068604, + "reward_std": 0.08054106682538986, + "rewards/final_reward": 0.9038268503309029, + "rewards/mask_iou_reward": 0.45191342516545147, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3363724946975708, + "rewards/thk_ans_format_reward": 1.0, + "step": 1611, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.40625762939453, + "epoch": 5.445193929173693, + "grad_norm": 9.121135289305188, + "kl": 0.474609375, + "learning_rate": 5.461711711711712e-07, + "loss": 0.0005, + "reward": 3.646440267562866, + "reward_std": 0.06063675507903099, + "rewards/final_reward": 1.8498281476593796, + "rewards/mask_iou_reward": 0.9249140738296898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.646440327167511, + "rewards/thk_ans_format_reward": 1.0, + "step": 1612, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.17708587646484, + "epoch": 5.448566610455312, + "grad_norm": 9.347691834498171, + "kl": 0.4990234375, + "learning_rate": 5.458896396396396e-07, + "loss": 0.0005, + "reward": 3.603909730911255, + "reward_std": 0.059759557247161865, + "rewards/final_reward": 1.7791889589015653, + "rewards/mask_iou_reward": 0.8895944794507826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6039097905158997, + "rewards/thk_ans_format_reward": 1.0, + "step": 1613, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.95833587646484, + "epoch": 5.451939291736931, + "grad_norm": 64.78327486793128, + "kl": 0.3916015625, + "learning_rate": 5.456081081081081e-07, + "loss": 0.0004, + "reward": 3.368232846260071, + "reward_std": 0.04645315185189247, + "rewards/final_reward": 1.1973084912773273, + "rewards/mask_iou_reward": 0.5986542456386637, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.368232548236847, + "rewards/thk_ans_format_reward": 1.0, + "step": 1614, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.1875, + "epoch": 5.455311973018549, + "grad_norm": 6.750624391879805, + "kl": 0.40234375, + "learning_rate": 5.453265765765765e-07, + "loss": 0.0004, + "reward": 3.5145528316497803, + "reward_std": 0.03645121678709984, + "rewards/final_reward": 1.694978467397088, + "rewards/mask_iou_reward": 0.847489233698544, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5145527124404907, + "rewards/thk_ans_format_reward": 1.0, + "step": 1615, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.7604217529297, + "epoch": 5.458684654300169, + "grad_norm": 16.16205090571039, + "kl": 1.986328125, + "learning_rate": 5.45045045045045e-07, + "loss": 0.002, + "reward": 3.627463221549988, + "reward_std": 0.04613169934600592, + "rewards/final_reward": 1.9425661984513574, + "rewards/mask_iou_reward": 0.9712830992256787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6274632215499878, + "rewards/thk_ans_format_reward": 1.0, + "step": 1616, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.00000762939453, + "epoch": 5.462057335581788, + "grad_norm": 10.333683537542921, + "kl": 0.412109375, + "learning_rate": 5.447635135135135e-07, + "loss": 0.0005, + "reward": 3.6512891054153442, + "reward_std": 0.10130597651004791, + "rewards/final_reward": 1.7270932594670367, + "rewards/mask_iou_reward": 0.8635466297335184, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6617057919502258, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1617, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.59375381469727, + "epoch": 5.465430016863406, + "grad_norm": 11.288309283682565, + "kl": 0.576171875, + "learning_rate": 5.444819819819819e-07, + "loss": 0.0006, + "reward": 3.6216951608657837, + "reward_std": 0.036016141064465046, + "rewards/final_reward": 1.9431474565908906, + "rewards/mask_iou_reward": 0.9715737282954453, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6216949224472046, + "rewards/thk_ans_format_reward": 1.0, + "step": 1618, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.7916717529297, + "epoch": 5.4688026981450255, + "grad_norm": 11.645954279749585, + "kl": 0.3896484375, + "learning_rate": 5.442004504504504e-07, + "loss": 0.0004, + "reward": 3.32417094707489, + "reward_std": 0.11718141287565231, + "rewards/final_reward": 1.2097389401016168, + "rewards/mask_iou_reward": 0.6048694700508084, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3241708874702454, + "rewards/thk_ans_format_reward": 1.0, + "step": 1619, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.44791793823242, + "epoch": 5.472175379426644, + "grad_norm": 10.510371694596317, + "kl": 0.4541015625, + "learning_rate": 5.43918918918919e-07, + "loss": 0.0005, + "reward": 3.4671072959899902, + "reward_std": 0.09320265799760818, + "rewards/final_reward": 1.7891415823759047, + "rewards/mask_iou_reward": 0.8945707911879524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4671071767807007, + "rewards/thk_ans_format_reward": 1.0, + "step": 1620, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.70833587646484, + "epoch": 5.475548060708263, + "grad_norm": 10.57340206050586, + "kl": 1.16796875, + "learning_rate": 5.436373873873874e-07, + "loss": 0.0012, + "reward": 3.784354567527771, + "reward_std": 0.05950320092961192, + "rewards/final_reward": 1.6432683415503035, + "rewards/mask_iou_reward": 0.8216341707751518, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.784354567527771, + "rewards/thk_ans_format_reward": 1.0, + "step": 1621, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.34375381469727, + "epoch": 5.4789207419898815, + "grad_norm": 14.473689193550852, + "kl": 0.423828125, + "learning_rate": 5.433558558558559e-07, + "loss": 0.0004, + "reward": 3.405122399330139, + "reward_std": 0.11872344464063644, + "rewards/final_reward": 1.1827626419877142, + "rewards/mask_iou_reward": 0.5913813209938571, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4051221013069153, + "rewards/thk_ans_format_reward": 1.0, + "step": 1622, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.8645896911621, + "epoch": 5.482293423271501, + "grad_norm": 43.06217139028088, + "kl": 0.3828125, + "learning_rate": 5.430743243243243e-07, + "loss": 0.0004, + "reward": 3.340269446372986, + "reward_std": 0.20309398137032986, + "rewards/final_reward": 1.2307757543990752, + "rewards/mask_iou_reward": 0.6153878771995376, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3506861925125122, + "rewards/thk_ans_format_reward": 1.0, + "step": 1623, + "think_completion_length": 6.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.36459350585938, + "epoch": 5.48566610455312, + "grad_norm": 7.01983725004176, + "kl": 0.451171875, + "learning_rate": 5.427927927927928e-07, + "loss": 0.0005, + "reward": 3.6813403367996216, + "reward_std": 0.04475306533277035, + "rewards/final_reward": 1.8279500199920133, + "rewards/mask_iou_reward": 0.9139750099960067, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6813403367996216, + "rewards/thk_ans_format_reward": 1.0, + "step": 1624, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.78125, + "epoch": 5.4890387858347385, + "grad_norm": 10.716033971911102, + "kl": 0.85546875, + "learning_rate": 5.425112612612613e-07, + "loss": 0.0008, + "reward": 3.4271167516708374, + "reward_std": 0.12328441441059113, + "rewards/final_reward": 1.671780505812558, + "rewards/mask_iou_reward": 0.835890252906279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4271168112754822, + "rewards/thk_ans_format_reward": 1.0, + "step": 1625, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.84375, + "epoch": 5.492411467116358, + "grad_norm": 18.256151332877142, + "kl": 0.490234375, + "learning_rate": 5.422297297297297e-07, + "loss": 0.0005, + "reward": 3.758134603500366, + "reward_std": 0.026996027678251266, + "rewards/final_reward": 1.5473002885407978, + "rewards/mask_iou_reward": 0.7736501442703989, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7581345438957214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1626, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.66666793823242, + "epoch": 5.495784148397976, + "grad_norm": 14.495247198790699, + "kl": 0.3984375, + "learning_rate": 5.419481981981982e-07, + "loss": 0.0004, + "reward": 3.59829044342041, + "reward_std": 0.05195681378245354, + "rewards/final_reward": 1.8786183719312315, + "rewards/mask_iou_reward": 0.9393091859656157, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5982903838157654, + "rewards/thk_ans_format_reward": 1.0, + "step": 1627, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.36458587646484, + "epoch": 5.499156829679595, + "grad_norm": 9.939043586294236, + "kl": 0.443359375, + "learning_rate": 5.416666666666666e-07, + "loss": 0.0005, + "reward": 3.37367844581604, + "reward_std": 0.032843963243067265, + "rewards/final_reward": 1.2515881682879417, + "rewards/mask_iou_reward": 0.6257940841439709, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3736785650253296, + "rewards/thk_ans_format_reward": 1.0, + "step": 1628, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.95833587646484, + "epoch": 5.502529510961214, + "grad_norm": 9.954367021666135, + "kl": 0.3984375, + "learning_rate": 5.413851351351351e-07, + "loss": 0.0004, + "reward": 3.2100558280944824, + "reward_std": 0.02408889401704073, + "rewards/final_reward": 1.5521448941685216, + "rewards/mask_iou_reward": 0.7760724470842608, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2100557088851929, + "rewards/thk_ans_format_reward": 1.0, + "step": 1629, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.7916717529297, + "epoch": 5.505902192242833, + "grad_norm": 25.744694096144258, + "kl": 0.435546875, + "learning_rate": 5.411036036036037e-07, + "loss": 0.0004, + "reward": 3.306903600692749, + "reward_std": 0.1038884948939085, + "rewards/final_reward": 0.9442666685268137, + "rewards/mask_iou_reward": 0.47213333426340687, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3069035410881042, + "rewards/thk_ans_format_reward": 1.0, + "step": 1630, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.87500381469727, + "epoch": 5.509274873524452, + "grad_norm": 114.37127200407646, + "kl": 0.52734375, + "learning_rate": 5.408220720720721e-07, + "loss": 0.0005, + "reward": 3.312256336212158, + "reward_std": 0.0768951065838337, + "rewards/final_reward": 0.6544148646849897, + "rewards/mask_iou_reward": 0.32720743234249483, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3122565150260925, + "rewards/thk_ans_format_reward": 1.0, + "step": 1631, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.23958587646484, + "epoch": 5.512647554806071, + "grad_norm": 17.052226307928535, + "kl": 0.4052734375, + "learning_rate": 5.405405405405406e-07, + "loss": 0.0004, + "reward": 3.6516486406326294, + "reward_std": 0.06326550245285034, + "rewards/final_reward": 1.8457776060042868, + "rewards/mask_iou_reward": 0.9228888030021434, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6516485810279846, + "rewards/thk_ans_format_reward": 1.0, + "step": 1632, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.93750381469727, + "epoch": 5.51602023608769, + "grad_norm": 14.431876285263879, + "kl": 0.4482421875, + "learning_rate": 5.40259009009009e-07, + "loss": 0.0005, + "reward": 3.2408528327941895, + "reward_std": 0.043673787266016006, + "rewards/final_reward": 0.9847618306461534, + "rewards/mask_iou_reward": 0.4923809153230767, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2408528923988342, + "rewards/thk_ans_format_reward": 1.0, + "step": 1633, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.55209350585938, + "epoch": 5.519392917369308, + "grad_norm": 17.26001494408382, + "kl": 0.4228515625, + "learning_rate": 5.399774774774775e-07, + "loss": 0.0004, + "reward": 3.347735643386841, + "reward_std": 0.3271710202097893, + "rewards/final_reward": 1.4704092045759678, + "rewards/mask_iou_reward": 0.7352046022879839, + "rewards/sam_format_reward": 0.9270833432674408, + "rewards/sam_reward_func_ultra": 1.493568778038025, + "rewards/thk_ans_format_reward": 0.9270833432674408, + "step": 1634, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.32291793823242, + "epoch": 5.522765598650928, + "grad_norm": 8.97053405491925, + "kl": 0.45703125, + "learning_rate": 5.39695945945946e-07, + "loss": 0.0005, + "reward": 3.441559910774231, + "reward_std": 0.18121246993541718, + "rewards/final_reward": 1.500605036354436, + "rewards/mask_iou_reward": 0.750302518177218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4415597915649414, + "rewards/thk_ans_format_reward": 1.0, + "step": 1635, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.84375, + "epoch": 5.526138279932546, + "grad_norm": 7.933652110506884, + "kl": 0.4609375, + "learning_rate": 5.394144144144144e-07, + "loss": 0.0005, + "reward": 3.5340306758880615, + "reward_std": 0.07737600617110729, + "rewards/final_reward": 1.5301011994802551, + "rewards/mask_iou_reward": 0.7650505997401276, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5340306758880615, + "rewards/thk_ans_format_reward": 1.0, + "step": 1636, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.82291793823242, + "epoch": 5.529510961214165, + "grad_norm": 21.625197898356575, + "kl": 0.53125, + "learning_rate": 5.391328828828829e-07, + "loss": 0.0005, + "reward": 3.2599620819091797, + "reward_std": 0.1290854811668396, + "rewards/final_reward": 0.9122858420071565, + "rewards/mask_iou_reward": 0.45614292100357823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2599619626998901, + "rewards/thk_ans_format_reward": 1.0, + "step": 1637, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.50000381469727, + "epoch": 5.532883642495785, + "grad_norm": 14.189035764609386, + "kl": 0.439453125, + "learning_rate": 5.388513513513512e-07, + "loss": 0.0004, + "reward": 3.785208821296692, + "reward_std": 0.10167535580694675, + "rewards/final_reward": 1.5567690849466458, + "rewards/mask_iou_reward": 0.7783845424733229, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7852087020874023, + "rewards/thk_ans_format_reward": 1.0, + "step": 1638, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.25000762939453, + "epoch": 5.536256323777403, + "grad_norm": 17.561204439467147, + "kl": 0.4267578125, + "learning_rate": 5.385698198198197e-07, + "loss": 0.0004, + "reward": 3.3584929704666138, + "reward_std": 0.15081000700592995, + "rewards/final_reward": 1.350205189405206, + "rewards/mask_iou_reward": 0.675102594702603, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.35849267244339, + "rewards/thk_ans_format_reward": 1.0, + "step": 1639, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.45833587646484, + "epoch": 5.539629005059022, + "grad_norm": 13.441524313971499, + "kl": 0.5224609375, + "learning_rate": 5.382882882882883e-07, + "loss": 0.0005, + "reward": 3.2099350690841675, + "reward_std": 0.12646333128213882, + "rewards/final_reward": 1.0815409952337722, + "rewards/mask_iou_reward": 0.5407704976168861, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2099350094795227, + "rewards/thk_ans_format_reward": 1.0, + "step": 1640, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.7395896911621, + "epoch": 5.543001686340641, + "grad_norm": 11.448017304941155, + "kl": 0.4208984375, + "learning_rate": 5.380067567567567e-07, + "loss": 0.0004, + "reward": 3.6662003993988037, + "reward_std": 0.025301916524767876, + "rewards/final_reward": 1.728385868170287, + "rewards/mask_iou_reward": 0.8641929340851435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6662004590034485, + "rewards/thk_ans_format_reward": 1.0, + "step": 1641, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.83333587646484, + "epoch": 5.54637436762226, + "grad_norm": 11.684922782454798, + "kl": 0.5966796875, + "learning_rate": 5.377252252252252e-07, + "loss": 0.0006, + "reward": 3.4820899963378906, + "reward_std": 0.05957669019699097, + "rewards/final_reward": 1.1023617770398222, + "rewards/mask_iou_reward": 0.5511808885199111, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4820902347564697, + "rewards/thk_ans_format_reward": 1.0, + "step": 1642, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.87500381469727, + "epoch": 5.549747048903878, + "grad_norm": 10.7985345793467, + "kl": 0.541015625, + "learning_rate": 5.374436936936936e-07, + "loss": 0.0006, + "reward": 3.5354676246643066, + "reward_std": 0.09456159453839064, + "rewards/final_reward": 1.2728535625444786, + "rewards/mask_iou_reward": 0.6364267812722393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.535467505455017, + "rewards/thk_ans_format_reward": 1.0, + "step": 1643, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.7291717529297, + "epoch": 5.5531197301854975, + "grad_norm": 16.472570929374434, + "kl": 0.400390625, + "learning_rate": 5.371621621621621e-07, + "loss": 0.0004, + "reward": 3.267836332321167, + "reward_std": 0.10154848545789719, + "rewards/final_reward": 0.9260682476262471, + "rewards/mask_iou_reward": 0.46303412381312353, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.267836332321167, + "rewards/thk_ans_format_reward": 1.0, + "step": 1644, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.62500381469727, + "epoch": 5.556492411467117, + "grad_norm": 11.550943423319882, + "kl": 0.662109375, + "learning_rate": 5.368806306306306e-07, + "loss": 0.0007, + "reward": 3.6972213983535767, + "reward_std": 0.05988108739256859, + "rewards/final_reward": 1.817311461596225, + "rewards/mask_iou_reward": 0.9086557307981125, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6972213387489319, + "rewards/thk_ans_format_reward": 1.0, + "step": 1645, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.55208587646484, + "epoch": 5.559865092748735, + "grad_norm": 12.56189111590616, + "kl": 0.416015625, + "learning_rate": 5.36599099099099e-07, + "loss": 0.0005, + "reward": 3.3453147411346436, + "reward_std": 0.09329931810498238, + "rewards/final_reward": 1.5583423431680927, + "rewards/mask_iou_reward": 0.7791711715840464, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3453145623207092, + "rewards/thk_ans_format_reward": 1.0, + "step": 1646, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4166717529297, + "epoch": 5.5632377740303545, + "grad_norm": 22.44312055578235, + "kl": 0.4697265625, + "learning_rate": 5.363175675675675e-07, + "loss": 0.0005, + "reward": 3.079059362411499, + "reward_std": 0.08573806285858154, + "rewards/final_reward": 0.784900331719663, + "rewards/mask_iou_reward": 0.3924501658598315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0790594518184662, + "rewards/thk_ans_format_reward": 1.0, + "step": 1647, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.1041717529297, + "epoch": 5.566610455311973, + "grad_norm": 18.880687882286963, + "kl": 0.5234375, + "learning_rate": 5.36036036036036e-07, + "loss": 0.0005, + "reward": 3.4205507040023804, + "reward_std": 0.13985726051032543, + "rewards/final_reward": 1.363954579065279, + "rewards/mask_iou_reward": 0.6819772895326395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4205505847930908, + "rewards/thk_ans_format_reward": 1.0, + "step": 1648, + "think_completion_length": 9.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.6354217529297, + "epoch": 5.569983136593592, + "grad_norm": 32.78135655581178, + "kl": 0.4423828125, + "learning_rate": 5.357545045045044e-07, + "loss": 0.0004, + "reward": 3.25208842754364, + "reward_std": 0.05463777109980583, + "rewards/final_reward": 0.8885717828254704, + "rewards/mask_iou_reward": 0.4442858914127352, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2520886063575745, + "rewards/thk_ans_format_reward": 1.0, + "step": 1649, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.9375, + "epoch": 5.5733558178752105, + "grad_norm": 12.839953502051491, + "kl": 0.80078125, + "learning_rate": 5.35472972972973e-07, + "loss": 0.0008, + "reward": 3.2596405744552612, + "reward_std": 0.0540030263364315, + "rewards/final_reward": 1.8060929050230472, + "rewards/mask_iou_reward": 0.9030464525115236, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2596407532691956, + "rewards/thk_ans_format_reward": 1.0, + "step": 1650, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.53125381469727, + "epoch": 5.57672849915683, + "grad_norm": 11.968290135687798, + "kl": 0.5263671875, + "learning_rate": 5.351914414414414e-07, + "loss": 0.0005, + "reward": 3.6679221391677856, + "reward_std": 0.041471182368695736, + "rewards/final_reward": 1.8406348878109202, + "rewards/mask_iou_reward": 0.9203174439054601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.667921781539917, + "rewards/thk_ans_format_reward": 1.0, + "step": 1651, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.96875381469727, + "epoch": 5.580101180438449, + "grad_norm": 42.447179584452854, + "kl": 0.4755859375, + "learning_rate": 5.349099099099099e-07, + "loss": 0.0005, + "reward": 3.6466050148010254, + "reward_std": 0.059364247135818005, + "rewards/final_reward": 1.5773073296975162, + "rewards/mask_iou_reward": 0.7886536648487581, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6466050148010254, + "rewards/thk_ans_format_reward": 1.0, + "step": 1652, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.63542175292969, + "epoch": 5.583473861720067, + "grad_norm": 40.10948190147107, + "kl": 0.44921875, + "learning_rate": 5.346283783783784e-07, + "loss": 0.0004, + "reward": 3.4712870121002197, + "reward_std": 0.08134759962558746, + "rewards/final_reward": 1.6431637920767832, + "rewards/mask_iou_reward": 0.8215818960383916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4712870717048645, + "rewards/thk_ans_format_reward": 1.0, + "step": 1653, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.86458587646484, + "epoch": 5.586846543001687, + "grad_norm": 13.079819261870021, + "kl": 0.44921875, + "learning_rate": 5.343468468468468e-07, + "loss": 0.0004, + "reward": 3.5946396589279175, + "reward_std": 0.13511600345373154, + "rewards/final_reward": 1.3123857157778915, + "rewards/mask_iou_reward": 0.6561928578889458, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.605056345462799, + "rewards/thk_ans_format_reward": 1.0, + "step": 1654, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.9270896911621, + "epoch": 5.590219224283305, + "grad_norm": 12.470984977069678, + "kl": 2.4609375, + "learning_rate": 5.340653153153153e-07, + "loss": 0.0025, + "reward": 3.556732416152954, + "reward_std": 0.1202475274913013, + "rewards/final_reward": 1.564926728070922, + "rewards/mask_iou_reward": 0.782463364035461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.556732177734375, + "rewards/thk_ans_format_reward": 1.0, + "step": 1655, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.60416793823242, + "epoch": 5.593591905564924, + "grad_norm": 241.62352953992772, + "kl": 0.533203125, + "learning_rate": 5.337837837837837e-07, + "loss": 0.0005, + "reward": 3.4673640727996826, + "reward_std": 0.05588648747652769, + "rewards/final_reward": 1.3350874708452558, + "rewards/mask_iou_reward": 0.6675437354226279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4673641324043274, + "rewards/thk_ans_format_reward": 1.0, + "step": 1656, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.71875381469727, + "epoch": 5.596964586846543, + "grad_norm": 8.440196304786275, + "kl": 0.435546875, + "learning_rate": 5.335022522522522e-07, + "loss": 0.0004, + "reward": 3.678599715232849, + "reward_std": 0.09616643656045198, + "rewards/final_reward": 1.2519775876713632, + "rewards/mask_iou_reward": 0.6259887938356816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6785999536514282, + "rewards/thk_ans_format_reward": 1.0, + "step": 1657, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.79167175292969, + "epoch": 5.600337268128162, + "grad_norm": 20.773866582057902, + "kl": 0.41015625, + "learning_rate": 5.332207207207207e-07, + "loss": 0.0004, + "reward": 3.4142476320266724, + "reward_std": 0.12623858451843262, + "rewards/final_reward": 1.590514677557561, + "rewards/mask_iou_reward": 0.7952573387787805, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4246641993522644, + "rewards/thk_ans_format_reward": 1.0, + "step": 1658, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.35416793823242, + "epoch": 5.60370994940978, + "grad_norm": 13.944861340918969, + "kl": 0.482421875, + "learning_rate": 5.329391891891891e-07, + "loss": 0.0005, + "reward": 3.3524316549301147, + "reward_std": 0.0691116601228714, + "rewards/final_reward": 1.4367779985159759, + "rewards/mask_iou_reward": 0.7183889992579879, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.35243159532547, + "rewards/thk_ans_format_reward": 1.0, + "step": 1659, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.57292175292969, + "epoch": 5.6070826306914, + "grad_norm": 21.92530932871542, + "kl": 0.50390625, + "learning_rate": 5.326576576576577e-07, + "loss": 0.0005, + "reward": 3.5524903535842896, + "reward_std": 0.1243749912828207, + "rewards/final_reward": 1.56479281877437, + "rewards/mask_iou_reward": 0.782396409387185, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.552490234375, + "rewards/thk_ans_format_reward": 1.0, + "step": 1660, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.64583587646484, + "epoch": 5.610455311973018, + "grad_norm": 32.03033808136419, + "kl": 0.474609375, + "learning_rate": 5.323761261261262e-07, + "loss": 0.0005, + "reward": 3.106863260269165, + "reward_std": 0.08488507196307182, + "rewards/final_reward": 1.1713338661055392, + "rewards/mask_iou_reward": 0.5856669330527696, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1068630814552307, + "rewards/thk_ans_format_reward": 1.0, + "step": 1661, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.26042938232422, + "epoch": 5.613827993254637, + "grad_norm": 11.539811281420858, + "kl": 0.494140625, + "learning_rate": 5.320945945945946e-07, + "loss": 0.0005, + "reward": 3.4144234657287598, + "reward_std": 0.1457614228129387, + "rewards/final_reward": 1.3577875336470997, + "rewards/mask_iou_reward": 0.6788937668235498, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4248398542404175, + "rewards/thk_ans_format_reward": 1.0, + "step": 1662, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.53125, + "epoch": 5.617200674536257, + "grad_norm": 8.60480920359381, + "kl": 0.4521484375, + "learning_rate": 5.318130630630631e-07, + "loss": 0.0005, + "reward": 3.43615460395813, + "reward_std": 0.09291991218924522, + "rewards/final_reward": 1.0773759939667544, + "rewards/mask_iou_reward": 0.5386879969833772, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4361546635627747, + "rewards/thk_ans_format_reward": 1.0, + "step": 1663, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.45833587646484, + "epoch": 5.620573355817875, + "grad_norm": 12.209485569856556, + "kl": 0.6083984375, + "learning_rate": 5.315315315315315e-07, + "loss": 0.0006, + "reward": 3.3641871213912964, + "reward_std": 0.062420524656772614, + "rewards/final_reward": 0.591343764088626, + "rewards/mask_iou_reward": 0.295671882044313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.364187240600586, + "rewards/thk_ans_format_reward": 1.0, + "step": 1664, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.42708969116211, + "epoch": 5.623946037099494, + "grad_norm": 21.161549557227392, + "kl": 0.41796875, + "learning_rate": 5.3125e-07, + "loss": 0.0004, + "reward": 3.2564741373062134, + "reward_std": 0.10630857944488525, + "rewards/final_reward": 1.307717137972623, + "rewards/mask_iou_reward": 0.6538585689863115, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2564740180969238, + "rewards/thk_ans_format_reward": 1.0, + "step": 1665, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.64583587646484, + "epoch": 5.627318718381113, + "grad_norm": 7.549720713456712, + "kl": 0.375, + "learning_rate": 5.309684684684685e-07, + "loss": 0.0004, + "reward": 3.6098986864089966, + "reward_std": 0.08896861225366592, + "rewards/final_reward": 1.8692268517240347, + "rewards/mask_iou_reward": 0.9346134258620173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.609898328781128, + "rewards/thk_ans_format_reward": 1.0, + "step": 1666, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.59375381469727, + "epoch": 5.630691399662732, + "grad_norm": 6.874837411521086, + "kl": 0.572265625, + "learning_rate": 5.306869369369369e-07, + "loss": 0.0006, + "reward": 3.6495296955108643, + "reward_std": 0.020325029268860817, + "rewards/final_reward": 1.8610696951488621, + "rewards/mask_iou_reward": 0.9305348475744311, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6495293974876404, + "rewards/thk_ans_format_reward": 1.0, + "step": 1667, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.95833587646484, + "epoch": 5.63406408094435, + "grad_norm": 15.487206766913037, + "kl": 0.4453125, + "learning_rate": 5.304054054054054e-07, + "loss": 0.0004, + "reward": 3.2291752099990845, + "reward_std": 0.0427558608353138, + "rewards/final_reward": 1.6678412198822345, + "rewards/mask_iou_reward": 0.8339206099411173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.229175090789795, + "rewards/thk_ans_format_reward": 1.0, + "step": 1668, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.07292175292969, + "epoch": 5.63743676222597, + "grad_norm": 11.987338469465488, + "kl": 0.490234375, + "learning_rate": 5.301238738738738e-07, + "loss": 0.0005, + "reward": 3.36657178401947, + "reward_std": 0.06892849691212177, + "rewards/final_reward": 1.483850377035424, + "rewards/mask_iou_reward": 0.741925188517712, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3665716052055359, + "rewards/thk_ans_format_reward": 1.0, + "step": 1669, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.81250762939453, + "epoch": 5.640809443507589, + "grad_norm": 8.52808794702646, + "kl": 0.4130859375, + "learning_rate": 5.298423423423423e-07, + "loss": 0.0004, + "reward": 3.2042828798294067, + "reward_std": 0.04842074401676655, + "rewards/final_reward": 1.0179814834202083, + "rewards/mask_iou_reward": 0.5089907417101042, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2042827904224396, + "rewards/thk_ans_format_reward": 1.0, + "step": 1670, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.20833587646484, + "epoch": 5.644182124789207, + "grad_norm": 11.609123162744043, + "kl": 0.49609375, + "learning_rate": 5.295608108108109e-07, + "loss": 0.0005, + "reward": 3.619337797164917, + "reward_std": 0.10815814649686217, + "rewards/final_reward": 1.6707233426072208, + "rewards/mask_iou_reward": 0.8353616713036104, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.619337797164917, + "rewards/thk_ans_format_reward": 1.0, + "step": 1671, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.75000381469727, + "epoch": 5.6475548060708265, + "grad_norm": 9.05456878348143, + "kl": 0.466796875, + "learning_rate": 5.292792792792793e-07, + "loss": 0.0005, + "reward": 3.596415400505066, + "reward_std": 0.04145562183111906, + "rewards/final_reward": 1.4119979903838544, + "rewards/mask_iou_reward": 0.7059989951919272, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5964152216911316, + "rewards/thk_ans_format_reward": 1.0, + "step": 1672, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.36458587646484, + "epoch": 5.650927487352445, + "grad_norm": 7.830828364844946, + "kl": 0.4765625, + "learning_rate": 5.289977477477478e-07, + "loss": 0.0005, + "reward": 3.470069169998169, + "reward_std": 0.07297424972057343, + "rewards/final_reward": 1.0570247474625651, + "rewards/mask_iou_reward": 0.5285123737312826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4700689315795898, + "rewards/thk_ans_format_reward": 1.0, + "step": 1673, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.60416793823242, + "epoch": 5.654300168634064, + "grad_norm": 8.478112364771983, + "kl": 0.638671875, + "learning_rate": 5.287162162162162e-07, + "loss": 0.0006, + "reward": 3.4935306310653687, + "reward_std": 0.07750003226101398, + "rewards/final_reward": 1.1903071669456566, + "rewards/mask_iou_reward": 0.5951535834728283, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.493530511856079, + "rewards/thk_ans_format_reward": 1.0, + "step": 1674, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.86458587646484, + "epoch": 5.6576728499156825, + "grad_norm": 13.125329872620233, + "kl": 0.884765625, + "learning_rate": 5.284346846846847e-07, + "loss": 0.0009, + "reward": 3.6382278203964233, + "reward_std": 0.1006748378276825, + "rewards/final_reward": 1.789235596962822, + "rewards/mask_iou_reward": 0.894617798481411, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6382275819778442, + "rewards/thk_ans_format_reward": 1.0, + "step": 1675, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.77083969116211, + "epoch": 5.661045531197302, + "grad_norm": 107.66558914496177, + "kl": 0.45703125, + "learning_rate": 5.281531531531532e-07, + "loss": 0.0005, + "reward": 3.3801087141036987, + "reward_std": 0.10969089716672897, + "rewards/final_reward": 1.0466964469001674, + "rewards/mask_iou_reward": 0.5233482234500837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3801087141036987, + "rewards/thk_ans_format_reward": 1.0, + "step": 1676, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.37500381469727, + "epoch": 5.664418212478921, + "grad_norm": 10.73940996989274, + "kl": 0.4755859375, + "learning_rate": 5.278716216216215e-07, + "loss": 0.0005, + "reward": 3.7271050214767456, + "reward_std": 0.08030565828084946, + "rewards/final_reward": 1.5374772492864222, + "rewards/mask_iou_reward": 0.7687386246432111, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7271050214767456, + "rewards/thk_ans_format_reward": 1.0, + "step": 1677, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.1875, + "epoch": 5.6677908937605395, + "grad_norm": 6.539633603292251, + "kl": 0.4658203125, + "learning_rate": 5.2759009009009e-07, + "loss": 0.0005, + "reward": 3.651551127433777, + "reward_std": 0.06802648678421974, + "rewards/final_reward": 1.3033550017231739, + "rewards/mask_iou_reward": 0.6516775008615869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6515511274337769, + "rewards/thk_ans_format_reward": 1.0, + "step": 1678, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.50000381469727, + "epoch": 5.671163575042159, + "grad_norm": 8.418325679063077, + "kl": 0.4365234375, + "learning_rate": 5.273085585585584e-07, + "loss": 0.0004, + "reward": 3.2904953956604004, + "reward_std": 0.11499625258147717, + "rewards/final_reward": 1.4031736868913716, + "rewards/mask_iou_reward": 0.7015868434456858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2904953360557556, + "rewards/thk_ans_format_reward": 1.0, + "step": 1679, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.84375, + "epoch": 5.674536256323777, + "grad_norm": 23.618377937796875, + "kl": 0.576171875, + "learning_rate": 5.270270270270269e-07, + "loss": 0.0006, + "reward": 3.468711018562317, + "reward_std": 0.18310219049453735, + "rewards/final_reward": 1.1853710656067125, + "rewards/mask_iou_reward": 0.5926855328033562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4687109589576721, + "rewards/thk_ans_format_reward": 1.0, + "step": 1680, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.11458587646484, + "epoch": 5.677908937605396, + "grad_norm": 7.811614773663714, + "kl": 0.474609375, + "learning_rate": 5.267454954954955e-07, + "loss": 0.0005, + "reward": 3.676987886428833, + "reward_std": 0.07566726952791214, + "rewards/final_reward": 1.582874192705009, + "rewards/mask_iou_reward": 0.7914370963525045, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.676987886428833, + "rewards/thk_ans_format_reward": 1.0, + "step": 1681, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.19791793823242, + "epoch": 5.681281618887015, + "grad_norm": 9.22818287374541, + "kl": 0.4150390625, + "learning_rate": 5.264639639639639e-07, + "loss": 0.0004, + "reward": 3.2872849702835083, + "reward_std": 0.1653279960155487, + "rewards/final_reward": 1.6721876660408017, + "rewards/mask_iou_reward": 0.8360938330204009, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2872849702835083, + "rewards/thk_ans_format_reward": 1.0, + "step": 1682, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.2291717529297, + "epoch": 5.684654300168634, + "grad_norm": 13.911973370238616, + "kl": 0.4345703125, + "learning_rate": 5.261824324324324e-07, + "loss": 0.0004, + "reward": 3.442698359489441, + "reward_std": 0.07027308642864227, + "rewards/final_reward": 1.656888151695507, + "rewards/mask_iou_reward": 0.8284440758477535, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.442698359489441, + "rewards/thk_ans_format_reward": 1.0, + "step": 1683, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.83333587646484, + "epoch": 5.688026981450253, + "grad_norm": 9.683213301253808, + "kl": 0.435546875, + "learning_rate": 5.259009009009009e-07, + "loss": 0.0004, + "reward": 3.4510587453842163, + "reward_std": 0.12935582548379898, + "rewards/final_reward": 1.524409884196702, + "rewards/mask_iou_reward": 0.762204942098351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4510589241981506, + "rewards/thk_ans_format_reward": 1.0, + "step": 1684, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.58333587646484, + "epoch": 5.691399662731872, + "grad_norm": 9.08676964608566, + "kl": 0.5107421875, + "learning_rate": 5.256193693693693e-07, + "loss": 0.0005, + "reward": 2.989185094833374, + "reward_std": 0.08790277317166328, + "rewards/final_reward": 1.412420054225461, + "rewards/mask_iou_reward": 0.7062100271127305, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9891849160194397, + "rewards/thk_ans_format_reward": 1.0, + "step": 1685, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.0416717529297, + "epoch": 5.694772344013491, + "grad_norm": 15.651366383255017, + "kl": 0.421875, + "learning_rate": 5.253378378378378e-07, + "loss": 0.0004, + "reward": 3.205936312675476, + "reward_std": 0.10460496693849564, + "rewards/final_reward": 1.316913765441078, + "rewards/mask_iou_reward": 0.658456882720539, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.205936223268509, + "rewards/thk_ans_format_reward": 1.0, + "step": 1686, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.11458587646484, + "epoch": 5.698145025295109, + "grad_norm": 9.987256106162631, + "kl": 0.4052734375, + "learning_rate": 5.250563063063062e-07, + "loss": 0.0004, + "reward": 3.3900359869003296, + "reward_std": 0.09033327549695969, + "rewards/final_reward": 0.7899245332137992, + "rewards/mask_iou_reward": 0.3949622666068996, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3900358080863953, + "rewards/thk_ans_format_reward": 1.0, + "step": 1687, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.42708587646484, + "epoch": 5.701517706576729, + "grad_norm": 32.114201494541796, + "kl": 0.4638671875, + "learning_rate": 5.247747747747747e-07, + "loss": 0.0005, + "reward": 3.728898286819458, + "reward_std": 0.03537928406149149, + "rewards/final_reward": 1.4416144211001236, + "rewards/mask_iou_reward": 0.7208072105500618, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7288981676101685, + "rewards/thk_ans_format_reward": 1.0, + "step": 1688, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.82291793823242, + "epoch": 5.704890387858347, + "grad_norm": 15.12992837384189, + "kl": 0.5078125, + "learning_rate": 5.244932432432432e-07, + "loss": 0.0005, + "reward": 3.466995120048523, + "reward_std": 0.11427108105272055, + "rewards/final_reward": 1.6574982774473515, + "rewards/mask_iou_reward": 0.8287491387236757, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4669951796531677, + "rewards/thk_ans_format_reward": 1.0, + "step": 1689, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.23958587646484, + "epoch": 5.708263069139966, + "grad_norm": 15.175398411998135, + "kl": 0.4794921875, + "learning_rate": 5.242117117117116e-07, + "loss": 0.0005, + "reward": 3.348542332649231, + "reward_std": 0.12049789726734161, + "rewards/final_reward": 1.367889439033234, + "rewards/mask_iou_reward": 0.683944719516617, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3485422134399414, + "rewards/thk_ans_format_reward": 1.0, + "step": 1690, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.98958587646484, + "epoch": 5.7116357504215856, + "grad_norm": 10.96788286776064, + "kl": 0.515625, + "learning_rate": 5.239301801801802e-07, + "loss": 0.0005, + "reward": 3.5892138481140137, + "reward_std": 0.10701700299978256, + "rewards/final_reward": 1.3613337813454496, + "rewards/mask_iou_reward": 0.6806668906727248, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5892136096954346, + "rewards/thk_ans_format_reward": 1.0, + "step": 1691, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.25000381469727, + "epoch": 5.715008431703204, + "grad_norm": 19.324395640798016, + "kl": 0.408203125, + "learning_rate": 5.236486486486486e-07, + "loss": 0.0004, + "reward": 3.7077428102493286, + "reward_std": 0.036984759382903576, + "rewards/final_reward": 1.5949915653025974, + "rewards/mask_iou_reward": 0.7974957826512987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7077427506446838, + "rewards/thk_ans_format_reward": 1.0, + "step": 1692, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.94791793823242, + "epoch": 5.718381112984823, + "grad_norm": 31.703385874009, + "kl": 0.544921875, + "learning_rate": 5.233671171171171e-07, + "loss": 0.0006, + "reward": 3.311197876930237, + "reward_std": 0.07203967496752739, + "rewards/final_reward": 0.9592490981961042, + "rewards/mask_iou_reward": 0.4796245490980521, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3111979365348816, + "rewards/thk_ans_format_reward": 1.0, + "step": 1693, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.61459350585938, + "epoch": 5.721753794266442, + "grad_norm": 18.58018766643279, + "kl": 0.505859375, + "learning_rate": 5.230855855855856e-07, + "loss": 0.0005, + "reward": 3.0218993425369263, + "reward_std": 0.052796896547079086, + "rewards/final_reward": 1.4534910951146789, + "rewards/mask_iou_reward": 0.7267455475573394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0218994319438934, + "rewards/thk_ans_format_reward": 1.0, + "step": 1694, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.90625381469727, + "epoch": 5.725126475548061, + "grad_norm": 68.33497978649426, + "kl": 0.484375, + "learning_rate": 5.22804054054054e-07, + "loss": 0.0005, + "reward": 3.3459736108779907, + "reward_std": 0.13298944756388664, + "rewards/final_reward": 1.8743874890546413, + "rewards/mask_iou_reward": 0.9371937445273206, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3459736108779907, + "rewards/thk_ans_format_reward": 1.0, + "step": 1695, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.1979217529297, + "epoch": 5.728499156829679, + "grad_norm": 18.308639001399047, + "kl": 0.4443359375, + "learning_rate": 5.225225225225225e-07, + "loss": 0.0005, + "reward": 3.351117730140686, + "reward_std": 0.09872744139283895, + "rewards/final_reward": 1.233155712943284, + "rewards/mask_iou_reward": 0.616577856471642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3511175513267517, + "rewards/thk_ans_format_reward": 1.0, + "step": 1696, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.0, + "epoch": 5.7318718381112985, + "grad_norm": 11.213363187180152, + "kl": 0.443359375, + "learning_rate": 5.22240990990991e-07, + "loss": 0.0004, + "reward": 3.452345371246338, + "reward_std": 0.1489114686846733, + "rewards/final_reward": 1.5670607741123705, + "rewards/mask_iou_reward": 0.7835303870561853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4523451924324036, + "rewards/thk_ans_format_reward": 1.0, + "step": 1697, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.53125, + "epoch": 5.735244519392918, + "grad_norm": 14.89224352148172, + "kl": 0.3994140625, + "learning_rate": 5.219594594594594e-07, + "loss": 0.0004, + "reward": 3.5091474056243896, + "reward_std": 0.1592223308980465, + "rewards/final_reward": 1.7831663927632908, + "rewards/mask_iou_reward": 0.8915831963816454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5091473460197449, + "rewards/thk_ans_format_reward": 1.0, + "step": 1698, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.80208587646484, + "epoch": 5.738617200674536, + "grad_norm": 23.0003418459873, + "kl": 0.4375, + "learning_rate": 5.216779279279279e-07, + "loss": 0.0004, + "reward": 3.3121293783187866, + "reward_std": 0.13283708691596985, + "rewards/final_reward": 1.2491971488263538, + "rewards/mask_iou_reward": 0.6245985744131769, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3121294975280762, + "rewards/thk_ans_format_reward": 1.0, + "step": 1699, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.50000381469727, + "epoch": 5.7419898819561555, + "grad_norm": 45.87529807068098, + "kl": 0.4248046875, + "learning_rate": 5.213963963963963e-07, + "loss": 0.0004, + "reward": 3.6261146068573, + "reward_std": 0.08257642015814781, + "rewards/final_reward": 1.6618618352805021, + "rewards/mask_iou_reward": 0.8309309176402511, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6261144876480103, + "rewards/thk_ans_format_reward": 1.0, + "step": 1700, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.11458587646484, + "epoch": 5.745362563237774, + "grad_norm": 18.868106450012085, + "kl": 0.6328125, + "learning_rate": 5.211148648648649e-07, + "loss": 0.0006, + "reward": 3.6011475324630737, + "reward_std": 0.1154952123761177, + "rewards/final_reward": 1.844404480008979, + "rewards/mask_iou_reward": 0.9222022400044895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6011476516723633, + "rewards/thk_ans_format_reward": 1.0, + "step": 1701, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.39583587646484, + "epoch": 5.748735244519393, + "grad_norm": 9.694625454651934, + "kl": 0.53515625, + "learning_rate": 5.208333333333334e-07, + "loss": 0.0005, + "reward": 3.4521749019622803, + "reward_std": 0.10139688663184643, + "rewards/final_reward": 0.945008864001477, + "rewards/mask_iou_reward": 0.4725044320007385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4521750807762146, + "rewards/thk_ans_format_reward": 1.0, + "step": 1702, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.66666793823242, + "epoch": 5.7521079258010115, + "grad_norm": 17.253212109876063, + "kl": 0.53125, + "learning_rate": 5.205518018018018e-07, + "loss": 0.0005, + "reward": 3.2960422039031982, + "reward_std": 0.1437181867659092, + "rewards/final_reward": 0.952370236450588, + "rewards/mask_iou_reward": 0.476185118225294, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3064589500427246, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1703, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.50000762939453, + "epoch": 5.755480607082631, + "grad_norm": 17.31363700736517, + "kl": 0.423828125, + "learning_rate": 5.202702702702703e-07, + "loss": 0.0004, + "reward": 3.463135838508606, + "reward_std": 0.04686561040580273, + "rewards/final_reward": 1.8182827151669658, + "rewards/mask_iou_reward": 0.9091413575834829, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4631354808807373, + "rewards/thk_ans_format_reward": 1.0, + "step": 1704, + "think_completion_length": 9.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.5, + "epoch": 5.75885328836425, + "grad_norm": 10.915200571014212, + "kl": 0.5595703125, + "learning_rate": 5.199887387387387e-07, + "loss": 0.0006, + "reward": 3.3400657176971436, + "reward_std": 0.12699758261442184, + "rewards/final_reward": 0.9774009059955321, + "rewards/mask_iou_reward": 0.48870045299776604, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.340065598487854, + "rewards/thk_ans_format_reward": 1.0, + "step": 1705, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.79166793823242, + "epoch": 5.762225969645868, + "grad_norm": 10.499187119948228, + "kl": 0.4296875, + "learning_rate": 5.197072072072072e-07, + "loss": 0.0004, + "reward": 3.5650423765182495, + "reward_std": 0.18133790232241154, + "rewards/final_reward": 1.4786204764762632, + "rewards/mask_iou_reward": 0.7393102382381316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5650423765182495, + "rewards/thk_ans_format_reward": 1.0, + "step": 1706, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.42708587646484, + "epoch": 5.765598650927488, + "grad_norm": 19.482838286261735, + "kl": 0.5107421875, + "learning_rate": 5.194256756756757e-07, + "loss": 0.0005, + "reward": 3.2994097471237183, + "reward_std": 0.10507571697235107, + "rewards/final_reward": 0.855652438364876, + "rewards/mask_iou_reward": 0.427826219182438, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.299409806728363, + "rewards/thk_ans_format_reward": 1.0, + "step": 1707, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.85416793823242, + "epoch": 5.768971332209106, + "grad_norm": 10.232377939016123, + "kl": 0.484375, + "learning_rate": 5.191441441441441e-07, + "loss": 0.0005, + "reward": 3.475585460662842, + "reward_std": 0.10696535930037498, + "rewards/final_reward": 1.4330362248927848, + "rewards/mask_iou_reward": 0.7165181124463924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.475585401058197, + "rewards/thk_ans_format_reward": 1.0, + "step": 1708, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.16666793823242, + "epoch": 5.772344013490725, + "grad_norm": 8.795494606383011, + "kl": 0.4638671875, + "learning_rate": 5.188626126126126e-07, + "loss": 0.0005, + "reward": 3.252189874649048, + "reward_std": 0.116634551435709, + "rewards/final_reward": 1.8850582728807999, + "rewards/mask_iou_reward": 0.9425291364403999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2521897554397583, + "rewards/thk_ans_format_reward": 1.0, + "step": 1709, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.46875, + "epoch": 5.775716694772344, + "grad_norm": 11.388991934526885, + "kl": 0.603515625, + "learning_rate": 5.18581081081081e-07, + "loss": 0.0006, + "reward": 3.1848024129867554, + "reward_std": 0.07361265271902084, + "rewards/final_reward": 1.122927262485939, + "rewards/mask_iou_reward": 0.5614636312429695, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1848024129867554, + "rewards/thk_ans_format_reward": 1.0, + "step": 1710, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.0625, + "epoch": 5.779089376053963, + "grad_norm": 21.639140996155447, + "kl": 0.455078125, + "learning_rate": 5.182995495495496e-07, + "loss": 0.0005, + "reward": 3.5467482805252075, + "reward_std": 0.08291709423065186, + "rewards/final_reward": 1.4941731254870554, + "rewards/mask_iou_reward": 0.7470865627435277, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5467482209205627, + "rewards/thk_ans_format_reward": 1.0, + "step": 1711, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.73958587646484, + "epoch": 5.782462057335582, + "grad_norm": 22.735884919407788, + "kl": 0.61328125, + "learning_rate": 5.180180180180181e-07, + "loss": 0.0006, + "reward": 2.988006830215454, + "reward_std": 0.13545718044042587, + "rewards/final_reward": 0.8691919661445637, + "rewards/mask_iou_reward": 0.43459598307228187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9880067706108093, + "rewards/thk_ans_format_reward": 1.0, + "step": 1712, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.68750762939453, + "epoch": 5.785834738617201, + "grad_norm": 7.582875420681268, + "kl": 0.4755859375, + "learning_rate": 5.177364864864865e-07, + "loss": 0.0005, + "reward": 3.2828404903411865, + "reward_std": 0.07186983339488506, + "rewards/final_reward": 1.1207400699644596, + "rewards/mask_iou_reward": 0.5603700349822298, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2828404903411865, + "rewards/thk_ans_format_reward": 1.0, + "step": 1713, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.79167175292969, + "epoch": 5.78920741989882, + "grad_norm": 11.276241793600565, + "kl": 0.486328125, + "learning_rate": 5.17454954954955e-07, + "loss": 0.0005, + "reward": 3.64591646194458, + "reward_std": 0.10193854942917824, + "rewards/final_reward": 1.731438612762751, + "rewards/mask_iou_reward": 0.8657193063813755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6459165215492249, + "rewards/thk_ans_format_reward": 1.0, + "step": 1714, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.6666717529297, + "epoch": 5.792580101180438, + "grad_norm": 8.613140831131751, + "kl": 0.701171875, + "learning_rate": 5.171734234234235e-07, + "loss": 0.0007, + "reward": 3.4341585636138916, + "reward_std": 0.10433509945869446, + "rewards/final_reward": 1.3357545861748017, + "rewards/mask_iou_reward": 0.6678772930874008, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4341585636138916, + "rewards/thk_ans_format_reward": 1.0, + "step": 1715, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.16666793823242, + "epoch": 5.795952782462058, + "grad_norm": 11.510790702330484, + "kl": 0.5400390625, + "learning_rate": 5.168918918918919e-07, + "loss": 0.0005, + "reward": 3.6309818029403687, + "reward_std": 0.07807623594999313, + "rewards/final_reward": 1.3849225081200518, + "rewards/mask_iou_reward": 0.6924612540600259, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6309821009635925, + "rewards/thk_ans_format_reward": 1.0, + "step": 1716, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.16666793823242, + "epoch": 5.799325463743676, + "grad_norm": 54.070522534119505, + "kl": 0.439453125, + "learning_rate": 5.166103603603603e-07, + "loss": 0.0004, + "reward": 3.3230055570602417, + "reward_std": 0.11343681067228317, + "rewards/final_reward": 1.5856585419995561, + "rewards/mask_iou_reward": 0.7928292709997781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3230054378509521, + "rewards/thk_ans_format_reward": 1.0, + "step": 1717, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.53125, + "epoch": 5.802698145025295, + "grad_norm": 23.76981236983568, + "kl": 0.525390625, + "learning_rate": 5.163288288288287e-07, + "loss": 0.0005, + "reward": 3.3168094158172607, + "reward_std": 0.1799912080168724, + "rewards/final_reward": 1.5219490770111228, + "rewards/mask_iou_reward": 0.7609745385055614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3168094754219055, + "rewards/thk_ans_format_reward": 1.0, + "step": 1718, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.60416793823242, + "epoch": 5.806070826306914, + "grad_norm": 10.974777498998844, + "kl": 0.4794921875, + "learning_rate": 5.160472972972972e-07, + "loss": 0.0005, + "reward": 3.6152318716049194, + "reward_std": 0.10453075915575027, + "rewards/final_reward": 1.632990182382099, + "rewards/mask_iou_reward": 0.8164950911910495, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6152318120002747, + "rewards/thk_ans_format_reward": 1.0, + "step": 1719, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.50000381469727, + "epoch": 5.809443507588533, + "grad_norm": 8.54597729635777, + "kl": 0.478515625, + "learning_rate": 5.157657657657657e-07, + "loss": 0.0005, + "reward": 3.0575523376464844, + "reward_std": 0.040778761729598045, + "rewards/final_reward": 0.7077429081727465, + "rewards/mask_iou_reward": 0.35387145408637327, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.05755215883255, + "rewards/thk_ans_format_reward": 1.0, + "step": 1720, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.5104217529297, + "epoch": 5.812816188870151, + "grad_norm": 8.132198049021852, + "kl": 0.48046875, + "learning_rate": 5.154842342342342e-07, + "loss": 0.0005, + "reward": 3.6106003522872925, + "reward_std": 0.08156681805849075, + "rewards/final_reward": 1.3410413718043763, + "rewards/mask_iou_reward": 0.6705206859021882, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6106004118919373, + "rewards/thk_ans_format_reward": 1.0, + "step": 1721, + "think_completion_length": 10.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.6041717529297, + "epoch": 5.8161888701517706, + "grad_norm": 15.388878973584676, + "kl": 0.470703125, + "learning_rate": 5.152027027027027e-07, + "loss": 0.0005, + "reward": 3.5082924365997314, + "reward_std": 0.2979699335992336, + "rewards/final_reward": 1.8881755293728573, + "rewards/mask_iou_reward": 0.9440877646864286, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5291257500648499, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1722, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.13542175292969, + "epoch": 5.81956155143339, + "grad_norm": 98.13463787790987, + "kl": 0.572265625, + "learning_rate": 5.149211711711711e-07, + "loss": 0.0006, + "reward": 3.3044140338897705, + "reward_std": 0.05266350507736206, + "rewards/final_reward": 0.9294198038171804, + "rewards/mask_iou_reward": 0.4647099019085902, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3044140934944153, + "rewards/thk_ans_format_reward": 1.0, + "step": 1723, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.34375, + "epoch": 5.822934232715008, + "grad_norm": 17.488677944153867, + "kl": 0.49609375, + "learning_rate": 5.146396396396396e-07, + "loss": 0.0005, + "reward": 3.245222806930542, + "reward_std": 0.14908801019191742, + "rewards/final_reward": 1.455564582508996, + "rewards/mask_iou_reward": 0.727782291254498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2452226877212524, + "rewards/thk_ans_format_reward": 1.0, + "step": 1724, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.62500762939453, + "epoch": 5.8263069139966275, + "grad_norm": 7.789892829358596, + "kl": 0.443359375, + "learning_rate": 5.143581081081081e-07, + "loss": 0.0004, + "reward": 3.2473161220550537, + "reward_std": 0.12342843785881996, + "rewards/final_reward": 1.5933870322320047, + "rewards/mask_iou_reward": 0.7966935161160024, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2473162412643433, + "rewards/thk_ans_format_reward": 1.0, + "step": 1725, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.61458587646484, + "epoch": 5.829679595278246, + "grad_norm": 13.595597903885679, + "kl": 0.462890625, + "learning_rate": 5.140765765765765e-07, + "loss": 0.0005, + "reward": 3.493627429008484, + "reward_std": 0.10147467255592346, + "rewards/final_reward": 1.446620973518145, + "rewards/mask_iou_reward": 0.7233104867590725, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4936274290084839, + "rewards/thk_ans_format_reward": 1.0, + "step": 1726, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.94791793823242, + "epoch": 5.833052276559865, + "grad_norm": 11.792450197444229, + "kl": 1.236328125, + "learning_rate": 5.13795045045045e-07, + "loss": 0.0012, + "reward": 3.316351890563965, + "reward_std": 0.07703639194369316, + "rewards/final_reward": 1.0956512096704052, + "rewards/mask_iou_reward": 0.5478256048352026, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3163517713546753, + "rewards/thk_ans_format_reward": 1.0, + "step": 1727, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.7604217529297, + "epoch": 5.8364249578414835, + "grad_norm": 8.292767744572648, + "kl": 0.3994140625, + "learning_rate": 5.135135135135134e-07, + "loss": 0.0004, + "reward": 3.33501935005188, + "reward_std": 0.2470620460808277, + "rewards/final_reward": 1.5175090174757648, + "rewards/mask_iou_reward": 0.7587545087378824, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3558525443077087, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1728, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.21875, + "epoch": 5.839797639123103, + "grad_norm": 8.741281688283356, + "kl": 0.568359375, + "learning_rate": 5.132319819819819e-07, + "loss": 0.0006, + "reward": 3.7741293907165527, + "reward_std": 0.06825266778469086, + "rewards/final_reward": 1.4154810710394847, + "rewards/mask_iou_reward": 0.7077405355197424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.774129331111908, + "rewards/thk_ans_format_reward": 1.0, + "step": 1729, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.61458587646484, + "epoch": 5.843170320404722, + "grad_norm": 19.962365291750043, + "kl": 0.361328125, + "learning_rate": 5.129504504504504e-07, + "loss": 0.0004, + "reward": 3.6556034088134766, + "reward_std": 0.0850059799849987, + "rewards/final_reward": 1.8285352974407516, + "rewards/mask_iou_reward": 0.9142676487203758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.655603289604187, + "rewards/thk_ans_format_reward": 1.0, + "step": 1730, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.71875, + "epoch": 5.8465430016863404, + "grad_norm": 18.535866700500076, + "kl": 0.4091796875, + "learning_rate": 5.126689189189189e-07, + "loss": 0.0004, + "reward": 3.645634651184082, + "reward_std": 0.08320539817214012, + "rewards/final_reward": 1.8012216379238128, + "rewards/mask_iou_reward": 0.9006108189619064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.645634651184082, + "rewards/thk_ans_format_reward": 1.0, + "step": 1731, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.08333587646484, + "epoch": 5.84991568296796, + "grad_norm": 10.25741967867522, + "kl": 0.3798828125, + "learning_rate": 5.123873873873874e-07, + "loss": 0.0004, + "reward": 3.1230812072753906, + "reward_std": 0.12471498548984528, + "rewards/final_reward": 1.1030794339834928, + "rewards/mask_iou_reward": 0.5515397169917464, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1230809688568115, + "rewards/thk_ans_format_reward": 1.0, + "step": 1732, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.0729217529297, + "epoch": 5.853288364249578, + "grad_norm": 11.29392121179074, + "kl": 0.6533203125, + "learning_rate": 5.121058558558559e-07, + "loss": 0.0007, + "reward": 3.413060188293457, + "reward_std": 0.09217966627329588, + "rewards/final_reward": 1.7253575871750448, + "rewards/mask_iou_reward": 0.8626787935875224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4130603075027466, + "rewards/thk_ans_format_reward": 1.0, + "step": 1733, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.27083587646484, + "epoch": 5.856661045531197, + "grad_norm": 11.747910346406071, + "kl": 0.466796875, + "learning_rate": 5.118243243243243e-07, + "loss": 0.0005, + "reward": 3.640872836112976, + "reward_std": 0.03201808128505945, + "rewards/final_reward": 1.9409818975883342, + "rewards/mask_iou_reward": 0.9704909487941671, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6408729553222656, + "rewards/thk_ans_format_reward": 1.0, + "step": 1734, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.23958587646484, + "epoch": 5.860033726812816, + "grad_norm": 6.658730706855383, + "kl": 0.42578125, + "learning_rate": 5.115427927927928e-07, + "loss": 0.0004, + "reward": 3.450482726097107, + "reward_std": 0.38272392749786377, + "rewards/final_reward": 1.4120153309699566, + "rewards/mask_iou_reward": 0.7060076654849783, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.512982964515686, + "rewards/thk_ans_format_reward": 0.9687500298023224, + "step": 1735, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.7916717529297, + "epoch": 5.863406408094435, + "grad_norm": 22.93808254518162, + "kl": 0.4140625, + "learning_rate": 5.112612612612612e-07, + "loss": 0.0004, + "reward": 3.6187745332717896, + "reward_std": 0.09785094857215881, + "rewards/final_reward": 1.4834057744913518, + "rewards/mask_iou_reward": 0.7417028872456759, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6187745928764343, + "rewards/thk_ans_format_reward": 1.0, + "step": 1736, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.2916717529297, + "epoch": 5.866779089376054, + "grad_norm": 10.396671513600031, + "kl": 0.4033203125, + "learning_rate": 5.109797297297297e-07, + "loss": 0.0004, + "reward": 3.3890039920806885, + "reward_std": 0.11911951750516891, + "rewards/final_reward": 0.9812503136409951, + "rewards/mask_iou_reward": 0.49062515682049757, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3890039920806885, + "rewards/thk_ans_format_reward": 1.0, + "step": 1737, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.65625, + "epoch": 5.870151770657673, + "grad_norm": 29.160739711927683, + "kl": 0.5849609375, + "learning_rate": 5.106981981981982e-07, + "loss": 0.0006, + "reward": 3.5928313732147217, + "reward_std": 0.05474974773824215, + "rewards/final_reward": 1.9592838294684447, + "rewards/mask_iou_reward": 0.9796419147342224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5928313732147217, + "rewards/thk_ans_format_reward": 1.0, + "step": 1738, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.73959350585938, + "epoch": 5.873524451939292, + "grad_norm": 17.75064820845106, + "kl": 0.4208984375, + "learning_rate": 5.104166666666666e-07, + "loss": 0.0004, + "reward": 3.7334030866622925, + "reward_std": 0.05087855085730553, + "rewards/final_reward": 1.7924745983336359, + "rewards/mask_iou_reward": 0.8962372991668179, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7334030866622925, + "rewards/thk_ans_format_reward": 1.0, + "step": 1739, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.11458587646484, + "epoch": 5.87689713322091, + "grad_norm": 18.717009916786903, + "kl": 0.546875, + "learning_rate": 5.101351351351351e-07, + "loss": 0.0006, + "reward": 3.5742392539978027, + "reward_std": 0.09535662084817886, + "rewards/final_reward": 1.240059957912595, + "rewards/mask_iou_reward": 0.6200299789562975, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5742393732070923, + "rewards/thk_ans_format_reward": 1.0, + "step": 1740, + "think_completion_length": 9.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.1875, + "epoch": 5.88026981450253, + "grad_norm": 16.3173306666386, + "kl": 0.4306640625, + "learning_rate": 5.098536036036036e-07, + "loss": 0.0004, + "reward": 3.5463132858276367, + "reward_std": 0.06176626309752464, + "rewards/final_reward": 1.612073242086272, + "rewards/mask_iou_reward": 0.806036621043136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5463131666183472, + "rewards/thk_ans_format_reward": 1.0, + "step": 1741, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.86458587646484, + "epoch": 5.883642495784148, + "grad_norm": 15.342405431064343, + "kl": 0.46875, + "learning_rate": 5.095720720720721e-07, + "loss": 0.0005, + "reward": 3.6642171144485474, + "reward_std": 0.10495152324438095, + "rewards/final_reward": 1.5927597251292855, + "rewards/mask_iou_reward": 0.7963798625646428, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6642171740531921, + "rewards/thk_ans_format_reward": 1.0, + "step": 1742, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.39583587646484, + "epoch": 5.887015177065767, + "grad_norm": 6.6510036765395535, + "kl": 0.5302734375, + "learning_rate": 5.092905405405406e-07, + "loss": 0.0005, + "reward": 3.4673802852630615, + "reward_std": 0.0501430481672287, + "rewards/final_reward": 1.4812380552366138, + "rewards/mask_iou_reward": 0.7406190276183069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4673802852630615, + "rewards/thk_ans_format_reward": 1.0, + "step": 1743, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.89583587646484, + "epoch": 5.8903878583473865, + "grad_norm": 24.165763442619365, + "kl": 0.4189453125, + "learning_rate": 5.09009009009009e-07, + "loss": 0.0004, + "reward": 3.6040745973587036, + "reward_std": 0.06931154802441597, + "rewards/final_reward": 1.5248657320782466, + "rewards/mask_iou_reward": 0.7624328660391233, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.604074478149414, + "rewards/thk_ans_format_reward": 1.0, + "step": 1744, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5729217529297, + "epoch": 5.893760539629005, + "grad_norm": 40.57122292570664, + "kl": 0.4365234375, + "learning_rate": 5.087274774774775e-07, + "loss": 0.0004, + "reward": 3.054714322090149, + "reward_std": 0.08430700935423374, + "rewards/final_reward": 1.288889173740179, + "rewards/mask_iou_reward": 0.6444445868700895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0547142326831818, + "rewards/thk_ans_format_reward": 1.0, + "step": 1745, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.20833587646484, + "epoch": 5.897133220910624, + "grad_norm": 11.861157554661235, + "kl": 0.423828125, + "learning_rate": 5.084459459459459e-07, + "loss": 0.0004, + "reward": 3.5562628507614136, + "reward_std": 0.10923858545720577, + "rewards/final_reward": 1.3463834622729496, + "rewards/mask_iou_reward": 0.6731917311364748, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5562628507614136, + "rewards/thk_ans_format_reward": 1.0, + "step": 1746, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.36459350585938, + "epoch": 5.900505902192243, + "grad_norm": 7.507762898919182, + "kl": 0.4873046875, + "learning_rate": 5.081644144144144e-07, + "loss": 0.0005, + "reward": 3.6194422245025635, + "reward_std": 0.27009348571300507, + "rewards/final_reward": 1.5039307968904405, + "rewards/mask_iou_reward": 0.7519653984452203, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.6611087322235107, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1747, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.7916717529297, + "epoch": 5.903878583473862, + "grad_norm": 17.826037032061887, + "kl": 0.40234375, + "learning_rate": 5.078828828828829e-07, + "loss": 0.0004, + "reward": 3.407109498977661, + "reward_std": 0.08853336982429028, + "rewards/final_reward": 1.2309654718579899, + "rewards/mask_iou_reward": 0.6154827359289949, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4071094989776611, + "rewards/thk_ans_format_reward": 1.0, + "step": 1748, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1354217529297, + "epoch": 5.90725126475548, + "grad_norm": 39.26605245968703, + "kl": 0.4814453125, + "learning_rate": 5.076013513513513e-07, + "loss": 0.0005, + "reward": 3.35912024974823, + "reward_std": 0.03402594896033406, + "rewards/final_reward": 1.236561535022871, + "rewards/mask_iou_reward": 0.6182807675114355, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3591201305389404, + "rewards/thk_ans_format_reward": 1.0, + "step": 1749, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.83333587646484, + "epoch": 5.9106239460370995, + "grad_norm": 18.73426457604382, + "kl": 0.400390625, + "learning_rate": 5.073198198198198e-07, + "loss": 0.0004, + "reward": 3.7030093669891357, + "reward_std": 0.06464430969208479, + "rewards/final_reward": 1.4769883342304442, + "rewards/mask_iou_reward": 0.7384941671152221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7030091881752014, + "rewards/thk_ans_format_reward": 1.0, + "step": 1750, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.7916717529297, + "epoch": 5.913996627318719, + "grad_norm": 13.299711018916073, + "kl": 0.41015625, + "learning_rate": 5.070382882882884e-07, + "loss": 0.0004, + "reward": 3.3841612339019775, + "reward_std": 0.08488386124372482, + "rewards/final_reward": 1.4041756477925396, + "rewards/mask_iou_reward": 0.7020878238962698, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.384161114692688, + "rewards/thk_ans_format_reward": 1.0, + "step": 1751, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.23958587646484, + "epoch": 5.917369308600337, + "grad_norm": 9.5260628825992, + "kl": 0.412109375, + "learning_rate": 5.067567567567568e-07, + "loss": 0.0004, + "reward": 3.6380069255828857, + "reward_std": 0.04551572538912296, + "rewards/final_reward": 1.8168987853889136, + "rewards/mask_iou_reward": 0.9084493926944568, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.638006865978241, + "rewards/thk_ans_format_reward": 1.0, + "step": 1752, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.05208587646484, + "epoch": 5.920741989881956, + "grad_norm": 6.56819437673071, + "kl": 0.375, + "learning_rate": 5.064752252252253e-07, + "loss": 0.0004, + "reward": 3.733555316925049, + "reward_std": 0.10838979762047529, + "rewards/final_reward": 1.9458549362643849, + "rewards/mask_iou_reward": 0.9729274681321924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7335551381111145, + "rewards/thk_ans_format_reward": 1.0, + "step": 1753, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.85416793823242, + "epoch": 5.924114671163575, + "grad_norm": 17.981830604106577, + "kl": 0.3955078125, + "learning_rate": 5.061936936936937e-07, + "loss": 0.0004, + "reward": 3.5746283531188965, + "reward_std": 0.0435329545289278, + "rewards/final_reward": 1.8869617940125476, + "rewards/mask_iou_reward": 0.9434808970062738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5746282935142517, + "rewards/thk_ans_format_reward": 1.0, + "step": 1754, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5729217529297, + "epoch": 5.927487352445194, + "grad_norm": 17.18872625472572, + "kl": 0.802734375, + "learning_rate": 5.059121621621622e-07, + "loss": 0.0008, + "reward": 3.631569027900696, + "reward_std": 0.0754023939371109, + "rewards/final_reward": 1.5373684318831864, + "rewards/mask_iou_reward": 0.7686842159415932, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6315687894821167, + "rewards/thk_ans_format_reward": 1.0, + "step": 1755, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.68750762939453, + "epoch": 5.9308600337268125, + "grad_norm": 10.750206089182589, + "kl": 0.505859375, + "learning_rate": 5.056306306306307e-07, + "loss": 0.0005, + "reward": 3.2980340719223022, + "reward_std": 0.09991785138845444, + "rewards/final_reward": 1.2133367748293613, + "rewards/mask_iou_reward": 0.6066683874146807, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.298033893108368, + "rewards/thk_ans_format_reward": 1.0, + "step": 1756, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.75, + "epoch": 5.934232715008432, + "grad_norm": 8.232534124957862, + "kl": 0.408203125, + "learning_rate": 5.05349099099099e-07, + "loss": 0.0004, + "reward": 3.4839136600494385, + "reward_std": 0.1400267817080021, + "rewards/final_reward": 1.68557964538632, + "rewards/mask_iou_reward": 0.84278982269316, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4839134216308594, + "rewards/thk_ans_format_reward": 1.0, + "step": 1757, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.86458587646484, + "epoch": 5.937605396290051, + "grad_norm": 24.402661559828726, + "kl": 0.4072265625, + "learning_rate": 5.050675675675675e-07, + "loss": 0.0004, + "reward": 3.389898419380188, + "reward_std": 0.10899049788713455, + "rewards/final_reward": 1.6848548729834658, + "rewards/mask_iou_reward": 0.8424274364917329, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3898981809616089, + "rewards/thk_ans_format_reward": 1.0, + "step": 1758, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.33333587646484, + "epoch": 5.940978077571669, + "grad_norm": 5.222034550827015, + "kl": 0.392578125, + "learning_rate": 5.047860360360359e-07, + "loss": 0.0004, + "reward": 3.622321844100952, + "reward_std": 0.03675311338156462, + "rewards/final_reward": 1.9151095000401712, + "rewards/mask_iou_reward": 0.9575547500200856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6223217844963074, + "rewards/thk_ans_format_reward": 1.0, + "step": 1759, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.59375762939453, + "epoch": 5.944350758853289, + "grad_norm": 27.666519901031105, + "kl": 0.3837890625, + "learning_rate": 5.045045045045044e-07, + "loss": 0.0004, + "reward": 3.654398560523987, + "reward_std": 0.06969969533383846, + "rewards/final_reward": 1.6697516701864739, + "rewards/mask_iou_reward": 0.8348758350932369, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.654398500919342, + "rewards/thk_ans_format_reward": 1.0, + "step": 1760, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.5625, + "epoch": 5.947723440134907, + "grad_norm": 7.459759500944451, + "kl": 0.4111328125, + "learning_rate": 5.04222972972973e-07, + "loss": 0.0004, + "reward": 3.2948083877563477, + "reward_std": 0.16588781774044037, + "rewards/final_reward": 1.6697245311694944, + "rewards/mask_iou_reward": 0.8348622655847472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2948083281517029, + "rewards/thk_ans_format_reward": 1.0, + "step": 1761, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.52083587646484, + "epoch": 5.951096121416526, + "grad_norm": 10.70021845117529, + "kl": 0.71484375, + "learning_rate": 5.039414414414414e-07, + "loss": 0.0007, + "reward": 3.3829610347747803, + "reward_std": 0.12868967279791832, + "rewards/final_reward": 1.9008267771256642, + "rewards/mask_iou_reward": 0.9504133885628321, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3933777213096619, + "rewards/thk_ans_format_reward": 1.0, + "step": 1762, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.05208587646484, + "epoch": 5.954468802698145, + "grad_norm": 21.624002001456407, + "kl": 0.4658203125, + "learning_rate": 5.036599099099099e-07, + "loss": 0.0005, + "reward": 3.522372007369995, + "reward_std": 0.060267508029937744, + "rewards/final_reward": 1.941477651115174, + "rewards/mask_iou_reward": 0.970738825557587, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.522372305393219, + "rewards/thk_ans_format_reward": 1.0, + "step": 1763, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75000762939453, + "epoch": 5.957841483979764, + "grad_norm": 12.43829962090804, + "kl": 0.4326171875, + "learning_rate": 5.033783783783783e-07, + "loss": 0.0004, + "reward": 3.623276948928833, + "reward_std": 0.0512046292424202, + "rewards/final_reward": 1.2826587931602818, + "rewards/mask_iou_reward": 0.6413293965801409, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6232768297195435, + "rewards/thk_ans_format_reward": 1.0, + "step": 1764, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.19791793823242, + "epoch": 5.961214165261383, + "grad_norm": 17.285948867539556, + "kl": 0.60546875, + "learning_rate": 5.030968468468468e-07, + "loss": 0.0006, + "reward": 3.4323031902313232, + "reward_std": 0.17292555421590805, + "rewards/final_reward": 1.7967404303750492, + "rewards/mask_iou_reward": 0.8983702151875246, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.432303249835968, + "rewards/thk_ans_format_reward": 1.0, + "step": 1765, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.59375762939453, + "epoch": 5.964586846543002, + "grad_norm": 7.939744549301504, + "kl": 0.58984375, + "learning_rate": 5.028153153153153e-07, + "loss": 0.0005, + "reward": 3.7313038110733032, + "reward_std": 0.06714446656405926, + "rewards/final_reward": 1.5971374551194422, + "rewards/mask_iou_reward": 0.7985687275597211, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.731303632259369, + "rewards/thk_ans_format_reward": 1.0, + "step": 1766, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8229217529297, + "epoch": 5.967959527824621, + "grad_norm": 86.11679710344998, + "kl": 0.4453125, + "learning_rate": 5.025337837837837e-07, + "loss": 0.0004, + "reward": 3.4167131185531616, + "reward_std": 0.07535018771886826, + "rewards/final_reward": 1.813674784198955, + "rewards/mask_iou_reward": 0.9068373920994774, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4167132377624512, + "rewards/thk_ans_format_reward": 1.0, + "step": 1767, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.4791717529297, + "epoch": 5.971332209106239, + "grad_norm": 12.385200771779376, + "kl": 0.4775390625, + "learning_rate": 5.022522522522522e-07, + "loss": 0.0005, + "reward": 3.4023174047470093, + "reward_std": 0.12072728388011456, + "rewards/final_reward": 0.8421889742507591, + "rewards/mask_iou_reward": 0.42109448712537956, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4023174047470093, + "rewards/thk_ans_format_reward": 1.0, + "step": 1768, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.03125, + "epoch": 5.974704890387859, + "grad_norm": 53.45386993073585, + "kl": 0.47265625, + "learning_rate": 5.019707207207206e-07, + "loss": 0.0005, + "reward": 3.238593816757202, + "reward_std": 0.07144520059227943, + "rewards/final_reward": 1.6239609793989143, + "rewards/mask_iou_reward": 0.8119804896994571, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2385937571525574, + "rewards/thk_ans_format_reward": 1.0, + "step": 1769, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.96875762939453, + "epoch": 5.978077571669477, + "grad_norm": 14.787089708404682, + "kl": 0.443359375, + "learning_rate": 5.016891891891891e-07, + "loss": 0.0005, + "reward": 3.7221224308013916, + "reward_std": 0.053310368210077286, + "rewards/final_reward": 1.821725056904525, + "rewards/mask_iou_reward": 0.9108625284522625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7221226692199707, + "rewards/thk_ans_format_reward": 1.0, + "step": 1770, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0104217529297, + "epoch": 5.981450252951096, + "grad_norm": 9.746981068060926, + "kl": 0.453125, + "learning_rate": 5.014076576576577e-07, + "loss": 0.0005, + "reward": 3.554728627204895, + "reward_std": 0.024302124045789242, + "rewards/final_reward": 1.364493583967756, + "rewards/mask_iou_reward": 0.682246791983878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5547285079956055, + "rewards/thk_ans_format_reward": 1.0, + "step": 1771, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.89583587646484, + "epoch": 5.9848229342327155, + "grad_norm": 15.870396692550974, + "kl": 0.732421875, + "learning_rate": 5.011261261261261e-07, + "loss": 0.0007, + "reward": 3.3367944955825806, + "reward_std": 0.05224468186497688, + "rewards/final_reward": 1.6297709031076986, + "rewards/mask_iou_reward": 0.8148854515538493, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3367944955825806, + "rewards/thk_ans_format_reward": 1.0, + "step": 1772, + "think_completion_length": 10.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.56250381469727, + "epoch": 5.988195615514334, + "grad_norm": 14.485932527886074, + "kl": 0.484375, + "learning_rate": 5.008445945945946e-07, + "loss": 0.0005, + "reward": 3.652013063430786, + "reward_std": 0.033195996191352606, + "rewards/final_reward": 1.3546507063087527, + "rewards/mask_iou_reward": 0.6773253531543764, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6520132422447205, + "rewards/thk_ans_format_reward": 1.0, + "step": 1773, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.30208587646484, + "epoch": 5.991568296795953, + "grad_norm": 19.43956413548473, + "kl": 0.431640625, + "learning_rate": 5.005630630630631e-07, + "loss": 0.0004, + "reward": 3.2097524404525757, + "reward_std": 0.06278990767896175, + "rewards/final_reward": 1.227444801034187, + "rewards/mask_iou_reward": 0.6137224005170935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.209752380847931, + "rewards/thk_ans_format_reward": 1.0, + "step": 1774, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.4583396911621, + "epoch": 5.9949409780775715, + "grad_norm": 14.53640635351357, + "kl": 0.41015625, + "learning_rate": 5.002815315315315e-07, + "loss": 0.0004, + "reward": 3.418621063232422, + "reward_std": 0.17865736782550812, + "rewards/final_reward": 1.7044326001778476, + "rewards/mask_iou_reward": 0.8522163000889238, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.439454197883606, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1775, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.8684196472168, + "epoch": 5.998313659359191, + "grad_norm": 15.114676747335935, + "kl": 0.423828125, + "learning_rate": 5e-07, + "loss": 0.0004, + "reward": 3.6168004274368286, + "reward_std": 0.01466382760554552, + "rewards/final_reward": 1.718274332119896, + "rewards/mask_iou_reward": 0.859137166059948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.616800308227539, + "rewards/thk_ans_format_reward": 1.0, + "step": 1776, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.81250762939453, + "epoch": 6.003372681281619, + "grad_norm": 11.445548876199032, + "kl": 0.4482421875, + "learning_rate": 4.997184684684684e-07, + "loss": 0.0005, + "reward": 3.3490960597991943, + "reward_std": 0.22792461514472961, + "rewards/final_reward": 1.3445919555141737, + "rewards/mask_iou_reward": 0.6722959777570868, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.369929313659668, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1777, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.78125, + "epoch": 6.006745362563238, + "grad_norm": 15.350160021168787, + "kl": 1.03125, + "learning_rate": 4.994369369369369e-07, + "loss": 0.001, + "reward": 3.6605095863342285, + "reward_std": 0.1871098130941391, + "rewards/final_reward": 1.486225992428576, + "rewards/mask_iou_reward": 0.743112996214288, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6813429594039917, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1778, + "think_completion_length": 7.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.95833587646484, + "epoch": 6.010118043844857, + "grad_norm": 19.652957475920775, + "kl": 0.4345703125, + "learning_rate": 4.991554054054054e-07, + "loss": 0.0004, + "reward": 3.4463671445846558, + "reward_std": 0.11494097299873829, + "rewards/final_reward": 1.5364096322411285, + "rewards/mask_iou_reward": 0.7682048161205642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4463672041893005, + "rewards/thk_ans_format_reward": 1.0, + "step": 1779, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.87500381469727, + "epoch": 6.013490725126475, + "grad_norm": 21.56330468855678, + "kl": 0.6611328125, + "learning_rate": 4.988738738738738e-07, + "loss": 0.0006, + "reward": 3.5298094749450684, + "reward_std": 0.03876837342977524, + "rewards/final_reward": 1.9804720364517627, + "rewards/mask_iou_reward": 0.9902360182258814, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5298094153404236, + "rewards/thk_ans_format_reward": 1.0, + "step": 1780, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.70833587646484, + "epoch": 6.016863406408095, + "grad_norm": 14.309020021037526, + "kl": 0.458984375, + "learning_rate": 4.985923423423423e-07, + "loss": 0.0005, + "reward": 3.177926540374756, + "reward_std": 0.0898869875818491, + "rewards/final_reward": 1.3632918010967072, + "rewards/mask_iou_reward": 0.6816459005483536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1779264211654663, + "rewards/thk_ans_format_reward": 1.0, + "step": 1781, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.00000762939453, + "epoch": 6.020236087689713, + "grad_norm": 72.04324946469048, + "kl": 0.423828125, + "learning_rate": 4.983108108108107e-07, + "loss": 0.0004, + "reward": 3.5047526359558105, + "reward_std": 0.05716628208756447, + "rewards/final_reward": 1.6035955601823277, + "rewards/mask_iou_reward": 0.8017977800911639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5047527551651, + "rewards/thk_ans_format_reward": 1.0, + "step": 1782, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.44791793823242, + "epoch": 6.023608768971332, + "grad_norm": 10.273570408753594, + "kl": 0.435546875, + "learning_rate": 4.980292792792792e-07, + "loss": 0.0004, + "reward": 3.215430498123169, + "reward_std": 0.05078008770942688, + "rewards/final_reward": 1.5356546696469242, + "rewards/mask_iou_reward": 0.7678273348234621, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2154302597045898, + "rewards/thk_ans_format_reward": 1.0, + "step": 1783, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.55208587646484, + "epoch": 6.0269814502529515, + "grad_norm": 13.941282005436875, + "kl": 0.4521484375, + "learning_rate": 4.977477477477478e-07, + "loss": 0.0005, + "reward": 3.353714108467102, + "reward_std": 0.13728094846010208, + "rewards/final_reward": 1.1914722189040807, + "rewards/mask_iou_reward": 0.5957361094520404, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3537142276763916, + "rewards/thk_ans_format_reward": 1.0, + "step": 1784, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.3541717529297, + "epoch": 6.03035413153457, + "grad_norm": 13.967535257238211, + "kl": 0.4609375, + "learning_rate": 4.974662162162162e-07, + "loss": 0.0005, + "reward": 3.2387622594833374, + "reward_std": 0.1281859129667282, + "rewards/final_reward": 1.4602011399614638, + "rewards/mask_iou_reward": 0.7301005699807319, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2387624382972717, + "rewards/thk_ans_format_reward": 1.0, + "step": 1785, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.31250381469727, + "epoch": 6.033726812816189, + "grad_norm": 16.134563017526045, + "kl": 0.4423828125, + "learning_rate": 4.971846846846847e-07, + "loss": 0.0005, + "reward": 3.3565257787704468, + "reward_std": 0.11713682115077972, + "rewards/final_reward": 1.6746956553420935, + "rewards/mask_iou_reward": 0.8373478276710468, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3565258383750916, + "rewards/thk_ans_format_reward": 1.0, + "step": 1786, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.43750762939453, + "epoch": 6.0370994940978076, + "grad_norm": 13.881967660391403, + "kl": 0.4853515625, + "learning_rate": 4.969031531531532e-07, + "loss": 0.0005, + "reward": 3.572825312614441, + "reward_std": 0.13529992662370205, + "rewards/final_reward": 1.8447336471667763, + "rewards/mask_iou_reward": 0.9223668235833882, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.572825312614441, + "rewards/thk_ans_format_reward": 1.0, + "step": 1787, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.33333587646484, + "epoch": 6.040472175379427, + "grad_norm": 11.643033696015314, + "kl": 0.404296875, + "learning_rate": 4.966216216216216e-07, + "loss": 0.0004, + "reward": 3.598806142807007, + "reward_std": 0.2787330001592636, + "rewards/final_reward": 1.6966626586449314, + "rewards/mask_iou_reward": 0.8483313293224657, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6196394562721252, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1788, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.3854217529297, + "epoch": 6.043844856661045, + "grad_norm": 12.237340492732953, + "kl": 0.494140625, + "learning_rate": 4.963400900900901e-07, + "loss": 0.0005, + "reward": 3.351323366165161, + "reward_std": 0.04036957677453756, + "rewards/final_reward": 1.2045851596437673, + "rewards/mask_iou_reward": 0.6022925798218837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3513233065605164, + "rewards/thk_ans_format_reward": 1.0, + "step": 1789, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.64583587646484, + "epoch": 6.0472175379426645, + "grad_norm": 5.86025306631662, + "kl": 0.5556640625, + "learning_rate": 4.960585585585585e-07, + "loss": 0.0006, + "reward": 2.9825971126556396, + "reward_std": 0.12656350433826447, + "rewards/final_reward": 0.9906582387276571, + "rewards/mask_iou_reward": 0.49532911936382856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9825969934463501, + "rewards/thk_ans_format_reward": 1.0, + "step": 1790, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.69791793823242, + "epoch": 6.050590219224283, + "grad_norm": 8.881479831274108, + "kl": 0.4951171875, + "learning_rate": 4.95777027027027e-07, + "loss": 0.0005, + "reward": 3.5861589908599854, + "reward_std": 0.036995792761445045, + "rewards/final_reward": 1.681130699757016, + "rewards/mask_iou_reward": 0.840565349878508, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5861589312553406, + "rewards/thk_ans_format_reward": 1.0, + "step": 1791, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.77083587646484, + "epoch": 6.053962900505902, + "grad_norm": 15.07407534685162, + "kl": 0.4453125, + "learning_rate": 4.954954954954955e-07, + "loss": 0.0004, + "reward": 3.5831758975982666, + "reward_std": 0.1213915403932333, + "rewards/final_reward": 1.7150435711100571, + "rewards/mask_iou_reward": 0.8575217855550286, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5831758975982666, + "rewards/thk_ans_format_reward": 1.0, + "step": 1792, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3854217529297, + "epoch": 6.057335581787521, + "grad_norm": 7.6668102116418355, + "kl": 0.4091796875, + "learning_rate": 4.952139639639639e-07, + "loss": 0.0004, + "reward": 3.604040026664734, + "reward_std": 0.054723722860217094, + "rewards/final_reward": 1.5824915907787216, + "rewards/mask_iou_reward": 0.7912457953893608, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6040397882461548, + "rewards/thk_ans_format_reward": 1.0, + "step": 1793, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.9375, + "epoch": 6.06070826306914, + "grad_norm": 17.07492983564493, + "kl": 0.4384765625, + "learning_rate": 4.949324324324325e-07, + "loss": 0.0004, + "reward": 3.750004529953003, + "reward_std": 0.07825981266796589, + "rewards/final_reward": 1.9201976174507362, + "rewards/mask_iou_reward": 0.9600988087253681, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7500044107437134, + "rewards/thk_ans_format_reward": 1.0, + "step": 1794, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.10417175292969, + "epoch": 6.064080944350759, + "grad_norm": 15.468223744682641, + "kl": 0.419921875, + "learning_rate": 4.946509009009009e-07, + "loss": 0.0004, + "reward": 3.6026484966278076, + "reward_std": 0.10279983654618263, + "rewards/final_reward": 1.36152362689057, + "rewards/mask_iou_reward": 0.680761813445285, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6026484370231628, + "rewards/thk_ans_format_reward": 1.0, + "step": 1795, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.27083587646484, + "epoch": 6.0674536256323774, + "grad_norm": 18.180630844455536, + "kl": 0.5810546875, + "learning_rate": 4.943693693693693e-07, + "loss": 0.0005, + "reward": 3.5730003118515015, + "reward_std": 0.01847125869244337, + "rewards/final_reward": 1.909989832297998, + "rewards/mask_iou_reward": 0.954994916148999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5730002522468567, + "rewards/thk_ans_format_reward": 1.0, + "step": 1796, + "think_completion_length": 10.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.1354217529297, + "epoch": 6.070826306913997, + "grad_norm": 8.679508837091305, + "kl": 0.4267578125, + "learning_rate": 4.940878378378378e-07, + "loss": 0.0004, + "reward": 3.4946954250335693, + "reward_std": 0.12109193205833435, + "rewards/final_reward": 1.3505949481761967, + "rewards/mask_iou_reward": 0.6752974740880984, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4946955442428589, + "rewards/thk_ans_format_reward": 1.0, + "step": 1797, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.9166717529297, + "epoch": 6.074198988195615, + "grad_norm": 8.79628738011936, + "kl": 0.447265625, + "learning_rate": 4.938063063063062e-07, + "loss": 0.0005, + "reward": 3.360843539237976, + "reward_std": 0.038767154794186354, + "rewards/final_reward": 1.0506880923424922, + "rewards/mask_iou_reward": 0.5253440461712461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3608436584472656, + "rewards/thk_ans_format_reward": 1.0, + "step": 1798, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.9791717529297, + "epoch": 6.077571669477234, + "grad_norm": 13.827782140220531, + "kl": 0.44921875, + "learning_rate": 4.935247747747748e-07, + "loss": 0.0005, + "reward": 3.5027761459350586, + "reward_std": 0.07696177158504725, + "rewards/final_reward": 1.4222419690109462, + "rewards/mask_iou_reward": 0.7111209845054731, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5027759671211243, + "rewards/thk_ans_format_reward": 1.0, + "step": 1799, + "think_completion_length": 7.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.20833587646484, + "epoch": 6.080944350758854, + "grad_norm": 9.89407607249062, + "kl": 0.474609375, + "learning_rate": 4.932432432432432e-07, + "loss": 0.0005, + "reward": 3.4364618062973022, + "reward_std": 0.032252633944153786, + "rewards/final_reward": 1.814366455937597, + "rewards/mask_iou_reward": 0.9071832279687985, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4364619255065918, + "rewards/thk_ans_format_reward": 1.0, + "step": 1800, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.18750762939453, + "epoch": 6.084317032040472, + "grad_norm": 12.036125325350469, + "kl": 0.46484375, + "learning_rate": 4.929617117117117e-07, + "loss": 0.0005, + "reward": 3.497703790664673, + "reward_std": 0.07510556373745203, + "rewards/final_reward": 1.375586700424734, + "rewards/mask_iou_reward": 0.687793350212367, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.497703731060028, + "rewards/thk_ans_format_reward": 1.0, + "step": 1801, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.18750381469727, + "epoch": 6.087689713322091, + "grad_norm": 9.272346546953209, + "kl": 2.04296875, + "learning_rate": 4.926801801801802e-07, + "loss": 0.002, + "reward": 3.246990919113159, + "reward_std": 0.07481374405324459, + "rewards/final_reward": 1.2698713054500153, + "rewards/mask_iou_reward": 0.6349356527250076, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2469908595085144, + "rewards/thk_ans_format_reward": 1.0, + "step": 1802, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.25000762939453, + "epoch": 6.09106239460371, + "grad_norm": 7.763042794101035, + "kl": 0.455078125, + "learning_rate": 4.923986486486486e-07, + "loss": 0.0005, + "reward": 3.592580199241638, + "reward_std": 0.1305740661919117, + "rewards/final_reward": 1.6972252048345693, + "rewards/mask_iou_reward": 0.8486126024172846, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6029969453811646, + "rewards/thk_ans_format_reward": 1.0, + "step": 1803, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.64583587646484, + "epoch": 6.094435075885329, + "grad_norm": 7.8167428206231016, + "kl": 0.4716796875, + "learning_rate": 4.921171171171171e-07, + "loss": 0.0005, + "reward": 3.5702860355377197, + "reward_std": 0.1579833161085844, + "rewards/final_reward": 1.2851122394080043, + "rewards/mask_iou_reward": 0.6425561197040022, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5702861547470093, + "rewards/thk_ans_format_reward": 1.0, + "step": 1804, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.64583587646484, + "epoch": 6.097807757166947, + "grad_norm": 8.273854917771985, + "kl": 0.650390625, + "learning_rate": 4.918355855855855e-07, + "loss": 0.0007, + "reward": 3.735241651535034, + "reward_std": 0.07285539992153645, + "rewards/final_reward": 1.7158941135165968, + "rewards/mask_iou_reward": 0.8579470567582984, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7352415323257446, + "rewards/thk_ans_format_reward": 1.0, + "step": 1805, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.5833396911621, + "epoch": 6.101180438448567, + "grad_norm": 8.050910452950912, + "kl": 0.42578125, + "learning_rate": 4.91554054054054e-07, + "loss": 0.0005, + "reward": 3.4020386934280396, + "reward_std": 0.02658071694895625, + "rewards/final_reward": 1.9605694783898593, + "rewards/mask_iou_reward": 0.9802847391949296, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4020384550094604, + "rewards/thk_ans_format_reward": 1.0, + "step": 1806, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.08333587646484, + "epoch": 6.104553119730186, + "grad_norm": 12.275695310256692, + "kl": 0.5146484375, + "learning_rate": 4.912725225225225e-07, + "loss": 0.0005, + "reward": 3.641546368598938, + "reward_std": 0.1854504942893982, + "rewards/final_reward": 1.712221254437532, + "rewards/mask_iou_reward": 0.856110627218766, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6623798608779907, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1807, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.8854217529297, + "epoch": 6.107925801011804, + "grad_norm": 9.725615667917536, + "kl": 0.419921875, + "learning_rate": 4.909909909909909e-07, + "loss": 0.0004, + "reward": 3.6741241216659546, + "reward_std": 0.04602981638163328, + "rewards/final_reward": 1.4570953259034565, + "rewards/mask_iou_reward": 0.7285476629517282, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6741241216659546, + "rewards/thk_ans_format_reward": 1.0, + "step": 1808, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6979217529297, + "epoch": 6.1112984822934235, + "grad_norm": 15.884101211242106, + "kl": 0.484375, + "learning_rate": 4.907094594594595e-07, + "loss": 0.0005, + "reward": 3.8114129304885864, + "reward_std": 0.045267632231116295, + "rewards/final_reward": 1.8431219354271338, + "rewards/mask_iou_reward": 0.9215609677135669, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8114128708839417, + "rewards/thk_ans_format_reward": 1.0, + "step": 1809, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.65625, + "epoch": 6.114671163575042, + "grad_norm": 25.81386622474354, + "kl": 0.46875, + "learning_rate": 4.90427927927928e-07, + "loss": 0.0005, + "reward": 3.6401002407073975, + "reward_std": 0.18938226997852325, + "rewards/final_reward": 1.9438146284433242, + "rewards/mask_iou_reward": 0.9719073142216621, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.650516927242279, + "rewards/thk_ans_format_reward": 1.0, + "step": 1810, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.35416793823242, + "epoch": 6.118043844856661, + "grad_norm": 8.497806317951227, + "kl": 0.427734375, + "learning_rate": 4.901463963963964e-07, + "loss": 0.0005, + "reward": 3.3357125520706177, + "reward_std": 0.10076085850596428, + "rewards/final_reward": 1.4866574546484423, + "rewards/mask_iou_reward": 0.7433287273242212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3357125520706177, + "rewards/thk_ans_format_reward": 1.0, + "step": 1811, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.42708587646484, + "epoch": 6.12141652613828, + "grad_norm": 11.40831085565751, + "kl": 0.572265625, + "learning_rate": 4.898648648648649e-07, + "loss": 0.0006, + "reward": 3.5201451778411865, + "reward_std": 0.07543889572843909, + "rewards/final_reward": 1.2276516786559781, + "rewards/mask_iou_reward": 0.6138258393279891, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.520145058631897, + "rewards/thk_ans_format_reward": 1.0, + "step": 1812, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5625, + "epoch": 6.124789207419899, + "grad_norm": 16.476859110128636, + "kl": 0.521484375, + "learning_rate": 4.895833333333333e-07, + "loss": 0.0005, + "reward": 3.0899579524993896, + "reward_std": 0.07404950819909573, + "rewards/final_reward": 1.4622259397212147, + "rewards/mask_iou_reward": 0.7311129698606074, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0899578332901, + "rewards/thk_ans_format_reward": 1.0, + "step": 1813, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.4791717529297, + "epoch": 6.128161888701518, + "grad_norm": 7.32903798149775, + "kl": 0.486328125, + "learning_rate": 4.893018018018018e-07, + "loss": 0.0005, + "reward": 3.2776451110839844, + "reward_std": 0.04616658762097359, + "rewards/final_reward": 1.4818470056431212, + "rewards/mask_iou_reward": 0.7409235028215606, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.277645230293274, + "rewards/thk_ans_format_reward": 1.0, + "step": 1814, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.8229217529297, + "epoch": 6.1315345699831365, + "grad_norm": 10.56676495216597, + "kl": 0.4208984375, + "learning_rate": 4.890202702702703e-07, + "loss": 0.0005, + "reward": 3.4688678979873657, + "reward_std": 0.07710606977343559, + "rewards/final_reward": 1.9705928437719198, + "rewards/mask_iou_reward": 0.9852964218859599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4688677787780762, + "rewards/thk_ans_format_reward": 1.0, + "step": 1815, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.61458587646484, + "epoch": 6.134907251264756, + "grad_norm": 6.168549350134606, + "kl": 0.4765625, + "learning_rate": 4.887387387387387e-07, + "loss": 0.0005, + "reward": 3.066379189491272, + "reward_std": 0.07491825148463249, + "rewards/final_reward": 1.4189378907902062, + "rewards/mask_iou_reward": 0.7094689453951031, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.066379189491272, + "rewards/thk_ans_format_reward": 1.0, + "step": 1816, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.13541793823242, + "epoch": 6.138279932546374, + "grad_norm": 46.181273262335935, + "kl": 0.419921875, + "learning_rate": 4.884572072072072e-07, + "loss": 0.0004, + "reward": 3.6587594747543335, + "reward_std": 0.1114624422043562, + "rewards/final_reward": 1.6354522765770105, + "rewards/mask_iou_reward": 0.8177261382885053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6587591767311096, + "rewards/thk_ans_format_reward": 1.0, + "step": 1817, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.40625381469727, + "epoch": 6.141652613827993, + "grad_norm": 9.338732537826253, + "kl": 0.54296875, + "learning_rate": 4.881756756756756e-07, + "loss": 0.0006, + "reward": 3.492391347885132, + "reward_std": 0.058474089950323105, + "rewards/final_reward": 1.5647377878528934, + "rewards/mask_iou_reward": 0.7823688939264467, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4923916459083557, + "rewards/thk_ans_format_reward": 1.0, + "step": 1818, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8541717529297, + "epoch": 6.145025295109612, + "grad_norm": 14.939731342568743, + "kl": 0.43359375, + "learning_rate": 4.878941441441441e-07, + "loss": 0.0004, + "reward": 3.352776527404785, + "reward_std": 0.0804421491920948, + "rewards/final_reward": 1.2421818179755024, + "rewards/mask_iou_reward": 0.6210909089877512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.35277658700943, + "rewards/thk_ans_format_reward": 1.0, + "step": 1819, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.4166717529297, + "epoch": 6.148397976391231, + "grad_norm": 7.270457639937909, + "kl": 0.5, + "learning_rate": 4.876126126126126e-07, + "loss": 0.0005, + "reward": 3.645534873008728, + "reward_std": 0.05726535618305206, + "rewards/final_reward": 1.2087856690062433, + "rewards/mask_iou_reward": 0.6043928345031216, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6455351114273071, + "rewards/thk_ans_format_reward": 1.0, + "step": 1820, + "think_completion_length": 10.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.6041717529297, + "epoch": 6.15177065767285, + "grad_norm": 21.78725379694707, + "kl": 0.6484375, + "learning_rate": 4.87331081081081e-07, + "loss": 0.0006, + "reward": 3.240267753601074, + "reward_std": 0.09761350601911545, + "rewards/final_reward": 0.3889431804937442, + "rewards/mask_iou_reward": 0.1944715902468721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2402676343917847, + "rewards/thk_ans_format_reward": 1.0, + "step": 1821, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.70834350585938, + "epoch": 6.155143338954469, + "grad_norm": 9.80529265640279, + "kl": 0.474609375, + "learning_rate": 4.870495495495495e-07, + "loss": 0.0005, + "reward": 3.245134949684143, + "reward_std": 0.11889784410595894, + "rewards/final_reward": 1.0686943146111323, + "rewards/mask_iou_reward": 0.5343471573055661, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2451348900794983, + "rewards/thk_ans_format_reward": 1.0, + "step": 1822, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.01042175292969, + "epoch": 6.158516020236088, + "grad_norm": 20.528644385672013, + "kl": 0.759765625, + "learning_rate": 4.867680180180179e-07, + "loss": 0.0008, + "reward": 3.2267754077911377, + "reward_std": 0.03484675846993923, + "rewards/final_reward": 1.4529543439692785, + "rewards/mask_iou_reward": 0.7264771719846392, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2267754077911377, + "rewards/thk_ans_format_reward": 1.0, + "step": 1823, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.7083396911621, + "epoch": 6.161888701517706, + "grad_norm": 6.527277155267125, + "kl": 0.5185546875, + "learning_rate": 4.864864864864865e-07, + "loss": 0.0005, + "reward": 3.5386557579040527, + "reward_std": 0.15575581789016724, + "rewards/final_reward": 1.7940223206754031, + "rewards/mask_iou_reward": 0.8970111603377016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5386556386947632, + "rewards/thk_ans_format_reward": 1.0, + "step": 1824, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.30208587646484, + "epoch": 6.165261382799326, + "grad_norm": 7.014600074912451, + "kl": 0.4580078125, + "learning_rate": 4.86204954954955e-07, + "loss": 0.0005, + "reward": 3.5652071237564087, + "reward_std": 0.03770854417234659, + "rewards/final_reward": 1.817075835921095, + "rewards/mask_iou_reward": 0.9085379179605475, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5652071237564087, + "rewards/thk_ans_format_reward": 1.0, + "step": 1825, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.20834350585938, + "epoch": 6.168634064080944, + "grad_norm": 16.387909639859263, + "kl": 3.076171875, + "learning_rate": 4.859234234234234e-07, + "loss": 0.0031, + "reward": 3.3169403076171875, + "reward_std": 0.06754343025386333, + "rewards/final_reward": 1.1096263378606444, + "rewards/mask_iou_reward": 0.5548131689303222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3169403672218323, + "rewards/thk_ans_format_reward": 1.0, + "step": 1826, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.46875762939453, + "epoch": 6.172006745362563, + "grad_norm": 8.58167030233964, + "kl": 0.4384765625, + "learning_rate": 4.856418918918919e-07, + "loss": 0.0004, + "reward": 3.648140788078308, + "reward_std": 0.08551261574029922, + "rewards/final_reward": 1.7414726240826537, + "rewards/mask_iou_reward": 0.8707363120413268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6481409072875977, + "rewards/thk_ans_format_reward": 1.0, + "step": 1827, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.98959350585938, + "epoch": 6.175379426644182, + "grad_norm": 13.283475018178068, + "kl": 0.4248046875, + "learning_rate": 4.853603603603604e-07, + "loss": 0.0004, + "reward": 3.440650701522827, + "reward_std": 0.2208097279071808, + "rewards/final_reward": 1.6704628419619407, + "rewards/mask_iou_reward": 0.8352314209809704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4406505823135376, + "rewards/thk_ans_format_reward": 1.0, + "step": 1828, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.17708587646484, + "epoch": 6.178752107925801, + "grad_norm": 24.21518870673125, + "kl": 0.7392578125, + "learning_rate": 4.850788288288288e-07, + "loss": 0.0007, + "reward": 3.4418188333511353, + "reward_std": 0.035075574181973934, + "rewards/final_reward": 1.6447764419543716, + "rewards/mask_iou_reward": 0.8223882209771858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4418187141418457, + "rewards/thk_ans_format_reward": 1.0, + "step": 1829, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.52084350585938, + "epoch": 6.18212478920742, + "grad_norm": 12.568516663657245, + "kl": 0.4658203125, + "learning_rate": 4.847972972972973e-07, + "loss": 0.0005, + "reward": 3.697560429573059, + "reward_std": 0.05124947056174278, + "rewards/final_reward": 1.4155713679870419, + "rewards/mask_iou_reward": 0.7077856839935209, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.697560429573059, + "rewards/thk_ans_format_reward": 1.0, + "step": 1830, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.73958587646484, + "epoch": 6.185497470489039, + "grad_norm": 6.373846714068581, + "kl": 0.455078125, + "learning_rate": 4.845157657657657e-07, + "loss": 0.0005, + "reward": 3.2864041328430176, + "reward_std": 0.056804947555065155, + "rewards/final_reward": 1.3386153594496895, + "rewards/mask_iou_reward": 0.6693076797248447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.286404013633728, + "rewards/thk_ans_format_reward": 1.0, + "step": 1831, + "think_completion_length": 7.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.71875381469727, + "epoch": 6.188870151770658, + "grad_norm": 11.950140564771095, + "kl": 0.48828125, + "learning_rate": 4.842342342342342e-07, + "loss": 0.0005, + "reward": 3.6336305141448975, + "reward_std": 0.054893579334020615, + "rewards/final_reward": 1.8206187602156358, + "rewards/mask_iou_reward": 0.9103093801078179, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6336302757263184, + "rewards/thk_ans_format_reward": 1.0, + "step": 1832, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.30209350585938, + "epoch": 6.192242833052276, + "grad_norm": 8.144789812330119, + "kl": 0.408203125, + "learning_rate": 4.839527027027027e-07, + "loss": 0.0004, + "reward": 3.1093939542770386, + "reward_std": 0.2279842160642147, + "rewards/final_reward": 0.9082491008078164, + "rewards/mask_iou_reward": 0.4541245504039082, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.130227416753769, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1833, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5416717529297, + "epoch": 6.195615514333896, + "grad_norm": 13.15715666098289, + "kl": 0.421875, + "learning_rate": 4.836711711711711e-07, + "loss": 0.0005, + "reward": 3.1555248498916626, + "reward_std": 0.1126946210861206, + "rewards/final_reward": 1.6677855768724603, + "rewards/mask_iou_reward": 0.8338927884362302, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1555247902870178, + "rewards/thk_ans_format_reward": 1.0, + "step": 1834, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.73958587646484, + "epoch": 6.198988195615514, + "grad_norm": 10.09920999100286, + "kl": 0.4697265625, + "learning_rate": 4.833896396396397e-07, + "loss": 0.0005, + "reward": 3.560391664505005, + "reward_std": 0.036461517214775085, + "rewards/final_reward": 1.5740383488315124, + "rewards/mask_iou_reward": 0.7870191744157562, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.56039160490036, + "rewards/thk_ans_format_reward": 1.0, + "step": 1835, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.70833587646484, + "epoch": 6.202360876897133, + "grad_norm": 21.17541065383814, + "kl": 0.4384765625, + "learning_rate": 4.83108108108108e-07, + "loss": 0.0004, + "reward": 3.2930556535720825, + "reward_std": 0.1490391530096531, + "rewards/final_reward": 0.8221827094035479, + "rewards/mask_iou_reward": 0.41109135470177394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2930553555488586, + "rewards/thk_ans_format_reward": 1.0, + "step": 1836, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.50000762939453, + "epoch": 6.2057335581787525, + "grad_norm": 39.0265227970619, + "kl": 0.5498046875, + "learning_rate": 4.828265765765765e-07, + "loss": 0.0006, + "reward": 3.4005260467529297, + "reward_std": 0.04964868910610676, + "rewards/final_reward": 1.012190933699076, + "rewards/mask_iou_reward": 0.506095466849538, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4005258679389954, + "rewards/thk_ans_format_reward": 1.0, + "step": 1837, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.2916717529297, + "epoch": 6.209106239460371, + "grad_norm": 10.174527346336742, + "kl": 0.431640625, + "learning_rate": 4.82545045045045e-07, + "loss": 0.0004, + "reward": 3.533667206764221, + "reward_std": 0.07525857351720333, + "rewards/final_reward": 1.5471510406717721, + "rewards/mask_iou_reward": 0.7735755203358861, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5336670279502869, + "rewards/thk_ans_format_reward": 1.0, + "step": 1838, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.64583587646484, + "epoch": 6.21247892074199, + "grad_norm": 16.030419351543703, + "kl": 0.4560546875, + "learning_rate": 4.822635135135134e-07, + "loss": 0.0004, + "reward": 3.734760046005249, + "reward_std": 0.07690603472292423, + "rewards/final_reward": 1.6794847684644445, + "rewards/mask_iou_reward": 0.8397423842322222, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7347601056098938, + "rewards/thk_ans_format_reward": 1.0, + "step": 1839, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.33333587646484, + "epoch": 6.2158516020236085, + "grad_norm": 129.07331392254122, + "kl": 0.3955078125, + "learning_rate": 4.81981981981982e-07, + "loss": 0.0004, + "reward": 3.5038132667541504, + "reward_std": 0.10739928111433983, + "rewards/final_reward": 1.4776798330855296, + "rewards/mask_iou_reward": 0.7388399165427648, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5038131475448608, + "rewards/thk_ans_format_reward": 1.0, + "step": 1840, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.47917175292969, + "epoch": 6.219224283305228, + "grad_norm": 14.35637587106052, + "kl": 0.98828125, + "learning_rate": 4.817004504504505e-07, + "loss": 0.001, + "reward": 3.8405778408050537, + "reward_std": 0.015699880197644234, + "rewards/final_reward": 1.9513990856142902, + "rewards/mask_iou_reward": 0.9756995428071451, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.840577781200409, + "rewards/thk_ans_format_reward": 1.0, + "step": 1841, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.67709350585938, + "epoch": 6.222596964586846, + "grad_norm": 13.455911390668044, + "kl": 0.4951171875, + "learning_rate": 4.814189189189189e-07, + "loss": 0.0005, + "reward": 3.352190852165222, + "reward_std": 0.049657109659165144, + "rewards/final_reward": 0.9730023664458212, + "rewards/mask_iou_reward": 0.4865011832229106, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.352190911769867, + "rewards/thk_ans_format_reward": 1.0, + "step": 1842, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.52084350585938, + "epoch": 6.2259696458684655, + "grad_norm": 15.863423153445565, + "kl": 0.5087890625, + "learning_rate": 4.811373873873874e-07, + "loss": 0.0005, + "reward": 3.609791874885559, + "reward_std": 0.03911227732896805, + "rewards/final_reward": 1.574771011400429, + "rewards/mask_iou_reward": 0.7873855057002145, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6097919940948486, + "rewards/thk_ans_format_reward": 1.0, + "step": 1843, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.18750762939453, + "epoch": 6.229342327150085, + "grad_norm": 10.155964433901605, + "kl": 0.37890625, + "learning_rate": 4.808558558558558e-07, + "loss": 0.0004, + "reward": 3.828355550765991, + "reward_std": 0.015464604832231998, + "rewards/final_reward": 1.914407570926103, + "rewards/mask_iou_reward": 0.9572037854630515, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8283554911613464, + "rewards/thk_ans_format_reward": 1.0, + "step": 1844, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.11458587646484, + "epoch": 6.232715008431703, + "grad_norm": 19.461364205169517, + "kl": 0.52734375, + "learning_rate": 4.805743243243243e-07, + "loss": 0.0005, + "reward": 3.383014440536499, + "reward_std": 0.16253596171736717, + "rewards/final_reward": 1.2477217644360832, + "rewards/mask_iou_reward": 0.6238608822180416, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3830143809318542, + "rewards/thk_ans_format_reward": 1.0, + "step": 1845, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.17708587646484, + "epoch": 6.236087689713322, + "grad_norm": 11.087827878884447, + "kl": 0.41796875, + "learning_rate": 4.802927927927928e-07, + "loss": 0.0004, + "reward": 3.2912451028823853, + "reward_std": 0.1132066361606121, + "rewards/final_reward": 1.7735398501636395, + "rewards/mask_iou_reward": 0.8867699250818197, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2912451028823853, + "rewards/thk_ans_format_reward": 1.0, + "step": 1846, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.15625, + "epoch": 6.239460370994941, + "grad_norm": 17.893362894514222, + "kl": 0.7080078125, + "learning_rate": 4.800112612612612e-07, + "loss": 0.0008, + "reward": 3.4652419090270996, + "reward_std": 0.046267539262771606, + "rewards/final_reward": 1.3597863364412965, + "rewards/mask_iou_reward": 0.6798931682206483, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4652420282363892, + "rewards/thk_ans_format_reward": 1.0, + "step": 1847, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.625, + "epoch": 6.24283305227656, + "grad_norm": 9.73142051628865, + "kl": 0.40234375, + "learning_rate": 4.797297297297297e-07, + "loss": 0.0005, + "reward": 3.4128910303115845, + "reward_std": 0.07327094860374928, + "rewards/final_reward": 1.433672808060011, + "rewards/mask_iou_reward": 0.7168364040300055, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4128910303115845, + "rewards/thk_ans_format_reward": 1.0, + "step": 1848, + "think_completion_length": 6.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.6875, + "epoch": 6.246205733558178, + "grad_norm": 13.676519283489288, + "kl": 0.427734375, + "learning_rate": 4.794481981981981e-07, + "loss": 0.0005, + "reward": 3.3033007383346558, + "reward_std": 0.09086661785840988, + "rewards/final_reward": 1.643985369919946, + "rewards/mask_iou_reward": 0.821992684959973, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3033005595207214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1849, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.39583587646484, + "epoch": 6.249578414839798, + "grad_norm": 8.076127289231117, + "kl": 0.4140625, + "learning_rate": 4.791666666666667e-07, + "loss": 0.0004, + "reward": 3.0996118783950806, + "reward_std": 0.16868788562715054, + "rewards/final_reward": 1.5549581997171695, + "rewards/mask_iou_reward": 0.7774790998585848, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1204451322555542, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1850, + "think_completion_length": 6.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.72916793823242, + "epoch": 6.252951096121416, + "grad_norm": 31.480636549002742, + "kl": 0.4677734375, + "learning_rate": 4.788851351351352e-07, + "loss": 0.0005, + "reward": 3.263098955154419, + "reward_std": 0.21037384122610092, + "rewards/final_reward": 1.3115423029288782, + "rewards/mask_iou_reward": 0.6557711514644391, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2630988359451294, + "rewards/thk_ans_format_reward": 1.0, + "step": 1851, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.20833587646484, + "epoch": 6.256323777403035, + "grad_norm": 6.937391694748509, + "kl": 0.501953125, + "learning_rate": 4.786036036036036e-07, + "loss": 0.0005, + "reward": 3.13068687915802, + "reward_std": 0.20532716810703278, + "rewards/final_reward": 0.978316601376308, + "rewards/mask_iou_reward": 0.489158300688154, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.1515201926231384, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1852, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.12500762939453, + "epoch": 6.259696458684655, + "grad_norm": 9.857491201068333, + "kl": 0.3994140625, + "learning_rate": 4.783220720720721e-07, + "loss": 0.0004, + "reward": 3.5886178016662598, + "reward_std": 0.07222697883844376, + "rewards/final_reward": 1.4276639751242879, + "rewards/mask_iou_reward": 0.7138319875621439, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5886176824569702, + "rewards/thk_ans_format_reward": 1.0, + "step": 1853, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.30208587646484, + "epoch": 6.263069139966273, + "grad_norm": 19.50421363346234, + "kl": 0.400390625, + "learning_rate": 4.780405405405405e-07, + "loss": 0.0004, + "reward": 3.641382932662964, + "reward_std": 0.0448097325861454, + "rewards/final_reward": 1.6495557187238357, + "rewards/mask_iou_reward": 0.8247778593619178, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6413828134536743, + "rewards/thk_ans_format_reward": 1.0, + "step": 1854, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.2291717529297, + "epoch": 6.266441821247892, + "grad_norm": 7.351446937843033, + "kl": 0.580078125, + "learning_rate": 4.77759009009009e-07, + "loss": 0.0006, + "reward": 3.594027280807495, + "reward_std": 0.26565699838101864, + "rewards/final_reward": 1.9030951882988434, + "rewards/mask_iou_reward": 0.9515475941494217, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.6356940865516663, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 1855, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.2916717529297, + "epoch": 6.269814502529511, + "grad_norm": 22.137513934687362, + "kl": 0.478515625, + "learning_rate": 4.774774774774775e-07, + "loss": 0.0005, + "reward": 3.6100287437438965, + "reward_std": 0.025507054291665554, + "rewards/final_reward": 1.6572004694700335, + "rewards/mask_iou_reward": 0.8286002347350168, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6100287437438965, + "rewards/thk_ans_format_reward": 1.0, + "step": 1856, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.375, + "epoch": 6.27318718381113, + "grad_norm": 10.17366204243886, + "kl": 0.4677734375, + "learning_rate": 4.771959459459459e-07, + "loss": 0.0005, + "reward": 3.527943730354309, + "reward_std": 0.25231462717056274, + "rewards/final_reward": 1.6537278172949341, + "rewards/mask_iou_reward": 0.8268639086474671, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5279436111450195, + "rewards/thk_ans_format_reward": 1.0, + "step": 1857, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.6666717529297, + "epoch": 6.276559865092748, + "grad_norm": 5.4924519172347885, + "kl": 0.4638671875, + "learning_rate": 4.769144144144144e-07, + "loss": 0.0005, + "reward": 3.417281150817871, + "reward_std": 0.08728579431772232, + "rewards/final_reward": 1.5202317602031523, + "rewards/mask_iou_reward": 0.7601158801015762, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4172809720039368, + "rewards/thk_ans_format_reward": 1.0, + "step": 1858, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.9479217529297, + "epoch": 6.279932546374368, + "grad_norm": 6.81588308994651, + "kl": 0.412109375, + "learning_rate": 4.7663288288288285e-07, + "loss": 0.0004, + "reward": 3.592257022857666, + "reward_std": 0.049894423224031925, + "rewards/final_reward": 1.3763441086826136, + "rewards/mask_iou_reward": 0.6881720543413068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5922568440437317, + "rewards/thk_ans_format_reward": 1.0, + "step": 1859, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.55209350585938, + "epoch": 6.283305227655987, + "grad_norm": 6.815516689412823, + "kl": 0.484375, + "learning_rate": 4.7635135135135136e-07, + "loss": 0.0005, + "reward": 3.6291333436965942, + "reward_std": 0.11306917294859886, + "rewards/final_reward": 1.5164182235209045, + "rewards/mask_iou_reward": 0.7582091117604522, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6291332244873047, + "rewards/thk_ans_format_reward": 1.0, + "step": 1860, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.21875762939453, + "epoch": 6.286677908937605, + "grad_norm": 11.211526253372272, + "kl": 0.357421875, + "learning_rate": 4.760698198198198e-07, + "loss": 0.0004, + "reward": 3.6165082454681396, + "reward_std": 0.12074577808380127, + "rewards/final_reward": 1.597264458972782, + "rewards/mask_iou_reward": 0.798632229486391, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.61650812625885, + "rewards/thk_ans_format_reward": 1.0, + "step": 1861, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.18750762939453, + "epoch": 6.2900505902192245, + "grad_norm": 17.466570040418652, + "kl": 0.4365234375, + "learning_rate": 4.757882882882883e-07, + "loss": 0.0004, + "reward": 3.407384157180786, + "reward_std": 0.06713058799505234, + "rewards/final_reward": 1.872860084905434, + "rewards/mask_iou_reward": 0.936430042452717, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.407383918762207, + "rewards/thk_ans_format_reward": 1.0, + "step": 1862, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6979217529297, + "epoch": 6.293423271500843, + "grad_norm": 10.368101080790261, + "kl": 0.4150390625, + "learning_rate": 4.7550675675675674e-07, + "loss": 0.0004, + "reward": 3.6531848907470703, + "reward_std": 0.1504236189648509, + "rewards/final_reward": 1.3807319749969587, + "rewards/mask_iou_reward": 0.6903659874984793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.653184711933136, + "rewards/thk_ans_format_reward": 1.0, + "step": 1863, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5416717529297, + "epoch": 6.296795952782462, + "grad_norm": 12.365053252610293, + "kl": 0.4267578125, + "learning_rate": 4.752252252252252e-07, + "loss": 0.0006, + "reward": 3.647891640663147, + "reward_std": 0.07979295030236244, + "rewards/final_reward": 1.739375300114676, + "rewards/mask_iou_reward": 0.869687650057338, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.647891640663147, + "rewards/thk_ans_format_reward": 1.0, + "step": 1864, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11458587646484, + "epoch": 6.300168634064081, + "grad_norm": 12.789096356831873, + "kl": 0.681640625, + "learning_rate": 4.749436936936937e-07, + "loss": 0.0007, + "reward": 3.50308358669281, + "reward_std": 0.039963416289538145, + "rewards/final_reward": 1.1487614871984597, + "rewards/mask_iou_reward": 0.5743807435992299, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5030834078788757, + "rewards/thk_ans_format_reward": 1.0, + "step": 1865, + "think_completion_length": 6.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.52083587646484, + "epoch": 6.3035413153457, + "grad_norm": 6.530863942545822, + "kl": 0.4521484375, + "learning_rate": 4.746621621621621e-07, + "loss": 0.0005, + "reward": 3.501235008239746, + "reward_std": 0.15554272197186947, + "rewards/final_reward": 1.7192413322312214, + "rewards/mask_iou_reward": 0.8596206661156107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5012351870536804, + "rewards/thk_ans_format_reward": 1.0, + "step": 1866, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1354217529297, + "epoch": 6.306913996627319, + "grad_norm": 13.245851374127852, + "kl": 0.3955078125, + "learning_rate": 4.743806306306306e-07, + "loss": 0.0004, + "reward": 3.6863157749176025, + "reward_std": 0.07255137898027897, + "rewards/final_reward": 1.7820136042567227, + "rewards/mask_iou_reward": 0.8910068021283614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6863157153129578, + "rewards/thk_ans_format_reward": 1.0, + "step": 1867, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.3854217529297, + "epoch": 6.3102866779089375, + "grad_norm": 17.41876990858359, + "kl": 0.47265625, + "learning_rate": 4.7409909909909905e-07, + "loss": 0.0005, + "reward": 3.3488051891326904, + "reward_std": 0.04395863972604275, + "rewards/final_reward": 1.1569589159476008, + "rewards/mask_iou_reward": 0.5784794579738004, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3488048911094666, + "rewards/thk_ans_format_reward": 1.0, + "step": 1868, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.2916717529297, + "epoch": 6.313659359190557, + "grad_norm": 7.877274635062242, + "kl": 0.408203125, + "learning_rate": 4.738175675675675e-07, + "loss": 0.0005, + "reward": 3.414574384689331, + "reward_std": 0.03488452360033989, + "rewards/final_reward": 1.345949744631015, + "rewards/mask_iou_reward": 0.6729748723155075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4145742654800415, + "rewards/thk_ans_format_reward": 1.0, + "step": 1869, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0520896911621, + "epoch": 6.317032040472175, + "grad_norm": 20.015154532440828, + "kl": 0.640625, + "learning_rate": 4.73536036036036e-07, + "loss": 0.0006, + "reward": 3.530336856842041, + "reward_std": 0.08547847159206867, + "rewards/final_reward": 1.0988999298593032, + "rewards/mask_iou_reward": 0.5494499649296516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5303367376327515, + "rewards/thk_ans_format_reward": 1.0, + "step": 1870, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.70833587646484, + "epoch": 6.320404721753794, + "grad_norm": 44.35928207835712, + "kl": 0.4873046875, + "learning_rate": 4.732545045045045e-07, + "loss": 0.0005, + "reward": 3.468393325805664, + "reward_std": 0.05352478846907616, + "rewards/final_reward": 1.684244809139015, + "rewards/mask_iou_reward": 0.8421224045695075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4683933854103088, + "rewards/thk_ans_format_reward": 1.0, + "step": 1871, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.8229217529297, + "epoch": 6.323777403035413, + "grad_norm": 25.40471040471904, + "kl": 0.4287109375, + "learning_rate": 4.7297297297297294e-07, + "loss": 0.0005, + "reward": 3.4976236820220947, + "reward_std": 0.08486808463931084, + "rewards/final_reward": 1.3047796045588043, + "rewards/mask_iou_reward": 0.6523898022794021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.49762362241745, + "rewards/thk_ans_format_reward": 1.0, + "step": 1872, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.34375, + "epoch": 6.327150084317032, + "grad_norm": 6.886490971181804, + "kl": 0.712890625, + "learning_rate": 4.726914414414414e-07, + "loss": 0.0007, + "reward": 3.532763361930847, + "reward_std": 0.06538549810647964, + "rewards/final_reward": 1.7415548988047154, + "rewards/mask_iou_reward": 0.8707774494023577, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.532763421535492, + "rewards/thk_ans_format_reward": 1.0, + "step": 1873, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.2291717529297, + "epoch": 6.330522765598651, + "grad_norm": 9.355042908067066, + "kl": 0.423828125, + "learning_rate": 4.7240990990990986e-07, + "loss": 0.0004, + "reward": 3.6574703454971313, + "reward_std": 0.04328635986894369, + "rewards/final_reward": 1.9130759923228222, + "rewards/mask_iou_reward": 0.9565379961614111, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6574699878692627, + "rewards/thk_ans_format_reward": 1.0, + "step": 1874, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4479217529297, + "epoch": 6.33389544688027, + "grad_norm": 9.49022115875892, + "kl": 0.5078125, + "learning_rate": 4.721283783783784e-07, + "loss": 0.0005, + "reward": 3.5508744716644287, + "reward_std": 0.07801926881074905, + "rewards/final_reward": 1.5653615315107303, + "rewards/mask_iou_reward": 0.7826807657553652, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.550874412059784, + "rewards/thk_ans_format_reward": 1.0, + "step": 1875, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.25000762939453, + "epoch": 6.337268128161889, + "grad_norm": 6.088105787863321, + "kl": 0.400390625, + "learning_rate": 4.7184684684684684e-07, + "loss": 0.0004, + "reward": 3.5852304697036743, + "reward_std": 0.12043560296297073, + "rewards/final_reward": 1.7741760352887033, + "rewards/mask_iou_reward": 0.8870880176443516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5852304100990295, + "rewards/thk_ans_format_reward": 1.0, + "step": 1876, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.4166717529297, + "epoch": 6.340640809443507, + "grad_norm": 5.801410379375544, + "kl": 0.3720703125, + "learning_rate": 4.715653153153153e-07, + "loss": 0.0004, + "reward": 3.80169677734375, + "reward_std": 0.035175224766135216, + "rewards/final_reward": 1.8897999246439883, + "rewards/mask_iou_reward": 0.9448999623219941, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8016967177391052, + "rewards/thk_ans_format_reward": 1.0, + "step": 1877, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.53125762939453, + "epoch": 6.344013490725127, + "grad_norm": 10.847386218011383, + "kl": 0.451171875, + "learning_rate": 4.7128378378378376e-07, + "loss": 0.0005, + "reward": 3.548642635345459, + "reward_std": 0.07939281314611435, + "rewards/final_reward": 1.7258872880340754, + "rewards/mask_iou_reward": 0.8629436440170377, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.548642635345459, + "rewards/thk_ans_format_reward": 1.0, + "step": 1878, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.63542938232422, + "epoch": 6.347386172006745, + "grad_norm": 5.9551648312298715, + "kl": 0.3994140625, + "learning_rate": 4.710022522522522e-07, + "loss": 0.0004, + "reward": 3.187057852745056, + "reward_std": 0.19463208317756653, + "rewards/final_reward": 1.2669394950542276, + "rewards/mask_iou_reward": 0.6334697475271138, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2078912556171417, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1879, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.1354217529297, + "epoch": 6.350758853288364, + "grad_norm": 12.090427610462893, + "kl": 0.4150390625, + "learning_rate": 4.7072072072072073e-07, + "loss": 0.0004, + "reward": 3.521798610687256, + "reward_std": 0.05686133913695812, + "rewards/final_reward": 1.0955721458005767, + "rewards/mask_iou_reward": 0.5477860729002884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5217987298965454, + "rewards/thk_ans_format_reward": 1.0, + "step": 1880, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.89583587646484, + "epoch": 6.354131534569984, + "grad_norm": 6.817187114704969, + "kl": 0.4169921875, + "learning_rate": 4.704391891891892e-07, + "loss": 0.0004, + "reward": 3.7434085607528687, + "reward_std": 0.17366989701986313, + "rewards/final_reward": 1.7364503374356408, + "rewards/mask_iou_reward": 0.8682251687178204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7434085607528687, + "rewards/thk_ans_format_reward": 1.0, + "step": 1881, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.02083587646484, + "epoch": 6.357504215851602, + "grad_norm": 11.502887174757745, + "kl": 0.4736328125, + "learning_rate": 4.7015765765765766e-07, + "loss": 0.0005, + "reward": 3.6568949222564697, + "reward_std": 0.060632091015577316, + "rewards/final_reward": 1.8699796659595362, + "rewards/mask_iou_reward": 0.9349898329797681, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6568948030471802, + "rewards/thk_ans_format_reward": 1.0, + "step": 1882, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.14583587646484, + "epoch": 6.360876897133221, + "grad_norm": 9.859580321040402, + "kl": 0.4111328125, + "learning_rate": 4.698761261261261e-07, + "loss": 0.0004, + "reward": 3.5974520444869995, + "reward_std": 0.10273704305291176, + "rewards/final_reward": 1.7107119595052134, + "rewards/mask_iou_reward": 0.8553559797526067, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5974522829055786, + "rewards/thk_ans_format_reward": 1.0, + "step": 1883, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.67708587646484, + "epoch": 6.36424957841484, + "grad_norm": 9.131603356854935, + "kl": 0.404296875, + "learning_rate": 4.695945945945946e-07, + "loss": 0.0004, + "reward": 3.458780288696289, + "reward_std": 0.08173859491944313, + "rewards/final_reward": 1.5234535005155414, + "rewards/mask_iou_reward": 0.7617267502577707, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4587804675102234, + "rewards/thk_ans_format_reward": 1.0, + "step": 1884, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.98959350585938, + "epoch": 6.367622259696459, + "grad_norm": 7.846051525792356, + "kl": 0.43359375, + "learning_rate": 4.6931306306306304e-07, + "loss": 0.0004, + "reward": 3.477925419807434, + "reward_std": 0.056161317974328995, + "rewards/final_reward": 1.7337655025580978, + "rewards/mask_iou_reward": 0.8668827512790489, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4779254794120789, + "rewards/thk_ans_format_reward": 1.0, + "step": 1885, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.50000762939453, + "epoch": 6.370994940978077, + "grad_norm": 9.447334787633611, + "kl": 0.4921875, + "learning_rate": 4.690315315315315e-07, + "loss": 0.0005, + "reward": 3.6862441301345825, + "reward_std": 0.07632257603108883, + "rewards/final_reward": 1.539230532497822, + "rewards/mask_iou_reward": 0.769615266248911, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6862438917160034, + "rewards/thk_ans_format_reward": 1.0, + "step": 1886, + "think_completion_length": 10.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.45834350585938, + "epoch": 6.3743676222596966, + "grad_norm": 7.634914287544993, + "kl": 0.388671875, + "learning_rate": 4.6874999999999996e-07, + "loss": 0.0004, + "reward": 3.2322871685028076, + "reward_std": 0.3254256844520569, + "rewards/final_reward": 0.8302558515962044, + "rewards/mask_iou_reward": 0.4151279257981022, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2531203627586365, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1887, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.39584350585938, + "epoch": 6.377740303541315, + "grad_norm": 9.849923513152714, + "kl": 0.5419921875, + "learning_rate": 4.684684684684684e-07, + "loss": 0.0005, + "reward": 3.6471699476242065, + "reward_std": 0.11751040071249008, + "rewards/final_reward": 1.6158547055186272, + "rewards/mask_iou_reward": 0.8079273527593136, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.647170066833496, + "rewards/thk_ans_format_reward": 1.0, + "step": 1888, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.3541717529297, + "epoch": 6.381112984822934, + "grad_norm": 14.533013973880257, + "kl": 0.419921875, + "learning_rate": 4.681869369369369e-07, + "loss": 0.0004, + "reward": 3.430299997329712, + "reward_std": 0.10118568316102028, + "rewards/final_reward": 1.7194286437301751, + "rewards/mask_iou_reward": 0.8597143218650876, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4302998781204224, + "rewards/thk_ans_format_reward": 1.0, + "step": 1889, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.53125762939453, + "epoch": 6.3844856661045535, + "grad_norm": 8.55568300191631, + "kl": 0.55078125, + "learning_rate": 4.679054054054054e-07, + "loss": 0.0006, + "reward": 3.460633873939514, + "reward_std": 0.25002913177013397, + "rewards/final_reward": 1.8770856025200167, + "rewards/mask_iou_reward": 0.9385428012600083, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4814671277999878, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1890, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.62500762939453, + "epoch": 6.387858347386172, + "grad_norm": 11.682304101407496, + "kl": 0.4931640625, + "learning_rate": 4.6762387387387385e-07, + "loss": 0.0005, + "reward": 3.004297971725464, + "reward_std": 0.11261074617505074, + "rewards/final_reward": 0.4484286897518397, + "rewards/mask_iou_reward": 0.22421434487591985, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.004297912120819, + "rewards/thk_ans_format_reward": 1.0, + "step": 1891, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.71875762939453, + "epoch": 6.391231028667791, + "grad_norm": 7.919820661858854, + "kl": 0.470703125, + "learning_rate": 4.673423423423423e-07, + "loss": 0.0005, + "reward": 3.617723226547241, + "reward_std": 0.06559170037508011, + "rewards/final_reward": 1.4914626275770577, + "rewards/mask_iou_reward": 0.7457313137885289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.617722988128662, + "rewards/thk_ans_format_reward": 1.0, + "step": 1892, + "think_completion_length": 7.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.1979217529297, + "epoch": 6.3946037099494095, + "grad_norm": 16.955529628182386, + "kl": 0.41015625, + "learning_rate": 4.670608108108108e-07, + "loss": 0.0004, + "reward": 3.0826051235198975, + "reward_std": 0.14307872019708157, + "rewards/final_reward": 0.46960478747577494, + "rewards/mask_iou_reward": 0.23480239373788747, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.0930215120315552, + "rewards/thk_ans_format_reward": 1.0, + "step": 1893, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.46875762939453, + "epoch": 6.397976391231029, + "grad_norm": 9.479392818196867, + "kl": 0.478515625, + "learning_rate": 4.6677927927927924e-07, + "loss": 0.0005, + "reward": 3.517951011657715, + "reward_std": 0.16193577647209167, + "rewards/final_reward": 1.0184101228734423, + "rewards/mask_iou_reward": 0.5092050614367212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5179506540298462, + "rewards/thk_ans_format_reward": 1.0, + "step": 1894, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.92708587646484, + "epoch": 6.401349072512647, + "grad_norm": 20.372484305072078, + "kl": 0.416015625, + "learning_rate": 4.6649774774774775e-07, + "loss": 0.0004, + "reward": 3.436389207839966, + "reward_std": 0.11014799401164055, + "rewards/final_reward": 1.2207106790456062, + "rewards/mask_iou_reward": 0.6103553395228031, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4363892078399658, + "rewards/thk_ans_format_reward": 1.0, + "step": 1895, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.08333587646484, + "epoch": 6.4047217537942664, + "grad_norm": 15.893878645545087, + "kl": 0.494140625, + "learning_rate": 4.662162162162162e-07, + "loss": 0.0005, + "reward": 3.601964235305786, + "reward_std": 0.16152212023735046, + "rewards/final_reward": 1.766780638428799, + "rewards/mask_iou_reward": 0.8833903192143995, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.601963996887207, + "rewards/thk_ans_format_reward": 1.0, + "step": 1896, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.2604217529297, + "epoch": 6.408094435075886, + "grad_norm": 8.736031495192677, + "kl": 0.3828125, + "learning_rate": 4.6593468468468467e-07, + "loss": 0.0004, + "reward": 3.5205795764923096, + "reward_std": 0.20784608274698257, + "rewards/final_reward": 1.4668057131209964, + "rewards/mask_iou_reward": 0.7334028565604982, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.52057945728302, + "rewards/thk_ans_format_reward": 1.0, + "step": 1897, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.18750762939453, + "epoch": 6.411467116357504, + "grad_norm": 7.594879038415985, + "kl": 0.669921875, + "learning_rate": 4.6565315315315313e-07, + "loss": 0.0007, + "reward": 3.405119299888611, + "reward_std": 0.12414194270968437, + "rewards/final_reward": 1.162574556350045, + "rewards/mask_iou_reward": 0.5812872781750225, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.40511953830719, + "rewards/thk_ans_format_reward": 1.0, + "step": 1898, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.2291717529297, + "epoch": 6.414839797639123, + "grad_norm": 29.348037382618, + "kl": 0.37890625, + "learning_rate": 4.653716216216216e-07, + "loss": 0.0003, + "reward": 3.5290740728378296, + "reward_std": 0.07987385988235474, + "rewards/final_reward": 1.2127857770770258, + "rewards/mask_iou_reward": 0.6063928885385129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.529074251651764, + "rewards/thk_ans_format_reward": 1.0, + "step": 1899, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.46875762939453, + "epoch": 6.418212478920742, + "grad_norm": 31.070338674714865, + "kl": 0.54296875, + "learning_rate": 4.650900900900901e-07, + "loss": 0.0006, + "reward": 3.3826311826705933, + "reward_std": 0.049534888938069344, + "rewards/final_reward": 1.8532369135906288, + "rewards/mask_iou_reward": 0.9266184567953144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3826313018798828, + "rewards/thk_ans_format_reward": 1.0, + "step": 1900, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.3229217529297, + "epoch": 6.421585160202361, + "grad_norm": 9.508395337814294, + "kl": 0.40234375, + "learning_rate": 4.6480855855855857e-07, + "loss": 0.0004, + "reward": 3.3927754163742065, + "reward_std": 0.07938742637634277, + "rewards/final_reward": 1.1924416076254536, + "rewards/mask_iou_reward": 0.5962208038127268, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3927754163742065, + "rewards/thk_ans_format_reward": 1.0, + "step": 1901, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.8854217529297, + "epoch": 6.424957841483979, + "grad_norm": 11.1582006576339, + "kl": 0.4482421875, + "learning_rate": 4.6452702702702703e-07, + "loss": 0.0005, + "reward": 3.4296629428863525, + "reward_std": 0.11983692087233067, + "rewards/final_reward": 1.087414950859162, + "rewards/mask_iou_reward": 0.543707475429581, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4296630024909973, + "rewards/thk_ans_format_reward": 1.0, + "step": 1902, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.30209350585938, + "epoch": 6.428330522765599, + "grad_norm": 12.465830782713155, + "kl": 0.501953125, + "learning_rate": 4.642454954954955e-07, + "loss": 0.0005, + "reward": 3.1689430475234985, + "reward_std": 0.07689309120178223, + "rewards/final_reward": 1.5112853224431064, + "rewards/mask_iou_reward": 0.7556426612215532, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.168942928314209, + "rewards/thk_ans_format_reward": 1.0, + "step": 1903, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.65625762939453, + "epoch": 6.431703204047217, + "grad_norm": 9.971794799370729, + "kl": 0.4296875, + "learning_rate": 4.639639639639639e-07, + "loss": 0.0004, + "reward": 3.2337831258773804, + "reward_std": 0.10983862727880478, + "rewards/final_reward": 1.3039498349252785, + "rewards/mask_iou_reward": 0.6519749174626392, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.233783334493637, + "rewards/thk_ans_format_reward": 1.0, + "step": 1904, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.26041793823242, + "epoch": 6.435075885328836, + "grad_norm": 18.680989287171347, + "kl": 0.607421875, + "learning_rate": 4.636824324324324e-07, + "loss": 0.0006, + "reward": 3.4857401847839355, + "reward_std": 0.122207872569561, + "rewards/final_reward": 1.6436075325939328, + "rewards/mask_iou_reward": 0.8218037662969664, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.485740065574646, + "rewards/thk_ans_format_reward": 1.0, + "step": 1905, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.90625, + "epoch": 6.438448566610456, + "grad_norm": 24.15840867971098, + "kl": 0.4111328125, + "learning_rate": 4.6340090090090087e-07, + "loss": 0.0004, + "reward": 3.565677046775818, + "reward_std": 0.05232588015496731, + "rewards/final_reward": 1.7861149132567309, + "rewards/mask_iou_reward": 0.8930574566283654, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5656769275665283, + "rewards/thk_ans_format_reward": 1.0, + "step": 1906, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.0520896911621, + "epoch": 6.441821247892074, + "grad_norm": 9.748880341775454, + "kl": 0.392578125, + "learning_rate": 4.6311936936936933e-07, + "loss": 0.0004, + "reward": 3.5896493196487427, + "reward_std": 0.06489380449056625, + "rewards/final_reward": 1.803237968390866, + "rewards/mask_iou_reward": 0.901618984195433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5896490812301636, + "rewards/thk_ans_format_reward": 1.0, + "step": 1907, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.21875, + "epoch": 6.445193929173693, + "grad_norm": 8.003405234992922, + "kl": 0.453125, + "learning_rate": 4.628378378378378e-07, + "loss": 0.0005, + "reward": 3.381407618522644, + "reward_std": 0.106148362159729, + "rewards/final_reward": 1.3149248627705654, + "rewards/mask_iou_reward": 0.6574624313852827, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.381407618522644, + "rewards/thk_ans_format_reward": 1.0, + "step": 1908, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.46875, + "epoch": 6.448566610455312, + "grad_norm": 7.960417542259144, + "kl": 0.4375, + "learning_rate": 4.6255630630630625e-07, + "loss": 0.0004, + "reward": 3.6770023107528687, + "reward_std": 0.11613703519105911, + "rewards/final_reward": 1.7991855951978626, + "rewards/mask_iou_reward": 0.8995927975989313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6770024299621582, + "rewards/thk_ans_format_reward": 1.0, + "step": 1909, + "think_completion_length": 9.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.05208587646484, + "epoch": 6.451939291736931, + "grad_norm": 13.076750082591246, + "kl": 0.38671875, + "learning_rate": 4.6227477477477477e-07, + "loss": 0.0004, + "reward": 3.450620651245117, + "reward_std": 0.12947594933211803, + "rewards/final_reward": 1.7108386702374498, + "rewards/mask_iou_reward": 0.8554193351187249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.450620412826538, + "rewards/thk_ans_format_reward": 1.0, + "step": 1910, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.59375762939453, + "epoch": 6.455311973018549, + "grad_norm": 12.562946407758929, + "kl": 0.462890625, + "learning_rate": 4.6199324324324323e-07, + "loss": 0.0005, + "reward": 3.538373827934265, + "reward_std": 0.052979251369833946, + "rewards/final_reward": 1.9293922026045054, + "rewards/mask_iou_reward": 0.9646961013022527, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5383738279342651, + "rewards/thk_ans_format_reward": 1.0, + "step": 1911, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.25000762939453, + "epoch": 6.458684654300169, + "grad_norm": 27.17672682917496, + "kl": 0.443359375, + "learning_rate": 4.617117117117117e-07, + "loss": 0.0004, + "reward": 3.4632757902145386, + "reward_std": 0.18749287351965904, + "rewards/final_reward": 1.8631885229559808, + "rewards/mask_iou_reward": 0.9315942614779904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4632756114006042, + "rewards/thk_ans_format_reward": 1.0, + "step": 1912, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.50000762939453, + "epoch": 6.462057335581788, + "grad_norm": 68.01765666204261, + "kl": 0.8388671875, + "learning_rate": 4.6143018018018015e-07, + "loss": 0.0008, + "reward": 3.3502947092056274, + "reward_std": 0.15701918303966522, + "rewards/final_reward": 1.287749956469415, + "rewards/mask_iou_reward": 0.6438749782347075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3502946496009827, + "rewards/thk_ans_format_reward": 1.0, + "step": 1913, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.93750762939453, + "epoch": 6.465430016863406, + "grad_norm": 12.465496225956764, + "kl": 0.4345703125, + "learning_rate": 4.611486486486486e-07, + "loss": 0.0004, + "reward": 3.6096237897872925, + "reward_std": 0.11629275232553482, + "rewards/final_reward": 1.6158974661798904, + "rewards/mask_iou_reward": 0.8079487330899452, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6096238493919373, + "rewards/thk_ans_format_reward": 1.0, + "step": 1914, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.1875, + "epoch": 6.4688026981450255, + "grad_norm": 8.953988368314507, + "kl": 0.41015625, + "learning_rate": 4.608671171171171e-07, + "loss": 0.0004, + "reward": 3.6579357385635376, + "reward_std": 0.08615681529045105, + "rewards/final_reward": 1.8738261382499415, + "rewards/mask_iou_reward": 0.9369130691249707, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6579357981681824, + "rewards/thk_ans_format_reward": 1.0, + "step": 1915, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.29166793823242, + "epoch": 6.472175379426644, + "grad_norm": 8.530074182730772, + "kl": 0.419921875, + "learning_rate": 4.605855855855856e-07, + "loss": 0.0004, + "reward": 3.500860333442688, + "reward_std": 0.06479554157704115, + "rewards/final_reward": 1.7721428409723283, + "rewards/mask_iou_reward": 0.8860714204861642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5008601546287537, + "rewards/thk_ans_format_reward": 1.0, + "step": 1916, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.09375762939453, + "epoch": 6.475548060708263, + "grad_norm": 9.642455716005925, + "kl": 0.439453125, + "learning_rate": 4.6030405405405404e-07, + "loss": 0.0005, + "reward": 3.2050682306289673, + "reward_std": 0.2021598145365715, + "rewards/final_reward": 0.5997128136106451, + "rewards/mask_iou_reward": 0.29985640680532255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2050682306289673, + "rewards/thk_ans_format_reward": 1.0, + "step": 1917, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.9791717529297, + "epoch": 6.4789207419898815, + "grad_norm": 7.830276530642089, + "kl": 0.626953125, + "learning_rate": 4.600225225225225e-07, + "loss": 0.0006, + "reward": 3.629697561264038, + "reward_std": 0.036790769547224045, + "rewards/final_reward": 1.4997423734748434, + "rewards/mask_iou_reward": 0.7498711867374217, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6296973824501038, + "rewards/thk_ans_format_reward": 1.0, + "step": 1918, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.8854217529297, + "epoch": 6.482293423271501, + "grad_norm": 8.236699940973251, + "kl": 0.5322265625, + "learning_rate": 4.5974099099099097e-07, + "loss": 0.0005, + "reward": 3.3659796714782715, + "reward_std": 0.08384433016180992, + "rewards/final_reward": 1.8747773736141915, + "rewards/mask_iou_reward": 0.9373886868070958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.365979552268982, + "rewards/thk_ans_format_reward": 1.0, + "step": 1919, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.45833587646484, + "epoch": 6.48566610455312, + "grad_norm": 7.399897831772437, + "kl": 0.576171875, + "learning_rate": 4.594594594594595e-07, + "loss": 0.0006, + "reward": 3.5207263231277466, + "reward_std": 0.0641837865114212, + "rewards/final_reward": 1.9002956424366944, + "rewards/mask_iou_reward": 0.9501478212183472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5207264423370361, + "rewards/thk_ans_format_reward": 1.0, + "step": 1920, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.42708587646484, + "epoch": 6.4890387858347385, + "grad_norm": 9.934190133486409, + "kl": 0.5322265625, + "learning_rate": 4.5917792792792794e-07, + "loss": 0.0005, + "reward": 3.6050784587860107, + "reward_std": 0.05036386847496033, + "rewards/final_reward": 1.4844076510515394, + "rewards/mask_iou_reward": 0.7422038255257697, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6050782203674316, + "rewards/thk_ans_format_reward": 1.0, + "step": 1921, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.61458587646484, + "epoch": 6.492411467116358, + "grad_norm": 17.45409532746466, + "kl": 0.501953125, + "learning_rate": 4.588963963963964e-07, + "loss": 0.0006, + "reward": 3.6852781772613525, + "reward_std": 0.14414148032665253, + "rewards/final_reward": 1.8175316750528594, + "rewards/mask_iou_reward": 0.9087658375264297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6852782368659973, + "rewards/thk_ans_format_reward": 1.0, + "step": 1922, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.67708587646484, + "epoch": 6.495784148397976, + "grad_norm": 8.365698590732766, + "kl": 0.494140625, + "learning_rate": 4.5861486486486486e-07, + "loss": 0.0005, + "reward": 3.620155453681946, + "reward_std": 0.10641103237867355, + "rewards/final_reward": 1.5902591801469743, + "rewards/mask_iou_reward": 0.7951295900734872, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6201554536819458, + "rewards/thk_ans_format_reward": 1.0, + "step": 1923, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.64583587646484, + "epoch": 6.499156829679595, + "grad_norm": 8.407844695773075, + "kl": 0.4052734375, + "learning_rate": 4.5833333333333327e-07, + "loss": 0.0004, + "reward": 3.578555703163147, + "reward_std": 0.040246653370559216, + "rewards/final_reward": 1.0063336036209627, + "rewards/mask_iou_reward": 0.5031668018104813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5785555839538574, + "rewards/thk_ans_format_reward": 1.0, + "step": 1924, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.34375762939453, + "epoch": 6.502529510961214, + "grad_norm": 26.172801271228387, + "kl": 0.421875, + "learning_rate": 4.580518018018018e-07, + "loss": 0.0004, + "reward": 3.4072368144989014, + "reward_std": 0.07461421936750412, + "rewards/final_reward": 1.2929097656984843, + "rewards/mask_iou_reward": 0.6464548828492421, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4072368144989014, + "rewards/thk_ans_format_reward": 1.0, + "step": 1925, + "think_completion_length": 6.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.73958587646484, + "epoch": 6.505902192242833, + "grad_norm": 12.119896301081399, + "kl": 0.4287109375, + "learning_rate": 4.5777027027027024e-07, + "loss": 0.0004, + "reward": 3.206121563911438, + "reward_std": 0.09432797785848379, + "rewards/final_reward": 0.8982187139512381, + "rewards/mask_iou_reward": 0.44910935697561905, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2061215043067932, + "rewards/thk_ans_format_reward": 1.0, + "step": 1926, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.21875, + "epoch": 6.509274873524452, + "grad_norm": 415.2726487286936, + "kl": 0.423828125, + "learning_rate": 4.574887387387387e-07, + "loss": 0.0004, + "reward": 3.6168497800827026, + "reward_std": 0.10688769817352295, + "rewards/final_reward": 1.6747035288263004, + "rewards/mask_iou_reward": 0.8373517644131502, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6272663474082947, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1927, + "think_completion_length": 9.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.5729217529297, + "epoch": 6.512647554806071, + "grad_norm": 19.920648554833924, + "kl": 1.8330078125, + "learning_rate": 4.5720720720720716e-07, + "loss": 0.0018, + "reward": 3.4919700622558594, + "reward_std": 0.17838171124458313, + "rewards/final_reward": 1.3544193334299761, + "rewards/mask_iou_reward": 0.6772096667149881, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.491970181465149, + "rewards/thk_ans_format_reward": 1.0, + "step": 1928, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.73959350585938, + "epoch": 6.51602023608769, + "grad_norm": 5.874339600504554, + "kl": 0.41796875, + "learning_rate": 4.569256756756756e-07, + "loss": 0.0004, + "reward": 3.5194711685180664, + "reward_std": 0.1812281534075737, + "rewards/final_reward": 1.7720225438373989, + "rewards/mask_iou_reward": 0.8860112719186994, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5194711089134216, + "rewards/thk_ans_format_reward": 1.0, + "step": 1929, + "think_completion_length": 9.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.14583587646484, + "epoch": 6.519392917369308, + "grad_norm": 71.2249777632913, + "kl": 0.5859375, + "learning_rate": 4.5664414414414414e-07, + "loss": 0.0006, + "reward": 3.513147711753845, + "reward_std": 0.0648888386785984, + "rewards/final_reward": 1.1092570722526767, + "rewards/mask_iou_reward": 0.5546285361263383, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5131479501724243, + "rewards/thk_ans_format_reward": 1.0, + "step": 1930, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.46875, + "epoch": 6.522765598650928, + "grad_norm": 10.954163481582695, + "kl": 0.3837890625, + "learning_rate": 4.563626126126126e-07, + "loss": 0.0004, + "reward": 3.397698402404785, + "reward_std": 0.0887177549302578, + "rewards/final_reward": 1.8347744826708012, + "rewards/mask_iou_reward": 0.9173872413354006, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3976984024047852, + "rewards/thk_ans_format_reward": 1.0, + "step": 1931, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.53125762939453, + "epoch": 6.526138279932546, + "grad_norm": 7.545325626890654, + "kl": 0.5244140625, + "learning_rate": 4.5608108108108106e-07, + "loss": 0.0005, + "reward": 3.047685384750366, + "reward_std": 0.022525336127728224, + "rewards/final_reward": 1.05814256808342, + "rewards/mask_iou_reward": 0.52907128404171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0476853251457214, + "rewards/thk_ans_format_reward": 1.0, + "step": 1932, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.20833587646484, + "epoch": 6.529510961214165, + "grad_norm": 33.660125151640464, + "kl": 0.44921875, + "learning_rate": 4.557995495495495e-07, + "loss": 0.0005, + "reward": 3.55889630317688, + "reward_std": 0.051030886359512806, + "rewards/final_reward": 1.5346261785456772, + "rewards/mask_iou_reward": 0.7673130892728386, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5588963627815247, + "rewards/thk_ans_format_reward": 1.0, + "step": 1933, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.05208587646484, + "epoch": 6.532883642495785, + "grad_norm": 8.120695624403554, + "kl": 0.431640625, + "learning_rate": 4.55518018018018e-07, + "loss": 0.0004, + "reward": 3.450679898262024, + "reward_std": 0.10136326961219311, + "rewards/final_reward": 0.9703960820165738, + "rewards/mask_iou_reward": 0.4851980410082869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4506798386573792, + "rewards/thk_ans_format_reward": 1.0, + "step": 1934, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.8229217529297, + "epoch": 6.536256323777403, + "grad_norm": 12.669245787632661, + "kl": 4.64453125, + "learning_rate": 4.552364864864865e-07, + "loss": 0.0047, + "reward": 3.7870538234710693, + "reward_std": 0.07818649988621473, + "rewards/final_reward": 1.5274776588059071, + "rewards/mask_iou_reward": 0.7637388294029536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7870538234710693, + "rewards/thk_ans_format_reward": 1.0, + "step": 1935, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.90625762939453, + "epoch": 6.539629005059022, + "grad_norm": 11.033899027393797, + "kl": 0.416015625, + "learning_rate": 4.5495495495495496e-07, + "loss": 0.0004, + "reward": 3.4292943477630615, + "reward_std": 0.12554167211055756, + "rewards/final_reward": 1.6428870075377782, + "rewards/mask_iou_reward": 0.8214435037688891, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.429294228553772, + "rewards/thk_ans_format_reward": 1.0, + "step": 1936, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.2916717529297, + "epoch": 6.543001686340641, + "grad_norm": 8.971625066042417, + "kl": 0.5234375, + "learning_rate": 4.546734234234234e-07, + "loss": 0.0006, + "reward": 3.288731336593628, + "reward_std": 0.13752873055636883, + "rewards/final_reward": 1.865159786427796, + "rewards/mask_iou_reward": 0.932579893213898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2887312769889832, + "rewards/thk_ans_format_reward": 1.0, + "step": 1937, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.8854217529297, + "epoch": 6.54637436762226, + "grad_norm": 13.524320563229523, + "kl": 0.423828125, + "learning_rate": 4.543918918918919e-07, + "loss": 0.0005, + "reward": 3.1908024549484253, + "reward_std": 0.04454457201063633, + "rewards/final_reward": 1.530183020227454, + "rewards/mask_iou_reward": 0.765091510113727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1908025741577148, + "rewards/thk_ans_format_reward": 1.0, + "step": 1938, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.48958587646484, + "epoch": 6.549747048903878, + "grad_norm": 7.095076055591257, + "kl": 0.54296875, + "learning_rate": 4.5411036036036034e-07, + "loss": 0.0006, + "reward": 3.755676031112671, + "reward_std": 0.05841661896556616, + "rewards/final_reward": 1.6603783972345676, + "rewards/mask_iou_reward": 0.8301891986172838, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7556763291358948, + "rewards/thk_ans_format_reward": 1.0, + "step": 1939, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.03125762939453, + "epoch": 6.5531197301854975, + "grad_norm": 19.24255297558277, + "kl": 0.443359375, + "learning_rate": 4.5382882882882885e-07, + "loss": 0.0004, + "reward": 3.671309232711792, + "reward_std": 0.07891538739204407, + "rewards/final_reward": 1.6045817813514578, + "rewards/mask_iou_reward": 0.8022908906757289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6713091135025024, + "rewards/thk_ans_format_reward": 1.0, + "step": 1940, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.12500762939453, + "epoch": 6.556492411467117, + "grad_norm": 21.61772380542658, + "kl": 0.44140625, + "learning_rate": 4.535472972972973e-07, + "loss": 0.0005, + "reward": 3.2939724922180176, + "reward_std": 0.07393957488238811, + "rewards/final_reward": 1.8902274489083017, + "rewards/mask_iou_reward": 0.9451137244541509, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2939725518226624, + "rewards/thk_ans_format_reward": 1.0, + "step": 1941, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.70833587646484, + "epoch": 6.559865092748735, + "grad_norm": 8.304061072522082, + "kl": 0.3974609375, + "learning_rate": 4.5326576576576577e-07, + "loss": 0.0004, + "reward": 3.7287017107009888, + "reward_std": 0.0779542843811214, + "rewards/final_reward": 1.565529514841001, + "rewards/mask_iou_reward": 0.7827647574205004, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7287017703056335, + "rewards/thk_ans_format_reward": 1.0, + "step": 1942, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.5729217529297, + "epoch": 6.5632377740303545, + "grad_norm": 29.07655724939814, + "kl": 0.4384765625, + "learning_rate": 4.5298423423423423e-07, + "loss": 0.0004, + "reward": 3.7415881156921387, + "reward_std": 0.11560340598225594, + "rewards/final_reward": 1.9170531111047713, + "rewards/mask_iou_reward": 0.9585265555523856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7415879368782043, + "rewards/thk_ans_format_reward": 1.0, + "step": 1943, + "think_completion_length": 10.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.61458587646484, + "epoch": 6.566610455311973, + "grad_norm": 6.927386222217618, + "kl": 0.47265625, + "learning_rate": 4.5270270270270264e-07, + "loss": 0.0005, + "reward": 3.2021960020065308, + "reward_std": 0.12428174912929535, + "rewards/final_reward": 0.5200134386180482, + "rewards/mask_iou_reward": 0.2600067193090241, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2021958231925964, + "rewards/thk_ans_format_reward": 1.0, + "step": 1944, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.09375762939453, + "epoch": 6.569983136593592, + "grad_norm": 9.047643638152596, + "kl": 0.4072265625, + "learning_rate": 4.524211711711711e-07, + "loss": 0.0005, + "reward": 3.5492023229599, + "reward_std": 0.052342869341373444, + "rewards/final_reward": 1.9278371578029603, + "rewards/mask_iou_reward": 0.9639185789014801, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5492025017738342, + "rewards/thk_ans_format_reward": 1.0, + "step": 1945, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.67708587646484, + "epoch": 6.5733558178752105, + "grad_norm": 8.075763010287291, + "kl": 0.4912109375, + "learning_rate": 4.521396396396396e-07, + "loss": 0.0006, + "reward": 3.7009243965148926, + "reward_std": 0.04675254225730896, + "rewards/final_reward": 1.7781464676900498, + "rewards/mask_iou_reward": 0.8890732338450249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7009243369102478, + "rewards/thk_ans_format_reward": 1.0, + "step": 1946, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5625, + "epoch": 6.57672849915683, + "grad_norm": 15.369144475444557, + "kl": 0.5537109375, + "learning_rate": 4.518581081081081e-07, + "loss": 0.0005, + "reward": 3.627153992652893, + "reward_std": 0.11572860553860664, + "rewards/final_reward": 1.614166593708518, + "rewards/mask_iou_reward": 0.807083296854259, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6271539330482483, + "rewards/thk_ans_format_reward": 1.0, + "step": 1947, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.39583587646484, + "epoch": 6.580101180438449, + "grad_norm": 84.51339562057268, + "kl": 0.669921875, + "learning_rate": 4.5157657657657654e-07, + "loss": 0.0007, + "reward": 3.5069133043289185, + "reward_std": 0.15810798108577728, + "rewards/final_reward": 1.6061496162495597, + "rewards/mask_iou_reward": 0.8030748081247798, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5069132447242737, + "rewards/thk_ans_format_reward": 1.0, + "step": 1948, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.42709350585938, + "epoch": 6.583473861720067, + "grad_norm": 12.173912335159075, + "kl": 0.4990234375, + "learning_rate": 4.51295045045045e-07, + "loss": 0.0005, + "reward": 3.458880066871643, + "reward_std": 0.079188940115273, + "rewards/final_reward": 1.3770498445233732, + "rewards/mask_iou_reward": 0.6885249222616866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4588798880577087, + "rewards/thk_ans_format_reward": 1.0, + "step": 1949, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.47916793823242, + "epoch": 6.586846543001687, + "grad_norm": 17.453147272178725, + "kl": 0.541015625, + "learning_rate": 4.5101351351351346e-07, + "loss": 0.0005, + "reward": 3.465920329093933, + "reward_std": 0.07390345633029938, + "rewards/final_reward": 1.7473652437753469, + "rewards/mask_iou_reward": 0.8736826218876734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.465920329093933, + "rewards/thk_ans_format_reward": 1.0, + "step": 1950, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.67708587646484, + "epoch": 6.590219224283305, + "grad_norm": 7.769345536825688, + "kl": 0.4599609375, + "learning_rate": 4.5073198198198197e-07, + "loss": 0.0005, + "reward": 3.3470152616500854, + "reward_std": 0.08661656081676483, + "rewards/final_reward": 1.3208495764728003, + "rewards/mask_iou_reward": 0.6604247882364002, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3470152616500854, + "rewards/thk_ans_format_reward": 1.0, + "step": 1951, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.34375762939453, + "epoch": 6.593591905564924, + "grad_norm": 15.386128922804302, + "kl": 0.53125, + "learning_rate": 4.5045045045045043e-07, + "loss": 0.0006, + "reward": 3.5289965867996216, + "reward_std": 0.14280862733721733, + "rewards/final_reward": 1.461553967923843, + "rewards/mask_iou_reward": 0.7307769839619215, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5289965271949768, + "rewards/thk_ans_format_reward": 1.0, + "step": 1952, + "think_completion_length": 7.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.67708587646484, + "epoch": 6.596964586846543, + "grad_norm": 26.095973849490772, + "kl": 0.474609375, + "learning_rate": 4.501689189189189e-07, + "loss": 0.0005, + "reward": 3.497464895248413, + "reward_std": 0.1548975557088852, + "rewards/final_reward": 1.8610477691556193, + "rewards/mask_iou_reward": 0.9305238845778097, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4974648356437683, + "rewards/thk_ans_format_reward": 1.0, + "step": 1953, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.6041717529297, + "epoch": 6.600337268128162, + "grad_norm": 18.14612727739541, + "kl": 3.826171875, + "learning_rate": 4.4988738738738735e-07, + "loss": 0.0038, + "reward": 3.464960217475891, + "reward_std": 0.168898306787014, + "rewards/final_reward": 1.8793782250237863, + "rewards/mask_iou_reward": 0.9396891125118931, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4857934713363647, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1954, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.53125762939453, + "epoch": 6.60370994940978, + "grad_norm": 46.67923195576741, + "kl": 0.703125, + "learning_rate": 4.496058558558558e-07, + "loss": 0.0007, + "reward": 3.451904535293579, + "reward_std": 0.11900551989674568, + "rewards/final_reward": 1.68860443712021, + "rewards/mask_iou_reward": 0.844302218560105, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4519044756889343, + "rewards/thk_ans_format_reward": 1.0, + "step": 1955, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.10417175292969, + "epoch": 6.6070826306914, + "grad_norm": 9.579470561626394, + "kl": 0.4453125, + "learning_rate": 4.4932432432432433e-07, + "loss": 0.0005, + "reward": 3.6845006942749023, + "reward_std": 0.04711965471506119, + "rewards/final_reward": 1.8025177070464735, + "rewards/mask_iou_reward": 0.9012588535232368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.684500515460968, + "rewards/thk_ans_format_reward": 1.0, + "step": 1956, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.0104217529297, + "epoch": 6.610455311973018, + "grad_norm": 11.62271298127058, + "kl": 0.4140625, + "learning_rate": 4.490427927927928e-07, + "loss": 0.0004, + "reward": 3.4423630237579346, + "reward_std": 0.06707348302006721, + "rewards/final_reward": 1.4323155140969193, + "rewards/mask_iou_reward": 0.7161577570484596, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4423629641532898, + "rewards/thk_ans_format_reward": 1.0, + "step": 1957, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.375, + "epoch": 6.613827993254637, + "grad_norm": 5.29781925562583, + "kl": 0.56640625, + "learning_rate": 4.4876126126126125e-07, + "loss": 0.0005, + "reward": 3.433348774909973, + "reward_std": 0.03969069384038448, + "rewards/final_reward": 1.8051425197216238, + "rewards/mask_iou_reward": 0.9025712598608119, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4333484768867493, + "rewards/thk_ans_format_reward": 1.0, + "step": 1958, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.92708587646484, + "epoch": 6.617200674536257, + "grad_norm": 8.482364662005988, + "kl": 0.5, + "learning_rate": 4.484797297297297e-07, + "loss": 0.0005, + "reward": 3.334537982940674, + "reward_std": 0.2071598581969738, + "rewards/final_reward": 1.401921329152926, + "rewards/mask_iou_reward": 0.700960664576463, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3553712964057922, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1959, + "think_completion_length": 7.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.65625762939453, + "epoch": 6.620573355817875, + "grad_norm": 9.109768103898793, + "kl": 0.548828125, + "learning_rate": 4.4819819819819817e-07, + "loss": 0.0005, + "reward": 3.436391592025757, + "reward_std": 0.09884997457265854, + "rewards/final_reward": 1.448345198392761, + "rewards/mask_iou_reward": 0.7241725991963806, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4363916516304016, + "rewards/thk_ans_format_reward": 1.0, + "step": 1960, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.09375, + "epoch": 6.623946037099494, + "grad_norm": 7.618736225368571, + "kl": 0.521484375, + "learning_rate": 4.479166666666667e-07, + "loss": 0.0005, + "reward": 3.4175766706466675, + "reward_std": 0.19502199813723564, + "rewards/final_reward": 1.3908411124500102, + "rewards/mask_iou_reward": 0.6954205562250051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4175763726234436, + "rewards/thk_ans_format_reward": 1.0, + "step": 1961, + "think_completion_length": 9.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.34375762939453, + "epoch": 6.627318718381113, + "grad_norm": 9.619492441340693, + "kl": 0.4921875, + "learning_rate": 4.4763513513513514e-07, + "loss": 0.0005, + "reward": 3.8224265575408936, + "reward_std": 0.07703239098191261, + "rewards/final_reward": 1.9362953050973177, + "rewards/mask_iou_reward": 0.9681476525486589, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8224263787269592, + "rewards/thk_ans_format_reward": 1.0, + "step": 1962, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.71875381469727, + "epoch": 6.630691399662732, + "grad_norm": 7.710027079651332, + "kl": 0.5009765625, + "learning_rate": 4.473536036036036e-07, + "loss": 0.0005, + "reward": 3.3381128311157227, + "reward_std": 0.10878269374370575, + "rewards/final_reward": 1.7846295429377987, + "rewards/mask_iou_reward": 0.8923147714688994, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.338112473487854, + "rewards/thk_ans_format_reward": 1.0, + "step": 1963, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.21875381469727, + "epoch": 6.63406408094435, + "grad_norm": 11.144801235946936, + "kl": 0.498046875, + "learning_rate": 4.47072072072072e-07, + "loss": 0.0005, + "reward": 3.5592269897460938, + "reward_std": 0.06369103118777275, + "rewards/final_reward": 1.1932721632080656, + "rewards/mask_iou_reward": 0.5966360816040328, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.559226930141449, + "rewards/thk_ans_format_reward": 1.0, + "step": 1964, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.16667175292969, + "epoch": 6.63743676222597, + "grad_norm": 7.580881113025555, + "kl": 0.419921875, + "learning_rate": 4.467905405405405e-07, + "loss": 0.0004, + "reward": 3.5747615098953247, + "reward_std": 0.037079617381095886, + "rewards/final_reward": 1.4820962960826627, + "rewards/mask_iou_reward": 0.7410481480413313, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5747616291046143, + "rewards/thk_ans_format_reward": 1.0, + "step": 1965, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.98958587646484, + "epoch": 6.640809443507589, + "grad_norm": 25.18062527412311, + "kl": 0.384765625, + "learning_rate": 4.46509009009009e-07, + "loss": 0.0004, + "reward": 3.6321656703948975, + "reward_std": 0.09968181699514389, + "rewards/final_reward": 1.7989874917960231, + "rewards/mask_iou_reward": 0.8994937458980116, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6321657299995422, + "rewards/thk_ans_format_reward": 1.0, + "step": 1966, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.03125381469727, + "epoch": 6.644182124789207, + "grad_norm": 23.268402345738316, + "kl": 0.4365234375, + "learning_rate": 4.4622747747747745e-07, + "loss": 0.0004, + "reward": 3.722867250442505, + "reward_std": 0.07552542351186275, + "rewards/final_reward": 1.9138519713931477, + "rewards/mask_iou_reward": 0.9569259856965738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7228673696517944, + "rewards/thk_ans_format_reward": 1.0, + "step": 1967, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.31250381469727, + "epoch": 6.6475548060708265, + "grad_norm": 9.668951180153915, + "kl": 0.46875, + "learning_rate": 4.459459459459459e-07, + "loss": 0.0005, + "reward": 3.4191720485687256, + "reward_std": 0.09570085257291794, + "rewards/final_reward": 1.1168326682335907, + "rewards/mask_iou_reward": 0.5584163341167954, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4191720485687256, + "rewards/thk_ans_format_reward": 1.0, + "step": 1968, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.84375, + "epoch": 6.650927487352445, + "grad_norm": 12.650533772674278, + "kl": 0.4365234375, + "learning_rate": 4.4566441441441437e-07, + "loss": 0.0004, + "reward": 3.7565109729766846, + "reward_std": 0.034082308411598206, + "rewards/final_reward": 1.794588959040616, + "rewards/mask_iou_reward": 0.897294479520308, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7565112113952637, + "rewards/thk_ans_format_reward": 1.0, + "step": 1969, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.20833587646484, + "epoch": 6.654300168634064, + "grad_norm": 9.648538208038131, + "kl": 0.42578125, + "learning_rate": 4.4538288288288283e-07, + "loss": 0.0004, + "reward": 3.744592070579529, + "reward_std": 0.051872748881578445, + "rewards/final_reward": 1.8309302844557134, + "rewards/mask_iou_reward": 0.9154651422278567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7445920705795288, + "rewards/thk_ans_format_reward": 1.0, + "step": 1970, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 112.89583587646484, + "epoch": 6.6576728499156825, + "grad_norm": 7.015618502596955, + "kl": 0.615234375, + "learning_rate": 4.4510135135135134e-07, + "loss": 0.0006, + "reward": 2.9393208026885986, + "reward_std": 0.16016435716301203, + "rewards/final_reward": 0.18500248755062323, + "rewards/mask_iou_reward": 0.09250124377531162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9393208026885986, + "rewards/thk_ans_format_reward": 1.0, + "step": 1971, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.32291793823242, + "epoch": 6.661045531197302, + "grad_norm": 20.898709748720457, + "kl": 0.4794921875, + "learning_rate": 4.448198198198198e-07, + "loss": 0.0005, + "reward": 3.446476101875305, + "reward_std": 0.05248234234750271, + "rewards/final_reward": 1.1091014816709772, + "rewards/mask_iou_reward": 0.5545507408354886, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4464759826660156, + "rewards/thk_ans_format_reward": 1.0, + "step": 1972, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.15625381469727, + "epoch": 6.664418212478921, + "grad_norm": 14.035078577908726, + "kl": 0.478515625, + "learning_rate": 4.4453828828828826e-07, + "loss": 0.0005, + "reward": 3.5463058948516846, + "reward_std": 0.12094194442033768, + "rewards/final_reward": 1.6668437508848952, + "rewards/mask_iou_reward": 0.8334218754424476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5463060140609741, + "rewards/thk_ans_format_reward": 1.0, + "step": 1973, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.62500762939453, + "epoch": 6.6677908937605395, + "grad_norm": 12.934268271191499, + "kl": 0.4150390625, + "learning_rate": 4.442567567567567e-07, + "loss": 0.0004, + "reward": 3.3734281063079834, + "reward_std": 0.06955000199377537, + "rewards/final_reward": 1.6885538225511096, + "rewards/mask_iou_reward": 0.8442769112755548, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.373428225517273, + "rewards/thk_ans_format_reward": 1.0, + "step": 1974, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.6770896911621, + "epoch": 6.671163575042159, + "grad_norm": 8.430511842274917, + "kl": 0.537109375, + "learning_rate": 4.439752252252252e-07, + "loss": 0.0005, + "reward": 3.0608272552490234, + "reward_std": 0.09121683984994888, + "rewards/final_reward": 1.2410802690536467, + "rewards/mask_iou_reward": 0.6205401345268233, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0608273148536682, + "rewards/thk_ans_format_reward": 1.0, + "step": 1975, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.1145896911621, + "epoch": 6.674536256323777, + "grad_norm": 8.93189961549658, + "kl": 0.4951171875, + "learning_rate": 4.436936936936937e-07, + "loss": 0.0005, + "reward": 3.746822953224182, + "reward_std": 0.060973282903432846, + "rewards/final_reward": 1.8544490342566362, + "rewards/mask_iou_reward": 0.9272245171283181, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7468228340148926, + "rewards/thk_ans_format_reward": 1.0, + "step": 1976, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.68750762939453, + "epoch": 6.677908937605396, + "grad_norm": 10.003023701820984, + "kl": 0.4462890625, + "learning_rate": 4.4341216216216216e-07, + "loss": 0.0004, + "reward": 3.589956760406494, + "reward_std": 0.11278185062110424, + "rewards/final_reward": 1.6687520003064154, + "rewards/mask_iou_reward": 0.8343760001532077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5899565815925598, + "rewards/thk_ans_format_reward": 1.0, + "step": 1977, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.44791793823242, + "epoch": 6.681281618887015, + "grad_norm": 16.828960754253966, + "kl": 0.4638671875, + "learning_rate": 4.431306306306306e-07, + "loss": 0.0005, + "reward": 3.5644259452819824, + "reward_std": 0.060715802013874054, + "rewards/final_reward": 1.7956880990376614, + "rewards/mask_iou_reward": 0.8978440495188307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5644258856773376, + "rewards/thk_ans_format_reward": 1.0, + "step": 1978, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.6041717529297, + "epoch": 6.684654300168634, + "grad_norm": 10.676107641246627, + "kl": 0.41015625, + "learning_rate": 4.428490990990991e-07, + "loss": 0.0004, + "reward": 3.4321783781051636, + "reward_std": 0.23373743519186974, + "rewards/final_reward": 1.3625278164093109, + "rewards/mask_iou_reward": 0.6812639082046554, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4530117511749268, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 1979, + "think_completion_length": 7.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.87500762939453, + "epoch": 6.688026981450253, + "grad_norm": 13.405735230042936, + "kl": 0.4658203125, + "learning_rate": 4.4256756756756754e-07, + "loss": 0.0005, + "reward": 3.292783737182617, + "reward_std": 0.06245427392423153, + "rewards/final_reward": 1.5012419177457694, + "rewards/mask_iou_reward": 0.7506209588728847, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2927836179733276, + "rewards/thk_ans_format_reward": 1.0, + "step": 1980, + "think_completion_length": 6.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.93750762939453, + "epoch": 6.691399662731872, + "grad_norm": 13.934015511756387, + "kl": 0.4453125, + "learning_rate": 4.4228603603603606e-07, + "loss": 0.0005, + "reward": 3.5601495504379272, + "reward_std": 0.11545858904719353, + "rewards/final_reward": 1.9275234496275209, + "rewards/mask_iou_reward": 0.9637617248137604, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.560149371623993, + "rewards/thk_ans_format_reward": 1.0, + "step": 1981, + "think_completion_length": 7.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.28125, + "epoch": 6.694772344013491, + "grad_norm": 22.647171475849067, + "kl": 0.404296875, + "learning_rate": 4.420045045045045e-07, + "loss": 0.0004, + "reward": 3.3105775117874146, + "reward_std": 0.06604907289147377, + "rewards/final_reward": 0.9340656949871835, + "rewards/mask_iou_reward": 0.46703284749359175, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.310577392578125, + "rewards/thk_ans_format_reward": 1.0, + "step": 1982, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.61458587646484, + "epoch": 6.698145025295109, + "grad_norm": 9.748655133158199, + "kl": 0.4072265625, + "learning_rate": 4.41722972972973e-07, + "loss": 0.0005, + "reward": 3.4918267726898193, + "reward_std": 0.06488988548517227, + "rewards/final_reward": 1.7892597610591587, + "rewards/mask_iou_reward": 0.8946298805295794, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4918268322944641, + "rewards/thk_ans_format_reward": 1.0, + "step": 1983, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.1770896911621, + "epoch": 6.701517706576729, + "grad_norm": 7.5516474807955705, + "kl": 0.4306640625, + "learning_rate": 4.414414414414414e-07, + "loss": 0.0005, + "reward": 3.610625743865967, + "reward_std": 0.03432144969701767, + "rewards/final_reward": 1.8832431743295208, + "rewards/mask_iou_reward": 0.9416215871647604, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.610625684261322, + "rewards/thk_ans_format_reward": 1.0, + "step": 1984, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.27084350585938, + "epoch": 6.704890387858347, + "grad_norm": 9.755324488231713, + "kl": 0.4091796875, + "learning_rate": 4.4115990990990985e-07, + "loss": 0.0004, + "reward": 3.649802088737488, + "reward_std": 0.09982002340257168, + "rewards/final_reward": 1.787384414540992, + "rewards/mask_iou_reward": 0.893692207270496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6498020887374878, + "rewards/thk_ans_format_reward": 1.0, + "step": 1985, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.95833587646484, + "epoch": 6.708263069139966, + "grad_norm": 6.872406296871829, + "kl": 0.4326171875, + "learning_rate": 4.4087837837837836e-07, + "loss": 0.0004, + "reward": 3.651780366897583, + "reward_std": 0.03141362592577934, + "rewards/final_reward": 1.8310194196700014, + "rewards/mask_iou_reward": 0.9155097098350007, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.651780366897583, + "rewards/thk_ans_format_reward": 1.0, + "step": 1986, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.50000762939453, + "epoch": 6.7116357504215856, + "grad_norm": 6.035689301729314, + "kl": 0.767578125, + "learning_rate": 4.405968468468468e-07, + "loss": 0.0008, + "reward": 3.4294917583465576, + "reward_std": 0.05013443436473608, + "rewards/final_reward": 0.8621415224398494, + "rewards/mask_iou_reward": 0.4310707612199247, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4294917583465576, + "rewards/thk_ans_format_reward": 1.0, + "step": 1987, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.62500762939453, + "epoch": 6.715008431703204, + "grad_norm": 10.28762324119743, + "kl": 0.41796875, + "learning_rate": 4.403153153153153e-07, + "loss": 0.0004, + "reward": 3.6641100645065308, + "reward_std": 0.06396713852882385, + "rewards/final_reward": 1.8392503553519635, + "rewards/mask_iou_reward": 0.9196251776759817, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6641101241111755, + "rewards/thk_ans_format_reward": 1.0, + "step": 1988, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.02083587646484, + "epoch": 6.718381112984823, + "grad_norm": 10.948623290376469, + "kl": 0.453125, + "learning_rate": 4.4003378378378374e-07, + "loss": 0.0004, + "reward": 3.5362966060638428, + "reward_std": 0.07892968133091927, + "rewards/final_reward": 1.5511359942392953, + "rewards/mask_iou_reward": 0.7755679971196476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5362964868545532, + "rewards/thk_ans_format_reward": 1.0, + "step": 1989, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.1875, + "epoch": 6.721753794266442, + "grad_norm": 54.9128726538673, + "kl": 0.439453125, + "learning_rate": 4.397522522522522e-07, + "loss": 0.0005, + "reward": 3.337985396385193, + "reward_std": 0.10868017747998238, + "rewards/final_reward": 1.6815334552677537, + "rewards/mask_iou_reward": 0.8407667276338768, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.337985336780548, + "rewards/thk_ans_format_reward": 1.0, + "step": 1990, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.95833587646484, + "epoch": 6.725126475548061, + "grad_norm": 18.053724543999564, + "kl": 0.416015625, + "learning_rate": 4.394707207207207e-07, + "loss": 0.0004, + "reward": 3.48260235786438, + "reward_std": 0.044008538126945496, + "rewards/final_reward": 1.215805165491985, + "rewards/mask_iou_reward": 0.6079025827459925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4826021194458008, + "rewards/thk_ans_format_reward": 1.0, + "step": 1991, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.9166717529297, + "epoch": 6.728499156829679, + "grad_norm": 16.723041509589873, + "kl": 0.4111328125, + "learning_rate": 4.391891891891892e-07, + "loss": 0.0004, + "reward": 3.580541491508484, + "reward_std": 0.029503321275115013, + "rewards/final_reward": 0.8807408224852558, + "rewards/mask_iou_reward": 0.4403704112426279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5805413722991943, + "rewards/thk_ans_format_reward": 1.0, + "step": 1992, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.03125, + "epoch": 6.7318718381112985, + "grad_norm": 9.532645094987426, + "kl": 0.564453125, + "learning_rate": 4.3890765765765764e-07, + "loss": 0.0006, + "reward": 3.3507272005081177, + "reward_std": 0.16045157611370087, + "rewards/final_reward": 1.4624383649971802, + "rewards/mask_iou_reward": 0.7312191824985901, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.350727140903473, + "rewards/thk_ans_format_reward": 1.0, + "step": 1993, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.71875762939453, + "epoch": 6.735244519392918, + "grad_norm": 103.84941218533088, + "kl": 0.4140625, + "learning_rate": 4.386261261261261e-07, + "loss": 0.0004, + "reward": 3.5490139722824097, + "reward_std": 0.11374928709119558, + "rewards/final_reward": 1.794110779942215, + "rewards/mask_iou_reward": 0.8970553899711075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.549013912677765, + "rewards/thk_ans_format_reward": 1.0, + "step": 1994, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.59375762939453, + "epoch": 6.738617200674536, + "grad_norm": 7.3091119459144895, + "kl": 0.447265625, + "learning_rate": 4.3834459459459456e-07, + "loss": 0.0005, + "reward": 3.6078637838363647, + "reward_std": 0.04512942023575306, + "rewards/final_reward": 1.2506563243584963, + "rewards/mask_iou_reward": 0.6253281621792481, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6078636050224304, + "rewards/thk_ans_format_reward": 1.0, + "step": 1995, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.11458587646484, + "epoch": 6.7419898819561555, + "grad_norm": 9.110154205303866, + "kl": 0.4443359375, + "learning_rate": 4.3806306306306307e-07, + "loss": 0.0004, + "reward": 3.3878947496414185, + "reward_std": 0.09348580799996853, + "rewards/final_reward": 0.8257914149734332, + "rewards/mask_iou_reward": 0.4128957074867166, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3878947496414185, + "rewards/thk_ans_format_reward": 1.0, + "step": 1996, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.48958587646484, + "epoch": 6.745362563237774, + "grad_norm": 25.63321763421153, + "kl": 0.404296875, + "learning_rate": 4.3778153153153153e-07, + "loss": 0.0004, + "reward": 3.3954946994781494, + "reward_std": 0.11219822522252798, + "rewards/final_reward": 1.9292324835472883, + "rewards/mask_iou_reward": 0.9646162417736441, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3954947590827942, + "rewards/thk_ans_format_reward": 1.0, + "step": 1997, + "think_completion_length": 6.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.6979217529297, + "epoch": 6.748735244519393, + "grad_norm": 6.710029564547177, + "kl": 0.541015625, + "learning_rate": 4.375e-07, + "loss": 0.0005, + "reward": 3.6499743461608887, + "reward_std": 0.11773128435015678, + "rewards/final_reward": 1.974263432550682, + "rewards/mask_iou_reward": 0.987131716275341, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6499744653701782, + "rewards/thk_ans_format_reward": 1.0, + "step": 1998, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.35416793823242, + "epoch": 6.7521079258010115, + "grad_norm": 10.378393283198248, + "kl": 0.607421875, + "learning_rate": 4.3721846846846845e-07, + "loss": 0.0006, + "reward": 3.4805933237075806, + "reward_std": 0.02605645265430212, + "rewards/final_reward": 1.6452655520916593, + "rewards/mask_iou_reward": 0.8226327760458296, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4805933833122253, + "rewards/thk_ans_format_reward": 1.0, + "step": 1999, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.36458587646484, + "epoch": 6.755480607082631, + "grad_norm": 10.707522224413472, + "kl": 0.4482421875, + "learning_rate": 4.369369369369369e-07, + "loss": 0.0005, + "reward": 3.482789993286133, + "reward_std": 0.077627994120121, + "rewards/final_reward": 1.5862446101483059, + "rewards/mask_iou_reward": 0.7931223050741529, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4827898144721985, + "rewards/thk_ans_format_reward": 1.0, + "step": 2000, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.59375381469727, + "epoch": 6.75885328836425, + "grad_norm": 10.464256908306714, + "kl": 0.490234375, + "learning_rate": 4.3665540540540543e-07, + "loss": 0.0005, + "reward": 3.395174980163574, + "reward_std": 0.18283828347921371, + "rewards/final_reward": 1.5661229109850008, + "rewards/mask_iou_reward": 0.7830614554925004, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3951751589775085, + "rewards/thk_ans_format_reward": 1.0, + "step": 2001, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.6666717529297, + "epoch": 6.762225969645868, + "grad_norm": 8.871161573522318, + "kl": 0.478515625, + "learning_rate": 4.363738738738739e-07, + "loss": 0.0005, + "reward": 3.6884692907333374, + "reward_std": 0.10596386343240738, + "rewards/final_reward": 1.7219217855060478, + "rewards/mask_iou_reward": 0.8609608927530239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6884692907333374, + "rewards/thk_ans_format_reward": 1.0, + "step": 2002, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.78125381469727, + "epoch": 6.765598650927488, + "grad_norm": 11.590469988382258, + "kl": 0.4375, + "learning_rate": 4.360923423423423e-07, + "loss": 0.0005, + "reward": 3.7286834716796875, + "reward_std": 0.017819946398958564, + "rewards/final_reward": 1.8638369676242008, + "rewards/mask_iou_reward": 0.9319184838121004, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7286832928657532, + "rewards/thk_ans_format_reward": 1.0, + "step": 2003, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.7916717529297, + "epoch": 6.768971332209106, + "grad_norm": 8.86489481149611, + "kl": 0.4755859375, + "learning_rate": 4.3581081081081076e-07, + "loss": 0.0005, + "reward": 3.4955180883407593, + "reward_std": 0.05199388600885868, + "rewards/final_reward": 1.7048009412676646, + "rewards/mask_iou_reward": 0.8524004706338323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4955180883407593, + "rewards/thk_ans_format_reward": 1.0, + "step": 2004, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6770896911621, + "epoch": 6.772344013490725, + "grad_norm": 9.166938127616588, + "kl": 0.4013671875, + "learning_rate": 4.355292792792792e-07, + "loss": 0.0004, + "reward": 3.5893832445144653, + "reward_std": 0.12393485009670258, + "rewards/final_reward": 1.8329345116802633, + "rewards/mask_iou_reward": 0.9164672558401317, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5893832445144653, + "rewards/thk_ans_format_reward": 1.0, + "step": 2005, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.92708587646484, + "epoch": 6.775716694772344, + "grad_norm": 14.396280349467032, + "kl": 0.4501953125, + "learning_rate": 4.3524774774774773e-07, + "loss": 0.0005, + "reward": 3.6871821880340576, + "reward_std": 0.04286697134375572, + "rewards/final_reward": 1.803451205269603, + "rewards/mask_iou_reward": 0.9017256026348015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6871821880340576, + "rewards/thk_ans_format_reward": 1.0, + "step": 2006, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.6666717529297, + "epoch": 6.779089376053963, + "grad_norm": 13.29191324189456, + "kl": 0.4033203125, + "learning_rate": 4.349662162162162e-07, + "loss": 0.0005, + "reward": 3.649420142173767, + "reward_std": 0.06181117706000805, + "rewards/final_reward": 1.3635650259031187, + "rewards/mask_iou_reward": 0.6817825129515593, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6494203209877014, + "rewards/thk_ans_format_reward": 1.0, + "step": 2007, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.7291717529297, + "epoch": 6.782462057335582, + "grad_norm": 10.68491866592069, + "kl": 0.435546875, + "learning_rate": 4.3468468468468465e-07, + "loss": 0.0004, + "reward": 3.775758147239685, + "reward_std": 0.17000571638345718, + "rewards/final_reward": 1.8791206054269782, + "rewards/mask_iou_reward": 0.9395603027134891, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.796591579914093, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2008, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.25000762939453, + "epoch": 6.785834738617201, + "grad_norm": 28.604722545822455, + "kl": 0.439453125, + "learning_rate": 4.344031531531531e-07, + "loss": 0.0004, + "reward": 3.454217791557312, + "reward_std": 0.046439859084784985, + "rewards/final_reward": 1.6288204235372983, + "rewards/mask_iou_reward": 0.8144102117686491, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.454217791557312, + "rewards/thk_ans_format_reward": 1.0, + "step": 2009, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.89583587646484, + "epoch": 6.78920741989882, + "grad_norm": 5.868265505091315, + "kl": 0.4248046875, + "learning_rate": 4.341216216216216e-07, + "loss": 0.0004, + "reward": 3.2674766778945923, + "reward_std": 0.029944440349936485, + "rewards/final_reward": 1.4306260788189844, + "rewards/mask_iou_reward": 0.7153130394094922, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.267476499080658, + "rewards/thk_ans_format_reward": 1.0, + "step": 2010, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.65625, + "epoch": 6.792580101180438, + "grad_norm": 19.454490820847745, + "kl": 0.33984375, + "learning_rate": 4.338400900900901e-07, + "loss": 0.0003, + "reward": 3.4577724933624268, + "reward_std": 0.18022079020738602, + "rewards/final_reward": 1.252627623997265, + "rewards/mask_iou_reward": 0.6263138119986325, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4786058068275452, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2011, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.73958587646484, + "epoch": 6.795952782462058, + "grad_norm": 9.842950941312992, + "kl": 0.4677734375, + "learning_rate": 4.3355855855855855e-07, + "loss": 0.0005, + "reward": 3.304463267326355, + "reward_std": 0.1052134744822979, + "rewards/final_reward": 1.713373917512853, + "rewards/mask_iou_reward": 0.8566869587564265, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3044632077217102, + "rewards/thk_ans_format_reward": 1.0, + "step": 2012, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.4479217529297, + "epoch": 6.799325463743676, + "grad_norm": 11.935152460455082, + "kl": 0.3857421875, + "learning_rate": 4.33277027027027e-07, + "loss": 0.0004, + "reward": 3.6137847900390625, + "reward_std": 0.12032023817300797, + "rewards/final_reward": 1.956927414618658, + "rewards/mask_iou_reward": 0.978463707309329, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.613784670829773, + "rewards/thk_ans_format_reward": 1.0, + "step": 2013, + "think_completion_length": 6.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5729217529297, + "epoch": 6.802698145025295, + "grad_norm": 11.123120156843346, + "kl": 0.529296875, + "learning_rate": 4.3299549549549547e-07, + "loss": 0.0005, + "reward": 3.1668895483016968, + "reward_std": 0.16719982773065567, + "rewards/final_reward": 0.6149814804918514, + "rewards/mask_iou_reward": 0.3074907402459257, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1668896079063416, + "rewards/thk_ans_format_reward": 1.0, + "step": 2014, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.28125381469727, + "epoch": 6.806070826306914, + "grad_norm": 7.836398907878883, + "kl": 0.5341796875, + "learning_rate": 4.3271396396396393e-07, + "loss": 0.0006, + "reward": 3.5650718212127686, + "reward_std": 0.10769028216600418, + "rewards/final_reward": 1.439123274000237, + "rewards/mask_iou_reward": 0.7195616370001185, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5650717616081238, + "rewards/thk_ans_format_reward": 1.0, + "step": 2015, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.8229217529297, + "epoch": 6.809443507588533, + "grad_norm": 6.754528250710224, + "kl": 0.390625, + "learning_rate": 4.3243243243243244e-07, + "loss": 0.0004, + "reward": 3.625916361808777, + "reward_std": 0.051243921276181936, + "rewards/final_reward": 1.0752049657908953, + "rewards/mask_iou_reward": 0.5376024828954477, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6259164214134216, + "rewards/thk_ans_format_reward": 1.0, + "step": 2016, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.40625762939453, + "epoch": 6.812816188870151, + "grad_norm": 8.338203379557932, + "kl": 0.587890625, + "learning_rate": 4.321509009009009e-07, + "loss": 0.0006, + "reward": 3.6156070232391357, + "reward_std": 0.045968128368258476, + "rewards/final_reward": 1.7577103244250059, + "rewards/mask_iou_reward": 0.8788551622125029, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6156070828437805, + "rewards/thk_ans_format_reward": 1.0, + "step": 2017, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.87500762939453, + "epoch": 6.8161888701517706, + "grad_norm": 14.692569916989621, + "kl": 0.404296875, + "learning_rate": 4.3186936936936937e-07, + "loss": 0.0004, + "reward": 3.511265277862549, + "reward_std": 0.11939521878957748, + "rewards/final_reward": 1.8363637454485904, + "rewards/mask_iou_reward": 0.9181818727242952, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.511265218257904, + "rewards/thk_ans_format_reward": 1.0, + "step": 2018, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.06250762939453, + "epoch": 6.81956155143339, + "grad_norm": 21.775539312569634, + "kl": 0.4091796875, + "learning_rate": 4.315878378378378e-07, + "loss": 0.0004, + "reward": 3.3648746013641357, + "reward_std": 0.04116539843380451, + "rewards/final_reward": 1.5359148831507619, + "rewards/mask_iou_reward": 0.7679574415753809, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.364874541759491, + "rewards/thk_ans_format_reward": 1.0, + "step": 2019, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.78125, + "epoch": 6.822934232715008, + "grad_norm": 19.70188938671734, + "kl": 0.419921875, + "learning_rate": 4.313063063063063e-07, + "loss": 0.0004, + "reward": 3.721093535423279, + "reward_std": 0.07094984129071236, + "rewards/final_reward": 1.6374509550249228, + "rewards/mask_iou_reward": 0.8187254775124614, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7210937142372131, + "rewards/thk_ans_format_reward": 1.0, + "step": 2020, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.28125762939453, + "epoch": 6.8263069139966275, + "grad_norm": 27.009543335856872, + "kl": 0.361328125, + "learning_rate": 4.310247747747748e-07, + "loss": 0.0004, + "reward": 3.712403893470764, + "reward_std": 0.25884455256164074, + "rewards/final_reward": 1.7145527885472487, + "rewards/mask_iou_reward": 0.8572763942736243, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.7436540722846985, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2021, + "think_completion_length": 6.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.03125762939453, + "epoch": 6.829679595278246, + "grad_norm": 9.488178524727436, + "kl": 0.462890625, + "learning_rate": 4.3074324324324326e-07, + "loss": 0.0005, + "reward": 3.418555974960327, + "reward_std": 0.11346096568740904, + "rewards/final_reward": 1.8226451478927959, + "rewards/mask_iou_reward": 0.9113225739463979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4185560941696167, + "rewards/thk_ans_format_reward": 1.0, + "step": 2022, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.23958587646484, + "epoch": 6.833052276559865, + "grad_norm": 8.097256330169499, + "kl": 0.486328125, + "learning_rate": 4.3046171171171167e-07, + "loss": 0.0005, + "reward": 3.4910701513290405, + "reward_std": 0.11701249331235886, + "rewards/final_reward": 1.4391594572018165, + "rewards/mask_iou_reward": 0.7195797286009082, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4910700917243958, + "rewards/thk_ans_format_reward": 1.0, + "step": 2023, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.1666717529297, + "epoch": 6.8364249578414835, + "grad_norm": 8.493282353877067, + "kl": 0.453125, + "learning_rate": 4.3018018018018013e-07, + "loss": 0.0005, + "reward": 3.4912160634994507, + "reward_std": 0.09391393139958382, + "rewards/final_reward": 1.438580935765031, + "rewards/mask_iou_reward": 0.7192904678825155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4912160634994507, + "rewards/thk_ans_format_reward": 1.0, + "step": 2024, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.02084350585938, + "epoch": 6.839797639123103, + "grad_norm": 13.795074515453015, + "kl": 0.380859375, + "learning_rate": 4.298986486486486e-07, + "loss": 0.0004, + "reward": 3.5047991275787354, + "reward_std": 0.06105640344321728, + "rewards/final_reward": 1.382774638880558, + "rewards/mask_iou_reward": 0.691387319440279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5047988891601562, + "rewards/thk_ans_format_reward": 1.0, + "step": 2025, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.3541717529297, + "epoch": 6.843170320404722, + "grad_norm": 26.800221203820207, + "kl": 0.453125, + "learning_rate": 4.296171171171171e-07, + "loss": 0.0005, + "reward": 3.1508926153182983, + "reward_std": 0.1210801713168621, + "rewards/final_reward": 1.5852870872997817, + "rewards/mask_iou_reward": 0.7926435436498909, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1508926749229431, + "rewards/thk_ans_format_reward": 1.0, + "step": 2026, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.1979217529297, + "epoch": 6.8465430016863404, + "grad_norm": 11.371052034781563, + "kl": 0.4189453125, + "learning_rate": 4.2933558558558556e-07, + "loss": 0.0004, + "reward": 3.4964561462402344, + "reward_std": 0.12025372684001923, + "rewards/final_reward": 1.6872832059981004, + "rewards/mask_iou_reward": 0.8436416029990502, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4964563250541687, + "rewards/thk_ans_format_reward": 1.0, + "step": 2027, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.125, + "epoch": 6.84991568296796, + "grad_norm": 16.273760978509856, + "kl": 0.3974609375, + "learning_rate": 4.29054054054054e-07, + "loss": 0.0004, + "reward": 3.4314554929733276, + "reward_std": 0.05612679943442345, + "rewards/final_reward": 1.9202637571991077, + "rewards/mask_iou_reward": 0.9601318785995538, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4314554333686829, + "rewards/thk_ans_format_reward": 1.0, + "step": 2028, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.78125, + "epoch": 6.853288364249578, + "grad_norm": 9.154995082158772, + "kl": 0.3916015625, + "learning_rate": 4.287725225225225e-07, + "loss": 0.0004, + "reward": 3.800302505493164, + "reward_std": 0.01082283305004239, + "rewards/final_reward": 1.8819881384430022, + "rewards/mask_iou_reward": 0.9409940692215011, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8003026843070984, + "rewards/thk_ans_format_reward": 1.0, + "step": 2029, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.68750762939453, + "epoch": 6.856661045531197, + "grad_norm": 10.432191975171135, + "kl": 0.4560546875, + "learning_rate": 4.2849099099099095e-07, + "loss": 0.0004, + "reward": 3.822153329849243, + "reward_std": 0.02899275626987219, + "rewards/final_reward": 1.9186536417083446, + "rewards/mask_iou_reward": 0.9593268208541723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8221532106399536, + "rewards/thk_ans_format_reward": 1.0, + "step": 2030, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.71875762939453, + "epoch": 6.860033726812816, + "grad_norm": 9.532886988046716, + "kl": 0.45703125, + "learning_rate": 4.2820945945945946e-07, + "loss": 0.0005, + "reward": 3.441614031791687, + "reward_std": 0.18240241333842278, + "rewards/final_reward": 1.3654729554678962, + "rewards/mask_iou_reward": 0.6827364777339481, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4520307183265686, + "rewards/thk_ans_format_reward": 1.0, + "step": 2031, + "think_completion_length": 7.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.77083587646484, + "epoch": 6.863406408094435, + "grad_norm": 9.693404018286879, + "kl": 0.462890625, + "learning_rate": 4.279279279279279e-07, + "loss": 0.0005, + "reward": 3.097666382789612, + "reward_std": 0.1792236566543579, + "rewards/final_reward": 0.7095118455592196, + "rewards/mask_iou_reward": 0.3547559227796098, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0976662635803223, + "rewards/thk_ans_format_reward": 1.0, + "step": 2032, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.9791717529297, + "epoch": 6.866779089376054, + "grad_norm": 6.292404625002513, + "kl": 0.5263671875, + "learning_rate": 4.276463963963964e-07, + "loss": 0.0005, + "reward": 3.4150606393814087, + "reward_std": 0.08291511330753565, + "rewards/final_reward": 1.8709507809260542, + "rewards/mask_iou_reward": 0.9354753904630271, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4150602221488953, + "rewards/thk_ans_format_reward": 1.0, + "step": 2033, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.87500762939453, + "epoch": 6.870151770657673, + "grad_norm": 25.59875786307666, + "kl": 0.427734375, + "learning_rate": 4.2736486486486484e-07, + "loss": 0.0004, + "reward": 3.5064250230789185, + "reward_std": 0.32452040165662766, + "rewards/final_reward": 1.3838483755187423, + "rewards/mask_iou_reward": 0.6919241877593711, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5480916500091553, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2034, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.4791717529297, + "epoch": 6.873524451939292, + "grad_norm": 16.83175076438212, + "kl": 0.607421875, + "learning_rate": 4.270833333333333e-07, + "loss": 0.0006, + "reward": 3.4737848043441772, + "reward_std": 0.10801901668310165, + "rewards/final_reward": 1.1984207695179063, + "rewards/mask_iou_reward": 0.5992103847589532, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.473784863948822, + "rewards/thk_ans_format_reward": 1.0, + "step": 2035, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.58333587646484, + "epoch": 6.87689713322091, + "grad_norm": 23.664794856462887, + "kl": 0.4716796875, + "learning_rate": 4.268018018018018e-07, + "loss": 0.0005, + "reward": 3.7692642211914062, + "reward_std": 0.06182833015918732, + "rewards/final_reward": 1.8681330269338021, + "rewards/mask_iou_reward": 0.9340665134669011, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7692642211914062, + "rewards/thk_ans_format_reward": 1.0, + "step": 2036, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.5416717529297, + "epoch": 6.88026981450253, + "grad_norm": 4.613817355642268, + "kl": 0.4453125, + "learning_rate": 4.265202702702703e-07, + "loss": 0.0005, + "reward": 3.53538978099823, + "reward_std": 0.09306821972131729, + "rewards/final_reward": 1.388622126765915, + "rewards/mask_iou_reward": 0.6943110633829574, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5353897213935852, + "rewards/thk_ans_format_reward": 1.0, + "step": 2037, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.0833396911621, + "epoch": 6.883642495784148, + "grad_norm": 7.311229770984157, + "kl": 0.580078125, + "learning_rate": 4.2623873873873874e-07, + "loss": 0.0006, + "reward": 3.4610273838043213, + "reward_std": 0.14541162177920341, + "rewards/final_reward": 1.5367615667478312, + "rewards/mask_iou_reward": 0.7683807833739156, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.461027443408966, + "rewards/thk_ans_format_reward": 1.0, + "step": 2038, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.50000762939453, + "epoch": 6.887015177065767, + "grad_norm": 28.29948794364883, + "kl": 1.51171875, + "learning_rate": 4.259572072072072e-07, + "loss": 0.0015, + "reward": 3.4711467027664185, + "reward_std": 0.14650648832321167, + "rewards/final_reward": 1.0551088381709772, + "rewards/mask_iou_reward": 0.5275544190854886, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4711466431617737, + "rewards/thk_ans_format_reward": 1.0, + "step": 2039, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.61459350585938, + "epoch": 6.8903878583473865, + "grad_norm": 6.459950948608125, + "kl": 0.4189453125, + "learning_rate": 4.2567567567567566e-07, + "loss": 0.0004, + "reward": 3.627356767654419, + "reward_std": 0.2032754383981228, + "rewards/final_reward": 1.5017048117281278, + "rewards/mask_iou_reward": 0.7508524058640639, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6481899619102478, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2040, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.23958587646484, + "epoch": 6.893760539629005, + "grad_norm": 11.212679981209691, + "kl": 0.4052734375, + "learning_rate": 4.2539414414414417e-07, + "loss": 0.0004, + "reward": 3.6127889156341553, + "reward_std": 0.05332676135003567, + "rewards/final_reward": 1.7987581374357826, + "rewards/mask_iou_reward": 0.8993790687178913, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6127886772155762, + "rewards/thk_ans_format_reward": 1.0, + "step": 2041, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.9791717529297, + "epoch": 6.897133220910624, + "grad_norm": 15.159521023445356, + "kl": 0.4052734375, + "learning_rate": 4.2511261261261263e-07, + "loss": 0.0004, + "reward": 3.6380761861801147, + "reward_std": 0.048528952058404684, + "rewards/final_reward": 1.9370697683883562, + "rewards/mask_iou_reward": 0.9685348841941781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6380762457847595, + "rewards/thk_ans_format_reward": 1.0, + "step": 2042, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.96875762939453, + "epoch": 6.900505902192243, + "grad_norm": 8.981046099187886, + "kl": 0.369140625, + "learning_rate": 4.2483108108108104e-07, + "loss": 0.0004, + "reward": 3.67224383354187, + "reward_std": 0.11439605057239532, + "rewards/final_reward": 1.8130440471669833, + "rewards/mask_iou_reward": 0.9065220235834917, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6722437143325806, + "rewards/thk_ans_format_reward": 1.0, + "step": 2043, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.9791717529297, + "epoch": 6.903878583473862, + "grad_norm": 8.722632525029411, + "kl": 0.3701171875, + "learning_rate": 4.245495495495495e-07, + "loss": 0.0004, + "reward": 3.5095438957214355, + "reward_std": 0.25895553827285767, + "rewards/final_reward": 1.8349615960203192, + "rewards/mask_iou_reward": 0.9174807980101596, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5303771495819092, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2044, + "think_completion_length": 6.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.18750762939453, + "epoch": 6.90725126475548, + "grad_norm": 5.767256824016995, + "kl": 0.400390625, + "learning_rate": 4.2426801801801796e-07, + "loss": 0.0004, + "reward": 3.5061161518096924, + "reward_std": 0.023271950893104076, + "rewards/final_reward": 1.279184815264447, + "rewards/mask_iou_reward": 0.6395924076322235, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5061162114143372, + "rewards/thk_ans_format_reward": 1.0, + "step": 2045, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.59376525878906, + "epoch": 6.9106239460370995, + "grad_norm": 12.672614787199961, + "kl": 0.5947265625, + "learning_rate": 4.239864864864865e-07, + "loss": 0.0006, + "reward": 3.4643458127975464, + "reward_std": 0.06347063556313515, + "rewards/final_reward": 1.1290433724283133, + "rewards/mask_iou_reward": 0.5645216862141567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4643457531929016, + "rewards/thk_ans_format_reward": 1.0, + "step": 2046, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.17709350585938, + "epoch": 6.913996627318719, + "grad_norm": 50.46641016769619, + "kl": 0.3564453125, + "learning_rate": 4.2370495495495494e-07, + "loss": 0.0004, + "reward": 3.238726854324341, + "reward_std": 0.11985756456851959, + "rewards/final_reward": 1.1162085397945747, + "rewards/mask_iou_reward": 0.5581042698972873, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2387269139289856, + "rewards/thk_ans_format_reward": 1.0, + "step": 2047, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.50000762939453, + "epoch": 6.917369308600337, + "grad_norm": 22.18032940628456, + "kl": 0.4208984375, + "learning_rate": 4.234234234234234e-07, + "loss": 0.0004, + "reward": 3.5871880054473877, + "reward_std": 0.12817983329296112, + "rewards/final_reward": 1.545037883939106, + "rewards/mask_iou_reward": 0.772518941969553, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.587187945842743, + "rewards/thk_ans_format_reward": 1.0, + "step": 2048, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.1666717529297, + "epoch": 6.920741989881956, + "grad_norm": 28.427981286178998, + "kl": 0.4033203125, + "learning_rate": 4.2314189189189186e-07, + "loss": 0.0004, + "reward": 3.621571660041809, + "reward_std": 0.020552618894726038, + "rewards/final_reward": 1.3964029672048128, + "rewards/mask_iou_reward": 0.6982014836024064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6215715408325195, + "rewards/thk_ans_format_reward": 1.0, + "step": 2049, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.83333587646484, + "epoch": 6.924114671163575, + "grad_norm": 9.362320711061372, + "kl": 0.4072265625, + "learning_rate": 4.228603603603603e-07, + "loss": 0.0004, + "reward": 3.4834201335906982, + "reward_std": 0.14708335511386395, + "rewards/final_reward": 1.827577067771653, + "rewards/mask_iou_reward": 0.9137885338858265, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4834200739860535, + "rewards/thk_ans_format_reward": 1.0, + "step": 2050, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.80208587646484, + "epoch": 6.927487352445194, + "grad_norm": 13.204004703332789, + "kl": 0.419921875, + "learning_rate": 4.2257882882882883e-07, + "loss": 0.0004, + "reward": 3.435231924057007, + "reward_std": 0.07278600335121155, + "rewards/final_reward": 1.6471412575629387, + "rewards/mask_iou_reward": 0.8235706287814694, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4352316856384277, + "rewards/thk_ans_format_reward": 1.0, + "step": 2051, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.90625381469727, + "epoch": 6.9308600337268125, + "grad_norm": 9.33496190379643, + "kl": 0.447265625, + "learning_rate": 4.222972972972973e-07, + "loss": 0.0005, + "reward": 3.709159255027771, + "reward_std": 0.03617256507277489, + "rewards/final_reward": 1.7899409602624758, + "rewards/mask_iou_reward": 0.8949704801312379, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.70915949344635, + "rewards/thk_ans_format_reward": 1.0, + "step": 2052, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.8854217529297, + "epoch": 6.934232715008432, + "grad_norm": 23.282238466862477, + "kl": 0.5009765625, + "learning_rate": 4.2201576576576575e-07, + "loss": 0.0005, + "reward": 3.610919237136841, + "reward_std": 0.044389775954186916, + "rewards/final_reward": 1.2384339013913657, + "rewards/mask_iou_reward": 0.6192169506956828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6109193563461304, + "rewards/thk_ans_format_reward": 1.0, + "step": 2053, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.09375762939453, + "epoch": 6.937605396290051, + "grad_norm": 12.724463055365122, + "kl": 0.4013671875, + "learning_rate": 4.217342342342342e-07, + "loss": 0.0004, + "reward": 3.6631650924682617, + "reward_std": 0.13815235905349255, + "rewards/final_reward": 1.7990066628365704, + "rewards/mask_iou_reward": 0.8995033314182852, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.683998465538025, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2054, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.70833587646484, + "epoch": 6.940978077571669, + "grad_norm": 8.099159270808352, + "kl": 0.48046875, + "learning_rate": 4.214527027027027e-07, + "loss": 0.0005, + "reward": 3.4705790281295776, + "reward_std": 0.06602546386420727, + "rewards/final_reward": 1.3726603388502736, + "rewards/mask_iou_reward": 0.6863301694251368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4705789685249329, + "rewards/thk_ans_format_reward": 1.0, + "step": 2055, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.52084350585938, + "epoch": 6.944350758853289, + "grad_norm": 11.396772634783654, + "kl": 0.376953125, + "learning_rate": 4.2117117117117114e-07, + "loss": 0.0004, + "reward": 3.3403568267822266, + "reward_std": 0.12214644998311996, + "rewards/final_reward": 1.7037964388214435, + "rewards/mask_iou_reward": 0.8518982194107217, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3403565883636475, + "rewards/thk_ans_format_reward": 1.0, + "step": 2056, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.75, + "epoch": 6.947723440134907, + "grad_norm": 11.978637065897662, + "kl": 0.3828125, + "learning_rate": 4.2088963963963965e-07, + "loss": 0.0004, + "reward": 3.2751539945602417, + "reward_std": 0.173908993601799, + "rewards/final_reward": 1.8127977999330058, + "rewards/mask_iou_reward": 0.9063988999665029, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2959871888160706, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2057, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.58333587646484, + "epoch": 6.951096121416526, + "grad_norm": 11.433820469930387, + "kl": 0.6953125, + "learning_rate": 4.206081081081081e-07, + "loss": 0.0007, + "reward": 3.478344202041626, + "reward_std": 0.10730455070734024, + "rewards/final_reward": 1.515322556277861, + "rewards/mask_iou_reward": 0.7576612781389305, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4783442616462708, + "rewards/thk_ans_format_reward": 1.0, + "step": 2058, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.6979217529297, + "epoch": 6.954468802698145, + "grad_norm": 7.33352107841849, + "kl": 0.3896484375, + "learning_rate": 4.2032657657657657e-07, + "loss": 0.0004, + "reward": 3.4388267993927, + "reward_std": 0.004863133071921766, + "rewards/final_reward": 0.9464367668835442, + "rewards/mask_iou_reward": 0.4732183834417721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4388265013694763, + "rewards/thk_ans_format_reward": 1.0, + "step": 2059, + "think_completion_length": 6.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.34375762939453, + "epoch": 6.957841483979764, + "grad_norm": 36.48687000543652, + "kl": 0.4052734375, + "learning_rate": 4.2004504504504503e-07, + "loss": 0.0004, + "reward": 3.280514121055603, + "reward_std": 0.16359587758779526, + "rewards/final_reward": 1.5899992914228696, + "rewards/mask_iou_reward": 0.7949996457114348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.280514121055603, + "rewards/thk_ans_format_reward": 1.0, + "step": 2060, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.2604217529297, + "epoch": 6.961214165261383, + "grad_norm": 6.0776006598667545, + "kl": 0.36328125, + "learning_rate": 4.197635135135135e-07, + "loss": 0.0004, + "reward": 3.6308919191360474, + "reward_std": 0.06040792353451252, + "rewards/final_reward": 1.7377528025504996, + "rewards/mask_iou_reward": 0.8688764012752498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6308916807174683, + "rewards/thk_ans_format_reward": 1.0, + "step": 2061, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6354217529297, + "epoch": 6.964586846543002, + "grad_norm": 14.250731643162881, + "kl": 0.509765625, + "learning_rate": 4.19481981981982e-07, + "loss": 0.0005, + "reward": 3.692965269088745, + "reward_std": 0.039796837605535984, + "rewards/final_reward": 1.9779923740620449, + "rewards/mask_iou_reward": 0.9889961870310224, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6929653882980347, + "rewards/thk_ans_format_reward": 1.0, + "step": 2062, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.64584350585938, + "epoch": 6.967959527824621, + "grad_norm": 25.0275801936448, + "kl": 0.4345703125, + "learning_rate": 4.192004504504504e-07, + "loss": 0.0004, + "reward": 3.548501491546631, + "reward_std": 0.12630417943000793, + "rewards/final_reward": 1.5421603575578033, + "rewards/mask_iou_reward": 0.7710801787789017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5485016107559204, + "rewards/thk_ans_format_reward": 1.0, + "step": 2063, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.07292938232422, + "epoch": 6.971332209106239, + "grad_norm": 9.025736068743793, + "kl": 0.470703125, + "learning_rate": 4.189189189189189e-07, + "loss": 0.0005, + "reward": 3.666631579399109, + "reward_std": 0.04170432314276695, + "rewards/final_reward": 1.6901646951117226, + "rewards/mask_iou_reward": 0.8450823475558613, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6666315197944641, + "rewards/thk_ans_format_reward": 1.0, + "step": 2064, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.67708587646484, + "epoch": 6.974704890387859, + "grad_norm": 12.870468548233324, + "kl": 0.4599609375, + "learning_rate": 4.1863738738738733e-07, + "loss": 0.0005, + "reward": 3.4917114973068237, + "reward_std": 0.12080695852637291, + "rewards/final_reward": 1.628104300970692, + "rewards/mask_iou_reward": 0.814052150485346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4917116165161133, + "rewards/thk_ans_format_reward": 1.0, + "step": 2065, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.05208587646484, + "epoch": 6.978077571669477, + "grad_norm": 6.25613270559228, + "kl": 0.41015625, + "learning_rate": 4.183558558558558e-07, + "loss": 0.0005, + "reward": 3.553234815597534, + "reward_std": 0.027653097175061703, + "rewards/final_reward": 1.6949132902041693, + "rewards/mask_iou_reward": 0.8474566451020846, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.553234577178955, + "rewards/thk_ans_format_reward": 1.0, + "step": 2066, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.625, + "epoch": 6.981450252951096, + "grad_norm": 10.920688669993615, + "kl": 0.4814453125, + "learning_rate": 4.180743243243243e-07, + "loss": 0.0005, + "reward": 3.600623846054077, + "reward_std": 0.11215956509113312, + "rewards/final_reward": 1.2300481204617575, + "rewards/mask_iou_reward": 0.6150240602308787, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6006236672401428, + "rewards/thk_ans_format_reward": 1.0, + "step": 2067, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.64583587646484, + "epoch": 6.9848229342327155, + "grad_norm": 14.28349519092112, + "kl": 0.630859375, + "learning_rate": 4.1779279279279277e-07, + "loss": 0.0006, + "reward": 3.2274869680404663, + "reward_std": 0.1608312577009201, + "rewards/final_reward": 1.013269042755018, + "rewards/mask_iou_reward": 0.506634521377509, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.227486789226532, + "rewards/thk_ans_format_reward": 1.0, + "step": 2068, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.82291793823242, + "epoch": 6.988195615514334, + "grad_norm": 8.99894758581553, + "kl": 0.45703125, + "learning_rate": 4.1751126126126123e-07, + "loss": 0.0005, + "reward": 3.7834309339523315, + "reward_std": 0.025212008506059647, + "rewards/final_reward": 1.6631238092734235, + "rewards/mask_iou_reward": 0.8315619046367118, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7834311127662659, + "rewards/thk_ans_format_reward": 1.0, + "step": 2069, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.20833587646484, + "epoch": 6.991568296795953, + "grad_norm": 47.632513811120354, + "kl": 0.521484375, + "learning_rate": 4.172297297297297e-07, + "loss": 0.0006, + "reward": 3.3873326778411865, + "reward_std": 0.28176888823509216, + "rewards/final_reward": 1.8443692369468496, + "rewards/mask_iou_reward": 0.9221846184734248, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3873324990272522, + "rewards/thk_ans_format_reward": 1.0, + "step": 2070, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.625, + "epoch": 6.9949409780775715, + "grad_norm": 7.669381095104932, + "kl": 0.458984375, + "learning_rate": 4.1694819819819815e-07, + "loss": 0.0005, + "reward": 3.062989354133606, + "reward_std": 0.08837875723838806, + "rewards/final_reward": 1.4319442864302787, + "rewards/mask_iou_reward": 0.7159721432151394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0629891753196716, + "rewards/thk_ans_format_reward": 1.0, + "step": 2071, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.92105102539062, + "epoch": 6.998313659359191, + "grad_norm": 47.753976767913095, + "kl": 0.490234375, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.0005, + "reward": 3.8917770385742188, + "reward_std": 0.017438477370887995, + "rewards/final_reward": 1.8893658419395065, + "rewards/mask_iou_reward": 0.9446829209697533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8917770385742188, + "rewards/thk_ans_format_reward": 1.0, + "step": 2072, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.36458587646484, + "epoch": 7.003372681281619, + "grad_norm": 39.113360226616884, + "kl": 0.548828125, + "learning_rate": 4.163851351351351e-07, + "loss": 0.0006, + "reward": 3.5466045141220093, + "reward_std": 0.10832555405795574, + "rewards/final_reward": 1.4662659790531536, + "rewards/mask_iou_reward": 0.7331329895265768, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5466048121452332, + "rewards/thk_ans_format_reward": 1.0, + "step": 2073, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.4166717529297, + "epoch": 7.006745362563238, + "grad_norm": 8.053807818313302, + "kl": 0.4931640625, + "learning_rate": 4.161036036036036e-07, + "loss": 0.0005, + "reward": 3.5464266538619995, + "reward_std": 0.05879717133939266, + "rewards/final_reward": 1.8675432805313172, + "rewards/mask_iou_reward": 0.9337716402656586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.54642653465271, + "rewards/thk_ans_format_reward": 1.0, + "step": 2074, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.55208587646484, + "epoch": 7.010118043844857, + "grad_norm": 8.633699358022847, + "kl": 0.431640625, + "learning_rate": 4.1582207207207205e-07, + "loss": 0.0004, + "reward": 3.399770975112915, + "reward_std": 0.1902711447328329, + "rewards/final_reward": 1.408978737907396, + "rewards/mask_iou_reward": 0.704489368953698, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.420604407787323, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2075, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.0729217529297, + "epoch": 7.013490725126475, + "grad_norm": 12.270572132816845, + "kl": 0.439453125, + "learning_rate": 4.155405405405405e-07, + "loss": 0.0004, + "reward": 3.4145290851593018, + "reward_std": 0.03906204830855131, + "rewards/final_reward": 1.0129971435762222, + "rewards/mask_iou_reward": 0.5064985717881111, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4145292043685913, + "rewards/thk_ans_format_reward": 1.0, + "step": 2076, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.40625762939453, + "epoch": 7.016863406408095, + "grad_norm": 45.624948252596006, + "kl": 0.58203125, + "learning_rate": 4.15259009009009e-07, + "loss": 0.0006, + "reward": 3.3474459648132324, + "reward_std": 0.25892218202352524, + "rewards/final_reward": 1.5210914066116574, + "rewards/mask_iou_reward": 0.7605457033058287, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3474458456039429, + "rewards/thk_ans_format_reward": 1.0, + "step": 2077, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.1041717529297, + "epoch": 7.020236087689713, + "grad_norm": 8.878850724523403, + "kl": 0.4873046875, + "learning_rate": 4.149774774774775e-07, + "loss": 0.0005, + "reward": 3.6284542083740234, + "reward_std": 0.056491006165742874, + "rewards/final_reward": 1.72353412290081, + "rewards/mask_iou_reward": 0.861767061450405, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6284542083740234, + "rewards/thk_ans_format_reward": 1.0, + "step": 2078, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.50000762939453, + "epoch": 7.023608768971332, + "grad_norm": 7.75147011450966, + "kl": 0.4677734375, + "learning_rate": 4.1469594594594594e-07, + "loss": 0.0005, + "reward": 3.2908732891082764, + "reward_std": 0.09442063421010971, + "rewards/final_reward": 1.541150831055468, + "rewards/mask_iou_reward": 0.770575415527734, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2908731698989868, + "rewards/thk_ans_format_reward": 1.0, + "step": 2079, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.5625, + "epoch": 7.0269814502529515, + "grad_norm": 10.132125586188312, + "kl": 0.509765625, + "learning_rate": 4.144144144144144e-07, + "loss": 0.0005, + "reward": 3.430995225906372, + "reward_std": 0.07199489884078503, + "rewards/final_reward": 0.8899003687201831, + "rewards/mask_iou_reward": 0.44495018436009154, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.430995225906372, + "rewards/thk_ans_format_reward": 1.0, + "step": 2080, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.3854217529297, + "epoch": 7.03035413153457, + "grad_norm": 10.19011933825818, + "kl": 0.41796875, + "learning_rate": 4.1413288288288286e-07, + "loss": 0.0004, + "reward": 3.6006062030792236, + "reward_std": 0.14285754412412643, + "rewards/final_reward": 1.189594890267856, + "rewards/mask_iou_reward": 0.594797445133928, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.600606083869934, + "rewards/thk_ans_format_reward": 1.0, + "step": 2081, + "think_completion_length": 6.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.7395896911621, + "epoch": 7.033726812816189, + "grad_norm": 7.733210854177587, + "kl": 0.4814453125, + "learning_rate": 4.138513513513514e-07, + "loss": 0.0005, + "reward": 3.3528921604156494, + "reward_std": 0.09503498487174511, + "rewards/final_reward": 1.043915528277389, + "rewards/mask_iou_reward": 0.5219577641386945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3528921604156494, + "rewards/thk_ans_format_reward": 1.0, + "step": 2082, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.09375762939453, + "epoch": 7.0370994940978076, + "grad_norm": 19.94042549315522, + "kl": 0.8447265625, + "learning_rate": 4.135698198198198e-07, + "loss": 0.0008, + "reward": 3.5029793977737427, + "reward_std": 0.08090989291667938, + "rewards/final_reward": 1.3858858070205202, + "rewards/mask_iou_reward": 0.6929429035102601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5029793977737427, + "rewards/thk_ans_format_reward": 1.0, + "step": 2083, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.41666793823242, + "epoch": 7.040472175379427, + "grad_norm": 10.718417213008133, + "kl": 0.4501953125, + "learning_rate": 4.1328828828828825e-07, + "loss": 0.0005, + "reward": 3.7191574573516846, + "reward_std": 0.05856491345912218, + "rewards/final_reward": 1.728180431981604, + "rewards/mask_iou_reward": 0.864090215990802, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7191572785377502, + "rewards/thk_ans_format_reward": 1.0, + "step": 2084, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.53125381469727, + "epoch": 7.043844856661045, + "grad_norm": 8.713561379718907, + "kl": 0.4443359375, + "learning_rate": 4.130067567567567e-07, + "loss": 0.0004, + "reward": 3.4879956245422363, + "reward_std": 0.1195925809442997, + "rewards/final_reward": 1.8511911442186961, + "rewards/mask_iou_reward": 0.9255955721093481, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.487995684146881, + "rewards/thk_ans_format_reward": 1.0, + "step": 2085, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.80208587646484, + "epoch": 7.0472175379426645, + "grad_norm": 10.716632561067437, + "kl": 0.4453125, + "learning_rate": 4.1272522522522517e-07, + "loss": 0.0004, + "reward": 3.579306960105896, + "reward_std": 0.1012413278222084, + "rewards/final_reward": 1.7639435770020846, + "rewards/mask_iou_reward": 0.8819717885010423, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5793068408966064, + "rewards/thk_ans_format_reward": 1.0, + "step": 2086, + "think_completion_length": 7.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.96875381469727, + "epoch": 7.050590219224283, + "grad_norm": 8.659125001587432, + "kl": 0.421875, + "learning_rate": 4.124436936936937e-07, + "loss": 0.0004, + "reward": 3.30793559551239, + "reward_std": 0.16385822743177414, + "rewards/final_reward": 1.435798409073462, + "rewards/mask_iou_reward": 0.717899204536731, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3079355955123901, + "rewards/thk_ans_format_reward": 1.0, + "step": 2087, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.25000762939453, + "epoch": 7.053962900505902, + "grad_norm": 10.376524327165193, + "kl": 0.4296875, + "learning_rate": 4.1216216216216214e-07, + "loss": 0.0004, + "reward": 3.631484866142273, + "reward_std": 0.037298865616321564, + "rewards/final_reward": 1.8119502353295949, + "rewards/mask_iou_reward": 0.9059751176647974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6314847469329834, + "rewards/thk_ans_format_reward": 1.0, + "step": 2088, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.44791793823242, + "epoch": 7.057335581787521, + "grad_norm": 9.465415172309807, + "kl": 0.51953125, + "learning_rate": 4.118806306306306e-07, + "loss": 0.0005, + "reward": 3.8078432083129883, + "reward_std": 0.010188735090196133, + "rewards/final_reward": 1.8143336565431716, + "rewards/mask_iou_reward": 0.9071668282715858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8078433871269226, + "rewards/thk_ans_format_reward": 1.0, + "step": 2089, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.95834350585938, + "epoch": 7.06070826306914, + "grad_norm": 22.620576396536165, + "kl": 0.4453125, + "learning_rate": 4.1159909909909906e-07, + "loss": 0.0005, + "reward": 3.7463499307632446, + "reward_std": 0.047850754112005234, + "rewards/final_reward": 1.8501468568459538, + "rewards/mask_iou_reward": 0.9250734284229769, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7463498711585999, + "rewards/thk_ans_format_reward": 1.0, + "step": 2090, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.77083587646484, + "epoch": 7.064080944350759, + "grad_norm": 8.978667930484319, + "kl": 0.5849609375, + "learning_rate": 4.113175675675675e-07, + "loss": 0.0006, + "reward": 3.4429363012313843, + "reward_std": 0.021207381039857864, + "rewards/final_reward": 1.0751444982965108, + "rewards/mask_iou_reward": 0.5375722491482554, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4429363012313843, + "rewards/thk_ans_format_reward": 1.0, + "step": 2091, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.1875, + "epoch": 7.0674536256323774, + "grad_norm": 72.53993116517377, + "kl": 0.4384765625, + "learning_rate": 4.1103603603603604e-07, + "loss": 0.0005, + "reward": 3.5110002756118774, + "reward_std": 0.07376820594072342, + "rewards/final_reward": 0.7198397571572596, + "rewards/mask_iou_reward": 0.3599198785786298, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5110000371932983, + "rewards/thk_ans_format_reward": 1.0, + "step": 2092, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.04167175292969, + "epoch": 7.070826306913997, + "grad_norm": 6.7448324868525145, + "kl": 0.4599609375, + "learning_rate": 4.107545045045045e-07, + "loss": 0.0005, + "reward": 3.767333507537842, + "reward_std": 0.031111277639865875, + "rewards/final_reward": 1.6456698216256496, + "rewards/mask_iou_reward": 0.8228349108128248, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7673333883285522, + "rewards/thk_ans_format_reward": 1.0, + "step": 2093, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.67708587646484, + "epoch": 7.074198988195615, + "grad_norm": 31.687531572728126, + "kl": 0.4296875, + "learning_rate": 4.1047297297297296e-07, + "loss": 0.0004, + "reward": 3.6214983463287354, + "reward_std": 0.07041146233677864, + "rewards/final_reward": 1.50514327525262, + "rewards/mask_iou_reward": 0.75257163762631, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.621498167514801, + "rewards/thk_ans_format_reward": 1.0, + "step": 2094, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.3229217529297, + "epoch": 7.077571669477234, + "grad_norm": 71.86638065587701, + "kl": 0.53125, + "learning_rate": 4.101914414414414e-07, + "loss": 0.0005, + "reward": 2.968865752220154, + "reward_std": 0.08706454932689667, + "rewards/final_reward": 0.9521295059368908, + "rewards/mask_iou_reward": 0.4760647529684454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9688657224178314, + "rewards/thk_ans_format_reward": 1.0, + "step": 2095, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.65625, + "epoch": 7.080944350758854, + "grad_norm": 31.190633686564574, + "kl": 0.4873046875, + "learning_rate": 4.099099099099099e-07, + "loss": 0.0005, + "reward": 3.5734691619873047, + "reward_std": 0.06589183211326599, + "rewards/final_reward": 1.9099139483753502, + "rewards/mask_iou_reward": 0.9549569741876751, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5734692811965942, + "rewards/thk_ans_format_reward": 1.0, + "step": 2096, + "think_completion_length": 7.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.93750381469727, + "epoch": 7.084317032040472, + "grad_norm": 9.307520904937336, + "kl": 0.537109375, + "learning_rate": 4.096283783783784e-07, + "loss": 0.0006, + "reward": 3.633847951889038, + "reward_std": 0.1258330326527357, + "rewards/final_reward": 1.5853979707257067, + "rewards/mask_iou_reward": 0.7926989853628533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.633847951889038, + "rewards/thk_ans_format_reward": 1.0, + "step": 2097, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.5729217529297, + "epoch": 7.087689713322091, + "grad_norm": 10.44724731169259, + "kl": 0.501953125, + "learning_rate": 4.0934684684684685e-07, + "loss": 0.0005, + "reward": 3.5259032249450684, + "reward_std": 0.13000381737947464, + "rewards/final_reward": 1.8315799850789156, + "rewards/mask_iou_reward": 0.9157899925394578, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5259031057357788, + "rewards/thk_ans_format_reward": 1.0, + "step": 2098, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.48958587646484, + "epoch": 7.09106239460371, + "grad_norm": 13.639258532398406, + "kl": 0.46484375, + "learning_rate": 4.090653153153153e-07, + "loss": 0.0005, + "reward": 3.4919179677963257, + "reward_std": 0.05438784509897232, + "rewards/final_reward": 1.593466845640541, + "rewards/mask_iou_reward": 0.7967334228202705, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4919179677963257, + "rewards/thk_ans_format_reward": 1.0, + "step": 2099, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.83334350585938, + "epoch": 7.094435075885329, + "grad_norm": 8.851197616606413, + "kl": 0.62109375, + "learning_rate": 4.087837837837838e-07, + "loss": 0.0006, + "reward": 3.5763626098632812, + "reward_std": 0.11013033799827099, + "rewards/final_reward": 1.5660547621116407, + "rewards/mask_iou_reward": 0.7830273810558204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5763624906539917, + "rewards/thk_ans_format_reward": 1.0, + "step": 2100, + "think_completion_length": 7.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.79167938232422, + "epoch": 7.097807757166947, + "grad_norm": 8.102484022277629, + "kl": 0.5068359375, + "learning_rate": 4.0850225225225224e-07, + "loss": 0.0005, + "reward": 3.2421382665634155, + "reward_std": 0.032540466636419296, + "rewards/final_reward": 1.1041041897317303, + "rewards/mask_iou_reward": 0.5520520948658652, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2421382665634155, + "rewards/thk_ans_format_reward": 1.0, + "step": 2101, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.43750762939453, + "epoch": 7.101180438448567, + "grad_norm": 43.703709847710556, + "kl": 0.4111328125, + "learning_rate": 4.082207207207207e-07, + "loss": 0.0004, + "reward": 3.2417622804641724, + "reward_std": 0.04179760627448559, + "rewards/final_reward": 1.284407073054897, + "rewards/mask_iou_reward": 0.6422035365274485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2417622804641724, + "rewards/thk_ans_format_reward": 1.0, + "step": 2102, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.9375, + "epoch": 7.104553119730186, + "grad_norm": 15.530204865371694, + "kl": 0.45703125, + "learning_rate": 4.0793918918918916e-07, + "loss": 0.0005, + "reward": 3.7399545907974243, + "reward_std": 0.04067044984549284, + "rewards/final_reward": 1.7588847560236927, + "rewards/mask_iou_reward": 0.8794423780118463, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7399544715881348, + "rewards/thk_ans_format_reward": 1.0, + "step": 2103, + "think_completion_length": 6.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.9791717529297, + "epoch": 7.107925801011804, + "grad_norm": 17.465844081341633, + "kl": 0.60546875, + "learning_rate": 4.076576576576576e-07, + "loss": 0.0006, + "reward": 3.3154332637786865, + "reward_std": 0.046184979379177094, + "rewards/final_reward": 1.0295719726963035, + "rewards/mask_iou_reward": 0.5147859863481518, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3154330849647522, + "rewards/thk_ans_format_reward": 1.0, + "step": 2104, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.67708587646484, + "epoch": 7.1112984822934235, + "grad_norm": 11.463666020587564, + "kl": 0.376953125, + "learning_rate": 4.073761261261261e-07, + "loss": 0.0004, + "reward": 3.6639139652252197, + "reward_std": 0.06345823779702187, + "rewards/final_reward": 1.5036986046138827, + "rewards/mask_iou_reward": 0.7518493023069414, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6639137864112854, + "rewards/thk_ans_format_reward": 1.0, + "step": 2105, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.09375381469727, + "epoch": 7.114671163575042, + "grad_norm": 5.965543170545409, + "kl": 0.4853515625, + "learning_rate": 4.0709459459459454e-07, + "loss": 0.0005, + "reward": 3.514773368835449, + "reward_std": 0.02285183686763048, + "rewards/final_reward": 1.8121782155019046, + "rewards/mask_iou_reward": 0.9060891077509523, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5147733688354492, + "rewards/thk_ans_format_reward": 1.0, + "step": 2106, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.9479217529297, + "epoch": 7.118043844856661, + "grad_norm": 26.377210709034202, + "kl": 0.4423828125, + "learning_rate": 4.0681306306306305e-07, + "loss": 0.0004, + "reward": 3.6755313873291016, + "reward_std": 0.1012740321457386, + "rewards/final_reward": 1.9159749806838375, + "rewards/mask_iou_reward": 0.9579874903419188, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6755311489105225, + "rewards/thk_ans_format_reward": 1.0, + "step": 2107, + "think_completion_length": 6.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.95834350585938, + "epoch": 7.12141652613828, + "grad_norm": 9.271016248200919, + "kl": 0.478515625, + "learning_rate": 4.065315315315315e-07, + "loss": 0.0005, + "reward": 3.7043673992156982, + "reward_std": 0.03634229302406311, + "rewards/final_reward": 1.9564682308637495, + "rewards/mask_iou_reward": 0.9782341154318748, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7043673396110535, + "rewards/thk_ans_format_reward": 1.0, + "step": 2108, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.3229217529297, + "epoch": 7.124789207419899, + "grad_norm": 25.351969828606347, + "kl": 0.416015625, + "learning_rate": 4.0625e-07, + "loss": 0.0004, + "reward": 3.440119981765747, + "reward_std": 0.15209950879216194, + "rewards/final_reward": 1.4271350135374492, + "rewards/mask_iou_reward": 0.7135675067687246, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.440119981765747, + "rewards/thk_ans_format_reward": 1.0, + "step": 2109, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.92708587646484, + "epoch": 7.128161888701518, + "grad_norm": 16.288230773620306, + "kl": 0.4150390625, + "learning_rate": 4.0596846846846844e-07, + "loss": 0.0004, + "reward": 3.5552964210510254, + "reward_std": 0.04798351600766182, + "rewards/final_reward": 0.9282412127329586, + "rewards/mask_iou_reward": 0.4641206063664793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.555296540260315, + "rewards/thk_ans_format_reward": 1.0, + "step": 2110, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5104217529297, + "epoch": 7.1315345699831365, + "grad_norm": 8.161303969531241, + "kl": 0.46484375, + "learning_rate": 4.056869369369369e-07, + "loss": 0.0005, + "reward": 3.3685712814331055, + "reward_std": 0.08867337927222252, + "rewards/final_reward": 0.8364246545390581, + "rewards/mask_iou_reward": 0.41821232726952906, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.368571400642395, + "rewards/thk_ans_format_reward": 1.0, + "step": 2111, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.4791717529297, + "epoch": 7.134907251264756, + "grad_norm": 9.576325538332526, + "kl": 0.3876953125, + "learning_rate": 4.054054054054054e-07, + "loss": 0.0004, + "reward": 3.5125324726104736, + "reward_std": 0.08074938133358955, + "rewards/final_reward": 1.516974694693673, + "rewards/mask_iou_reward": 0.7584873473468365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5125325322151184, + "rewards/thk_ans_format_reward": 1.0, + "step": 2112, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.65625381469727, + "epoch": 7.138279932546374, + "grad_norm": 26.230253385846307, + "kl": 0.51171875, + "learning_rate": 4.0512387387387387e-07, + "loss": 0.0005, + "reward": 3.467235565185547, + "reward_std": 0.09212902188301086, + "rewards/final_reward": 1.2272662786796569, + "rewards/mask_iou_reward": 0.6136331393398284, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4672354459762573, + "rewards/thk_ans_format_reward": 1.0, + "step": 2113, + "think_completion_length": 6.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.03125381469727, + "epoch": 7.141652613827993, + "grad_norm": 20.715019889586777, + "kl": 2.5693359375, + "learning_rate": 4.0484234234234233e-07, + "loss": 0.0026, + "reward": 3.457432508468628, + "reward_std": 0.10950843244791031, + "rewards/final_reward": 1.429733310803956, + "rewards/mask_iou_reward": 0.714866655401978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4678492546081543, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2114, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.9166717529297, + "epoch": 7.145025295109612, + "grad_norm": 9.845124376157337, + "kl": 0.4306640625, + "learning_rate": 4.045608108108108e-07, + "loss": 0.0004, + "reward": 3.435527205467224, + "reward_std": 0.03152256831526756, + "rewards/final_reward": 1.55579422645883, + "rewards/mask_iou_reward": 0.777897113229415, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.435526967048645, + "rewards/thk_ans_format_reward": 1.0, + "step": 2115, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.58333587646484, + "epoch": 7.148397976391231, + "grad_norm": 94.28612906783492, + "kl": 0.4306640625, + "learning_rate": 4.0427927927927925e-07, + "loss": 0.0004, + "reward": 3.3601832389831543, + "reward_std": 0.08771881833672523, + "rewards/final_reward": 0.87061459389086, + "rewards/mask_iou_reward": 0.43530729694543, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3601831197738647, + "rewards/thk_ans_format_reward": 1.0, + "step": 2116, + "think_completion_length": 6.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.59375, + "epoch": 7.15177065767285, + "grad_norm": 10.221977457090283, + "kl": 0.4208984375, + "learning_rate": 4.0399774774774777e-07, + "loss": 0.0004, + "reward": 2.9706469774246216, + "reward_std": 0.047149766236543655, + "rewards/final_reward": 0.6018353282228965, + "rewards/mask_iou_reward": 0.30091766411144827, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9706469774246216, + "rewards/thk_ans_format_reward": 1.0, + "step": 2117, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.55208587646484, + "epoch": 7.155143338954469, + "grad_norm": 8.51580663824599, + "kl": 0.478515625, + "learning_rate": 4.0371621621621623e-07, + "loss": 0.0005, + "reward": 3.6264944076538086, + "reward_std": 0.1628369241952896, + "rewards/final_reward": 1.9376107862778913, + "rewards/mask_iou_reward": 0.9688053931389456, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6369110941886902, + "rewards/thk_ans_format_reward": 1.0, + "step": 2118, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.23958587646484, + "epoch": 7.158516020236088, + "grad_norm": 58.8540263103047, + "kl": 0.46875, + "learning_rate": 4.034346846846847e-07, + "loss": 0.0005, + "reward": 3.8285425901412964, + "reward_std": 0.021101244492456317, + "rewards/final_reward": 1.912153629289806, + "rewards/mask_iou_reward": 0.956076814644903, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.828542411327362, + "rewards/thk_ans_format_reward": 1.0, + "step": 2119, + "think_completion_length": 6.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.08334350585938, + "epoch": 7.161888701517706, + "grad_norm": 14.23377320752225, + "kl": 0.4228515625, + "learning_rate": 4.0315315315315315e-07, + "loss": 0.0004, + "reward": 3.592753767967224, + "reward_std": 0.06616166792809963, + "rewards/final_reward": 1.2861638111709919, + "rewards/mask_iou_reward": 0.6430819055854959, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5927537679672241, + "rewards/thk_ans_format_reward": 1.0, + "step": 2120, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.3229217529297, + "epoch": 7.165261382799326, + "grad_norm": 7.3252669737331075, + "kl": 0.5078125, + "learning_rate": 4.0287162162162156e-07, + "loss": 0.0005, + "reward": 3.5950053930282593, + "reward_std": 0.30001043528318405, + "rewards/final_reward": 1.5873910615638014, + "rewards/mask_iou_reward": 0.7936955307819007, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6158387660980225, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2121, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.5729217529297, + "epoch": 7.168634064080944, + "grad_norm": 7.638980910789786, + "kl": 0.4794921875, + "learning_rate": 4.0259009009009007e-07, + "loss": 0.0006, + "reward": 3.5733102560043335, + "reward_std": 0.0462705185636878, + "rewards/final_reward": 1.4853571579104514, + "rewards/mask_iou_reward": 0.7426785789552257, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5733102560043335, + "rewards/thk_ans_format_reward": 1.0, + "step": 2122, + "think_completion_length": 7.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.45833587646484, + "epoch": 7.172006745362563, + "grad_norm": 9.882482773527292, + "kl": 0.439453125, + "learning_rate": 4.0230855855855853e-07, + "loss": 0.0005, + "reward": 3.4895578622817993, + "reward_std": 0.0789448469877243, + "rewards/final_reward": 1.319527123891827, + "rewards/mask_iou_reward": 0.6597635619459135, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4895577430725098, + "rewards/thk_ans_format_reward": 1.0, + "step": 2123, + "think_completion_length": 6.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.3854217529297, + "epoch": 7.175379426644182, + "grad_norm": 39.426525505427065, + "kl": 0.427734375, + "learning_rate": 4.02027027027027e-07, + "loss": 0.0004, + "reward": 3.4777748584747314, + "reward_std": 0.1806269846856594, + "rewards/final_reward": 1.8703323689058926, + "rewards/mask_iou_reward": 0.9351661844529463, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4986081719398499, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2124, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.96875762939453, + "epoch": 7.178752107925801, + "grad_norm": 11.025901523498554, + "kl": 0.416015625, + "learning_rate": 4.0174549549549545e-07, + "loss": 0.0003, + "reward": 3.55265212059021, + "reward_std": 0.013125112280249596, + "rewards/final_reward": 0.9920675836555493, + "rewards/mask_iou_reward": 0.49603379182777463, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.55265212059021, + "rewards/thk_ans_format_reward": 1.0, + "step": 2125, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.68750762939453, + "epoch": 7.18212478920742, + "grad_norm": 12.019458429651026, + "kl": 0.4140625, + "learning_rate": 4.014639639639639e-07, + "loss": 0.0004, + "reward": 3.4704500436782837, + "reward_std": 0.07267389260232449, + "rewards/final_reward": 1.8845524357169587, + "rewards/mask_iou_reward": 0.9422762178584794, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.470449984073639, + "rewards/thk_ans_format_reward": 1.0, + "step": 2126, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.3854217529297, + "epoch": 7.185497470489039, + "grad_norm": 23.32054644884736, + "kl": 0.4140625, + "learning_rate": 4.011824324324324e-07, + "loss": 0.0004, + "reward": 3.5207202434539795, + "reward_std": 0.07270674407482147, + "rewards/final_reward": 1.5134364200643244, + "rewards/mask_iou_reward": 0.7567182100321622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5207201838493347, + "rewards/thk_ans_format_reward": 1.0, + "step": 2127, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.96875762939453, + "epoch": 7.188870151770658, + "grad_norm": 233.4763662776818, + "kl": 0.447265625, + "learning_rate": 4.009009009009009e-07, + "loss": 0.0004, + "reward": 3.513588070869446, + "reward_std": 0.057372111827135086, + "rewards/final_reward": 1.375929576670419, + "rewards/mask_iou_reward": 0.6879647883352095, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5135878920555115, + "rewards/thk_ans_format_reward": 1.0, + "step": 2128, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.98958587646484, + "epoch": 7.192242833052276, + "grad_norm": 16.137783540866636, + "kl": 0.515625, + "learning_rate": 4.0061936936936935e-07, + "loss": 0.0005, + "reward": 3.594455122947693, + "reward_std": 0.05518577480688691, + "rewards/final_reward": 1.87773857431839, + "rewards/mask_iou_reward": 0.938869287159195, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.594455063343048, + "rewards/thk_ans_format_reward": 1.0, + "step": 2129, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.87500762939453, + "epoch": 7.195615514333896, + "grad_norm": 10.617842015866719, + "kl": 0.435546875, + "learning_rate": 4.003378378378378e-07, + "loss": 0.0004, + "reward": 3.1099244356155396, + "reward_std": 0.055913373827934265, + "rewards/final_reward": 0.5185217282650944, + "rewards/mask_iou_reward": 0.2592608641325472, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1099244356155396, + "rewards/thk_ans_format_reward": 1.0, + "step": 2130, + "think_completion_length": 6.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.84375762939453, + "epoch": 7.198988195615514, + "grad_norm": 13.662643697706343, + "kl": 0.484375, + "learning_rate": 4.0005630630630627e-07, + "loss": 0.0005, + "reward": 3.8299983739852905, + "reward_std": 0.01682877354323864, + "rewards/final_reward": 1.7913390549733599, + "rewards/mask_iou_reward": 0.8956695274866799, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8299983739852905, + "rewards/thk_ans_format_reward": 1.0, + "step": 2131, + "think_completion_length": 5.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.55208587646484, + "epoch": 7.202360876897133, + "grad_norm": 17.046292903262447, + "kl": 0.421875, + "learning_rate": 3.997747747747748e-07, + "loss": 0.0004, + "reward": 3.196842908859253, + "reward_std": 0.16826032102108002, + "rewards/final_reward": 1.912653454277757, + "rewards/mask_iou_reward": 0.9563267271388785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1968427896499634, + "rewards/thk_ans_format_reward": 1.0, + "step": 2132, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.75000762939453, + "epoch": 7.2057335581787525, + "grad_norm": 15.186937128391326, + "kl": 0.38671875, + "learning_rate": 3.9949324324324324e-07, + "loss": 0.0004, + "reward": 3.2341290712356567, + "reward_std": 0.026309030130505562, + "rewards/final_reward": 1.563177724085566, + "rewards/mask_iou_reward": 0.781588862042783, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2341291904449463, + "rewards/thk_ans_format_reward": 1.0, + "step": 2133, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0520896911621, + "epoch": 7.209106239460371, + "grad_norm": 29.4255008250624, + "kl": 0.7470703125, + "learning_rate": 3.992117117117117e-07, + "loss": 0.0007, + "reward": 3.400033712387085, + "reward_std": 0.17767321318387985, + "rewards/final_reward": 1.5393208560935134, + "rewards/mask_iou_reward": 0.7696604280467567, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4208670258522034, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2134, + "think_completion_length": 6.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.96875762939453, + "epoch": 7.21247892074199, + "grad_norm": 8.485399571137938, + "kl": 0.44921875, + "learning_rate": 3.9893018018018016e-07, + "loss": 0.0005, + "reward": 3.3921091556549072, + "reward_std": 0.08318794146180153, + "rewards/final_reward": 1.3257067431520768, + "rewards/mask_iou_reward": 0.6628533715760384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3921091556549072, + "rewards/thk_ans_format_reward": 1.0, + "step": 2135, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.11458587646484, + "epoch": 7.2158516020236085, + "grad_norm": 12.122435985688213, + "kl": 0.513671875, + "learning_rate": 3.986486486486486e-07, + "loss": 0.0005, + "reward": 3.521836519241333, + "reward_std": 0.17866870388388634, + "rewards/final_reward": 1.0408465030152805, + "rewards/mask_iou_reward": 0.5204232515076402, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.521836519241333, + "rewards/thk_ans_format_reward": 1.0, + "step": 2136, + "think_completion_length": 6.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.9479217529297, + "epoch": 7.219224283305228, + "grad_norm": 18.80734240520374, + "kl": 0.36328125, + "learning_rate": 3.9836711711711714e-07, + "loss": 0.0004, + "reward": 3.5676910877227783, + "reward_std": 0.03717435151338577, + "rewards/final_reward": 1.8321522375205679, + "rewards/mask_iou_reward": 0.9160761187602839, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5676909685134888, + "rewards/thk_ans_format_reward": 1.0, + "step": 2137, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.56250762939453, + "epoch": 7.222596964586846, + "grad_norm": 14.275363933619314, + "kl": 0.4375, + "learning_rate": 3.980855855855856e-07, + "loss": 0.0004, + "reward": 3.482014775276184, + "reward_std": 0.2985433340072632, + "rewards/final_reward": 1.349934065591685, + "rewards/mask_iou_reward": 0.6749670327958425, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5236812829971313, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2138, + "think_completion_length": 6.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.1041717529297, + "epoch": 7.2259696458684655, + "grad_norm": 9.048563062948114, + "kl": 0.4150390625, + "learning_rate": 3.9780405405405406e-07, + "loss": 0.0004, + "reward": 3.4160574674606323, + "reward_std": 0.10905380174517632, + "rewards/final_reward": 1.6702328220901432, + "rewards/mask_iou_reward": 0.8351164110450716, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4160576462745667, + "rewards/thk_ans_format_reward": 1.0, + "step": 2139, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.87500762939453, + "epoch": 7.229342327150085, + "grad_norm": 8.460530923785011, + "kl": 0.451171875, + "learning_rate": 3.975225225225225e-07, + "loss": 0.0005, + "reward": 3.340652108192444, + "reward_std": 0.09711728245019913, + "rewards/final_reward": 1.0464337627592921, + "rewards/mask_iou_reward": 0.5232168813796461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3406521081924438, + "rewards/thk_ans_format_reward": 1.0, + "step": 2140, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.61458587646484, + "epoch": 7.232715008431703, + "grad_norm": 14.278147083487097, + "kl": 0.408203125, + "learning_rate": 3.9724099099099093e-07, + "loss": 0.0004, + "reward": 3.703104615211487, + "reward_std": 0.02983518410474062, + "rewards/final_reward": 1.3170696865282334, + "rewards/mask_iou_reward": 0.6585348432641167, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7031044960021973, + "rewards/thk_ans_format_reward": 1.0, + "step": 2141, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.98958587646484, + "epoch": 7.236087689713322, + "grad_norm": 9.425512098183246, + "kl": 0.4912109375, + "learning_rate": 3.9695945945945944e-07, + "loss": 0.0005, + "reward": 3.412560224533081, + "reward_std": 0.15079614520072937, + "rewards/final_reward": 1.7363874893385831, + "rewards/mask_iou_reward": 0.8681937446692916, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4125602841377258, + "rewards/thk_ans_format_reward": 1.0, + "step": 2142, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.6666717529297, + "epoch": 7.239460370994941, + "grad_norm": 6.200306150717977, + "kl": 0.4580078125, + "learning_rate": 3.966779279279279e-07, + "loss": 0.0005, + "reward": 3.399711489677429, + "reward_std": 0.15125693986192346, + "rewards/final_reward": 1.5797161181786068, + "rewards/mask_iou_reward": 0.7898580590893034, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4205447435379028, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2143, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.77083587646484, + "epoch": 7.24283305227656, + "grad_norm": 8.147060870604529, + "kl": 0.5302734375, + "learning_rate": 3.9639639639639636e-07, + "loss": 0.0005, + "reward": 3.6022841930389404, + "reward_std": 0.15134335309267044, + "rewards/final_reward": 1.8884136915778607, + "rewards/mask_iou_reward": 0.9442068457889303, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6022841930389404, + "rewards/thk_ans_format_reward": 1.0, + "step": 2144, + "think_completion_length": 6.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 117.85417175292969, + "epoch": 7.246205733558178, + "grad_norm": 149.03203489496107, + "kl": 0.650390625, + "learning_rate": 3.961148648648648e-07, + "loss": 0.0007, + "reward": 3.508164882659912, + "reward_std": 0.10794974863529205, + "rewards/final_reward": 1.9698042980004327, + "rewards/mask_iou_reward": 0.9849021490002163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.508164882659912, + "rewards/thk_ans_format_reward": 1.0, + "step": 2145, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.00000762939453, + "epoch": 7.249578414839798, + "grad_norm": 13.057972042386261, + "kl": 0.443359375, + "learning_rate": 3.958333333333333e-07, + "loss": 0.0004, + "reward": 3.5224772691726685, + "reward_std": 0.19207923859357834, + "rewards/final_reward": 1.4173489969961852, + "rewards/mask_iou_reward": 0.7086744984980926, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5224772691726685, + "rewards/thk_ans_format_reward": 1.0, + "step": 2146, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.39583587646484, + "epoch": 7.252951096121416, + "grad_norm": 4.5979033935736435, + "kl": 0.451171875, + "learning_rate": 3.955518018018018e-07, + "loss": 0.0005, + "reward": 3.186232328414917, + "reward_std": 0.17321348935365677, + "rewards/final_reward": 1.3516898725056445, + "rewards/mask_iou_reward": 0.6758449362528223, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1862324178218842, + "rewards/thk_ans_format_reward": 1.0, + "step": 2147, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.77083587646484, + "epoch": 7.256323777403035, + "grad_norm": 9.207451207124421, + "kl": 0.42578125, + "learning_rate": 3.9527027027027026e-07, + "loss": 0.0004, + "reward": 3.324433922767639, + "reward_std": 0.06282211095094681, + "rewards/final_reward": 1.2522952711881437, + "rewards/mask_iou_reward": 0.6261476355940718, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3244337439537048, + "rewards/thk_ans_format_reward": 1.0, + "step": 2148, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.78125, + "epoch": 7.259696458684655, + "grad_norm": 10.283281006635079, + "kl": 0.408203125, + "learning_rate": 3.949887387387387e-07, + "loss": 0.0004, + "reward": 3.7005112171173096, + "reward_std": 0.039139024913311005, + "rewards/final_reward": 1.5932102741295942, + "rewards/mask_iou_reward": 0.7966051370647971, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7005112767219543, + "rewards/thk_ans_format_reward": 1.0, + "step": 2149, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.7916717529297, + "epoch": 7.263069139966273, + "grad_norm": 38.4408050725588, + "kl": 0.392578125, + "learning_rate": 3.947072072072072e-07, + "loss": 0.0004, + "reward": 3.5593591928482056, + "reward_std": 0.08478840440511703, + "rewards/final_reward": 1.4710174033824241, + "rewards/mask_iou_reward": 0.7355087016912121, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5593591928482056, + "rewards/thk_ans_format_reward": 1.0, + "step": 2150, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.67708587646484, + "epoch": 7.266441821247892, + "grad_norm": 5.787815387830136, + "kl": 0.541015625, + "learning_rate": 3.9442567567567564e-07, + "loss": 0.0005, + "reward": 3.6677483320236206, + "reward_std": 0.12288782093673944, + "rewards/final_reward": 1.5742830388891997, + "rewards/mask_iou_reward": 0.7871415194445999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6677480936050415, + "rewards/thk_ans_format_reward": 1.0, + "step": 2151, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.5, + "epoch": 7.269814502529511, + "grad_norm": 10.878390354861022, + "kl": 0.4296875, + "learning_rate": 3.9414414414414415e-07, + "loss": 0.0004, + "reward": 3.606956720352173, + "reward_std": 0.040956467390060425, + "rewards/final_reward": 1.4903519318159777, + "rewards/mask_iou_reward": 0.7451759659079888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6069567799568176, + "rewards/thk_ans_format_reward": 1.0, + "step": 2152, + "think_completion_length": 6.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.0104217529297, + "epoch": 7.27318718381113, + "grad_norm": 13.98855206951791, + "kl": 0.3876953125, + "learning_rate": 3.938626126126126e-07, + "loss": 0.0004, + "reward": 3.5343345403671265, + "reward_std": 0.31789813563227654, + "rewards/final_reward": 1.2603681149803911, + "rewards/mask_iou_reward": 0.6301840574901956, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5760010480880737, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2153, + "think_completion_length": 6.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.1666717529297, + "epoch": 7.276559865092748, + "grad_norm": 15.301432109475098, + "kl": 0.4130859375, + "learning_rate": 3.935810810810811e-07, + "loss": 0.0004, + "reward": 3.613258957862854, + "reward_std": 0.21496530901640654, + "rewards/final_reward": 1.6013227848837182, + "rewards/mask_iou_reward": 0.8006613924418591, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.654925525188446, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2154, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.77084350585938, + "epoch": 7.279932546374368, + "grad_norm": 14.334921119171533, + "kl": 0.490234375, + "learning_rate": 3.9329954954954954e-07, + "loss": 0.0005, + "reward": 3.3268117904663086, + "reward_std": 0.11101927608251572, + "rewards/final_reward": 1.8440713944047213, + "rewards/mask_iou_reward": 0.9220356972023607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.326811671257019, + "rewards/thk_ans_format_reward": 1.0, + "step": 2155, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.5729217529297, + "epoch": 7.283305227655987, + "grad_norm": 18.06605732574647, + "kl": 0.43359375, + "learning_rate": 3.93018018018018e-07, + "loss": 0.0004, + "reward": 3.5632468461990356, + "reward_std": 0.07951584458351135, + "rewards/final_reward": 1.1299945703341507, + "rewards/mask_iou_reward": 0.5649972851670754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.563246726989746, + "rewards/thk_ans_format_reward": 1.0, + "step": 2156, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.4166717529297, + "epoch": 7.286677908937605, + "grad_norm": 7.086837200220286, + "kl": 0.556640625, + "learning_rate": 3.927364864864865e-07, + "loss": 0.0006, + "reward": 3.759609341621399, + "reward_std": 0.2072555348277092, + "rewards/final_reward": 1.8777304738504346, + "rewards/mask_iou_reward": 0.9388652369252173, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7804425954818726, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2157, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.20834350585938, + "epoch": 7.2900505902192245, + "grad_norm": 19.968401086600352, + "kl": 0.4443359375, + "learning_rate": 3.9245495495495497e-07, + "loss": 0.0005, + "reward": 3.369117498397827, + "reward_std": 0.19751837849617004, + "rewards/final_reward": 1.1488828752023381, + "rewards/mask_iou_reward": 0.5744414376011691, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.389950692653656, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2158, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.65626525878906, + "epoch": 7.293423271500843, + "grad_norm": 12.387109597703848, + "kl": 0.4111328125, + "learning_rate": 3.9217342342342343e-07, + "loss": 0.0004, + "reward": 3.170549750328064, + "reward_std": 0.28758961241692305, + "rewards/final_reward": 1.3935270921524614, + "rewards/mask_iou_reward": 0.6967635460762307, + "rewards/sam_format_reward": 0.9270833432674408, + "rewards/sam_reward_func_ultra": 1.316383183002472, + "rewards/thk_ans_format_reward": 0.9270833432674408, + "step": 2159, + "think_completion_length": 6.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.04167938232422, + "epoch": 7.296795952782462, + "grad_norm": 19.28123203840753, + "kl": 0.423828125, + "learning_rate": 3.918918918918919e-07, + "loss": 0.0004, + "reward": 3.62705397605896, + "reward_std": 0.22706139460206032, + "rewards/final_reward": 1.6559225404737472, + "rewards/mask_iou_reward": 0.8279612702368736, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.647887110710144, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2160, + "think_completion_length": 6.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.09375, + "epoch": 7.300168634064081, + "grad_norm": 8.978004082516899, + "kl": 0.5009765625, + "learning_rate": 3.916103603603603e-07, + "loss": 0.0005, + "reward": 3.428421139717102, + "reward_std": 0.18285532295703888, + "rewards/final_reward": 1.617844638569244, + "rewards/mask_iou_reward": 0.808922319284622, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4700875282287598, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2161, + "think_completion_length": 6.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.40626525878906, + "epoch": 7.3035413153457, + "grad_norm": 16.80433491867078, + "kl": 0.36328125, + "learning_rate": 3.913288288288288e-07, + "loss": 0.0004, + "reward": 3.5047558546066284, + "reward_std": 0.30528346402570605, + "rewards/final_reward": 1.7800765282781192, + "rewards/mask_iou_reward": 0.8900382641390596, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.6089226603507996, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 2162, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.02084350585938, + "epoch": 7.306913996627319, + "grad_norm": 5.595963899834234, + "kl": 0.6728515625, + "learning_rate": 3.910472972972973e-07, + "loss": 0.0007, + "reward": 3.6005738973617554, + "reward_std": 0.09857448190450668, + "rewards/final_reward": 1.6690496045432348, + "rewards/mask_iou_reward": 0.8345248022716174, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6005739569664001, + "rewards/thk_ans_format_reward": 1.0, + "step": 2163, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.1041717529297, + "epoch": 7.3102866779089375, + "grad_norm": 16.19212872325748, + "kl": 0.365234375, + "learning_rate": 3.9076576576576574e-07, + "loss": 0.0004, + "reward": 3.5587165355682373, + "reward_std": 0.1438434375450015, + "rewards/final_reward": 1.6682048349112146, + "rewards/mask_iou_reward": 0.8341024174556073, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5795499086380005, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2164, + "think_completion_length": 7.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.71875762939453, + "epoch": 7.313659359190557, + "grad_norm": 7.803745253729495, + "kl": 0.443359375, + "learning_rate": 3.904842342342342e-07, + "loss": 0.0005, + "reward": 3.412447929382324, + "reward_std": 0.02094810316339135, + "rewards/final_reward": 1.3435670359208849, + "rewards/mask_iou_reward": 0.6717835179604424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4124478697776794, + "rewards/thk_ans_format_reward": 1.0, + "step": 2165, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.78125762939453, + "epoch": 7.317032040472175, + "grad_norm": 4.843633546989581, + "kl": 0.4716796875, + "learning_rate": 3.9020270270270266e-07, + "loss": 0.0005, + "reward": 3.5543458461761475, + "reward_std": 0.052722327411174774, + "rewards/final_reward": 1.8292358145000578, + "rewards/mask_iou_reward": 0.9146179072500289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.554345726966858, + "rewards/thk_ans_format_reward": 1.0, + "step": 2166, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.12500762939453, + "epoch": 7.320404721753794, + "grad_norm": 15.399198454003308, + "kl": 0.537109375, + "learning_rate": 3.899211711711711e-07, + "loss": 0.0005, + "reward": 3.5818214416503906, + "reward_std": 0.08301165979355574, + "rewards/final_reward": 1.163532358848403, + "rewards/mask_iou_reward": 0.5817661794242015, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5818213820457458, + "rewards/thk_ans_format_reward": 1.0, + "step": 2167, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.31251525878906, + "epoch": 7.323777403035413, + "grad_norm": 21.04994479043818, + "kl": 0.404296875, + "learning_rate": 3.8963963963963963e-07, + "loss": 0.0004, + "reward": 3.177867293357849, + "reward_std": 0.42192305624485016, + "rewards/final_reward": 1.5446818310936512, + "rewards/mask_iou_reward": 0.7723409155468256, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.2820341289043427, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 2168, + "think_completion_length": 5.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.1979217529297, + "epoch": 7.327150084317032, + "grad_norm": 7.782304979128652, + "kl": 0.345703125, + "learning_rate": 3.893581081081081e-07, + "loss": 0.0003, + "reward": 3.2318207025527954, + "reward_std": 0.41622330248355865, + "rewards/final_reward": 1.9512054078677687, + "rewards/mask_iou_reward": 0.9756027039338844, + "rewards/sam_format_reward": 0.9270833432674408, + "rewards/sam_reward_func_ultra": 1.3776538372039795, + "rewards/thk_ans_format_reward": 0.9270833432674408, + "step": 2169, + "think_completion_length": 6.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.5625, + "epoch": 7.330522765598651, + "grad_norm": 8.374995571382644, + "kl": 0.5, + "learning_rate": 3.8907657657657655e-07, + "loss": 0.0005, + "reward": 3.082065463066101, + "reward_std": 0.2468433976173401, + "rewards/final_reward": 1.6989625807648732, + "rewards/mask_iou_reward": 0.8494812903824366, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.102898895740509, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2170, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.06251525878906, + "epoch": 7.33389544688027, + "grad_norm": 9.203101037904963, + "kl": 0.3583984375, + "learning_rate": 3.88795045045045e-07, + "loss": 0.0004, + "reward": 3.2510664463043213, + "reward_std": 0.4821528196334839, + "rewards/final_reward": 1.641097839508744, + "rewards/mask_iou_reward": 0.820548919754372, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.3343997597694397, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 2171, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.25000762939453, + "epoch": 7.337268128161889, + "grad_norm": 12.01320772350456, + "kl": 0.380859375, + "learning_rate": 3.885135135135135e-07, + "loss": 0.0004, + "reward": 3.471616744995117, + "reward_std": 0.1700489092618227, + "rewards/final_reward": 1.6669278379681216, + "rewards/mask_iou_reward": 0.8334639189840608, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4924501180648804, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2172, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.625, + "epoch": 7.340640809443507, + "grad_norm": 8.85257565325863, + "kl": 0.3359375, + "learning_rate": 3.88231981981982e-07, + "loss": 0.0003, + "reward": 3.362300753593445, + "reward_std": 0.31437610648572445, + "rewards/final_reward": 1.7814382838099094, + "rewards/mask_iou_reward": 0.8907191419049547, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.4248005151748657, + "rewards/thk_ans_format_reward": 0.96875, + "step": 2173, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.96876525878906, + "epoch": 7.344013490725127, + "grad_norm": 6.91327388672393, + "kl": 0.3330078125, + "learning_rate": 3.8795045045045045e-07, + "loss": 0.0003, + "reward": 3.5213279724121094, + "reward_std": 0.14972331002354622, + "rewards/final_reward": 1.603927316493496, + "rewards/mask_iou_reward": 0.801963658246748, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5421611070632935, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2174, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.50001525878906, + "epoch": 7.347386172006745, + "grad_norm": 5.4068115963741175, + "kl": 0.3193359375, + "learning_rate": 3.876689189189189e-07, + "loss": 0.0003, + "reward": 3.0117900371551514, + "reward_std": 0.5588356852531433, + "rewards/final_reward": 1.7250185714794255, + "rewards/mask_iou_reward": 0.8625092857397127, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.1159567832946777, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 2175, + "think_completion_length": 7.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.56251525878906, + "epoch": 7.350758853288364, + "grad_norm": 8.37104926533639, + "kl": 0.318359375, + "learning_rate": 3.8738738738738737e-07, + "loss": 0.0003, + "reward": 3.1996771097183228, + "reward_std": 0.37156446278095245, + "rewards/final_reward": 1.4420773983333568, + "rewards/mask_iou_reward": 0.7210386991666784, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.2413436770439148, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2176, + "think_completion_length": 6.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.78125762939453, + "epoch": 7.354131534569984, + "grad_norm": 6.7895801457677045, + "kl": 0.3515625, + "learning_rate": 3.8710585585585583e-07, + "loss": 0.0004, + "reward": 3.296520471572876, + "reward_std": 0.18614527583122253, + "rewards/final_reward": 1.4495435536760706, + "rewards/mask_iou_reward": 0.7247717768380353, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.359020471572876, + "rewards/thk_ans_format_reward": 0.96875, + "step": 2177, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.8854217529297, + "epoch": 7.357504215851602, + "grad_norm": 6.552101407920925, + "kl": 0.365234375, + "learning_rate": 3.8682432432432434e-07, + "loss": 0.0004, + "reward": 3.418135404586792, + "reward_std": 0.39255285263061523, + "rewards/final_reward": 1.4095475051114925, + "rewards/mask_iou_reward": 0.7047737525557463, + "rewards/sam_format_reward": 0.9687500298023224, + "rewards/sam_reward_func_ultra": 1.480635404586792, + "rewards/thk_ans_format_reward": 0.9687500298023224, + "step": 2178, + "think_completion_length": 6.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.86458587646484, + "epoch": 7.360876897133221, + "grad_norm": 5.4216062546604835, + "kl": 0.509765625, + "learning_rate": 3.865427927927928e-07, + "loss": 0.0005, + "reward": 3.7206671237945557, + "reward_std": 0.03425286652054638, + "rewards/final_reward": 1.8160294503549825, + "rewards/mask_iou_reward": 0.9080147251774913, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7206671237945557, + "rewards/thk_ans_format_reward": 1.0, + "step": 2179, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.30209350585938, + "epoch": 7.36424957841484, + "grad_norm": 18.869959269884077, + "kl": 0.39453125, + "learning_rate": 3.8626126126126127e-07, + "loss": 0.0004, + "reward": 3.5714718103408813, + "reward_std": 0.22793716937303543, + "rewards/final_reward": 1.6498136826991323, + "rewards/mask_iou_reward": 0.8249068413495662, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5923051834106445, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2180, + "think_completion_length": 6.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.27084350585938, + "epoch": 7.367622259696459, + "grad_norm": 35.29660849574145, + "kl": 1.1455078125, + "learning_rate": 3.8597972972972967e-07, + "loss": 0.0011, + "reward": 3.8055763244628906, + "reward_std": 0.11059517413377762, + "rewards/final_reward": 1.932277467940826, + "rewards/mask_iou_reward": 0.966138733970413, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8055765628814697, + "rewards/thk_ans_format_reward": 1.0, + "step": 2181, + "think_completion_length": 6.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.39584350585938, + "epoch": 7.370994940978077, + "grad_norm": 5.726177243990948, + "kl": 0.3515625, + "learning_rate": 3.8569819819819813e-07, + "loss": 0.0004, + "reward": 3.4815937280654907, + "reward_std": 0.19145195186138153, + "rewards/final_reward": 1.6666713105364588, + "rewards/mask_iou_reward": 0.8333356552682294, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.502427101135254, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2182, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.48958587646484, + "epoch": 7.3743676222596966, + "grad_norm": 5.940061168135283, + "kl": 0.4697265625, + "learning_rate": 3.8541666666666665e-07, + "loss": 0.0005, + "reward": 3.6492291688919067, + "reward_std": 0.0633248221129179, + "rewards/final_reward": 1.7177834520208082, + "rewards/mask_iou_reward": 0.8588917260104041, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6492289304733276, + "rewards/thk_ans_format_reward": 1.0, + "step": 2183, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.6666717529297, + "epoch": 7.377740303541315, + "grad_norm": 5.7907618943106325, + "kl": 0.4248046875, + "learning_rate": 3.851351351351351e-07, + "loss": 0.0004, + "reward": 3.47650945186615, + "reward_std": 0.23418362438678741, + "rewards/final_reward": 1.8728345238525135, + "rewards/mask_iou_reward": 0.9364172619262567, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4973427653312683, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2184, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.78125, + "epoch": 7.381112984822934, + "grad_norm": 9.008928063551123, + "kl": 0.4111328125, + "learning_rate": 3.8485360360360357e-07, + "loss": 0.0004, + "reward": 3.35174298286438, + "reward_std": 0.32421646267175674, + "rewards/final_reward": 0.9672972781583278, + "rewards/mask_iou_reward": 0.4836486390791639, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.4142429828643799, + "rewards/thk_ans_format_reward": 0.96875, + "step": 2185, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.65625, + "epoch": 7.3844856661045535, + "grad_norm": 7.066288003515296, + "kl": 0.416015625, + "learning_rate": 3.8457207207207203e-07, + "loss": 0.0005, + "reward": 3.231791615486145, + "reward_std": 0.10090844705700874, + "rewards/final_reward": 0.919918053266102, + "rewards/mask_iou_reward": 0.459959026633051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2317915558815002, + "rewards/thk_ans_format_reward": 1.0, + "step": 2186, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.30208587646484, + "epoch": 7.387858347386172, + "grad_norm": 5.58767972116072, + "kl": 0.396484375, + "learning_rate": 3.842905405405405e-07, + "loss": 0.0004, + "reward": 3.4587671756744385, + "reward_std": 0.0885092574171722, + "rewards/final_reward": 1.8481281099755282, + "rewards/mask_iou_reward": 0.9240640549877641, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.458767294883728, + "rewards/thk_ans_format_reward": 1.0, + "step": 2187, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.06251525878906, + "epoch": 7.391231028667791, + "grad_norm": 11.352386821224222, + "kl": 0.3623046875, + "learning_rate": 3.84009009009009e-07, + "loss": 0.0004, + "reward": 3.7279186248779297, + "reward_std": 0.21404867619276047, + "rewards/final_reward": 1.7843203736840167, + "rewards/mask_iou_reward": 0.8921601868420084, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.7695854306221008, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2188, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.71875762939453, + "epoch": 7.3946037099494095, + "grad_norm": 13.41413439864965, + "kl": 0.390625, + "learning_rate": 3.8372747747747746e-07, + "loss": 0.0004, + "reward": 3.3822845220565796, + "reward_std": 0.11853938177227974, + "rewards/final_reward": 1.0435111055189241, + "rewards/mask_iou_reward": 0.5217555527594621, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3822844624519348, + "rewards/thk_ans_format_reward": 1.0, + "step": 2189, + "think_completion_length": 6.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.2916717529297, + "epoch": 7.397976391231029, + "grad_norm": 15.092434058069491, + "kl": 0.3916015625, + "learning_rate": 3.834459459459459e-07, + "loss": 0.0004, + "reward": 3.6769341230392456, + "reward_std": 0.19549360498785973, + "rewards/final_reward": 1.8140674795930374, + "rewards/mask_iou_reward": 0.9070337397965187, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6977673768997192, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2190, + "think_completion_length": 6.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.70833587646484, + "epoch": 7.401349072512647, + "grad_norm": 17.033820282190767, + "kl": 0.421875, + "learning_rate": 3.831644144144144e-07, + "loss": 0.0004, + "reward": 3.5693044662475586, + "reward_std": 0.035708085633814335, + "rewards/final_reward": 1.1148767645815578, + "rewards/mask_iou_reward": 0.5574383822907789, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.56930410861969, + "rewards/thk_ans_format_reward": 1.0, + "step": 2191, + "think_completion_length": 6.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.5625, + "epoch": 7.4047217537942664, + "grad_norm": 25.511794760633197, + "kl": 0.478515625, + "learning_rate": 3.8288288288288285e-07, + "loss": 0.0005, + "reward": 3.360031247138977, + "reward_std": 0.0737453605979681, + "rewards/final_reward": 1.5050937203409687, + "rewards/mask_iou_reward": 0.7525468601704843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.360031247138977, + "rewards/thk_ans_format_reward": 1.0, + "step": 2192, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.11458587646484, + "epoch": 7.408094435075886, + "grad_norm": 9.392724621007625, + "kl": 0.4892578125, + "learning_rate": 3.8260135135135136e-07, + "loss": 0.0005, + "reward": 3.3804363012313843, + "reward_std": 0.09317411482334137, + "rewards/final_reward": 1.1672647924376789, + "rewards/mask_iou_reward": 0.5836323962188394, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3804362416267395, + "rewards/thk_ans_format_reward": 1.0, + "step": 2193, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.32291793823242, + "epoch": 7.411467116357504, + "grad_norm": 40.91590308816363, + "kl": 0.6357421875, + "learning_rate": 3.823198198198198e-07, + "loss": 0.0007, + "reward": 3.6738728284835815, + "reward_std": 0.0640547089278698, + "rewards/final_reward": 1.4350410539701315, + "rewards/mask_iou_reward": 0.7175205269850657, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.673872709274292, + "rewards/thk_ans_format_reward": 1.0, + "step": 2194, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.70833587646484, + "epoch": 7.414839797639123, + "grad_norm": 17.155419273352084, + "kl": 0.568359375, + "learning_rate": 3.820382882882883e-07, + "loss": 0.0006, + "reward": 3.4904896020889282, + "reward_std": 0.046076007187366486, + "rewards/final_reward": 1.8197614421774921, + "rewards/mask_iou_reward": 0.9098807210887461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4904893636703491, + "rewards/thk_ans_format_reward": 1.0, + "step": 2195, + "think_completion_length": 6.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.58333587646484, + "epoch": 7.418212478920742, + "grad_norm": 27.34766369146001, + "kl": 0.666015625, + "learning_rate": 3.8175675675675674e-07, + "loss": 0.0007, + "reward": 3.746211886405945, + "reward_std": 0.0641557164490223, + "rewards/final_reward": 1.628753586553516, + "rewards/mask_iou_reward": 0.814376793276758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7462120056152344, + "rewards/thk_ans_format_reward": 1.0, + "step": 2196, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0104217529297, + "epoch": 7.421585160202361, + "grad_norm": 15.070360604305545, + "kl": 0.4365234375, + "learning_rate": 3.814752252252252e-07, + "loss": 0.0004, + "reward": 3.628060817718506, + "reward_std": 0.14443704020231962, + "rewards/final_reward": 1.7625150773573142, + "rewards/mask_iou_reward": 0.8812575386786571, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6384775042533875, + "rewards/thk_ans_format_reward": 1.0, + "step": 2197, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.13541793823242, + "epoch": 7.424957841483979, + "grad_norm": 15.525017230063492, + "kl": 0.51953125, + "learning_rate": 3.811936936936937e-07, + "loss": 0.0005, + "reward": 3.5307661294937134, + "reward_std": 0.054470050148665905, + "rewards/final_reward": 1.3505440488513778, + "rewards/mask_iou_reward": 0.6752720244256889, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.530765950679779, + "rewards/thk_ans_format_reward": 1.0, + "step": 2198, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.4479217529297, + "epoch": 7.428330522765599, + "grad_norm": 14.559243133401148, + "kl": 0.4306640625, + "learning_rate": 3.809121621621622e-07, + "loss": 0.0004, + "reward": 3.709012985229492, + "reward_std": 0.04592567728832364, + "rewards/final_reward": 1.6783327267927772, + "rewards/mask_iou_reward": 0.8391663633963886, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7090131640434265, + "rewards/thk_ans_format_reward": 1.0, + "step": 2199, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.75, + "epoch": 7.431703204047217, + "grad_norm": 12.114197844295079, + "kl": 0.5166015625, + "learning_rate": 3.8063063063063064e-07, + "loss": 0.0005, + "reward": 3.7441108226776123, + "reward_std": 0.022463752888143063, + "rewards/final_reward": 1.6117690480853413, + "rewards/mask_iou_reward": 0.8058845240426706, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7441107034683228, + "rewards/thk_ans_format_reward": 1.0, + "step": 2200, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.22916793823242, + "epoch": 7.435075885328836, + "grad_norm": 6.981330545722498, + "kl": 0.451171875, + "learning_rate": 3.8034909909909904e-07, + "loss": 0.0005, + "reward": 3.562459111213684, + "reward_std": 0.07043910771608353, + "rewards/final_reward": 0.9374220019005385, + "rewards/mask_iou_reward": 0.46871100095026924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5624586939811707, + "rewards/thk_ans_format_reward": 1.0, + "step": 2201, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.73958587646484, + "epoch": 7.438448566610456, + "grad_norm": 12.743543224102556, + "kl": 0.5830078125, + "learning_rate": 3.800675675675675e-07, + "loss": 0.0006, + "reward": 3.7944655418395996, + "reward_std": 0.044870490208268166, + "rewards/final_reward": 1.4736335733550814, + "rewards/mask_iou_reward": 0.7368167866775407, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.794465720653534, + "rewards/thk_ans_format_reward": 1.0, + "step": 2202, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.9479217529297, + "epoch": 7.441821247892074, + "grad_norm": 7.429789500024564, + "kl": 0.6259765625, + "learning_rate": 3.79786036036036e-07, + "loss": 0.0007, + "reward": 3.661327838897705, + "reward_std": 0.0312417505774647, + "rewards/final_reward": 1.4290566956752653, + "rewards/mask_iou_reward": 0.7145283478376326, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6613278985023499, + "rewards/thk_ans_format_reward": 1.0, + "step": 2203, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.92708587646484, + "epoch": 7.445193929173693, + "grad_norm": 24.441487691143756, + "kl": 0.85546875, + "learning_rate": 3.795045045045045e-07, + "loss": 0.0009, + "reward": 3.331498861312866, + "reward_std": 0.040046393405646086, + "rewards/final_reward": 1.232488613551466, + "rewards/mask_iou_reward": 0.616244306775733, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3314987421035767, + "rewards/thk_ans_format_reward": 1.0, + "step": 2204, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.09375, + "epoch": 7.448566610455312, + "grad_norm": 14.548716922710366, + "kl": 0.4306640625, + "learning_rate": 3.7922297297297294e-07, + "loss": 0.0004, + "reward": 3.6373748779296875, + "reward_std": 0.09864117112010717, + "rewards/final_reward": 1.7490384938743897, + "rewards/mask_iou_reward": 0.8745192469371948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6373746991157532, + "rewards/thk_ans_format_reward": 1.0, + "step": 2205, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.52083587646484, + "epoch": 7.451939291736931, + "grad_norm": 8.439231876587783, + "kl": 0.580078125, + "learning_rate": 3.789414414414414e-07, + "loss": 0.0006, + "reward": 3.783831477165222, + "reward_std": 0.06918147206306458, + "rewards/final_reward": 1.6706428733265701, + "rewards/mask_iou_reward": 0.8353214366632851, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7838313579559326, + "rewards/thk_ans_format_reward": 1.0, + "step": 2206, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.0416717529297, + "epoch": 7.455311973018549, + "grad_norm": 13.586478486878896, + "kl": 0.4736328125, + "learning_rate": 3.7865990990990986e-07, + "loss": 0.0005, + "reward": 3.2348625659942627, + "reward_std": 0.10381998401135206, + "rewards/final_reward": 1.3652094540702542, + "rewards/mask_iou_reward": 0.6826047270351271, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2348628044128418, + "rewards/thk_ans_format_reward": 1.0, + "step": 2207, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.28125762939453, + "epoch": 7.458684654300169, + "grad_norm": 8.91410597058018, + "kl": 0.408203125, + "learning_rate": 3.783783783783784e-07, + "loss": 0.0004, + "reward": 3.38576340675354, + "reward_std": 0.05992165021598339, + "rewards/final_reward": 1.5979352470525372, + "rewards/mask_iou_reward": 0.7989676235262686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3857632279396057, + "rewards/thk_ans_format_reward": 1.0, + "step": 2208, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.95833587646484, + "epoch": 7.462057335581788, + "grad_norm": 25.637237825695134, + "kl": 0.509765625, + "learning_rate": 3.7809684684684684e-07, + "loss": 0.0005, + "reward": 3.713690161705017, + "reward_std": 0.03531708940863609, + "rewards/final_reward": 1.3274247871289895, + "rewards/mask_iou_reward": 0.6637123935644947, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.713690161705017, + "rewards/thk_ans_format_reward": 1.0, + "step": 2209, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.93750762939453, + "epoch": 7.465430016863406, + "grad_norm": 28.405959846536, + "kl": 0.4443359375, + "learning_rate": 3.778153153153153e-07, + "loss": 0.0004, + "reward": 3.681239604949951, + "reward_std": 0.06277862191200256, + "rewards/final_reward": 1.5637615545390429, + "rewards/mask_iou_reward": 0.7818807772695214, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6812394857406616, + "rewards/thk_ans_format_reward": 1.0, + "step": 2210, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.42708587646484, + "epoch": 7.4688026981450255, + "grad_norm": 23.567063540409116, + "kl": 0.44921875, + "learning_rate": 3.7753378378378376e-07, + "loss": 0.0005, + "reward": 3.579433560371399, + "reward_std": 0.04429387301206589, + "rewards/final_reward": 1.4484530347445896, + "rewards/mask_iou_reward": 0.7242265173722948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5794333815574646, + "rewards/thk_ans_format_reward": 1.0, + "step": 2211, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.46876525878906, + "epoch": 7.472175379426644, + "grad_norm": 13.340321865561812, + "kl": 0.6357421875, + "learning_rate": 3.772522522522522e-07, + "loss": 0.0006, + "reward": 3.567931294441223, + "reward_std": 0.0854704063385725, + "rewards/final_reward": 1.5191784787802172, + "rewards/mask_iou_reward": 0.7595892393901086, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5679312944412231, + "rewards/thk_ans_format_reward": 1.0, + "step": 2212, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.92708587646484, + "epoch": 7.475548060708263, + "grad_norm": 19.779998859375922, + "kl": 0.4296875, + "learning_rate": 3.7697072072072073e-07, + "loss": 0.0004, + "reward": 3.3631173372268677, + "reward_std": 0.09759453311562538, + "rewards/final_reward": 1.1787163009693873, + "rewards/mask_iou_reward": 0.5893581504846936, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3631173372268677, + "rewards/thk_ans_format_reward": 1.0, + "step": 2213, + "think_completion_length": 7.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.375, + "epoch": 7.4789207419898815, + "grad_norm": 11.414583103038034, + "kl": 0.5224609375, + "learning_rate": 3.766891891891892e-07, + "loss": 0.0005, + "reward": 3.526229500770569, + "reward_std": 0.09345915447920561, + "rewards/final_reward": 1.2993992131816308, + "rewards/mask_iou_reward": 0.6496996065908154, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.526229441165924, + "rewards/thk_ans_format_reward": 1.0, + "step": 2214, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.45833587646484, + "epoch": 7.482293423271501, + "grad_norm": 93.76734649917043, + "kl": 0.4921875, + "learning_rate": 3.7640765765765765e-07, + "loss": 0.0005, + "reward": 3.3889803886413574, + "reward_std": 0.05893061310052872, + "rewards/final_reward": 1.1675987035576778, + "rewards/mask_iou_reward": 0.5837993517788389, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.388980507850647, + "rewards/thk_ans_format_reward": 1.0, + "step": 2215, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.61459350585938, + "epoch": 7.48566610455312, + "grad_norm": 9.911396228124637, + "kl": 0.630859375, + "learning_rate": 3.761261261261261e-07, + "loss": 0.0006, + "reward": 3.6827415227890015, + "reward_std": 0.03132602386176586, + "rewards/final_reward": 1.7700213586978424, + "rewards/mask_iou_reward": 0.8850106793489212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6827413439750671, + "rewards/thk_ans_format_reward": 1.0, + "step": 2216, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.06250762939453, + "epoch": 7.4890387858347385, + "grad_norm": 10.90871479195432, + "kl": 0.4384765625, + "learning_rate": 3.758445945945946e-07, + "loss": 0.0004, + "reward": 3.661782741546631, + "reward_std": 0.04503993829712272, + "rewards/final_reward": 1.3926323886832095, + "rewards/mask_iou_reward": 0.6963161943416047, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.661782443523407, + "rewards/thk_ans_format_reward": 1.0, + "step": 2217, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.2291717529297, + "epoch": 7.492411467116358, + "grad_norm": 11.99902117384724, + "kl": 0.544921875, + "learning_rate": 3.755630630630631e-07, + "loss": 0.0006, + "reward": 3.598073363304138, + "reward_std": 0.08071838691830635, + "rewards/final_reward": 1.4846668315120548, + "rewards/mask_iou_reward": 0.7423334157560274, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5980734825134277, + "rewards/thk_ans_format_reward": 1.0, + "step": 2218, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.23958587646484, + "epoch": 7.495784148397976, + "grad_norm": 8.12535465891368, + "kl": 0.48046875, + "learning_rate": 3.7528153153153155e-07, + "loss": 0.0005, + "reward": 3.6236926317214966, + "reward_std": 0.0668908916413784, + "rewards/final_reward": 1.5927653023144133, + "rewards/mask_iou_reward": 0.7963826511572066, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.623692274093628, + "rewards/thk_ans_format_reward": 1.0, + "step": 2219, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.98959350585938, + "epoch": 7.499156829679595, + "grad_norm": 6.679490208300435, + "kl": 0.515625, + "learning_rate": 3.75e-07, + "loss": 0.0005, + "reward": 3.4468973875045776, + "reward_std": 0.05854324251413345, + "rewards/final_reward": 1.732786586934808, + "rewards/mask_iou_reward": 0.866393293467404, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.446897566318512, + "rewards/thk_ans_format_reward": 1.0, + "step": 2220, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.89583587646484, + "epoch": 7.502529510961214, + "grad_norm": 45.747778673697404, + "kl": 0.4326171875, + "learning_rate": 3.747184684684684e-07, + "loss": 0.0004, + "reward": 3.609244465827942, + "reward_std": 0.06323170848190784, + "rewards/final_reward": 1.0002317736868498, + "rewards/mask_iou_reward": 0.5001158868434249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6092444062232971, + "rewards/thk_ans_format_reward": 1.0, + "step": 2221, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.67708587646484, + "epoch": 7.505902192242833, + "grad_norm": 32.9355415535747, + "kl": 0.439453125, + "learning_rate": 3.744369369369369e-07, + "loss": 0.0004, + "reward": 3.7596893310546875, + "reward_std": 0.06496717035770416, + "rewards/final_reward": 1.7733527351530844, + "rewards/mask_iou_reward": 0.8866763675765422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7596891522407532, + "rewards/thk_ans_format_reward": 1.0, + "step": 2222, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.81250762939453, + "epoch": 7.509274873524452, + "grad_norm": 18.282761151795988, + "kl": 0.4287109375, + "learning_rate": 3.741554054054054e-07, + "loss": 0.0004, + "reward": 3.25011944770813, + "reward_std": 0.1788785234093666, + "rewards/final_reward": 1.7937153522651648, + "rewards/mask_iou_reward": 0.8968576761325824, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.354286015033722, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 2223, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.48958587646484, + "epoch": 7.512647554806071, + "grad_norm": 11.784405940817232, + "kl": 0.5439453125, + "learning_rate": 3.7387387387387385e-07, + "loss": 0.0005, + "reward": 3.707550287246704, + "reward_std": 0.06585472077131271, + "rewards/final_reward": 1.5084785123889164, + "rewards/mask_iou_reward": 0.7542392561944582, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7075501680374146, + "rewards/thk_ans_format_reward": 1.0, + "step": 2224, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.2291717529297, + "epoch": 7.51602023608769, + "grad_norm": 517.8951212717884, + "kl": 0.548828125, + "learning_rate": 3.735923423423423e-07, + "loss": 0.0006, + "reward": 3.596574544906616, + "reward_std": 0.12409070134162903, + "rewards/final_reward": 1.5688267024199112, + "rewards/mask_iou_reward": 0.7844133512099556, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5965744256973267, + "rewards/thk_ans_format_reward": 1.0, + "step": 2225, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.1979217529297, + "epoch": 7.519392917369308, + "grad_norm": 10.548538981689129, + "kl": 0.3935546875, + "learning_rate": 3.7331081081081077e-07, + "loss": 0.0004, + "reward": 3.405883550643921, + "reward_std": 0.06507723964750767, + "rewards/final_reward": 1.6584953532713012, + "rewards/mask_iou_reward": 0.8292476766356506, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.405883550643921, + "rewards/thk_ans_format_reward": 1.0, + "step": 2226, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.22916793823242, + "epoch": 7.522765598650928, + "grad_norm": 7.929043652698762, + "kl": 0.4267578125, + "learning_rate": 3.7302927927927923e-07, + "loss": 0.0004, + "reward": 3.7863839864730835, + "reward_std": 0.017993359360843897, + "rewards/final_reward": 1.435606746177593, + "rewards/mask_iou_reward": 0.7178033730887965, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7863839864730835, + "rewards/thk_ans_format_reward": 1.0, + "step": 2227, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.90625, + "epoch": 7.526138279932546, + "grad_norm": 8.908029448338619, + "kl": 0.4423828125, + "learning_rate": 3.7274774774774775e-07, + "loss": 0.0004, + "reward": 3.4020566940307617, + "reward_std": 0.10786337032914162, + "rewards/final_reward": 1.4677611612201815, + "rewards/mask_iou_reward": 0.7338805806100908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4020565748214722, + "rewards/thk_ans_format_reward": 1.0, + "step": 2228, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.34375762939453, + "epoch": 7.529510961214165, + "grad_norm": 11.043197813990117, + "kl": 0.669921875, + "learning_rate": 3.724662162162162e-07, + "loss": 0.0007, + "reward": 3.0677175521850586, + "reward_std": 0.11405624449253082, + "rewards/final_reward": 1.335943431280947, + "rewards/mask_iou_reward": 0.6679717156404735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0677174925804138, + "rewards/thk_ans_format_reward": 1.0, + "step": 2229, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.3541717529297, + "epoch": 7.532883642495785, + "grad_norm": 6.752553142615018, + "kl": 0.4365234375, + "learning_rate": 3.7218468468468467e-07, + "loss": 0.0005, + "reward": 3.2327537536621094, + "reward_std": 0.06896964088082314, + "rewards/final_reward": 1.8958117133631451, + "rewards/mask_iou_reward": 0.9479058566815726, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2327535152435303, + "rewards/thk_ans_format_reward": 1.0, + "step": 2230, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.7916717529297, + "epoch": 7.536256323777403, + "grad_norm": 5.607542004004971, + "kl": 0.4560546875, + "learning_rate": 3.7190315315315313e-07, + "loss": 0.0004, + "reward": 3.449102759361267, + "reward_std": 0.02281183283776045, + "rewards/final_reward": 1.764716877455844, + "rewards/mask_iou_reward": 0.882358438727922, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4491026401519775, + "rewards/thk_ans_format_reward": 1.0, + "step": 2231, + "think_completion_length": 6.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.25000762939453, + "epoch": 7.539629005059022, + "grad_norm": 16.474163586412292, + "kl": 0.4541015625, + "learning_rate": 3.716216216216216e-07, + "loss": 0.0005, + "reward": 3.6877329349517822, + "reward_std": 0.05621516332030296, + "rewards/final_reward": 1.5877485930388506, + "rewards/mask_iou_reward": 0.7938742965194253, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6877328157424927, + "rewards/thk_ans_format_reward": 1.0, + "step": 2232, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.65625762939453, + "epoch": 7.543001686340641, + "grad_norm": 85.02315344867714, + "kl": 0.431640625, + "learning_rate": 3.713400900900901e-07, + "loss": 0.0004, + "reward": 3.723360538482666, + "reward_std": 0.02233183290809393, + "rewards/final_reward": 1.5680750089532776, + "rewards/mask_iou_reward": 0.7840375044766388, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7233604192733765, + "rewards/thk_ans_format_reward": 1.0, + "step": 2233, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.4166717529297, + "epoch": 7.54637436762226, + "grad_norm": 9.119756514703937, + "kl": 0.3876953125, + "learning_rate": 3.7105855855855856e-07, + "loss": 0.0004, + "reward": 3.675152897834778, + "reward_std": 0.04308299534022808, + "rewards/final_reward": 1.6556049828306034, + "rewards/mask_iou_reward": 0.8278024914153017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.675152838230133, + "rewards/thk_ans_format_reward": 1.0, + "step": 2234, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.47917938232422, + "epoch": 7.549747048903878, + "grad_norm": 21.538090314289647, + "kl": 0.4794921875, + "learning_rate": 3.70777027027027e-07, + "loss": 0.0005, + "reward": 3.4606536626815796, + "reward_std": 0.2695749457925558, + "rewards/final_reward": 1.6050519190494144, + "rewards/mask_iou_reward": 0.8025259595247072, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.543986976146698, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 2235, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.2395935058594, + "epoch": 7.5531197301854975, + "grad_norm": 13.49060468083085, + "kl": 0.42578125, + "learning_rate": 3.704954954954955e-07, + "loss": 0.0004, + "reward": 3.408440351486206, + "reward_std": 0.4749724715948105, + "rewards/final_reward": 1.6407366601027256, + "rewards/mask_iou_reward": 0.8203683300513628, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.4917737245559692, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 2236, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.625, + "epoch": 7.556492411467117, + "grad_norm": 8.739897004231855, + "kl": 0.4140625, + "learning_rate": 3.7021396396396395e-07, + "loss": 0.0004, + "reward": 3.3929598331451416, + "reward_std": 0.11936522647738457, + "rewards/final_reward": 1.5154707608922555, + "rewards/mask_iou_reward": 0.7577353804461278, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3929598331451416, + "rewards/thk_ans_format_reward": 1.0, + "step": 2237, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.1041717529297, + "epoch": 7.559865092748735, + "grad_norm": 5.920106396877477, + "kl": 0.4296875, + "learning_rate": 3.6993243243243246e-07, + "loss": 0.0004, + "reward": 3.253367304801941, + "reward_std": 0.18102075904607773, + "rewards/final_reward": 0.6303426610613139, + "rewards/mask_iou_reward": 0.31517133053065693, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2533673644065857, + "rewards/thk_ans_format_reward": 1.0, + "step": 2238, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.55208587646484, + "epoch": 7.5632377740303545, + "grad_norm": 9.261427448407403, + "kl": 0.4267578125, + "learning_rate": 3.696509009009009e-07, + "loss": 0.0005, + "reward": 3.3558290004730225, + "reward_std": 0.1935933530330658, + "rewards/final_reward": 1.7297704820722961, + "rewards/mask_iou_reward": 0.8648852410361481, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.355828881263733, + "rewards/thk_ans_format_reward": 1.0, + "step": 2239, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.40625, + "epoch": 7.566610455311973, + "grad_norm": 8.221685272658684, + "kl": 0.4716796875, + "learning_rate": 3.6936936936936933e-07, + "loss": 0.0005, + "reward": 3.681155204772949, + "reward_std": 0.026889142580330372, + "rewards/final_reward": 1.117077239786284, + "rewards/mask_iou_reward": 0.558538619893142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6811550855636597, + "rewards/thk_ans_format_reward": 1.0, + "step": 2240, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.56250762939453, + "epoch": 7.569983136593592, + "grad_norm": 18.127114567662925, + "kl": 0.4677734375, + "learning_rate": 3.690878378378378e-07, + "loss": 0.0005, + "reward": 3.411197304725647, + "reward_std": 0.08188456669449806, + "rewards/final_reward": 1.3459034129007557, + "rewards/mask_iou_reward": 0.6729517064503778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4111971855163574, + "rewards/thk_ans_format_reward": 1.0, + "step": 2241, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.89583587646484, + "epoch": 7.5733558178752105, + "grad_norm": 12.244729010105242, + "kl": 0.76171875, + "learning_rate": 3.6880630630630625e-07, + "loss": 0.0008, + "reward": 3.693773627281189, + "reward_std": 0.11770599335432053, + "rewards/final_reward": 1.858820377313907, + "rewards/mask_iou_reward": 0.9294101886569535, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6937735080718994, + "rewards/thk_ans_format_reward": 1.0, + "step": 2242, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.14583587646484, + "epoch": 7.57672849915683, + "grad_norm": 10.033573999591129, + "kl": 0.4765625, + "learning_rate": 3.6852477477477476e-07, + "loss": 0.0005, + "reward": 3.378702402114868, + "reward_std": 0.1148904599249363, + "rewards/final_reward": 1.559600441550097, + "rewards/mask_iou_reward": 0.7798002207750485, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3787025213241577, + "rewards/thk_ans_format_reward": 1.0, + "step": 2243, + "think_completion_length": 9.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.4166717529297, + "epoch": 7.580101180438449, + "grad_norm": 6.1941069990035835, + "kl": 0.453125, + "learning_rate": 3.682432432432432e-07, + "loss": 0.0004, + "reward": 3.464112401008606, + "reward_std": 0.08145070215687156, + "rewards/final_reward": 1.6657504073543956, + "rewards/mask_iou_reward": 0.8328752036771978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4641121625900269, + "rewards/thk_ans_format_reward": 1.0, + "step": 2244, + "think_completion_length": 6.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.6979217529297, + "epoch": 7.583473861720067, + "grad_norm": 30.140629304396885, + "kl": 0.4091796875, + "learning_rate": 3.679617117117117e-07, + "loss": 0.0004, + "reward": 3.6752060651779175, + "reward_std": 0.03587649203836918, + "rewards/final_reward": 1.2118360177049143, + "rewards/mask_iou_reward": 0.6059180088524572, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6752062439918518, + "rewards/thk_ans_format_reward": 1.0, + "step": 2245, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.1041717529297, + "epoch": 7.586846543001687, + "grad_norm": 7.843757555164772, + "kl": 0.5078125, + "learning_rate": 3.6768018018018015e-07, + "loss": 0.0005, + "reward": 3.5074230432510376, + "reward_std": 0.14821650087833405, + "rewards/final_reward": 1.4525049018818945, + "rewards/mask_iou_reward": 0.7262524509409473, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5074230432510376, + "rewards/thk_ans_format_reward": 1.0, + "step": 2246, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.87500762939453, + "epoch": 7.590219224283305, + "grad_norm": 37.07479306445186, + "kl": 0.4560546875, + "learning_rate": 3.673986486486486e-07, + "loss": 0.0005, + "reward": 3.6342554092407227, + "reward_std": 0.12161976844072342, + "rewards/final_reward": 1.6356422989189932, + "rewards/mask_iou_reward": 0.8178211494594966, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6342553496360779, + "rewards/thk_ans_format_reward": 1.0, + "step": 2247, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.45833587646484, + "epoch": 7.593591905564924, + "grad_norm": 13.198979193087293, + "kl": 0.486328125, + "learning_rate": 3.671171171171171e-07, + "loss": 0.0005, + "reward": 3.4941141605377197, + "reward_std": 0.12330342456698418, + "rewards/final_reward": 1.6368577190460187, + "rewards/mask_iou_reward": 0.8184288595230094, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4941142201423645, + "rewards/thk_ans_format_reward": 1.0, + "step": 2248, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.2604217529297, + "epoch": 7.596964586846543, + "grad_norm": 14.38327466152256, + "kl": 0.458984375, + "learning_rate": 3.668355855855856e-07, + "loss": 0.0005, + "reward": 3.3377726078033447, + "reward_std": 0.12106435745954514, + "rewards/final_reward": 1.4980455248753946, + "rewards/mask_iou_reward": 0.7490227624376973, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3377727270126343, + "rewards/thk_ans_format_reward": 1.0, + "step": 2249, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.89584350585938, + "epoch": 7.600337268128162, + "grad_norm": 17.91416919788979, + "kl": 0.763671875, + "learning_rate": 3.6655405405405404e-07, + "loss": 0.0008, + "reward": 3.4967925548553467, + "reward_std": 0.039044877514243126, + "rewards/final_reward": 1.9765416585709925, + "rewards/mask_iou_reward": 0.9882708292854963, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4967926740646362, + "rewards/thk_ans_format_reward": 1.0, + "step": 2250, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.31251525878906, + "epoch": 7.60370994940978, + "grad_norm": 7.692315822711692, + "kl": 0.4248046875, + "learning_rate": 3.662725225225225e-07, + "loss": 0.0004, + "reward": 3.5274364948272705, + "reward_std": 0.24068910256028175, + "rewards/final_reward": 1.848031531631317, + "rewards/mask_iou_reward": 0.9240157658156585, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.6107696294784546, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 2251, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.82291793823242, + "epoch": 7.6070826306914, + "grad_norm": 24.759656039202838, + "kl": 0.54296875, + "learning_rate": 3.6599099099099096e-07, + "loss": 0.0006, + "reward": 3.594699501991272, + "reward_std": 0.03100848849862814, + "rewards/final_reward": 1.705501867635372, + "rewards/mask_iou_reward": 0.852750933817686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5946993827819824, + "rewards/thk_ans_format_reward": 1.0, + "step": 2252, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.28125381469727, + "epoch": 7.610455311973018, + "grad_norm": 12.021081013771163, + "kl": 0.4404296875, + "learning_rate": 3.657094594594595e-07, + "loss": 0.0004, + "reward": 3.546002149581909, + "reward_std": 0.061210453510284424, + "rewards/final_reward": 1.1353577830401829, + "rewards/mask_iou_reward": 0.5676788915200914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5460022687911987, + "rewards/thk_ans_format_reward": 1.0, + "step": 2253, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.44791793823242, + "epoch": 7.613827993254637, + "grad_norm": 11.792513231710862, + "kl": 0.525390625, + "learning_rate": 3.6542792792792794e-07, + "loss": 0.0006, + "reward": 3.6590678691864014, + "reward_std": 0.03704315610229969, + "rewards/final_reward": 1.9048289909962175, + "rewards/mask_iou_reward": 0.9524144954981087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6590678095817566, + "rewards/thk_ans_format_reward": 1.0, + "step": 2254, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.7291717529297, + "epoch": 7.617200674536257, + "grad_norm": 8.792350098249111, + "kl": 0.5498046875, + "learning_rate": 3.651463963963964e-07, + "loss": 0.0006, + "reward": 3.502922534942627, + "reward_std": 0.029376371763646603, + "rewards/final_reward": 1.2339888142029136, + "rewards/mask_iou_reward": 0.6169944071014568, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5029225945472717, + "rewards/thk_ans_format_reward": 1.0, + "step": 2255, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.2916717529297, + "epoch": 7.620573355817875, + "grad_norm": 16.987057100099697, + "kl": 0.583984375, + "learning_rate": 3.6486486486486486e-07, + "loss": 0.0007, + "reward": 3.826703906059265, + "reward_std": 0.06088973954319954, + "rewards/final_reward": 1.7263040467134716, + "rewards/mask_iou_reward": 0.8631520233567358, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8267039060592651, + "rewards/thk_ans_format_reward": 1.0, + "step": 2256, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.96875, + "epoch": 7.623946037099494, + "grad_norm": 23.764417563866544, + "kl": 0.4296875, + "learning_rate": 3.645833333333333e-07, + "loss": 0.0004, + "reward": 3.435308575630188, + "reward_std": 0.062496624886989594, + "rewards/final_reward": 1.8658596923240005, + "rewards/mask_iou_reward": 0.9329298461620003, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4353084564208984, + "rewards/thk_ans_format_reward": 1.0, + "step": 2257, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.7604217529297, + "epoch": 7.627318718381113, + "grad_norm": 10.45185473996097, + "kl": 0.53515625, + "learning_rate": 3.6430180180180183e-07, + "loss": 0.0005, + "reward": 3.4529281854629517, + "reward_std": 0.11622785404324532, + "rewards/final_reward": 1.047259343234454, + "rewards/mask_iou_reward": 0.523629671617227, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4529281854629517, + "rewards/thk_ans_format_reward": 1.0, + "step": 2258, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.76041793823242, + "epoch": 7.630691399662732, + "grad_norm": 9.249642151291475, + "kl": 0.896484375, + "learning_rate": 3.640202702702703e-07, + "loss": 0.0009, + "reward": 3.285198450088501, + "reward_std": 0.06478903815150261, + "rewards/final_reward": 0.8379335323225162, + "rewards/mask_iou_reward": 0.4189667661612581, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2851983308792114, + "rewards/thk_ans_format_reward": 1.0, + "step": 2259, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.93750381469727, + "epoch": 7.63406408094435, + "grad_norm": 10.101942600327556, + "kl": 0.46484375, + "learning_rate": 3.637387387387387e-07, + "loss": 0.0005, + "reward": 3.510848045349121, + "reward_std": 0.07667689025402069, + "rewards/final_reward": 1.0392154602383656, + "rewards/mask_iou_reward": 0.5196077301191828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5108479857444763, + "rewards/thk_ans_format_reward": 1.0, + "step": 2260, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.71875, + "epoch": 7.63743676222597, + "grad_norm": 14.927874216703648, + "kl": 0.61328125, + "learning_rate": 3.6345720720720716e-07, + "loss": 0.0006, + "reward": 3.684826970100403, + "reward_std": 0.05714831594377756, + "rewards/final_reward": 1.916309884839134, + "rewards/mask_iou_reward": 0.958154942419567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6848266124725342, + "rewards/thk_ans_format_reward": 1.0, + "step": 2261, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.67708587646484, + "epoch": 7.640809443507589, + "grad_norm": 41.60265407330587, + "kl": 0.4482421875, + "learning_rate": 3.631756756756756e-07, + "loss": 0.0005, + "reward": 3.417069673538208, + "reward_std": 0.10894013848155737, + "rewards/final_reward": 1.7192058449284477, + "rewards/mask_iou_reward": 0.8596029224642239, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4274863004684448, + "rewards/thk_ans_format_reward": 1.0, + "step": 2262, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.3125, + "epoch": 7.644182124789207, + "grad_norm": 8.390634669842632, + "kl": 0.4814453125, + "learning_rate": 3.6289414414414414e-07, + "loss": 0.0005, + "reward": 3.4856581687927246, + "reward_std": 0.14437636360526085, + "rewards/final_reward": 1.4299188208280276, + "rewards/mask_iou_reward": 0.7149594104140138, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4856582283973694, + "rewards/thk_ans_format_reward": 1.0, + "step": 2263, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.80209350585938, + "epoch": 7.6475548060708265, + "grad_norm": 15.907862387941135, + "kl": 1.001953125, + "learning_rate": 3.626126126126126e-07, + "loss": 0.001, + "reward": 3.6028738021850586, + "reward_std": 0.0683306735008955, + "rewards/final_reward": 1.3892481459049233, + "rewards/mask_iou_reward": 0.6946240729524616, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6028736233711243, + "rewards/thk_ans_format_reward": 1.0, + "step": 2264, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.4479217529297, + "epoch": 7.650927487352445, + "grad_norm": 11.635154219046655, + "kl": 0.4150390625, + "learning_rate": 3.6233108108108106e-07, + "loss": 0.0004, + "reward": 3.636030435562134, + "reward_std": 0.054130956530570984, + "rewards/final_reward": 1.567502128195345, + "rewards/mask_iou_reward": 0.7837510640976725, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.636030375957489, + "rewards/thk_ans_format_reward": 1.0, + "step": 2265, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.02083587646484, + "epoch": 7.654300168634064, + "grad_norm": 13.225457282383763, + "kl": 0.529296875, + "learning_rate": 3.620495495495495e-07, + "loss": 0.0005, + "reward": 3.525128722190857, + "reward_std": 0.054051365703344345, + "rewards/final_reward": 1.942525280419543, + "rewards/mask_iou_reward": 0.9712626402097715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5251285433769226, + "rewards/thk_ans_format_reward": 1.0, + "step": 2266, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.9375, + "epoch": 7.6576728499156825, + "grad_norm": 7.194650640225918, + "kl": 0.583984375, + "learning_rate": 3.61768018018018e-07, + "loss": 0.0006, + "reward": 3.3762385845184326, + "reward_std": 0.1294691450893879, + "rewards/final_reward": 1.1971829463789943, + "rewards/mask_iou_reward": 0.5985914731894971, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3762387037277222, + "rewards/thk_ans_format_reward": 1.0, + "step": 2267, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.34375762939453, + "epoch": 7.661045531197302, + "grad_norm": 8.525852262401335, + "kl": 0.4208984375, + "learning_rate": 3.614864864864865e-07, + "loss": 0.0004, + "reward": 3.744857430458069, + "reward_std": 0.030906444415450096, + "rewards/final_reward": 1.4886834060598715, + "rewards/mask_iou_reward": 0.7443417030299357, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7448573112487793, + "rewards/thk_ans_format_reward": 1.0, + "step": 2268, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.18750762939453, + "epoch": 7.664418212478921, + "grad_norm": 16.675502796446537, + "kl": 0.458984375, + "learning_rate": 3.6120495495495495e-07, + "loss": 0.0004, + "reward": 3.4693796634674072, + "reward_std": 0.13334699161350727, + "rewards/final_reward": 1.0773209848085428, + "rewards/mask_iou_reward": 0.5386604924042714, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4693796038627625, + "rewards/thk_ans_format_reward": 1.0, + "step": 2269, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5729217529297, + "epoch": 7.6677908937605395, + "grad_norm": 8.558349809131897, + "kl": 0.419921875, + "learning_rate": 3.609234234234234e-07, + "loss": 0.0004, + "reward": 3.4815101623535156, + "reward_std": 0.07516926433891058, + "rewards/final_reward": 1.1391895798047658, + "rewards/mask_iou_reward": 0.5695947899023829, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4815101027488708, + "rewards/thk_ans_format_reward": 1.0, + "step": 2270, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.62500762939453, + "epoch": 7.671163575042159, + "grad_norm": 8.90441635462389, + "kl": 0.474609375, + "learning_rate": 3.606418918918919e-07, + "loss": 0.0005, + "reward": 3.58591091632843, + "reward_std": 0.052635351195931435, + "rewards/final_reward": 1.8259667105500643, + "rewards/mask_iou_reward": 0.9129833552750322, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5859108567237854, + "rewards/thk_ans_format_reward": 1.0, + "step": 2271, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.20833587646484, + "epoch": 7.674536256323777, + "grad_norm": 18.882738239796144, + "kl": 0.6806640625, + "learning_rate": 3.6036036036036033e-07, + "loss": 0.0007, + "reward": 3.450736880302429, + "reward_std": 0.12018753960728645, + "rewards/final_reward": 1.5825101953239598, + "rewards/mask_iou_reward": 0.7912550976619799, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4507368803024292, + "rewards/thk_ans_format_reward": 1.0, + "step": 2272, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.43750762939453, + "epoch": 7.677908937605396, + "grad_norm": 8.664488502796393, + "kl": 0.4912109375, + "learning_rate": 3.6007882882882885e-07, + "loss": 0.0005, + "reward": 3.5445717573165894, + "reward_std": 0.05564088374376297, + "rewards/final_reward": 1.3317285713213023, + "rewards/mask_iou_reward": 0.6658642856606511, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5445716381072998, + "rewards/thk_ans_format_reward": 1.0, + "step": 2273, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.58333587646484, + "epoch": 7.681281618887015, + "grad_norm": 34.33115400832227, + "kl": 0.4326171875, + "learning_rate": 3.597972972972973e-07, + "loss": 0.0005, + "reward": 3.6770211458206177, + "reward_std": 0.03603087249211967, + "rewards/final_reward": 1.1484784869257847, + "rewards/mask_iou_reward": 0.5742392434628923, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6770211458206177, + "rewards/thk_ans_format_reward": 1.0, + "step": 2274, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.65625, + "epoch": 7.684654300168634, + "grad_norm": 29.052257351781336, + "kl": 0.4716796875, + "learning_rate": 3.5951576576576577e-07, + "loss": 0.0005, + "reward": 3.1839051246643066, + "reward_std": 0.17020989954471588, + "rewards/final_reward": 1.1850262935917633, + "rewards/mask_iou_reward": 0.5925131467958816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.183905005455017, + "rewards/thk_ans_format_reward": 1.0, + "step": 2275, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.09375, + "epoch": 7.688026981450253, + "grad_norm": 39.52562117914507, + "kl": 0.859375, + "learning_rate": 3.5923423423423423e-07, + "loss": 0.0009, + "reward": 3.5432727336883545, + "reward_std": 0.08223596028983593, + "rewards/final_reward": 1.9002105295025138, + "rewards/mask_iou_reward": 0.9501052647512569, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5432727932929993, + "rewards/thk_ans_format_reward": 1.0, + "step": 2276, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.6666717529297, + "epoch": 7.691399662731872, + "grad_norm": 16.67426749316441, + "kl": 0.529296875, + "learning_rate": 3.589527027027027e-07, + "loss": 0.0006, + "reward": 3.728038787841797, + "reward_std": 0.0844535268843174, + "rewards/final_reward": 1.647716684034463, + "rewards/mask_iou_reward": 0.8238583420172315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7280389070510864, + "rewards/thk_ans_format_reward": 1.0, + "step": 2277, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 126.72916793823242, + "epoch": 7.694772344013491, + "grad_norm": 7.016987101308896, + "kl": 0.623046875, + "learning_rate": 3.5867117117117115e-07, + "loss": 0.0006, + "reward": 3.1386752128601074, + "reward_std": 0.13225263357162476, + "rewards/final_reward": 0.19329672988784663, + "rewards/mask_iou_reward": 0.09664836494392332, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1386749744415283, + "rewards/thk_ans_format_reward": 1.0, + "step": 2278, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.5104217529297, + "epoch": 7.698145025295109, + "grad_norm": 8.83488013053066, + "kl": 0.58203125, + "learning_rate": 3.5838963963963967e-07, + "loss": 0.0006, + "reward": 3.4244601726531982, + "reward_std": 0.0693696178495884, + "rewards/final_reward": 0.7691967355950156, + "rewards/mask_iou_reward": 0.3845983677975078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.424459993839264, + "rewards/thk_ans_format_reward": 1.0, + "step": 2279, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.3229217529297, + "epoch": 7.701517706576729, + "grad_norm": 17.366672119479563, + "kl": 0.5244140625, + "learning_rate": 3.5810810810810807e-07, + "loss": 0.0005, + "reward": 3.5201783180236816, + "reward_std": 0.07162079215049744, + "rewards/final_reward": 1.5854306102070996, + "rewards/mask_iou_reward": 0.7927153051035498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.520177960395813, + "rewards/thk_ans_format_reward": 1.0, + "step": 2280, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.3229217529297, + "epoch": 7.704890387858347, + "grad_norm": 11.9320751465435, + "kl": 0.4443359375, + "learning_rate": 3.5782657657657653e-07, + "loss": 0.0005, + "reward": 3.6812453269958496, + "reward_std": 0.037463925778865814, + "rewards/final_reward": 1.7041272989205765, + "rewards/mask_iou_reward": 0.8520636494602882, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.68124520778656, + "rewards/thk_ans_format_reward": 1.0, + "step": 2281, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.5625, + "epoch": 7.708263069139966, + "grad_norm": 27.22726471913835, + "kl": 0.494140625, + "learning_rate": 3.57545045045045e-07, + "loss": 0.0005, + "reward": 3.6677199602127075, + "reward_std": 0.07058407552540302, + "rewards/final_reward": 1.2181321919761752, + "rewards/mask_iou_reward": 0.6090660959880876, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6677199602127075, + "rewards/thk_ans_format_reward": 1.0, + "step": 2282, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.09375, + "epoch": 7.7116357504215856, + "grad_norm": 12.482225650157211, + "kl": 0.51171875, + "learning_rate": 3.5726351351351346e-07, + "loss": 0.0005, + "reward": 3.7093077898025513, + "reward_std": 0.06627171486616135, + "rewards/final_reward": 1.5520777488098996, + "rewards/mask_iou_reward": 0.7760388744049498, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7093076705932617, + "rewards/thk_ans_format_reward": 1.0, + "step": 2283, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.36459350585938, + "epoch": 7.715008431703204, + "grad_norm": 7.097840578621795, + "kl": 0.451171875, + "learning_rate": 3.5698198198198197e-07, + "loss": 0.0005, + "reward": 3.471312642097473, + "reward_std": 0.09290103241801262, + "rewards/final_reward": 1.309364779661415, + "rewards/mask_iou_reward": 0.6546823898307075, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4713128805160522, + "rewards/thk_ans_format_reward": 1.0, + "step": 2284, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.34375381469727, + "epoch": 7.718381112984823, + "grad_norm": 16.776373577928087, + "kl": 1.1015625, + "learning_rate": 3.5670045045045043e-07, + "loss": 0.0011, + "reward": 3.419792890548706, + "reward_std": 0.1175164058804512, + "rewards/final_reward": 1.0883838774256862, + "rewards/mask_iou_reward": 0.5441919387128431, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4197928309440613, + "rewards/thk_ans_format_reward": 1.0, + "step": 2285, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 131.1979217529297, + "epoch": 7.721753794266442, + "grad_norm": 9.676650940022329, + "kl": 0.509765625, + "learning_rate": 3.564189189189189e-07, + "loss": 0.0006, + "reward": 3.515068531036377, + "reward_std": 0.15917960554361343, + "rewards/final_reward": 1.503861021723187, + "rewards/mask_iou_reward": 0.7519305108615935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5150684714317322, + "rewards/thk_ans_format_reward": 1.0, + "step": 2286, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.9479217529297, + "epoch": 7.725126475548061, + "grad_norm": 20.745195561069806, + "kl": 0.55859375, + "learning_rate": 3.5613738738738735e-07, + "loss": 0.0006, + "reward": 3.739573359489441, + "reward_std": 0.01828201860189438, + "rewards/final_reward": 1.9107846618437416, + "rewards/mask_iou_reward": 0.9553923309218708, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7395732998847961, + "rewards/thk_ans_format_reward": 1.0, + "step": 2287, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.50000762939453, + "epoch": 7.728499156829679, + "grad_norm": 14.222149311002784, + "kl": 0.419921875, + "learning_rate": 3.558558558558558e-07, + "loss": 0.0004, + "reward": 3.4065486192703247, + "reward_std": 0.08904951438307762, + "rewards/final_reward": 1.4015213427144408, + "rewards/mask_iou_reward": 0.7007606713572204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.40654855966568, + "rewards/thk_ans_format_reward": 1.0, + "step": 2288, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.4791717529297, + "epoch": 7.7318718381112985, + "grad_norm": 8.227714520886847, + "kl": 0.416015625, + "learning_rate": 3.555743243243243e-07, + "loss": 0.0004, + "reward": 3.526465058326721, + "reward_std": 0.037529608234763145, + "rewards/final_reward": 1.8010980479365568, + "rewards/mask_iou_reward": 0.9005490239682784, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5264650583267212, + "rewards/thk_ans_format_reward": 1.0, + "step": 2289, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.58333587646484, + "epoch": 7.735244519392918, + "grad_norm": 10.560115501176965, + "kl": 0.4404296875, + "learning_rate": 3.552927927927928e-07, + "loss": 0.0005, + "reward": 3.440025568008423, + "reward_std": 0.27941257879137993, + "rewards/final_reward": 1.1915034510081766, + "rewards/mask_iou_reward": 0.5957517255040883, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.565025508403778, + "rewards/thk_ans_format_reward": 0.9375, + "step": 2290, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.92708587646484, + "epoch": 7.738617200674536, + "grad_norm": 35.51630293724726, + "kl": 0.572265625, + "learning_rate": 3.5501126126126125e-07, + "loss": 0.0006, + "reward": 3.538991093635559, + "reward_std": 0.08621177216991782, + "rewards/final_reward": 1.6881863003564102, + "rewards/mask_iou_reward": 0.8440931501782051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.538991093635559, + "rewards/thk_ans_format_reward": 1.0, + "step": 2291, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.60416793823242, + "epoch": 7.7419898819561555, + "grad_norm": 9.505637948818205, + "kl": 0.46875, + "learning_rate": 3.547297297297297e-07, + "loss": 0.0005, + "reward": 3.70369029045105, + "reward_std": 0.08660473302006721, + "rewards/final_reward": 1.4734322024985178, + "rewards/mask_iou_reward": 0.7367161012492589, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7036903500556946, + "rewards/thk_ans_format_reward": 1.0, + "step": 2292, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.9895896911621, + "epoch": 7.745362563237774, + "grad_norm": 10.484695069513347, + "kl": 0.416015625, + "learning_rate": 3.5444819819819817e-07, + "loss": 0.0004, + "reward": 3.4020389318466187, + "reward_std": 0.07059112749993801, + "rewards/final_reward": 1.0682273746109578, + "rewards/mask_iou_reward": 0.5341136873054789, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.402038812637329, + "rewards/thk_ans_format_reward": 1.0, + "step": 2293, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.90625381469727, + "epoch": 7.748735244519393, + "grad_norm": 14.513575678757574, + "kl": 0.4931640625, + "learning_rate": 3.541666666666667e-07, + "loss": 0.0005, + "reward": 3.4654370546340942, + "reward_std": 0.048545608296990395, + "rewards/final_reward": 0.7577805668806391, + "rewards/mask_iou_reward": 0.37889028344031955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4654370546340942, + "rewards/thk_ans_format_reward": 1.0, + "step": 2294, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.50001525878906, + "epoch": 7.7521079258010115, + "grad_norm": 41.83666830833403, + "kl": 0.630859375, + "learning_rate": 3.5388513513513514e-07, + "loss": 0.0006, + "reward": 3.5073028802871704, + "reward_std": 0.12978895753622055, + "rewards/final_reward": 1.9100693476261814, + "rewards/mask_iou_reward": 0.9550346738130907, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5073029398918152, + "rewards/thk_ans_format_reward": 1.0, + "step": 2295, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.2916717529297, + "epoch": 7.755480607082631, + "grad_norm": 9.347916089328256, + "kl": 0.4599609375, + "learning_rate": 3.536036036036036e-07, + "loss": 0.0005, + "reward": 3.6417770385742188, + "reward_std": 0.02596164494752884, + "rewards/final_reward": 1.6588071187782987, + "rewards/mask_iou_reward": 0.8294035593891493, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.641777217388153, + "rewards/thk_ans_format_reward": 1.0, + "step": 2296, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.96875, + "epoch": 7.75885328836425, + "grad_norm": 26.11197213835604, + "kl": 0.451171875, + "learning_rate": 3.5332207207207206e-07, + "loss": 0.0005, + "reward": 3.4054592847824097, + "reward_std": 0.06146854721009731, + "rewards/final_reward": 1.8495260696406592, + "rewards/mask_iou_reward": 0.9247630348203296, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4054594039916992, + "rewards/thk_ans_format_reward": 1.0, + "step": 2297, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.9479217529297, + "epoch": 7.762225969645868, + "grad_norm": 6.92012139038751, + "kl": 0.458984375, + "learning_rate": 3.530405405405405e-07, + "loss": 0.0005, + "reward": 3.5028148889541626, + "reward_std": 0.036012555472552776, + "rewards/final_reward": 1.466303298975642, + "rewards/mask_iou_reward": 0.733151649487821, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5028148293495178, + "rewards/thk_ans_format_reward": 1.0, + "step": 2298, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.78125762939453, + "epoch": 7.765598650927488, + "grad_norm": 8.461246166243656, + "kl": 0.431640625, + "learning_rate": 3.5275900900900904e-07, + "loss": 0.0004, + "reward": 3.5506240129470825, + "reward_std": 0.07442787801846862, + "rewards/final_reward": 1.5512045862225976, + "rewards/mask_iou_reward": 0.7756022931112988, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5506239533424377, + "rewards/thk_ans_format_reward": 1.0, + "step": 2299, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.75000762939453, + "epoch": 7.768971332209106, + "grad_norm": 11.326303220722192, + "kl": 0.447265625, + "learning_rate": 3.5247747747747745e-07, + "loss": 0.0004, + "reward": 3.489335775375366, + "reward_std": 0.025043433532118797, + "rewards/final_reward": 1.799468701569588, + "rewards/mask_iou_reward": 0.899734350784794, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.489335834980011, + "rewards/thk_ans_format_reward": 1.0, + "step": 2300, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.67708587646484, + "epoch": 7.772344013490725, + "grad_norm": 23.87880415190321, + "kl": 0.447265625, + "learning_rate": 3.521959459459459e-07, + "loss": 0.0004, + "reward": 3.2864990234375, + "reward_std": 0.07297203643247485, + "rewards/final_reward": 0.8304928473778955, + "rewards/mask_iou_reward": 0.41524642368894776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2864991426467896, + "rewards/thk_ans_format_reward": 1.0, + "step": 2301, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.64583587646484, + "epoch": 7.775716694772344, + "grad_norm": 90.40553412999691, + "kl": 0.521484375, + "learning_rate": 3.5191441441441437e-07, + "loss": 0.0006, + "reward": 3.360445976257324, + "reward_std": 0.05471951887011528, + "rewards/final_reward": 1.0188568995105611, + "rewards/mask_iou_reward": 0.5094284497552806, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.36044579744339, + "rewards/thk_ans_format_reward": 1.0, + "step": 2302, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.82291793823242, + "epoch": 7.779089376053963, + "grad_norm": 12.670352448393869, + "kl": 0.52734375, + "learning_rate": 3.5163288288288283e-07, + "loss": 0.0006, + "reward": 3.590890645980835, + "reward_std": 0.08140116557478905, + "rewards/final_reward": 1.5870530239041354, + "rewards/mask_iou_reward": 0.7935265119520677, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5908905267715454, + "rewards/thk_ans_format_reward": 1.0, + "step": 2303, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.6041717529297, + "epoch": 7.782462057335582, + "grad_norm": 16.46874007266962, + "kl": 0.4052734375, + "learning_rate": 3.5135135135135134e-07, + "loss": 0.0004, + "reward": 3.6431690454483032, + "reward_std": 0.04836719110608101, + "rewards/final_reward": 1.8580852428852772, + "rewards/mask_iou_reward": 0.9290426214426386, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6431688070297241, + "rewards/thk_ans_format_reward": 1.0, + "step": 2304, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.5416717529297, + "epoch": 7.785834738617201, + "grad_norm": 7.347159424364246, + "kl": 0.4130859375, + "learning_rate": 3.510698198198198e-07, + "loss": 0.0004, + "reward": 3.4472914934158325, + "reward_std": 0.020406564697623253, + "rewards/final_reward": 1.7516853229805487, + "rewards/mask_iou_reward": 0.8758426614902743, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4472914934158325, + "rewards/thk_ans_format_reward": 1.0, + "step": 2305, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.86458587646484, + "epoch": 7.78920741989882, + "grad_norm": 19.455204429708385, + "kl": 0.4296875, + "learning_rate": 3.5078828828828826e-07, + "loss": 0.0004, + "reward": 3.352233409881592, + "reward_std": 0.11754608154296875, + "rewards/final_reward": 1.8694441956962824, + "rewards/mask_iou_reward": 0.9347220978481412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3522332906723022, + "rewards/thk_ans_format_reward": 1.0, + "step": 2306, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.59375, + "epoch": 7.792580101180438, + "grad_norm": 13.13219609139006, + "kl": 0.9287109375, + "learning_rate": 3.505067567567567e-07, + "loss": 0.0009, + "reward": 3.4515668153762817, + "reward_std": 0.1543382704257965, + "rewards/final_reward": 1.317968822124398, + "rewards/mask_iou_reward": 0.658984411062199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4515668749809265, + "rewards/thk_ans_format_reward": 1.0, + "step": 2307, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.8229217529297, + "epoch": 7.795952782462058, + "grad_norm": 11.740865831015478, + "kl": 0.59765625, + "learning_rate": 3.502252252252252e-07, + "loss": 0.0006, + "reward": 3.5365203619003296, + "reward_std": 0.039987629279494286, + "rewards/final_reward": 1.9206840736768225, + "rewards/mask_iou_reward": 0.9603420368384112, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5365204811096191, + "rewards/thk_ans_format_reward": 1.0, + "step": 2308, + "think_completion_length": 9.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5104217529297, + "epoch": 7.799325463743676, + "grad_norm": 8.861505071959428, + "kl": 0.587890625, + "learning_rate": 3.499436936936937e-07, + "loss": 0.0006, + "reward": 3.3772459030151367, + "reward_std": 0.08232882246375084, + "rewards/final_reward": 1.546701372882615, + "rewards/mask_iou_reward": 0.7733506864413076, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.377245843410492, + "rewards/thk_ans_format_reward": 1.0, + "step": 2309, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.89583587646484, + "epoch": 7.802698145025295, + "grad_norm": 11.730335589806394, + "kl": 0.443359375, + "learning_rate": 3.4966216216216216e-07, + "loss": 0.0004, + "reward": 2.9785395860671997, + "reward_std": 0.08616751432418823, + "rewards/final_reward": 0.6032473239090221, + "rewards/mask_iou_reward": 0.30162366195451107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.9785396456718445, + "rewards/thk_ans_format_reward": 1.0, + "step": 2310, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.89584350585938, + "epoch": 7.806070826306914, + "grad_norm": 12.6283200499224, + "kl": 0.4033203125, + "learning_rate": 3.493806306306306e-07, + "loss": 0.0004, + "reward": 3.658359169960022, + "reward_std": 0.06291536800563335, + "rewards/final_reward": 1.621674238125308, + "rewards/mask_iou_reward": 0.810837119062654, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6583590507507324, + "rewards/thk_ans_format_reward": 1.0, + "step": 2311, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.21875762939453, + "epoch": 7.809443507588533, + "grad_norm": 6.784044716299772, + "kl": 0.4697265625, + "learning_rate": 3.490990990990991e-07, + "loss": 0.0005, + "reward": 3.6008437871932983, + "reward_std": 0.03371572960168123, + "rewards/final_reward": 1.9011566973384175, + "rewards/mask_iou_reward": 0.9505783486692088, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6008437871932983, + "rewards/thk_ans_format_reward": 1.0, + "step": 2312, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.30209350585938, + "epoch": 7.812816188870151, + "grad_norm": 27.340968120934896, + "kl": 0.6171875, + "learning_rate": 3.4881756756756754e-07, + "loss": 0.0006, + "reward": 3.232995629310608, + "reward_std": 0.08912499528378248, + "rewards/final_reward": 0.8706868650440512, + "rewards/mask_iou_reward": 0.4353434325220256, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2329955101013184, + "rewards/thk_ans_format_reward": 1.0, + "step": 2313, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.89584350585938, + "epoch": 7.8161888701517706, + "grad_norm": 8.13050302174962, + "kl": 0.4091796875, + "learning_rate": 3.4853603603603605e-07, + "loss": 0.0004, + "reward": 3.358641743659973, + "reward_std": 0.034882666543126106, + "rewards/final_reward": 1.7063491811286031, + "rewards/mask_iou_reward": 0.8531745905643016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.358641803264618, + "rewards/thk_ans_format_reward": 1.0, + "step": 2314, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.46875762939453, + "epoch": 7.81956155143339, + "grad_norm": 15.113967431164683, + "kl": 0.4560546875, + "learning_rate": 3.482545045045045e-07, + "loss": 0.0005, + "reward": 3.599574327468872, + "reward_std": 0.0958711989223957, + "rewards/final_reward": 1.6495412259843159, + "rewards/mask_iou_reward": 0.8247706129921579, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5995741486549377, + "rewards/thk_ans_format_reward": 1.0, + "step": 2315, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.75000762939453, + "epoch": 7.822934232715008, + "grad_norm": 26.43129837171513, + "kl": 0.443359375, + "learning_rate": 3.47972972972973e-07, + "loss": 0.0004, + "reward": 3.4044047594070435, + "reward_std": 0.06186537444591522, + "rewards/final_reward": 1.8200695665581779, + "rewards/mask_iou_reward": 0.9100347832790889, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4044047594070435, + "rewards/thk_ans_format_reward": 1.0, + "step": 2316, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.98958587646484, + "epoch": 7.8263069139966275, + "grad_norm": 10.395822624159122, + "kl": 0.5234375, + "learning_rate": 3.4769144144144144e-07, + "loss": 0.0005, + "reward": 3.568729519844055, + "reward_std": 0.07372662238776684, + "rewards/final_reward": 1.661542540133941, + "rewards/mask_iou_reward": 0.8307712700669705, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5687294006347656, + "rewards/thk_ans_format_reward": 1.0, + "step": 2317, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.84375, + "epoch": 7.829679595278246, + "grad_norm": 18.712101724067143, + "kl": 0.583984375, + "learning_rate": 3.474099099099099e-07, + "loss": 0.0006, + "reward": 3.6224128007888794, + "reward_std": 0.06478509679436684, + "rewards/final_reward": 1.5744207148950102, + "rewards/mask_iou_reward": 0.7872103574475051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6224127411842346, + "rewards/thk_ans_format_reward": 1.0, + "step": 2318, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.12500762939453, + "epoch": 7.833052276559865, + "grad_norm": 249.85421546912946, + "kl": 0.443359375, + "learning_rate": 3.471283783783784e-07, + "loss": 0.0005, + "reward": 3.545035719871521, + "reward_std": 0.07090389914810658, + "rewards/final_reward": 1.1629425698182438, + "rewards/mask_iou_reward": 0.5814712849091219, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5450357794761658, + "rewards/thk_ans_format_reward": 1.0, + "step": 2319, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.15625, + "epoch": 7.8364249578414835, + "grad_norm": 9.60632427652472, + "kl": 0.439453125, + "learning_rate": 3.468468468468468e-07, + "loss": 0.0005, + "reward": 3.721726655960083, + "reward_std": 0.038542356342077255, + "rewards/final_reward": 1.8422141542666353, + "rewards/mask_iou_reward": 0.9211070771333176, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7217265963554382, + "rewards/thk_ans_format_reward": 1.0, + "step": 2320, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.02083587646484, + "epoch": 7.839797639123103, + "grad_norm": 8.201411040398995, + "kl": 0.43359375, + "learning_rate": 3.465653153153153e-07, + "loss": 0.0005, + "reward": 3.5359808206558228, + "reward_std": 0.07264266163110733, + "rewards/final_reward": 1.7804632002891472, + "rewards/mask_iou_reward": 0.8902316001445736, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5359808206558228, + "rewards/thk_ans_format_reward": 1.0, + "step": 2321, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.36458587646484, + "epoch": 7.843170320404722, + "grad_norm": 10.835979305273261, + "kl": 0.3759765625, + "learning_rate": 3.4628378378378374e-07, + "loss": 0.0004, + "reward": 3.568721055984497, + "reward_std": 0.04248126968741417, + "rewards/final_reward": 1.540176830992082, + "rewards/mask_iou_reward": 0.770088415496041, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5687209367752075, + "rewards/thk_ans_format_reward": 1.0, + "step": 2322, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.7604217529297, + "epoch": 7.8465430016863404, + "grad_norm": 8.787303455057133, + "kl": 0.38671875, + "learning_rate": 3.460022522522522e-07, + "loss": 0.0004, + "reward": 3.5989596843719482, + "reward_std": 0.09075170010328293, + "rewards/final_reward": 1.2405073472062438, + "rewards/mask_iou_reward": 0.6202536736031219, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5989595651626587, + "rewards/thk_ans_format_reward": 1.0, + "step": 2323, + "think_completion_length": 7.333333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.8854217529297, + "epoch": 7.84991568296796, + "grad_norm": 9.182053652495602, + "kl": 0.537109375, + "learning_rate": 3.457207207207207e-07, + "loss": 0.0006, + "reward": 3.511338710784912, + "reward_std": 0.07975371927022934, + "rewards/final_reward": 1.170090604027916, + "rewards/mask_iou_reward": 0.585045302013958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.511338710784912, + "rewards/thk_ans_format_reward": 1.0, + "step": 2324, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.87500762939453, + "epoch": 7.853288364249578, + "grad_norm": 13.68236372891141, + "kl": 0.52734375, + "learning_rate": 3.454391891891892e-07, + "loss": 0.0005, + "reward": 3.566367268562317, + "reward_std": 0.07451931945979595, + "rewards/final_reward": 0.9658934278265687, + "rewards/mask_iou_reward": 0.48294671391328436, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5663670301437378, + "rewards/thk_ans_format_reward": 1.0, + "step": 2325, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.92708587646484, + "epoch": 7.856661045531197, + "grad_norm": 7.843385990135139, + "kl": 0.4921875, + "learning_rate": 3.4515765765765763e-07, + "loss": 0.0005, + "reward": 3.4027241468429565, + "reward_std": 0.04318516911007464, + "rewards/final_reward": 1.8411135688835198, + "rewards/mask_iou_reward": 0.9205567844417599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4027240872383118, + "rewards/thk_ans_format_reward": 1.0, + "step": 2326, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.83333587646484, + "epoch": 7.860033726812816, + "grad_norm": 17.641573866474715, + "kl": 2.2646484375, + "learning_rate": 3.448761261261261e-07, + "loss": 0.0023, + "reward": 3.287583351135254, + "reward_std": 0.070755485445261, + "rewards/final_reward": 1.2760556416836235, + "rewards/mask_iou_reward": 0.6380278208418118, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2875832915306091, + "rewards/thk_ans_format_reward": 1.0, + "step": 2327, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.28125, + "epoch": 7.863406408094435, + "grad_norm": 11.039443903969092, + "kl": 0.404296875, + "learning_rate": 3.4459459459459456e-07, + "loss": 0.0004, + "reward": 3.6915663480758667, + "reward_std": 0.10650844499468803, + "rewards/final_reward": 1.8212546911576388, + "rewards/mask_iou_reward": 0.9106273455788194, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6915662288665771, + "rewards/thk_ans_format_reward": 1.0, + "step": 2328, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.93750762939453, + "epoch": 7.866779089376054, + "grad_norm": 45.22548356284776, + "kl": 0.46875, + "learning_rate": 3.4431306306306307e-07, + "loss": 0.0005, + "reward": 3.289353132247925, + "reward_std": 0.10025676898658276, + "rewards/final_reward": 0.9978360163727829, + "rewards/mask_iou_reward": 0.4989180081863914, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2893530130386353, + "rewards/thk_ans_format_reward": 1.0, + "step": 2329, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.33333587646484, + "epoch": 7.870151770657673, + "grad_norm": 10.64319600309174, + "kl": 0.3994140625, + "learning_rate": 3.4403153153153153e-07, + "loss": 0.0004, + "reward": 3.5440070629119873, + "reward_std": 0.03834127727895975, + "rewards/final_reward": 1.5940954495231145, + "rewards/mask_iou_reward": 0.7970477247615573, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5440071821212769, + "rewards/thk_ans_format_reward": 1.0, + "step": 2330, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.14583587646484, + "epoch": 7.873524451939292, + "grad_norm": 9.592694804363731, + "kl": 0.62890625, + "learning_rate": 3.4375e-07, + "loss": 0.0006, + "reward": 3.3139032125473022, + "reward_std": 0.1449567973613739, + "rewards/final_reward": 0.8608355036367921, + "rewards/mask_iou_reward": 0.43041775181839603, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3139033317565918, + "rewards/thk_ans_format_reward": 1.0, + "step": 2331, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.62500762939453, + "epoch": 7.87689713322091, + "grad_norm": 11.82364466612513, + "kl": 0.4453125, + "learning_rate": 3.4346846846846845e-07, + "loss": 0.0005, + "reward": 3.5688596963882446, + "reward_std": 0.08483656868338585, + "rewards/final_reward": 1.5885996944105767, + "rewards/mask_iou_reward": 0.7942998472052883, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.568859577178955, + "rewards/thk_ans_format_reward": 1.0, + "step": 2332, + "think_completion_length": 12.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.7604217529297, + "epoch": 7.88026981450253, + "grad_norm": 11.078307712657724, + "kl": 0.4521484375, + "learning_rate": 3.431869369369369e-07, + "loss": 0.0005, + "reward": 3.384488821029663, + "reward_std": 0.12777956575155258, + "rewards/final_reward": 0.9596014290310367, + "rewards/mask_iou_reward": 0.47980071451551837, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3844888806343079, + "rewards/thk_ans_format_reward": 1.0, + "step": 2333, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.6041717529297, + "epoch": 7.883642495784148, + "grad_norm": 13.09620604007018, + "kl": 0.544921875, + "learning_rate": 3.429054054054054e-07, + "loss": 0.0006, + "reward": 3.4537363052368164, + "reward_std": 0.1276137139648199, + "rewards/final_reward": 0.5759846275612032, + "rewards/mask_iou_reward": 0.2879923137806016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4537363052368164, + "rewards/thk_ans_format_reward": 1.0, + "step": 2334, + "think_completion_length": 10.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.875, + "epoch": 7.887015177065767, + "grad_norm": 30.203500843574535, + "kl": 0.3544921875, + "learning_rate": 3.426238738738739e-07, + "loss": 0.0004, + "reward": 3.7364327907562256, + "reward_std": 0.03459780430421233, + "rewards/final_reward": 1.737842492664877, + "rewards/mask_iou_reward": 0.8689212463324385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7364325523376465, + "rewards/thk_ans_format_reward": 1.0, + "step": 2335, + "think_completion_length": 9.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.09375762939453, + "epoch": 7.8903878583473865, + "grad_norm": 12.063816563747155, + "kl": 0.5087890625, + "learning_rate": 3.4234234234234235e-07, + "loss": 0.0005, + "reward": 3.5211243629455566, + "reward_std": 0.04529313184320927, + "rewards/final_reward": 1.802233579994247, + "rewards/mask_iou_reward": 0.9011167899971235, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.521124243736267, + "rewards/thk_ans_format_reward": 1.0, + "step": 2336, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.61459350585938, + "epoch": 7.893760539629005, + "grad_norm": 7.861235953625273, + "kl": 0.568359375, + "learning_rate": 3.420608108108108e-07, + "loss": 0.0006, + "reward": 3.6086841821670532, + "reward_std": 0.03303397446870804, + "rewards/final_reward": 1.8981940915356716, + "rewards/mask_iou_reward": 0.9490970457678358, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6086838841438293, + "rewards/thk_ans_format_reward": 1.0, + "step": 2337, + "think_completion_length": 11.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.5625, + "epoch": 7.897133220910624, + "grad_norm": 17.37265934151549, + "kl": 0.4462890625, + "learning_rate": 3.4177927927927927e-07, + "loss": 0.0005, + "reward": 3.3253653049468994, + "reward_std": 0.1824297234416008, + "rewards/final_reward": 0.9615388733284267, + "rewards/mask_iou_reward": 0.48076943666421335, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.325365126132965, + "rewards/thk_ans_format_reward": 1.0, + "step": 2338, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.08334350585938, + "epoch": 7.900505902192243, + "grad_norm": 20.80358081412897, + "kl": 0.4697265625, + "learning_rate": 3.4149774774774773e-07, + "loss": 0.0005, + "reward": 3.2458066940307617, + "reward_std": 0.19842278212308884, + "rewards/final_reward": 1.1941345429684787, + "rewards/mask_iou_reward": 0.5970672714842393, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.2874733805656433, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2339, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.08333587646484, + "epoch": 7.903878583473862, + "grad_norm": 15.630954674126164, + "kl": 0.53125, + "learning_rate": 3.412162162162162e-07, + "loss": 0.0005, + "reward": 3.4466086626052856, + "reward_std": 0.11500649899244308, + "rewards/final_reward": 1.46870160960264, + "rewards/mask_iou_reward": 0.73435080480132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4466084241867065, + "rewards/thk_ans_format_reward": 1.0, + "step": 2340, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.46875762939453, + "epoch": 7.90725126475548, + "grad_norm": 7.77783215283481, + "kl": 0.3798828125, + "learning_rate": 3.4093468468468465e-07, + "loss": 0.0004, + "reward": 3.701847553253174, + "reward_std": 0.044463444501161575, + "rewards/final_reward": 1.8407206420426043, + "rewards/mask_iou_reward": 0.9203603210213022, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7018474340438843, + "rewards/thk_ans_format_reward": 1.0, + "step": 2341, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.80208587646484, + "epoch": 7.9106239460370995, + "grad_norm": 7.17664934078942, + "kl": 0.4013671875, + "learning_rate": 3.406531531531531e-07, + "loss": 0.0004, + "reward": 3.54721200466156, + "reward_std": 0.04046872444450855, + "rewards/final_reward": 1.9069610779524413, + "rewards/mask_iou_reward": 0.9534805389762206, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5472122430801392, + "rewards/thk_ans_format_reward": 1.0, + "step": 2342, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.5625, + "epoch": 7.913996627318719, + "grad_norm": 14.106045514797199, + "kl": 0.3974609375, + "learning_rate": 3.4037162162162157e-07, + "loss": 0.0004, + "reward": 3.5020689964294434, + "reward_std": 0.10484147071838379, + "rewards/final_reward": 1.9132991048909953, + "rewards/mask_iou_reward": 0.9566495524454977, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5020688772201538, + "rewards/thk_ans_format_reward": 1.0, + "step": 2343, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.15625762939453, + "epoch": 7.917369308600337, + "grad_norm": 67.84555169518201, + "kl": 0.5234375, + "learning_rate": 3.400900900900901e-07, + "loss": 0.0005, + "reward": 3.628502130508423, + "reward_std": 0.040713533759117126, + "rewards/final_reward": 1.2533247159523904, + "rewards/mask_iou_reward": 0.6266623579761952, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6285019516944885, + "rewards/thk_ans_format_reward": 1.0, + "step": 2344, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.83334350585938, + "epoch": 7.920741989881956, + "grad_norm": 33.18071869400635, + "kl": 0.494140625, + "learning_rate": 3.3980855855855855e-07, + "loss": 0.0005, + "reward": 3.621130108833313, + "reward_std": 0.03444007970392704, + "rewards/final_reward": 1.7131870145275485, + "rewards/mask_iou_reward": 0.8565935072637743, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6211299896240234, + "rewards/thk_ans_format_reward": 1.0, + "step": 2345, + "think_completion_length": 11.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4166717529297, + "epoch": 7.924114671163575, + "grad_norm": 21.952011179777173, + "kl": 0.4580078125, + "learning_rate": 3.39527027027027e-07, + "loss": 0.0005, + "reward": 3.218958258628845, + "reward_std": 0.07898985967040062, + "rewards/final_reward": 0.5945596088104014, + "rewards/mask_iou_reward": 0.2972798044052007, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2189582586288452, + "rewards/thk_ans_format_reward": 1.0, + "step": 2346, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.06250762939453, + "epoch": 7.927487352445194, + "grad_norm": 6.97931756465897, + "kl": 0.41796875, + "learning_rate": 3.3924549549549547e-07, + "loss": 0.0004, + "reward": 3.788905382156372, + "reward_std": 0.03340917080640793, + "rewards/final_reward": 1.8980523459951733, + "rewards/mask_iou_reward": 0.9490261729975866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7889053225517273, + "rewards/thk_ans_format_reward": 1.0, + "step": 2347, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.53125, + "epoch": 7.9308600337268125, + "grad_norm": 15.430277569447743, + "kl": 0.4609375, + "learning_rate": 3.3896396396396393e-07, + "loss": 0.0005, + "reward": 3.5355775356292725, + "reward_std": 0.04261211957782507, + "rewards/final_reward": 1.8832372240822794, + "rewards/mask_iou_reward": 0.9416186120411397, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.535577416419983, + "rewards/thk_ans_format_reward": 1.0, + "step": 2348, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.9270935058594, + "epoch": 7.934232715008432, + "grad_norm": 8.556554171502322, + "kl": 0.373046875, + "learning_rate": 3.3868243243243244e-07, + "loss": 0.0004, + "reward": 3.428499698638916, + "reward_std": 0.08785773441195488, + "rewards/final_reward": 1.4202347812155347, + "rewards/mask_iou_reward": 0.7101173906077674, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4284995198249817, + "rewards/thk_ans_format_reward": 1.0, + "step": 2349, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.14583587646484, + "epoch": 7.937605396290051, + "grad_norm": 20.645215597498492, + "kl": 0.50390625, + "learning_rate": 3.384009009009009e-07, + "loss": 0.0005, + "reward": 3.360377550125122, + "reward_std": 0.16720493882894516, + "rewards/final_reward": 1.595247778754447, + "rewards/mask_iou_reward": 0.7976238893772235, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3603774905204773, + "rewards/thk_ans_format_reward": 1.0, + "step": 2350, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.8854217529297, + "epoch": 7.940978077571669, + "grad_norm": 32.13190178817014, + "kl": 0.541015625, + "learning_rate": 3.3811936936936936e-07, + "loss": 0.0006, + "reward": 3.542497754096985, + "reward_std": 0.05872867442667484, + "rewards/final_reward": 1.6635123248896204, + "rewards/mask_iou_reward": 0.8317561624448102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.54249769449234, + "rewards/thk_ans_format_reward": 1.0, + "step": 2351, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.2916717529297, + "epoch": 7.944350758853289, + "grad_norm": 10.315882348214277, + "kl": 0.4404296875, + "learning_rate": 3.378378378378378e-07, + "loss": 0.0004, + "reward": 3.573422074317932, + "reward_std": 0.07881678268313408, + "rewards/final_reward": 1.6519748972757755, + "rewards/mask_iou_reward": 0.8259874486378878, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5734221935272217, + "rewards/thk_ans_format_reward": 1.0, + "step": 2352, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.5416793823242, + "epoch": 7.947723440134907, + "grad_norm": 7.729003667115754, + "kl": 0.732421875, + "learning_rate": 3.375563063063063e-07, + "loss": 0.0007, + "reward": 3.356159210205078, + "reward_std": 0.08650216832756996, + "rewards/final_reward": 1.3735585263480121, + "rewards/mask_iou_reward": 0.6867792631740061, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3561591506004333, + "rewards/thk_ans_format_reward": 1.0, + "step": 2353, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.52083587646484, + "epoch": 7.951096121416526, + "grad_norm": 5.855065824425488, + "kl": 0.4091796875, + "learning_rate": 3.372747747747748e-07, + "loss": 0.0006, + "reward": 3.6102631092071533, + "reward_std": 0.07107937522232533, + "rewards/final_reward": 1.53924420063235, + "rewards/mask_iou_reward": 0.769622100316175, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6102629899978638, + "rewards/thk_ans_format_reward": 1.0, + "step": 2354, + "think_completion_length": 9.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.0416717529297, + "epoch": 7.954468802698145, + "grad_norm": 22.629742151372223, + "kl": 0.41015625, + "learning_rate": 3.3699324324324326e-07, + "loss": 0.0004, + "reward": 2.8923134803771973, + "reward_std": 0.15287496149539948, + "rewards/final_reward": 0.804916964131339, + "rewards/mask_iou_reward": 0.4024584820656695, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.8923133313655853, + "rewards/thk_ans_format_reward": 1.0, + "step": 2355, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.17709350585938, + "epoch": 7.957841483979764, + "grad_norm": 9.726923091668409, + "kl": 0.4326171875, + "learning_rate": 3.367117117117117e-07, + "loss": 0.0004, + "reward": 3.207701802253723, + "reward_std": 0.061932358890771866, + "rewards/final_reward": 1.2355819249165891, + "rewards/mask_iou_reward": 0.6177909624582946, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2077017426490784, + "rewards/thk_ans_format_reward": 1.0, + "step": 2356, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.1041717529297, + "epoch": 7.961214165261383, + "grad_norm": 6.3255156854871, + "kl": 0.4150390625, + "learning_rate": 3.364301801801802e-07, + "loss": 0.0004, + "reward": 3.118018388748169, + "reward_std": 0.011369133368134499, + "rewards/final_reward": 0.4532947116263412, + "rewards/mask_iou_reward": 0.2266473558131706, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1180184483528137, + "rewards/thk_ans_format_reward": 1.0, + "step": 2357, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.6354217529297, + "epoch": 7.964586846543002, + "grad_norm": 6.205063191039621, + "kl": 0.443359375, + "learning_rate": 3.361486486486486e-07, + "loss": 0.0005, + "reward": 3.5695269107818604, + "reward_std": 0.14852624107152224, + "rewards/final_reward": 0.8570310423524563, + "rewards/mask_iou_reward": 0.42851552117622815, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5903602242469788, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2358, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.5416717529297, + "epoch": 7.967959527824621, + "grad_norm": 7.83157245622682, + "kl": 0.388671875, + "learning_rate": 3.358671171171171e-07, + "loss": 0.0004, + "reward": 3.6626497507095337, + "reward_std": 0.057364363223314285, + "rewards/final_reward": 1.9162213234292915, + "rewards/mask_iou_reward": 0.9581106617146458, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.662649691104889, + "rewards/thk_ans_format_reward": 1.0, + "step": 2359, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.84375762939453, + "epoch": 7.971332209106239, + "grad_norm": 44.272638856549584, + "kl": 0.4501953125, + "learning_rate": 3.3558558558558556e-07, + "loss": 0.0005, + "reward": 3.702326536178589, + "reward_std": 0.09001387841999531, + "rewards/final_reward": 1.7448316603806895, + "rewards/mask_iou_reward": 0.8724158301903447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7023264169692993, + "rewards/thk_ans_format_reward": 1.0, + "step": 2360, + "think_completion_length": 9.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.05209350585938, + "epoch": 7.974704890387859, + "grad_norm": 11.615592962528046, + "kl": 0.3505859375, + "learning_rate": 3.35304054054054e-07, + "loss": 0.0004, + "reward": 3.660933017730713, + "reward_std": 0.04490010812878609, + "rewards/final_reward": 1.7137576906076042, + "rewards/mask_iou_reward": 0.8568788453038021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6609334349632263, + "rewards/thk_ans_format_reward": 1.0, + "step": 2361, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.21875, + "epoch": 7.978077571669477, + "grad_norm": 21.061501022597145, + "kl": 0.39453125, + "learning_rate": 3.350225225225225e-07, + "loss": 0.0004, + "reward": 3.371769428253174, + "reward_std": 0.23365569114685059, + "rewards/final_reward": 1.7317698156014782, + "rewards/mask_iou_reward": 0.8658849078007391, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4134362936019897, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2362, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.0729217529297, + "epoch": 7.981450252951096, + "grad_norm": 18.606036025398936, + "kl": 0.3759765625, + "learning_rate": 3.3474099099099094e-07, + "loss": 0.0004, + "reward": 3.425795316696167, + "reward_std": 0.20433041267096996, + "rewards/final_reward": 1.7526200055850498, + "rewards/mask_iou_reward": 0.8763100027925249, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4466286897659302, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2363, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.96875762939453, + "epoch": 7.9848229342327155, + "grad_norm": 8.152504737474274, + "kl": 0.498046875, + "learning_rate": 3.3445945945945946e-07, + "loss": 0.0005, + "reward": 3.575013518333435, + "reward_std": 0.14411963429301977, + "rewards/final_reward": 1.5466455653104827, + "rewards/mask_iou_reward": 0.7733227826552413, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.595846951007843, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2364, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.6770935058594, + "epoch": 7.988195615514334, + "grad_norm": 10.160184176734472, + "kl": 0.390625, + "learning_rate": 3.341779279279279e-07, + "loss": 0.0004, + "reward": 3.7018014192581177, + "reward_std": 0.21830223500728607, + "rewards/final_reward": 1.923110668124186, + "rewards/mask_iou_reward": 0.961555334062093, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.7434679865837097, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2365, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.2604217529297, + "epoch": 7.991568296795953, + "grad_norm": 9.70498937587903, + "kl": 0.59765625, + "learning_rate": 3.338963963963964e-07, + "loss": 0.0006, + "reward": 3.6059956550598145, + "reward_std": 0.01825597556307912, + "rewards/final_reward": 1.8529482315714163, + "rewards/mask_iou_reward": 0.9264741157857082, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6059958934783936, + "rewards/thk_ans_format_reward": 1.0, + "step": 2366, + "think_completion_length": 12.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.6979217529297, + "epoch": 7.9949409780775715, + "grad_norm": 14.530691748123385, + "kl": 0.5234375, + "learning_rate": 3.3361486486486484e-07, + "loss": 0.0005, + "reward": 3.6428091526031494, + "reward_std": 0.07249030750244856, + "rewards/final_reward": 1.7027538774608706, + "rewards/mask_iou_reward": 0.8513769387304353, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6428090929985046, + "rewards/thk_ans_format_reward": 1.0, + "step": 2367, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.60526657104492, + "epoch": 7.998313659359191, + "grad_norm": 12.281907589005117, + "kl": 0.416015625, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0004, + "reward": 3.605751395225525, + "reward_std": 0.11550533585250378, + "rewards/final_reward": 1.4464381873724115, + "rewards/mask_iou_reward": 0.7232190936862057, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.605751395225525, + "rewards/thk_ans_format_reward": 1.0, + "step": 2368, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.7604217529297, + "epoch": 8.003372681281618, + "grad_norm": 23.744219327400796, + "kl": 0.392578125, + "learning_rate": 3.330518018018018e-07, + "loss": 0.0004, + "reward": 3.507601261138916, + "reward_std": 0.11289845686405897, + "rewards/final_reward": 1.228210963425049, + "rewards/mask_iou_reward": 0.6141054817125245, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5076011419296265, + "rewards/thk_ans_format_reward": 1.0, + "step": 2369, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.33333587646484, + "epoch": 8.006745362563239, + "grad_norm": 11.546192868792904, + "kl": 0.388671875, + "learning_rate": 3.327702702702703e-07, + "loss": 0.0004, + "reward": 3.7985188961029053, + "reward_std": 0.03749396279454231, + "rewards/final_reward": 1.867038907222471, + "rewards/mask_iou_reward": 0.9335194536112355, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7985187768936157, + "rewards/thk_ans_format_reward": 1.0, + "step": 2370, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.2708435058594, + "epoch": 8.010118043844857, + "grad_norm": 22.293278328225153, + "kl": 1.9775390625, + "learning_rate": 3.3248873873873874e-07, + "loss": 0.002, + "reward": 3.674692153930664, + "reward_std": 0.03796894662082195, + "rewards/final_reward": 1.643982822737366, + "rewards/mask_iou_reward": 0.821991411368683, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6746920347213745, + "rewards/thk_ans_format_reward": 1.0, + "step": 2371, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.9479217529297, + "epoch": 8.013490725126475, + "grad_norm": 26.55755490987274, + "kl": 0.4404296875, + "learning_rate": 3.322072072072072e-07, + "loss": 0.0004, + "reward": 3.422567129135132, + "reward_std": 0.06658890098333359, + "rewards/final_reward": 1.1253702176690892, + "rewards/mask_iou_reward": 0.5626851088345446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.422567069530487, + "rewards/thk_ans_format_reward": 1.0, + "step": 2372, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.20834350585938, + "epoch": 8.016863406408094, + "grad_norm": 7.147523959066198, + "kl": 0.40234375, + "learning_rate": 3.3192567567567566e-07, + "loss": 0.0004, + "reward": 3.740321159362793, + "reward_std": 0.21628601849079132, + "rewards/final_reward": 1.7931842878703108, + "rewards/mask_iou_reward": 0.8965921439351554, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.7819878458976746, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2373, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.6979217529297, + "epoch": 8.020236087689714, + "grad_norm": 8.090296978672066, + "kl": 0.619140625, + "learning_rate": 3.3164414414414417e-07, + "loss": 0.0006, + "reward": 3.5261985063552856, + "reward_std": 0.16563283652067184, + "rewards/final_reward": 1.7265625912685088, + "rewards/mask_iou_reward": 0.8632812956342544, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5470316410064697, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2374, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.89583587646484, + "epoch": 8.023608768971332, + "grad_norm": 6.551030017093635, + "kl": 0.44921875, + "learning_rate": 3.3136261261261263e-07, + "loss": 0.0004, + "reward": 3.4857009649276733, + "reward_std": 0.08584445342421532, + "rewards/final_reward": 1.5250340704634133, + "rewards/mask_iou_reward": 0.7625170352317067, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.485701084136963, + "rewards/thk_ans_format_reward": 1.0, + "step": 2375, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.6041717529297, + "epoch": 8.02698145025295, + "grad_norm": 8.89636088836023, + "kl": 0.4267578125, + "learning_rate": 3.310810810810811e-07, + "loss": 0.0004, + "reward": 3.673058271408081, + "reward_std": 0.07457491382956505, + "rewards/final_reward": 1.7505070487126124, + "rewards/mask_iou_reward": 0.8752535243563062, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.673058271408081, + "rewards/thk_ans_format_reward": 1.0, + "step": 2376, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.4479217529297, + "epoch": 8.03035413153457, + "grad_norm": 9.333641288570725, + "kl": 0.482421875, + "learning_rate": 3.3079954954954955e-07, + "loss": 0.0005, + "reward": 3.279388904571533, + "reward_std": 0.10259271413087845, + "rewards/final_reward": 1.165007475258762, + "rewards/mask_iou_reward": 0.582503737629381, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2793890237808228, + "rewards/thk_ans_format_reward": 1.0, + "step": 2377, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.05208587646484, + "epoch": 8.03372681281619, + "grad_norm": 7.7193007263674795, + "kl": 0.4609375, + "learning_rate": 3.3051801801801796e-07, + "loss": 0.0005, + "reward": 3.6911059617996216, + "reward_std": 0.03571598511189222, + "rewards/final_reward": 1.7758940189951273, + "rewards/mask_iou_reward": 0.8879470094975637, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6911059617996216, + "rewards/thk_ans_format_reward": 1.0, + "step": 2378, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.80209350585938, + "epoch": 8.037099494097808, + "grad_norm": 12.554029559797081, + "kl": 0.41015625, + "learning_rate": 3.302364864864865e-07, + "loss": 0.0004, + "reward": 3.50697922706604, + "reward_std": 0.010762129910290241, + "rewards/final_reward": 1.9482879176670094, + "rewards/mask_iou_reward": 0.9741439588335047, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.506978988647461, + "rewards/thk_ans_format_reward": 1.0, + "step": 2379, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.98958587646484, + "epoch": 8.040472175379426, + "grad_norm": 25.757316247888088, + "kl": 0.3984375, + "learning_rate": 3.2995495495495493e-07, + "loss": 0.0004, + "reward": 3.8017170429229736, + "reward_std": 0.05078030563890934, + "rewards/final_reward": 1.6788383903437554, + "rewards/mask_iou_reward": 0.8394191951718777, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8017171025276184, + "rewards/thk_ans_format_reward": 1.0, + "step": 2380, + "think_completion_length": 10.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.65626525878906, + "epoch": 8.043844856661046, + "grad_norm": 4.711917028914413, + "kl": 0.39453125, + "learning_rate": 3.296734234234234e-07, + "loss": 0.0004, + "reward": 3.4532305002212524, + "reward_std": 0.21785828098654747, + "rewards/final_reward": 1.2614822442511835, + "rewards/mask_iou_reward": 0.6307411221255917, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4636470079421997, + "rewards/thk_ans_format_reward": 1.0, + "step": 2381, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.1041717529297, + "epoch": 8.047217537942664, + "grad_norm": 5.786104794251641, + "kl": 0.43359375, + "learning_rate": 3.2939189189189186e-07, + "loss": 0.0004, + "reward": 3.568354845046997, + "reward_std": 0.23636979144066572, + "rewards/final_reward": 1.6667668988522717, + "rewards/mask_iou_reward": 0.8333834494261358, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5996047258377075, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2382, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.06250762939453, + "epoch": 8.050590219224283, + "grad_norm": 11.667658501236074, + "kl": 0.3955078125, + "learning_rate": 3.291103603603603e-07, + "loss": 0.0004, + "reward": 3.549164295196533, + "reward_std": 0.11421588622033596, + "rewards/final_reward": 1.9014355104384648, + "rewards/mask_iou_reward": 0.9507177552192324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5491642951965332, + "rewards/thk_ans_format_reward": 1.0, + "step": 2383, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.28125762939453, + "epoch": 8.053962900505903, + "grad_norm": 52.26829765639469, + "kl": 0.408203125, + "learning_rate": 3.2882882882882883e-07, + "loss": 0.0004, + "reward": 3.5316416025161743, + "reward_std": 0.07862124592065811, + "rewards/final_reward": 1.7800296532453164, + "rewards/mask_iou_reward": 0.8900148266226582, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5316415429115295, + "rewards/thk_ans_format_reward": 1.0, + "step": 2384, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.6041717529297, + "epoch": 8.057335581787521, + "grad_norm": 19.327738750923807, + "kl": 0.556640625, + "learning_rate": 3.285472972972973e-07, + "loss": 0.0006, + "reward": 3.734640121459961, + "reward_std": 0.11318856105208397, + "rewards/final_reward": 1.855851258239336, + "rewards/mask_iou_reward": 0.927925629119668, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7346400022506714, + "rewards/thk_ans_format_reward": 1.0, + "step": 2385, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.3229217529297, + "epoch": 8.06070826306914, + "grad_norm": 6.5165529355463345, + "kl": 0.4169921875, + "learning_rate": 3.2826576576576575e-07, + "loss": 0.0004, + "reward": 3.505844473838806, + "reward_std": 0.13680755905807018, + "rewards/final_reward": 1.2078693786602015, + "rewards/mask_iou_reward": 0.6039346893301007, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5266777276992798, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2386, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.1041717529297, + "epoch": 8.064080944350758, + "grad_norm": 5.597707115359675, + "kl": 0.544921875, + "learning_rate": 3.279842342342342e-07, + "loss": 0.0006, + "reward": 3.4991390705108643, + "reward_std": 0.07332871481776237, + "rewards/final_reward": 0.6863406134033639, + "rewards/mask_iou_reward": 0.34317030670168197, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4991388320922852, + "rewards/thk_ans_format_reward": 1.0, + "step": 2387, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.08333587646484, + "epoch": 8.067453625632378, + "grad_norm": 30.552181119910045, + "kl": 0.447265625, + "learning_rate": 3.2770270270270267e-07, + "loss": 0.0005, + "reward": 3.708919405937195, + "reward_std": 0.027788237668573856, + "rewards/final_reward": 1.5298693900152363, + "rewards/mask_iou_reward": 0.7649346950076181, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7089195251464844, + "rewards/thk_ans_format_reward": 1.0, + "step": 2388, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.3229217529297, + "epoch": 8.070826306913997, + "grad_norm": 5.886357948414976, + "kl": 0.509765625, + "learning_rate": 3.2742117117117113e-07, + "loss": 0.0005, + "reward": 3.342664361000061, + "reward_std": 0.10686694085597992, + "rewards/final_reward": 1.2874507479483295, + "rewards/mask_iou_reward": 0.6437253739741647, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3426641821861267, + "rewards/thk_ans_format_reward": 1.0, + "step": 2389, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.89584350585938, + "epoch": 8.074198988195615, + "grad_norm": 5.208029616834268, + "kl": 0.4287109375, + "learning_rate": 3.2713963963963965e-07, + "loss": 0.0004, + "reward": 3.3730231523513794, + "reward_std": 0.2724648416042328, + "rewards/final_reward": 1.0485035138599763, + "rewards/mask_iou_reward": 0.5242517569299882, + "rewards/sam_format_reward": 0.9375, + "rewards/sam_reward_func_ultra": 1.4980231523513794, + "rewards/thk_ans_format_reward": 0.9375, + "step": 2390, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.6041717529297, + "epoch": 8.077571669477235, + "grad_norm": 10.381308765291164, + "kl": 0.486328125, + "learning_rate": 3.268581081081081e-07, + "loss": 0.0005, + "reward": 3.585293173789978, + "reward_std": 0.08048686385154724, + "rewards/final_reward": 1.7610035561330326, + "rewards/mask_iou_reward": 0.8805017780665163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5852932333946228, + "rewards/thk_ans_format_reward": 1.0, + "step": 2391, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.1979217529297, + "epoch": 8.080944350758854, + "grad_norm": 4.410245364445696, + "kl": 0.400390625, + "learning_rate": 3.2657657657657657e-07, + "loss": 0.0004, + "reward": 3.611588478088379, + "reward_std": 0.06441009044647217, + "rewards/final_reward": 1.56279629364386, + "rewards/mask_iou_reward": 0.78139814682193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6115882992744446, + "rewards/thk_ans_format_reward": 1.0, + "step": 2392, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.75, + "epoch": 8.084317032040472, + "grad_norm": 8.748181047939324, + "kl": 0.396484375, + "learning_rate": 3.2629504504504503e-07, + "loss": 0.0004, + "reward": 3.7184702157974243, + "reward_std": 0.012965178117156029, + "rewards/final_reward": 1.8514894083113558, + "rewards/mask_iou_reward": 0.9257447041556779, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7184701561927795, + "rewards/thk_ans_format_reward": 1.0, + "step": 2393, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.375, + "epoch": 8.08768971332209, + "grad_norm": 30.253714510471372, + "kl": 0.46484375, + "learning_rate": 3.260135135135135e-07, + "loss": 0.0005, + "reward": 3.337326765060425, + "reward_std": 0.1405288316309452, + "rewards/final_reward": 1.8327282117382686, + "rewards/mask_iou_reward": 0.9163641058691343, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3373266458511353, + "rewards/thk_ans_format_reward": 1.0, + "step": 2394, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.4375, + "epoch": 8.09106239460371, + "grad_norm": 5.770519777707653, + "kl": 0.3984375, + "learning_rate": 3.25731981981982e-07, + "loss": 0.0004, + "reward": 3.6448590755462646, + "reward_std": 0.19882620126008987, + "rewards/final_reward": 1.5674474068642676, + "rewards/mask_iou_reward": 0.7837237034321338, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.665692389011383, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2395, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.45833587646484, + "epoch": 8.094435075885329, + "grad_norm": 7.792328264252321, + "kl": 0.7607421875, + "learning_rate": 3.2545045045045046e-07, + "loss": 0.0008, + "reward": 3.6415023803710938, + "reward_std": 0.03474126663058996, + "rewards/final_reward": 1.1202452769772973, + "rewards/mask_iou_reward": 0.5601226384886486, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6415023803710938, + "rewards/thk_ans_format_reward": 1.0, + "step": 2396, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.1354217529297, + "epoch": 8.097807757166947, + "grad_norm": 11.930417082724471, + "kl": 0.390625, + "learning_rate": 3.251689189189189e-07, + "loss": 0.0004, + "reward": 3.4700154066085815, + "reward_std": 0.07594737969338894, + "rewards/final_reward": 0.6212718691660815, + "rewards/mask_iou_reward": 0.31063593458304073, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4700152277946472, + "rewards/thk_ans_format_reward": 1.0, + "step": 2397, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.65625762939453, + "epoch": 8.101180438448566, + "grad_norm": 18.963475126752737, + "kl": 0.4609375, + "learning_rate": 3.2488738738738733e-07, + "loss": 0.0005, + "reward": 3.397141933441162, + "reward_std": 0.15669412538409233, + "rewards/final_reward": 1.7398657181327521, + "rewards/mask_iou_reward": 0.8699328590663761, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3971418142318726, + "rewards/thk_ans_format_reward": 1.0, + "step": 2398, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.36459350585938, + "epoch": 8.104553119730186, + "grad_norm": 15.023091097131797, + "kl": 0.3828125, + "learning_rate": 3.246058558558558e-07, + "loss": 0.0004, + "reward": 3.2796510457992554, + "reward_std": 0.11465698108077049, + "rewards/final_reward": 1.149373076233073, + "rewards/mask_iou_reward": 0.5746865381165365, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2796511054039001, + "rewards/thk_ans_format_reward": 1.0, + "step": 2399, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.21875762939453, + "epoch": 8.107925801011804, + "grad_norm": 129.00479724577653, + "kl": 0.44140625, + "learning_rate": 3.243243243243243e-07, + "loss": 0.0005, + "reward": 3.5124967098236084, + "reward_std": 0.09804843366146088, + "rewards/final_reward": 1.8780440430976026, + "rewards/mask_iou_reward": 0.9390220215488013, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5124965906143188, + "rewards/thk_ans_format_reward": 1.0, + "step": 2400, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.34375762939453, + "epoch": 8.111298482293423, + "grad_norm": 5.992840860030975, + "kl": 0.408203125, + "learning_rate": 3.2404279279279277e-07, + "loss": 0.0004, + "reward": 3.42751145362854, + "reward_std": 0.09829858504235744, + "rewards/final_reward": 1.6106350603914932, + "rewards/mask_iou_reward": 0.8053175301957466, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4275111556053162, + "rewards/thk_ans_format_reward": 1.0, + "step": 2401, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.7916717529297, + "epoch": 8.114671163575043, + "grad_norm": 8.068307793533288, + "kl": 0.4169921875, + "learning_rate": 3.2376126126126123e-07, + "loss": 0.0004, + "reward": 3.227345824241638, + "reward_std": 0.08449060097336769, + "rewards/final_reward": 1.410210368073459, + "rewards/mask_iou_reward": 0.7051051840367295, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2273458242416382, + "rewards/thk_ans_format_reward": 1.0, + "step": 2402, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.7291717529297, + "epoch": 8.118043844856661, + "grad_norm": 10.835281412196219, + "kl": 0.458984375, + "learning_rate": 3.234797297297297e-07, + "loss": 0.0005, + "reward": 3.6753695011138916, + "reward_std": 0.0677886251360178, + "rewards/final_reward": 1.5689224450025796, + "rewards/mask_iou_reward": 0.7844612225012898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.675369381904602, + "rewards/thk_ans_format_reward": 1.0, + "step": 2403, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.7916717529297, + "epoch": 8.12141652613828, + "grad_norm": 7.868671257529703, + "kl": 0.4091796875, + "learning_rate": 3.2319819819819815e-07, + "loss": 0.0004, + "reward": 3.660378336906433, + "reward_std": 0.12548162788152695, + "rewards/final_reward": 1.8662698704350658, + "rewards/mask_iou_reward": 0.9331349352175329, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6603783965110779, + "rewards/thk_ans_format_reward": 1.0, + "step": 2404, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.93750762939453, + "epoch": 8.124789207419898, + "grad_norm": 8.370326706025851, + "kl": 0.6669921875, + "learning_rate": 3.2291666666666666e-07, + "loss": 0.0007, + "reward": 3.3133339881896973, + "reward_std": 0.14033591002225876, + "rewards/final_reward": 1.7516517047199014, + "rewards/mask_iou_reward": 0.8758258523599507, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.313333809375763, + "rewards/thk_ans_format_reward": 1.0, + "step": 2405, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.87500762939453, + "epoch": 8.128161888701518, + "grad_norm": 8.330567857586583, + "kl": 0.423828125, + "learning_rate": 3.226351351351351e-07, + "loss": 0.0004, + "reward": 3.424866199493408, + "reward_std": 0.12922486569732428, + "rewards/final_reward": 1.0733441257197325, + "rewards/mask_iou_reward": 0.5366720628598662, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4456994533538818, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2406, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.3125, + "epoch": 8.131534569983137, + "grad_norm": 10.523488546634377, + "kl": 0.560546875, + "learning_rate": 3.223536036036036e-07, + "loss": 0.0006, + "reward": 3.84055757522583, + "reward_std": 0.08618904370814562, + "rewards/final_reward": 1.758160011130204, + "rewards/mask_iou_reward": 0.879080005565102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8405576944351196, + "rewards/thk_ans_format_reward": 1.0, + "step": 2407, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5, + "epoch": 8.134907251264755, + "grad_norm": 10.934981146929287, + "kl": 0.466796875, + "learning_rate": 3.2207207207207205e-07, + "loss": 0.0005, + "reward": 3.602039098739624, + "reward_std": 0.046292152255773544, + "rewards/final_reward": 1.8189440665563734, + "rewards/mask_iou_reward": 0.9094720332781867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6020390391349792, + "rewards/thk_ans_format_reward": 1.0, + "step": 2408, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.9479217529297, + "epoch": 8.138279932546375, + "grad_norm": 6.908980900575378, + "kl": 0.4326171875, + "learning_rate": 3.217905405405405e-07, + "loss": 0.0004, + "reward": 3.737973690032959, + "reward_std": 0.02162565803155303, + "rewards/final_reward": 1.873947012729171, + "rewards/mask_iou_reward": 0.9369735063645855, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.737973690032959, + "rewards/thk_ans_format_reward": 1.0, + "step": 2409, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.9791717529297, + "epoch": 8.141652613827993, + "grad_norm": 17.50123600151369, + "kl": 0.447265625, + "learning_rate": 3.21509009009009e-07, + "loss": 0.0004, + "reward": 3.526793599128723, + "reward_std": 0.04617397487163544, + "rewards/final_reward": 1.5573754967659243, + "rewards/mask_iou_reward": 0.7786877483829622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5267934203147888, + "rewards/thk_ans_format_reward": 1.0, + "step": 2410, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.98958587646484, + "epoch": 8.145025295109612, + "grad_norm": 10.237101465585567, + "kl": 0.4921875, + "learning_rate": 3.212274774774775e-07, + "loss": 0.0005, + "reward": 3.466389775276184, + "reward_std": 0.10824509710073471, + "rewards/final_reward": 1.0882982205477378, + "rewards/mask_iou_reward": 0.5441491102738689, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.466389775276184, + "rewards/thk_ans_format_reward": 1.0, + "step": 2411, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.30208587646484, + "epoch": 8.14839797639123, + "grad_norm": 17.59083784655385, + "kl": 0.4736328125, + "learning_rate": 3.2094594594594594e-07, + "loss": 0.0005, + "reward": 3.24951171875, + "reward_std": 0.05265136994421482, + "rewards/final_reward": 0.7422969614193567, + "rewards/mask_iou_reward": 0.3711484807096784, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2495114207267761, + "rewards/thk_ans_format_reward": 1.0, + "step": 2412, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.1041717529297, + "epoch": 8.15177065767285, + "grad_norm": 9.760394222550351, + "kl": 0.4599609375, + "learning_rate": 3.206644144144144e-07, + "loss": 0.0005, + "reward": 3.385643482208252, + "reward_std": 0.09001307748258114, + "rewards/final_reward": 0.6627619569797258, + "rewards/mask_iou_reward": 0.3313809784898629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3856431245803833, + "rewards/thk_ans_format_reward": 1.0, + "step": 2413, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.21875381469727, + "epoch": 8.155143338954469, + "grad_norm": 31.29595285296675, + "kl": 0.4208984375, + "learning_rate": 3.2038288288288286e-07, + "loss": 0.0004, + "reward": 3.4562931060791016, + "reward_std": 0.05336186848580837, + "rewards/final_reward": 1.1441042030131243, + "rewards/mask_iou_reward": 0.5720521015065622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.456292986869812, + "rewards/thk_ans_format_reward": 1.0, + "step": 2414, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.09375, + "epoch": 8.158516020236087, + "grad_norm": 9.501872238266467, + "kl": 0.87109375, + "learning_rate": 3.201013513513514e-07, + "loss": 0.0009, + "reward": 3.521483302116394, + "reward_std": 0.08115924685262144, + "rewards/final_reward": 1.6029761419337214, + "rewards/mask_iou_reward": 0.8014880709668607, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5214833617210388, + "rewards/thk_ans_format_reward": 1.0, + "step": 2415, + "think_completion_length": 10.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.34375, + "epoch": 8.161888701517707, + "grad_norm": 32.72477260944038, + "kl": 0.486328125, + "learning_rate": 3.1981981981981984e-07, + "loss": 0.0005, + "reward": 3.627326726913452, + "reward_std": 0.046977970749139786, + "rewards/final_reward": 1.6322781141514124, + "rewards/mask_iou_reward": 0.8161390570757062, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6273266077041626, + "rewards/thk_ans_format_reward": 1.0, + "step": 2416, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.55209350585938, + "epoch": 8.165261382799326, + "grad_norm": 7.131920182603108, + "kl": 0.4736328125, + "learning_rate": 3.195382882882883e-07, + "loss": 0.0005, + "reward": 3.7091704607009888, + "reward_std": 0.06176206795498729, + "rewards/final_reward": 1.550195763691969, + "rewards/mask_iou_reward": 0.7750978818459845, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7091703414916992, + "rewards/thk_ans_format_reward": 1.0, + "step": 2417, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.02083587646484, + "epoch": 8.168634064080944, + "grad_norm": 7.346436112306174, + "kl": 0.5078125, + "learning_rate": 3.192567567567567e-07, + "loss": 0.0005, + "reward": 3.6605117321014404, + "reward_std": 0.04182407818734646, + "rewards/final_reward": 1.4259560613112674, + "rewards/mask_iou_reward": 0.7129780306556337, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6605116724967957, + "rewards/thk_ans_format_reward": 1.0, + "step": 2418, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.48958587646484, + "epoch": 8.172006745362562, + "grad_norm": 9.91741035767607, + "kl": 0.44921875, + "learning_rate": 3.1897522522522517e-07, + "loss": 0.0005, + "reward": 3.6389087438583374, + "reward_std": 0.10996793489903212, + "rewards/final_reward": 1.8648980222098381, + "rewards/mask_iou_reward": 0.9324490111049191, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6389088034629822, + "rewards/thk_ans_format_reward": 1.0, + "step": 2419, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.7916717529297, + "epoch": 8.175379426644183, + "grad_norm": 8.771784119719042, + "kl": 0.478515625, + "learning_rate": 3.186936936936937e-07, + "loss": 0.0005, + "reward": 3.5804519653320312, + "reward_std": 0.019869420444592834, + "rewards/final_reward": 1.7407468174759197, + "rewards/mask_iou_reward": 0.8703734087379599, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5804519653320312, + "rewards/thk_ans_format_reward": 1.0, + "step": 2420, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.58333587646484, + "epoch": 8.178752107925801, + "grad_norm": 14.461206633706372, + "kl": 0.544921875, + "learning_rate": 3.1841216216216214e-07, + "loss": 0.0006, + "reward": 3.332810401916504, + "reward_std": 0.05701042152941227, + "rewards/final_reward": 1.5631696036833542, + "rewards/mask_iou_reward": 0.7815848018416771, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3328104615211487, + "rewards/thk_ans_format_reward": 1.0, + "step": 2421, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9166717529297, + "epoch": 8.18212478920742, + "grad_norm": 10.900392482313276, + "kl": 0.5048828125, + "learning_rate": 3.181306306306306e-07, + "loss": 0.0005, + "reward": 3.417921781539917, + "reward_std": 0.1880015730857849, + "rewards/final_reward": 1.6576152588454947, + "rewards/mask_iou_reward": 0.8288076294227473, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.428338348865509, + "rewards/thk_ans_format_reward": 1.0, + "step": 2422, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.58333587646484, + "epoch": 8.18549747048904, + "grad_norm": 8.069657839824432, + "kl": 0.482421875, + "learning_rate": 3.1784909909909906e-07, + "loss": 0.0005, + "reward": 3.43167781829834, + "reward_std": 0.09442893601953983, + "rewards/final_reward": 1.5638686557156498, + "rewards/mask_iou_reward": 0.7819343278578249, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4316779375076294, + "rewards/thk_ans_format_reward": 1.0, + "step": 2423, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.0104217529297, + "epoch": 8.188870151770658, + "grad_norm": 13.19781049934047, + "kl": 0.45703125, + "learning_rate": 3.175675675675675e-07, + "loss": 0.0005, + "reward": 3.432652235031128, + "reward_std": 0.10366004332900047, + "rewards/final_reward": 1.4790426118829503, + "rewards/mask_iou_reward": 0.7395213059414751, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.432652235031128, + "rewards/thk_ans_format_reward": 1.0, + "step": 2424, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.55208587646484, + "epoch": 8.192242833052276, + "grad_norm": 8.387087408769968, + "kl": 0.48046875, + "learning_rate": 3.1728603603603604e-07, + "loss": 0.0005, + "reward": 3.5953248739242554, + "reward_std": 0.04133354127407074, + "rewards/final_reward": 1.5849948737706572, + "rewards/mask_iou_reward": 0.7924974368853286, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5953248739242554, + "rewards/thk_ans_format_reward": 1.0, + "step": 2425, + "think_completion_length": 9.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.1666717529297, + "epoch": 8.195615514333895, + "grad_norm": 5.757225928522244, + "kl": 0.4755859375, + "learning_rate": 3.170045045045045e-07, + "loss": 0.0005, + "reward": 3.656041979789734, + "reward_std": 0.04135182220488787, + "rewards/final_reward": 1.9078518225754462, + "rewards/mask_iou_reward": 0.9539259112877231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6560419797897339, + "rewards/thk_ans_format_reward": 1.0, + "step": 2426, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.40625762939453, + "epoch": 8.198988195615515, + "grad_norm": 11.15654342910603, + "kl": 0.4365234375, + "learning_rate": 3.1672297297297296e-07, + "loss": 0.0004, + "reward": 3.546691417694092, + "reward_std": 0.13070277497172356, + "rewards/final_reward": 0.9240818825057648, + "rewards/mask_iou_reward": 0.4620409412528824, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5466914772987366, + "rewards/thk_ans_format_reward": 1.0, + "step": 2427, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.75000381469727, + "epoch": 8.202360876897133, + "grad_norm": 8.195096236049537, + "kl": 0.466796875, + "learning_rate": 3.164414414414414e-07, + "loss": 0.0005, + "reward": 3.33832848072052, + "reward_std": 0.09735456854104996, + "rewards/final_reward": 1.2954900338208892, + "rewards/mask_iou_reward": 0.6477450169104446, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.33832848072052, + "rewards/thk_ans_format_reward": 1.0, + "step": 2428, + "think_completion_length": 9.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.80208969116211, + "epoch": 8.205733558178752, + "grad_norm": 10.897348964830934, + "kl": 0.5087890625, + "learning_rate": 3.161599099099099e-07, + "loss": 0.0005, + "reward": 3.5560476779937744, + "reward_std": 0.1025335043668747, + "rewards/final_reward": 1.9635037618405802, + "rewards/mask_iou_reward": 0.9817518809202901, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5560476183891296, + "rewards/thk_ans_format_reward": 1.0, + "step": 2429, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.57292938232422, + "epoch": 8.209106239460372, + "grad_norm": 8.79365056371932, + "kl": 0.4287109375, + "learning_rate": 3.158783783783784e-07, + "loss": 0.0004, + "reward": 3.04141902923584, + "reward_std": 0.10161345452070236, + "rewards/final_reward": 0.734848567434719, + "rewards/mask_iou_reward": 0.3674242837173595, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.041418969631195, + "rewards/thk_ans_format_reward": 1.0, + "step": 2430, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.9791717529297, + "epoch": 8.21247892074199, + "grad_norm": 8.018953305589752, + "kl": 0.4169921875, + "learning_rate": 3.1559684684684685e-07, + "loss": 0.0004, + "reward": 3.687578797340393, + "reward_std": 0.07668573036789894, + "rewards/final_reward": 1.703764574546751, + "rewards/mask_iou_reward": 0.8518822872733754, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6875787377357483, + "rewards/thk_ans_format_reward": 1.0, + "step": 2431, + "think_completion_length": 10.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.64583587646484, + "epoch": 8.215851602023609, + "grad_norm": 23.61435432038782, + "kl": 0.517578125, + "learning_rate": 3.153153153153153e-07, + "loss": 0.0005, + "reward": 3.406832456588745, + "reward_std": 0.11250332370400429, + "rewards/final_reward": 1.213014148599159, + "rewards/mask_iou_reward": 0.6065070742995795, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4068323373794556, + "rewards/thk_ans_format_reward": 1.0, + "step": 2432, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0416717529297, + "epoch": 8.219224283305227, + "grad_norm": 9.13877086411204, + "kl": 0.59375, + "learning_rate": 3.150337837837838e-07, + "loss": 0.0006, + "reward": 3.6608787775039673, + "reward_std": 0.06742198672145605, + "rewards/final_reward": 1.7380011253940841, + "rewards/mask_iou_reward": 0.8690005626970421, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.660878837108612, + "rewards/thk_ans_format_reward": 1.0, + "step": 2433, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.71875, + "epoch": 8.222596964586847, + "grad_norm": 14.396642672138428, + "kl": 0.439453125, + "learning_rate": 3.1475225225225223e-07, + "loss": 0.0004, + "reward": 3.429533004760742, + "reward_std": 0.12928189616650343, + "rewards/final_reward": 1.663311666723789, + "rewards/mask_iou_reward": 0.8316558333618945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4295329451560974, + "rewards/thk_ans_format_reward": 1.0, + "step": 2434, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.70833587646484, + "epoch": 8.225969645868465, + "grad_norm": 12.580987281110211, + "kl": 0.5361328125, + "learning_rate": 3.1447072072072075e-07, + "loss": 0.0005, + "reward": 3.5138970613479614, + "reward_std": 0.09798325225710869, + "rewards/final_reward": 0.9252272387501513, + "rewards/mask_iou_reward": 0.46261361937507567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5138969421386719, + "rewards/thk_ans_format_reward": 1.0, + "step": 2435, + "think_completion_length": 9.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.11458587646484, + "epoch": 8.229342327150084, + "grad_norm": 7.862194953393018, + "kl": 0.556640625, + "learning_rate": 3.141891891891892e-07, + "loss": 0.0006, + "reward": 3.8974932432174683, + "reward_std": 0.008425467647612095, + "rewards/final_reward": 1.904266355805077, + "rewards/mask_iou_reward": 0.9521331779025385, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8974932432174683, + "rewards/thk_ans_format_reward": 1.0, + "step": 2436, + "think_completion_length": 11.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.8854217529297, + "epoch": 8.232715008431704, + "grad_norm": 7.968289754618249, + "kl": 0.5341796875, + "learning_rate": 3.1390765765765767e-07, + "loss": 0.0005, + "reward": 3.715288996696472, + "reward_std": 0.08045927435159683, + "rewards/final_reward": 1.7203986965591502, + "rewards/mask_iou_reward": 0.8601993482795751, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7152891755104065, + "rewards/thk_ans_format_reward": 1.0, + "step": 2437, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.4791717529297, + "epoch": 8.236087689713322, + "grad_norm": 8.114872001225699, + "kl": 0.3974609375, + "learning_rate": 3.136261261261261e-07, + "loss": 0.0004, + "reward": 3.6249054670333862, + "reward_std": 0.07169766910374165, + "rewards/final_reward": 1.5403707879258648, + "rewards/mask_iou_reward": 0.7701853939629324, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.635322093963623, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2438, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.5416717529297, + "epoch": 8.23946037099494, + "grad_norm": 9.312476476695707, + "kl": 0.45703125, + "learning_rate": 3.1334459459459454e-07, + "loss": 0.0005, + "reward": 3.6166598796844482, + "reward_std": 0.07187589257955551, + "rewards/final_reward": 1.833042004311349, + "rewards/mask_iou_reward": 0.9165210021556744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6166600584983826, + "rewards/thk_ans_format_reward": 1.0, + "step": 2439, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.25000762939453, + "epoch": 8.24283305227656, + "grad_norm": 12.932719567705236, + "kl": 0.474609375, + "learning_rate": 3.1306306306306305e-07, + "loss": 0.0005, + "reward": 3.3091615438461304, + "reward_std": 0.06278246641159058, + "rewards/final_reward": 1.414298059959736, + "rewards/mask_iou_reward": 0.707149029979868, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3091613054275513, + "rewards/thk_ans_format_reward": 1.0, + "step": 2440, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.39584350585938, + "epoch": 8.24620573355818, + "grad_norm": 6.166095258366224, + "kl": 0.4091796875, + "learning_rate": 3.127815315315315e-07, + "loss": 0.0004, + "reward": 3.5161445140838623, + "reward_std": 0.12845508754253387, + "rewards/final_reward": 1.7061118329359797, + "rewards/mask_iou_reward": 0.8530559164679898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5161446928977966, + "rewards/thk_ans_format_reward": 1.0, + "step": 2441, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.00000762939453, + "epoch": 8.249578414839798, + "grad_norm": 10.115525504943898, + "kl": 0.3896484375, + "learning_rate": 3.1249999999999997e-07, + "loss": 0.0004, + "reward": 3.7777761220932007, + "reward_std": 0.013954056892544031, + "rewards/final_reward": 1.759472750093344, + "rewards/mask_iou_reward": 0.879736375046672, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7777761816978455, + "rewards/thk_ans_format_reward": 1.0, + "step": 2442, + "think_completion_length": 11.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.4791717529297, + "epoch": 8.252951096121416, + "grad_norm": 11.21434176611459, + "kl": 0.634765625, + "learning_rate": 3.1221846846846843e-07, + "loss": 0.0007, + "reward": 3.4758187532424927, + "reward_std": 0.10847053304314613, + "rewards/final_reward": 1.401818192682101, + "rewards/mask_iou_reward": 0.7009090963410505, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4758188128471375, + "rewards/thk_ans_format_reward": 1.0, + "step": 2443, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.40625762939453, + "epoch": 8.256323777403036, + "grad_norm": 7.883589360397389, + "kl": 0.5, + "learning_rate": 3.119369369369369e-07, + "loss": 0.0005, + "reward": 3.708293914794922, + "reward_std": 0.01661589415743947, + "rewards/final_reward": 1.8312880607156954, + "rewards/mask_iou_reward": 0.9156440303578477, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.708293855190277, + "rewards/thk_ans_format_reward": 1.0, + "step": 2444, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.02083587646484, + "epoch": 8.259696458684655, + "grad_norm": 12.16595034169912, + "kl": 0.451171875, + "learning_rate": 3.116554054054054e-07, + "loss": 0.0005, + "reward": 3.5598052740097046, + "reward_std": 0.045145684853196144, + "rewards/final_reward": 1.7660008482587344, + "rewards/mask_iou_reward": 0.8830004241293672, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5598055124282837, + "rewards/thk_ans_format_reward": 1.0, + "step": 2445, + "think_completion_length": 12.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.89583587646484, + "epoch": 8.263069139966273, + "grad_norm": 56.30875917052071, + "kl": 0.541015625, + "learning_rate": 3.1137387387387387e-07, + "loss": 0.0006, + "reward": 3.6041558980941772, + "reward_std": 0.05847676005214453, + "rewards/final_reward": 1.8159008746661849, + "rewards/mask_iou_reward": 0.9079504373330924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6041561365127563, + "rewards/thk_ans_format_reward": 1.0, + "step": 2446, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.14584350585938, + "epoch": 8.266441821247891, + "grad_norm": 12.722989721471567, + "kl": 0.478515625, + "learning_rate": 3.1109234234234233e-07, + "loss": 0.0005, + "reward": 3.610850214958191, + "reward_std": 0.030726881697773933, + "rewards/final_reward": 1.6853639776575466, + "rewards/mask_iou_reward": 0.8426819888287733, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6108500361442566, + "rewards/thk_ans_format_reward": 1.0, + "step": 2447, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5208396911621, + "epoch": 8.269814502529512, + "grad_norm": 8.340943305441208, + "kl": 0.486328125, + "learning_rate": 3.108108108108108e-07, + "loss": 0.0005, + "reward": 3.596595287322998, + "reward_std": 0.028427790850400925, + "rewards/final_reward": 1.315319206376798, + "rewards/mask_iou_reward": 0.657659603188399, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5965954065322876, + "rewards/thk_ans_format_reward": 1.0, + "step": 2448, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.7916717529297, + "epoch": 8.27318718381113, + "grad_norm": 13.227968762729217, + "kl": 0.44921875, + "learning_rate": 3.1052927927927925e-07, + "loss": 0.0005, + "reward": 3.6097664833068848, + "reward_std": 0.13787231594324112, + "rewards/final_reward": 1.9214265601311156, + "rewards/mask_iou_reward": 0.9607132800655578, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6097664833068848, + "rewards/thk_ans_format_reward": 1.0, + "step": 2449, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.96875762939453, + "epoch": 8.276559865092748, + "grad_norm": 10.939981851399066, + "kl": 0.4560546875, + "learning_rate": 3.1024774774774776e-07, + "loss": 0.0005, + "reward": 3.5717475414276123, + "reward_std": 0.06208985298871994, + "rewards/final_reward": 1.2314015001254215, + "rewards/mask_iou_reward": 0.6157007500627107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5717473030090332, + "rewards/thk_ans_format_reward": 1.0, + "step": 2450, + "think_completion_length": 9.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.92708587646484, + "epoch": 8.279932546374368, + "grad_norm": 17.351268699684027, + "kl": 0.48046875, + "learning_rate": 3.099662162162162e-07, + "loss": 0.0005, + "reward": 3.4347504377365112, + "reward_std": 0.043286630883812904, + "rewards/final_reward": 1.8738782380664545, + "rewards/mask_iou_reward": 0.9369391190332272, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4347504377365112, + "rewards/thk_ans_format_reward": 1.0, + "step": 2451, + "think_completion_length": 10.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.5729217529297, + "epoch": 8.283305227655987, + "grad_norm": 7.358723536584804, + "kl": 0.46875, + "learning_rate": 3.096846846846847e-07, + "loss": 0.0005, + "reward": 3.572574019432068, + "reward_std": 0.05550253111869097, + "rewards/final_reward": 1.8950182497118977, + "rewards/mask_iou_reward": 0.9475091248559488, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5725743770599365, + "rewards/thk_ans_format_reward": 1.0, + "step": 2452, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.08334350585938, + "epoch": 8.286677908937605, + "grad_norm": 12.804066129927278, + "kl": 0.580078125, + "learning_rate": 3.0940315315315315e-07, + "loss": 0.0006, + "reward": 3.285163164138794, + "reward_std": 0.08269466087222099, + "rewards/final_reward": 0.24234840012493677, + "rewards/mask_iou_reward": 0.12117420006246839, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2851630449295044, + "rewards/thk_ans_format_reward": 1.0, + "step": 2453, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.64583587646484, + "epoch": 8.290050590219224, + "grad_norm": 7.576007898495416, + "kl": 0.650390625, + "learning_rate": 3.091216216216216e-07, + "loss": 0.0007, + "reward": 3.386742115020752, + "reward_std": 0.043751709163188934, + "rewards/final_reward": 0.6861222812405611, + "rewards/mask_iou_reward": 0.34306114062028054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3867419958114624, + "rewards/thk_ans_format_reward": 1.0, + "step": 2454, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.48959350585938, + "epoch": 8.293423271500844, + "grad_norm": 28.406237180014656, + "kl": 0.4541015625, + "learning_rate": 3.088400900900901e-07, + "loss": 0.0005, + "reward": 3.4539101123809814, + "reward_std": 0.07719557732343674, + "rewards/final_reward": 1.4805113924994693, + "rewards/mask_iou_reward": 0.7402556962497346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4539099335670471, + "rewards/thk_ans_format_reward": 1.0, + "step": 2455, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.11458587646484, + "epoch": 8.296795952782462, + "grad_norm": 12.882943111223456, + "kl": 0.533203125, + "learning_rate": 3.085585585585586e-07, + "loss": 0.0005, + "reward": 3.6638059616088867, + "reward_std": 0.0629742294549942, + "rewards/final_reward": 1.3295112124943975, + "rewards/mask_iou_reward": 0.6647556062471988, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6638058423995972, + "rewards/thk_ans_format_reward": 1.0, + "step": 2456, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.14584350585938, + "epoch": 8.30016863406408, + "grad_norm": 35.69679144791488, + "kl": 0.4951171875, + "learning_rate": 3.08277027027027e-07, + "loss": 0.0005, + "reward": 3.436911702156067, + "reward_std": 0.14027688652276993, + "rewards/final_reward": 1.6058508945621877, + "rewards/mask_iou_reward": 0.8029254472810938, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4369115829467773, + "rewards/thk_ans_format_reward": 1.0, + "step": 2457, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.7708396911621, + "epoch": 8.3035413153457, + "grad_norm": 18.234730844982323, + "kl": 0.5302734375, + "learning_rate": 3.0799549549549545e-07, + "loss": 0.0005, + "reward": 3.6933977603912354, + "reward_std": 0.04440005775541067, + "rewards/final_reward": 1.8964776895218742, + "rewards/mask_iou_reward": 0.9482388447609371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6933976411819458, + "rewards/thk_ans_format_reward": 1.0, + "step": 2458, + "think_completion_length": 10.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.28125762939453, + "epoch": 8.306913996627319, + "grad_norm": 19.83675276244956, + "kl": 0.443359375, + "learning_rate": 3.077139639639639e-07, + "loss": 0.0004, + "reward": 3.2433114051818848, + "reward_std": 0.07939034514129162, + "rewards/final_reward": 1.090434110862306, + "rewards/mask_iou_reward": 0.545217055431153, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2433112859725952, + "rewards/thk_ans_format_reward": 1.0, + "step": 2459, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.6979217529297, + "epoch": 8.310286677908937, + "grad_norm": 7.085741787515577, + "kl": 0.51171875, + "learning_rate": 3.074324324324324e-07, + "loss": 0.0005, + "reward": 3.4524821043014526, + "reward_std": 0.05678035132586956, + "rewards/final_reward": 1.646411504121072, + "rewards/mask_iou_reward": 0.823205752060536, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4524821639060974, + "rewards/thk_ans_format_reward": 1.0, + "step": 2460, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.3645896911621, + "epoch": 8.313659359190556, + "grad_norm": 15.709015841909409, + "kl": 0.62109375, + "learning_rate": 3.071509009009009e-07, + "loss": 0.0006, + "reward": 3.4738714694976807, + "reward_std": 0.07798239542171359, + "rewards/final_reward": 1.6209935966105546, + "rewards/mask_iou_reward": 0.8104967983052773, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4738715291023254, + "rewards/thk_ans_format_reward": 1.0, + "step": 2461, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.7604217529297, + "epoch": 8.317032040472176, + "grad_norm": 18.33658559895609, + "kl": 0.44140625, + "learning_rate": 3.0686936936936934e-07, + "loss": 0.0005, + "reward": 3.4468486309051514, + "reward_std": 0.017853936180472374, + "rewards/final_reward": 1.77218697188322, + "rewards/mask_iou_reward": 0.88609348594161, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.446848750114441, + "rewards/thk_ans_format_reward": 1.0, + "step": 2462, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.53126525878906, + "epoch": 8.320404721753794, + "grad_norm": 14.760904578386418, + "kl": 0.4501953125, + "learning_rate": 3.065878378378378e-07, + "loss": 0.0005, + "reward": 3.0838990211486816, + "reward_std": 0.08175505138933659, + "rewards/final_reward": 0.858719382503991, + "rewards/mask_iou_reward": 0.4293596912519955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0838988423347473, + "rewards/thk_ans_format_reward": 1.0, + "step": 2463, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.77083587646484, + "epoch": 8.323777403035413, + "grad_norm": 14.392272707383267, + "kl": 0.71484375, + "learning_rate": 3.0630630630630627e-07, + "loss": 0.0007, + "reward": 3.385775327682495, + "reward_std": 0.17238017916679382, + "rewards/final_reward": 1.1066655612204888, + "rewards/mask_iou_reward": 0.5533327806102444, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3857755064964294, + "rewards/thk_ans_format_reward": 1.0, + "step": 2464, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.15625762939453, + "epoch": 8.327150084317031, + "grad_norm": 20.668715407406307, + "kl": 0.466796875, + "learning_rate": 3.060247747747748e-07, + "loss": 0.0005, + "reward": 3.758315682411194, + "reward_std": 0.06302101723849773, + "rewards/final_reward": 1.625147822078063, + "rewards/mask_iou_reward": 0.8125739110390315, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.758315622806549, + "rewards/thk_ans_format_reward": 1.0, + "step": 2465, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.6875, + "epoch": 8.330522765598651, + "grad_norm": 10.070725120432538, + "kl": 0.583984375, + "learning_rate": 3.0574324324324324e-07, + "loss": 0.0006, + "reward": 3.7680346965789795, + "reward_std": 0.05466078221797943, + "rewards/final_reward": 1.620782261636709, + "rewards/mask_iou_reward": 0.8103911308183545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.76803457736969, + "rewards/thk_ans_format_reward": 1.0, + "step": 2466, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.14583587646484, + "epoch": 8.33389544688027, + "grad_norm": 10.935838833962107, + "kl": 0.49609375, + "learning_rate": 3.054617117117117e-07, + "loss": 0.0005, + "reward": 3.594951868057251, + "reward_std": 0.08344347029924393, + "rewards/final_reward": 1.5041478962655797, + "rewards/mask_iou_reward": 0.7520739481327898, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.59495210647583, + "rewards/thk_ans_format_reward": 1.0, + "step": 2467, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.57291793823242, + "epoch": 8.337268128161888, + "grad_norm": 7.509251515010535, + "kl": 0.576171875, + "learning_rate": 3.0518018018018016e-07, + "loss": 0.0006, + "reward": 3.664355993270874, + "reward_std": 0.08624411281198263, + "rewards/final_reward": 1.2802445932812174, + "rewards/mask_iou_reward": 0.6401222966406087, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6643559336662292, + "rewards/thk_ans_format_reward": 1.0, + "step": 2468, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.40625762939453, + "epoch": 8.340640809443508, + "grad_norm": 11.699693204834624, + "kl": 0.458984375, + "learning_rate": 3.048986486486486e-07, + "loss": 0.0005, + "reward": 3.586129307746887, + "reward_std": 0.13098665326833725, + "rewards/final_reward": 1.595433860783781, + "rewards/mask_iou_reward": 0.7977169303918905, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5861293077468872, + "rewards/thk_ans_format_reward": 1.0, + "step": 2469, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.03125762939453, + "epoch": 8.344013490725127, + "grad_norm": 12.73540793684107, + "kl": 0.6484375, + "learning_rate": 3.0461711711711714e-07, + "loss": 0.0007, + "reward": 3.8514713048934937, + "reward_std": 0.08601294551044703, + "rewards/final_reward": 1.8853168454128082, + "rewards/mask_iou_reward": 0.9426584227064041, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8514713644981384, + "rewards/thk_ans_format_reward": 1.0, + "step": 2470, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.83333587646484, + "epoch": 8.347386172006745, + "grad_norm": 18.984630505291502, + "kl": 0.57421875, + "learning_rate": 3.043355855855856e-07, + "loss": 0.0007, + "reward": 3.7675371170043945, + "reward_std": 0.08456644229590893, + "rewards/final_reward": 1.90508309737051, + "rewards/mask_iou_reward": 0.952541548685255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7675367593765259, + "rewards/thk_ans_format_reward": 1.0, + "step": 2471, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.67708587646484, + "epoch": 8.350758853288363, + "grad_norm": 36.358076957750235, + "kl": 0.4814453125, + "learning_rate": 3.0405405405405406e-07, + "loss": 0.0004, + "reward": 3.574353575706482, + "reward_std": 0.04178227297961712, + "rewards/final_reward": 1.6225584169950786, + "rewards/mask_iou_reward": 0.8112792084975393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5743535161018372, + "rewards/thk_ans_format_reward": 1.0, + "step": 2472, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.14583587646484, + "epoch": 8.354131534569984, + "grad_norm": 9.26915726526041, + "kl": 0.552734375, + "learning_rate": 3.037725225225225e-07, + "loss": 0.0006, + "reward": 3.664590835571289, + "reward_std": 0.08789392560720444, + "rewards/final_reward": 1.8925300259644269, + "rewards/mask_iou_reward": 0.9462650129822134, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6645907759666443, + "rewards/thk_ans_format_reward": 1.0, + "step": 2473, + "think_completion_length": 9.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.56250762939453, + "epoch": 8.357504215851602, + "grad_norm": 6.854815896384399, + "kl": 0.50390625, + "learning_rate": 3.03490990990991e-07, + "loss": 0.0005, + "reward": 3.483082890510559, + "reward_std": 0.04504427965730429, + "rewards/final_reward": 1.5886423660196636, + "rewards/mask_iou_reward": 0.7943211830098318, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.483082890510559, + "rewards/thk_ans_format_reward": 1.0, + "step": 2474, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.67708587646484, + "epoch": 8.36087689713322, + "grad_norm": 52.066697310453165, + "kl": 0.5283203125, + "learning_rate": 3.032094594594595e-07, + "loss": 0.0005, + "reward": 3.7876784801483154, + "reward_std": 0.0402345466427505, + "rewards/final_reward": 1.9005078347031392, + "rewards/mask_iou_reward": 0.9502539173515696, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7876786589622498, + "rewards/thk_ans_format_reward": 1.0, + "step": 2475, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.80209350585938, + "epoch": 8.36424957841484, + "grad_norm": 15.253898465155263, + "kl": 0.4267578125, + "learning_rate": 3.0292792792792795e-07, + "loss": 0.0004, + "reward": 3.3623496294021606, + "reward_std": 0.06954523921012878, + "rewards/final_reward": 1.6360745575927615, + "rewards/mask_iou_reward": 0.8180372787963808, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3623495697975159, + "rewards/thk_ans_format_reward": 1.0, + "step": 2476, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.7291717529297, + "epoch": 8.367622259696459, + "grad_norm": 7.604992373949504, + "kl": 0.419921875, + "learning_rate": 3.0264639639639636e-07, + "loss": 0.0005, + "reward": 3.484631061553955, + "reward_std": 0.05333420401439071, + "rewards/final_reward": 0.7899648588250253, + "rewards/mask_iou_reward": 0.39498242941251266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4846312403678894, + "rewards/thk_ans_format_reward": 1.0, + "step": 2477, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.48959350585938, + "epoch": 8.370994940978077, + "grad_norm": 16.13659881531086, + "kl": 0.3974609375, + "learning_rate": 3.023648648648648e-07, + "loss": 0.0004, + "reward": 3.7472829818725586, + "reward_std": 0.052220143377780914, + "rewards/final_reward": 1.8180687068441148, + "rewards/mask_iou_reward": 0.9090343534220574, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.747282862663269, + "rewards/thk_ans_format_reward": 1.0, + "step": 2478, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.05208587646484, + "epoch": 8.374367622259696, + "grad_norm": 24.763365338808626, + "kl": 0.49609375, + "learning_rate": 3.020833333333333e-07, + "loss": 0.0006, + "reward": 3.612781286239624, + "reward_std": 0.0973962377756834, + "rewards/final_reward": 1.796999700367992, + "rewards/mask_iou_reward": 0.898499850183996, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6127811670303345, + "rewards/thk_ans_format_reward": 1.0, + "step": 2479, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.0416717529297, + "epoch": 8.377740303541316, + "grad_norm": 38.40838261997966, + "kl": 0.482421875, + "learning_rate": 3.018018018018018e-07, + "loss": 0.0005, + "reward": 3.380762219429016, + "reward_std": 0.2005249634385109, + "rewards/final_reward": 1.290711448853035, + "rewards/mask_iou_reward": 0.6453557244265175, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3911789059638977, + "rewards/thk_ans_format_reward": 1.0, + "step": 2480, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.29166793823242, + "epoch": 8.381112984822934, + "grad_norm": 11.097298558244487, + "kl": 0.5908203125, + "learning_rate": 3.0152027027027026e-07, + "loss": 0.0006, + "reward": 3.3576884269714355, + "reward_std": 0.041434711776673794, + "rewards/final_reward": 0.9189116504533033, + "rewards/mask_iou_reward": 0.4594558252266516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.357688307762146, + "rewards/thk_ans_format_reward": 1.0, + "step": 2481, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.95834350585938, + "epoch": 8.384485666104553, + "grad_norm": 6.519165982877114, + "kl": 0.40234375, + "learning_rate": 3.012387387387387e-07, + "loss": 0.0004, + "reward": 3.258583188056946, + "reward_std": 0.11860659997910261, + "rewards/final_reward": 0.9140629804971686, + "rewards/mask_iou_reward": 0.4570314902485843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2585831880569458, + "rewards/thk_ans_format_reward": 1.0, + "step": 2482, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.3854217529297, + "epoch": 8.387858347386173, + "grad_norm": 8.464867970457298, + "kl": 0.45703125, + "learning_rate": 3.009572072072072e-07, + "loss": 0.0005, + "reward": 3.522014021873474, + "reward_std": 0.036655642092227936, + "rewards/final_reward": 1.9023617745948194, + "rewards/mask_iou_reward": 0.9511808872974097, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5220139622688293, + "rewards/thk_ans_format_reward": 1.0, + "step": 2483, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.43750762939453, + "epoch": 8.391231028667791, + "grad_norm": 12.517351363030858, + "kl": 0.4521484375, + "learning_rate": 3.0067567567567564e-07, + "loss": 0.0005, + "reward": 2.9502170085906982, + "reward_std": 0.3760811146348715, + "rewards/final_reward": 1.805158868698367, + "rewards/mask_iou_reward": 0.9025794343491835, + "rewards/sam_format_reward": 0.9166666865348816, + "rewards/sam_reward_func_ultra": 1.1168835163116455, + "rewards/thk_ans_format_reward": 0.9166666865348816, + "step": 2484, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.61458587646484, + "epoch": 8.39460370994941, + "grad_norm": 9.591095522414026, + "kl": 0.6611328125, + "learning_rate": 3.0039414414414415e-07, + "loss": 0.0007, + "reward": 3.5364558696746826, + "reward_std": 0.009379489347338676, + "rewards/final_reward": 1.732489508652729, + "rewards/mask_iou_reward": 0.8662447543263645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.536455750465393, + "rewards/thk_ans_format_reward": 1.0, + "step": 2485, + "think_completion_length": 9.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.59375762939453, + "epoch": 8.397976391231028, + "grad_norm": 19.63539543049306, + "kl": 0.361328125, + "learning_rate": 3.001126126126126e-07, + "loss": 0.0004, + "reward": 3.6687779426574707, + "reward_std": 0.13566308468580246, + "rewards/final_reward": 1.8564654166287857, + "rewards/mask_iou_reward": 0.9282327083143929, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6687778234481812, + "rewards/thk_ans_format_reward": 1.0, + "step": 2486, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.6979217529297, + "epoch": 8.401349072512648, + "grad_norm": 21.022706427785067, + "kl": 0.4287109375, + "learning_rate": 2.9983108108108107e-07, + "loss": 0.0005, + "reward": 3.546531081199646, + "reward_std": 0.04343246482312679, + "rewards/final_reward": 0.910726898082419, + "rewards/mask_iou_reward": 0.4553634490412095, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5465310215950012, + "rewards/thk_ans_format_reward": 1.0, + "step": 2487, + "think_completion_length": 9.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.3125, + "epoch": 8.404721753794266, + "grad_norm": 86.12108620293544, + "kl": 0.564453125, + "learning_rate": 2.9954954954954953e-07, + "loss": 0.0006, + "reward": 3.336834669113159, + "reward_std": 0.04387115687131882, + "rewards/final_reward": 1.6743125925985707, + "rewards/mask_iou_reward": 0.8371562962992853, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3368346691131592, + "rewards/thk_ans_format_reward": 1.0, + "step": 2488, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.70834350585938, + "epoch": 8.408094435075885, + "grad_norm": 5.607179483122207, + "kl": 0.4375, + "learning_rate": 2.99268018018018e-07, + "loss": 0.0005, + "reward": 3.646553158760071, + "reward_std": 0.036387352272868156, + "rewards/final_reward": 1.8311148539790383, + "rewards/mask_iou_reward": 0.9155574269895191, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6465531587600708, + "rewards/thk_ans_format_reward": 1.0, + "step": 2489, + "think_completion_length": 10.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.4479217529297, + "epoch": 8.411467116357505, + "grad_norm": 17.164425299435482, + "kl": 0.57421875, + "learning_rate": 2.989864864864865e-07, + "loss": 0.0006, + "reward": 3.4151976108551025, + "reward_std": 0.05222295597195625, + "rewards/final_reward": 1.0455022682978328, + "rewards/mask_iou_reward": 0.5227511341489164, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.415197730064392, + "rewards/thk_ans_format_reward": 1.0, + "step": 2490, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.40625762939453, + "epoch": 8.414839797639123, + "grad_norm": 9.427976224812596, + "kl": 0.4296875, + "learning_rate": 2.9870495495495497e-07, + "loss": 0.0004, + "reward": 3.4724154472351074, + "reward_std": 0.02874742913991213, + "rewards/final_reward": 1.2115602872674582, + "rewards/mask_iou_reward": 0.6057801436337291, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.472415566444397, + "rewards/thk_ans_format_reward": 1.0, + "step": 2491, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.8125, + "epoch": 8.418212478920742, + "grad_norm": 9.332438234253871, + "kl": 0.40625, + "learning_rate": 2.9842342342342343e-07, + "loss": 0.0004, + "reward": 3.3823347091674805, + "reward_std": 0.08106222376227379, + "rewards/final_reward": 1.2735577478572697, + "rewards/mask_iou_reward": 0.6367788739286349, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3823343515396118, + "rewards/thk_ans_format_reward": 1.0, + "step": 2492, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.58333587646484, + "epoch": 8.42158516020236, + "grad_norm": 5.7462511768993645, + "kl": 0.431640625, + "learning_rate": 2.981418918918919e-07, + "loss": 0.0004, + "reward": 3.6817235946655273, + "reward_std": 0.07646342925727367, + "rewards/final_reward": 1.6200802844913902, + "rewards/mask_iou_reward": 0.8100401422456951, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6817233562469482, + "rewards/thk_ans_format_reward": 1.0, + "step": 2493, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.8854217529297, + "epoch": 8.42495784148398, + "grad_norm": 18.36974505848094, + "kl": 0.49609375, + "learning_rate": 2.9786036036036035e-07, + "loss": 0.0005, + "reward": 3.3335955142974854, + "reward_std": 0.0239328695461154, + "rewards/final_reward": 0.4093485643823379, + "rewards/mask_iou_reward": 0.20467428219116895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3335955739021301, + "rewards/thk_ans_format_reward": 1.0, + "step": 2494, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.98958587646484, + "epoch": 8.428330522765599, + "grad_norm": 14.168116351084729, + "kl": 0.4853515625, + "learning_rate": 2.9757882882882886e-07, + "loss": 0.0005, + "reward": 3.652899146080017, + "reward_std": 0.06641834788024426, + "rewards/final_reward": 1.578465960049993, + "rewards/mask_iou_reward": 0.7892329800249965, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.652899146080017, + "rewards/thk_ans_format_reward": 1.0, + "step": 2495, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.3229217529297, + "epoch": 8.431703204047217, + "grad_norm": 9.093427773961716, + "kl": 0.3779296875, + "learning_rate": 2.972972972972973e-07, + "loss": 0.0004, + "reward": 3.5432502031326294, + "reward_std": 0.06956898421049118, + "rewards/final_reward": 0.9596189913403599, + "rewards/mask_iou_reward": 0.47980949567017994, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5432501435279846, + "rewards/thk_ans_format_reward": 1.0, + "step": 2496, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.4895896911621, + "epoch": 8.435075885328837, + "grad_norm": 8.366250293662942, + "kl": 0.44921875, + "learning_rate": 2.9701576576576573e-07, + "loss": 0.0005, + "reward": 3.49937105178833, + "reward_std": 0.036435868591070175, + "rewards/final_reward": 1.5941931373008824, + "rewards/mask_iou_reward": 0.7970965686504412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.499370813369751, + "rewards/thk_ans_format_reward": 1.0, + "step": 2497, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.8541717529297, + "epoch": 8.438448566610456, + "grad_norm": 100.97623268945327, + "kl": 0.50390625, + "learning_rate": 2.967342342342342e-07, + "loss": 0.0005, + "reward": 3.4538137912750244, + "reward_std": 0.13186132721602917, + "rewards/final_reward": 1.4109515061725602, + "rewards/mask_iou_reward": 0.7054757530862801, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4538137316703796, + "rewards/thk_ans_format_reward": 1.0, + "step": 2498, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.8125, + "epoch": 8.441821247892074, + "grad_norm": 9.866319728457622, + "kl": 0.44921875, + "learning_rate": 2.9645270270270265e-07, + "loss": 0.0004, + "reward": 3.690119504928589, + "reward_std": 0.06055077165365219, + "rewards/final_reward": 1.778394397487585, + "rewards/mask_iou_reward": 0.8891971987437925, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.690119445323944, + "rewards/thk_ans_format_reward": 1.0, + "step": 2499, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.7604217529297, + "epoch": 8.445193929173692, + "grad_norm": 14.757207246280633, + "kl": 0.421875, + "learning_rate": 2.961711711711711e-07, + "loss": 0.0004, + "reward": 3.51715350151062, + "reward_std": 0.1072116307914257, + "rewards/final_reward": 1.4594061525948856, + "rewards/mask_iou_reward": 0.7297030762974428, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5171534419059753, + "rewards/thk_ans_format_reward": 1.0, + "step": 2500, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4479217529297, + "epoch": 8.448566610455313, + "grad_norm": 10.616897251939372, + "kl": 3.1396484375, + "learning_rate": 2.9588963963963963e-07, + "loss": 0.0031, + "reward": 3.537464737892151, + "reward_std": 0.08383292891085148, + "rewards/final_reward": 1.5480614030024484, + "rewards/mask_iou_reward": 0.7740307015012242, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.537464678287506, + "rewards/thk_ans_format_reward": 1.0, + "step": 2501, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.98958969116211, + "epoch": 8.451939291736931, + "grad_norm": 8.944695462699952, + "kl": 0.533203125, + "learning_rate": 2.956081081081081e-07, + "loss": 0.0005, + "reward": 3.5444079637527466, + "reward_std": 0.08628739230334759, + "rewards/final_reward": 1.2522563293073279, + "rewards/mask_iou_reward": 0.6261281646536639, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5444077253341675, + "rewards/thk_ans_format_reward": 1.0, + "step": 2502, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.7916717529297, + "epoch": 8.45531197301855, + "grad_norm": 18.813662646472718, + "kl": 0.5625, + "learning_rate": 2.9532657657657655e-07, + "loss": 0.0006, + "reward": 3.4374442100524902, + "reward_std": 0.10433689411729574, + "rewards/final_reward": 1.61969842826039, + "rewards/mask_iou_reward": 0.809849214130195, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4374441504478455, + "rewards/thk_ans_format_reward": 1.0, + "step": 2503, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.52083587646484, + "epoch": 8.45868465430017, + "grad_norm": 14.633252688172275, + "kl": 0.4501953125, + "learning_rate": 2.95045045045045e-07, + "loss": 0.0005, + "reward": 3.606281876564026, + "reward_std": 0.04830903559923172, + "rewards/final_reward": 1.6475285395482937, + "rewards/mask_iou_reward": 0.8237642697741469, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6062817573547363, + "rewards/thk_ans_format_reward": 1.0, + "step": 2504, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.0104217529297, + "epoch": 8.462057335581788, + "grad_norm": 24.03846680399155, + "kl": 0.52734375, + "learning_rate": 2.9476351351351347e-07, + "loss": 0.0005, + "reward": 3.4330817461013794, + "reward_std": 0.22954870760440826, + "rewards/final_reward": 1.5019694669903112, + "rewards/mask_iou_reward": 0.7509847334951556, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.453914999961853, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2505, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.20833587646484, + "epoch": 8.465430016863406, + "grad_norm": 13.402525176205542, + "kl": 0.47265625, + "learning_rate": 2.94481981981982e-07, + "loss": 0.0005, + "reward": 3.840874671936035, + "reward_std": 0.068333032540977, + "rewards/final_reward": 1.80296798970013, + "rewards/mask_iou_reward": 0.901483994850065, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8408745527267456, + "rewards/thk_ans_format_reward": 1.0, + "step": 2506, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.43750762939453, + "epoch": 8.468802698145025, + "grad_norm": 9.997875936893529, + "kl": 0.455078125, + "learning_rate": 2.9420045045045045e-07, + "loss": 0.0005, + "reward": 3.3141136169433594, + "reward_std": 0.06572789885103703, + "rewards/final_reward": 1.3835898973357565, + "rewards/mask_iou_reward": 0.6917949486678783, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3141136169433594, + "rewards/thk_ans_format_reward": 1.0, + "step": 2507, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.34375762939453, + "epoch": 8.472175379426645, + "grad_norm": 14.379412047558572, + "kl": 0.455078125, + "learning_rate": 2.939189189189189e-07, + "loss": 0.0005, + "reward": 3.2447589635849, + "reward_std": 0.06407011300325394, + "rewards/final_reward": 0.7249371697291964, + "rewards/mask_iou_reward": 0.3624685848645982, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2447589635849, + "rewards/thk_ans_format_reward": 1.0, + "step": 2508, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.9166717529297, + "epoch": 8.475548060708263, + "grad_norm": 10.626344525000441, + "kl": 0.447265625, + "learning_rate": 2.9363738738738737e-07, + "loss": 0.0004, + "reward": 3.5443687438964844, + "reward_std": 0.011294094379991293, + "rewards/final_reward": 0.8410213537308809, + "rewards/mask_iou_reward": 0.42051067686544047, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.544368863105774, + "rewards/thk_ans_format_reward": 1.0, + "step": 2509, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.56250762939453, + "epoch": 8.478920741989882, + "grad_norm": 141.9077108977809, + "kl": 0.5576171875, + "learning_rate": 2.9335585585585583e-07, + "loss": 0.0006, + "reward": 3.2660834789276123, + "reward_std": 0.027983209118247032, + "rewards/final_reward": 1.5384706716644456, + "rewards/mask_iou_reward": 0.7692353358322228, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2660833597183228, + "rewards/thk_ans_format_reward": 1.0, + "step": 2510, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.2604217529297, + "epoch": 8.4822934232715, + "grad_norm": 7.13545711592528, + "kl": 0.39453125, + "learning_rate": 2.9307432432432434e-07, + "loss": 0.0004, + "reward": 3.448287844657898, + "reward_std": 0.07316517271101475, + "rewards/final_reward": 1.509025737794106, + "rewards/mask_iou_reward": 0.754512868897053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.448287844657898, + "rewards/thk_ans_format_reward": 1.0, + "step": 2511, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.8854217529297, + "epoch": 8.48566610455312, + "grad_norm": 14.17306525874049, + "kl": 0.4873046875, + "learning_rate": 2.927927927927928e-07, + "loss": 0.0005, + "reward": 3.374244809150696, + "reward_std": 0.042572012171149254, + "rewards/final_reward": 1.4661105339791107, + "rewards/mask_iou_reward": 0.7330552669895554, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3742445707321167, + "rewards/thk_ans_format_reward": 1.0, + "step": 2512, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.59375381469727, + "epoch": 8.489038785834738, + "grad_norm": 8.979587153252908, + "kl": 0.49609375, + "learning_rate": 2.9251126126126126e-07, + "loss": 0.0005, + "reward": 3.571820020675659, + "reward_std": 0.06202232651412487, + "rewards/final_reward": 1.7060769163035236, + "rewards/mask_iou_reward": 0.8530384581517618, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5718199014663696, + "rewards/thk_ans_format_reward": 1.0, + "step": 2513, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.0208396911621, + "epoch": 8.492411467116357, + "grad_norm": 17.016517968673256, + "kl": 0.521484375, + "learning_rate": 2.922297297297297e-07, + "loss": 0.0005, + "reward": 3.4522387981414795, + "reward_std": 0.08139899373054504, + "rewards/final_reward": 1.6619377635910786, + "rewards/mask_iou_reward": 0.8309688817955393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4522387981414795, + "rewards/thk_ans_format_reward": 1.0, + "step": 2514, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.5416717529297, + "epoch": 8.495784148397977, + "grad_norm": 7.961331611236981, + "kl": 0.59765625, + "learning_rate": 2.919481981981982e-07, + "loss": 0.0006, + "reward": 3.4150949716567993, + "reward_std": 0.0757363960146904, + "rewards/final_reward": 1.639288865133204, + "rewards/mask_iou_reward": 0.819644432566602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4150949716567993, + "rewards/thk_ans_format_reward": 1.0, + "step": 2515, + "think_completion_length": 10.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.15625762939453, + "epoch": 8.499156829679595, + "grad_norm": 10.052498835832177, + "kl": 0.4814453125, + "learning_rate": 2.916666666666667e-07, + "loss": 0.0005, + "reward": 3.7526577711105347, + "reward_std": 0.04326079413294792, + "rewards/final_reward": 1.6635693312937825, + "rewards/mask_iou_reward": 0.8317846656468912, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7526578307151794, + "rewards/thk_ans_format_reward": 1.0, + "step": 2516, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.25000762939453, + "epoch": 8.502529510961214, + "grad_norm": 8.869345441898515, + "kl": 0.6494140625, + "learning_rate": 2.913851351351351e-07, + "loss": 0.0006, + "reward": 3.7935723066329956, + "reward_std": 0.08686716388911009, + "rewards/final_reward": 1.8359953194848295, + "rewards/mask_iou_reward": 0.9179976597424148, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7935723066329956, + "rewards/thk_ans_format_reward": 1.0, + "step": 2517, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.9479217529297, + "epoch": 8.505902192242832, + "grad_norm": 9.171440126815991, + "kl": 0.3994140625, + "learning_rate": 2.9110360360360357e-07, + "loss": 0.0004, + "reward": 3.4219143390655518, + "reward_std": 0.1695508360862732, + "rewards/final_reward": 1.7729887482617612, + "rewards/mask_iou_reward": 0.8864943741308806, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4531643986701965, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2518, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.90625, + "epoch": 8.509274873524452, + "grad_norm": 22.261539602959235, + "kl": 0.4921875, + "learning_rate": 2.9082207207207203e-07, + "loss": 0.0005, + "reward": 3.3779337406158447, + "reward_std": 0.1589372158050537, + "rewards/final_reward": 1.3534446510088611, + "rewards/mask_iou_reward": 0.6767223255044306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3779336214065552, + "rewards/thk_ans_format_reward": 1.0, + "step": 2519, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.17708587646484, + "epoch": 8.51264755480607, + "grad_norm": 22.363042010028014, + "kl": 0.470703125, + "learning_rate": 2.905405405405405e-07, + "loss": 0.0005, + "reward": 3.648926615715027, + "reward_std": 0.022691112011671066, + "rewards/final_reward": 1.4853223530097646, + "rewards/mask_iou_reward": 0.7426611765048823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6489266157150269, + "rewards/thk_ans_format_reward": 1.0, + "step": 2520, + "think_completion_length": 9.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.4479217529297, + "epoch": 8.516020236087689, + "grad_norm": 139.0103838529845, + "kl": 0.447265625, + "learning_rate": 2.90259009009009e-07, + "loss": 0.0004, + "reward": 3.5983364582061768, + "reward_std": 0.1036510244011879, + "rewards/final_reward": 1.9440492694703408, + "rewards/mask_iou_reward": 0.9720246347351704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5983361601829529, + "rewards/thk_ans_format_reward": 1.0, + "step": 2521, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.05208587646484, + "epoch": 8.51939291736931, + "grad_norm": 6.94058567280915, + "kl": 0.4609375, + "learning_rate": 2.8997747747747746e-07, + "loss": 0.0005, + "reward": 3.624302864074707, + "reward_std": 0.08075489476323128, + "rewards/final_reward": 1.7019580353050605, + "rewards/mask_iou_reward": 0.8509790176525303, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6243028044700623, + "rewards/thk_ans_format_reward": 1.0, + "step": 2522, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.17708587646484, + "epoch": 8.522765598650928, + "grad_norm": 10.212955282258918, + "kl": 0.5166015625, + "learning_rate": 2.896959459459459e-07, + "loss": 0.0005, + "reward": 3.6895445585250854, + "reward_std": 0.03810789994895458, + "rewards/final_reward": 1.8911533097852695, + "rewards/mask_iou_reward": 0.9455766548926348, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.689544677734375, + "rewards/thk_ans_format_reward": 1.0, + "step": 2523, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.84375762939453, + "epoch": 8.526138279932546, + "grad_norm": 11.602663710945789, + "kl": 0.4384765625, + "learning_rate": 2.894144144144144e-07, + "loss": 0.0004, + "reward": 3.2343530654907227, + "reward_std": 0.14465375151485205, + "rewards/final_reward": 1.6413925943182874, + "rewards/mask_iou_reward": 0.8206962971591437, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2551867961883545, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2524, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.40625762939453, + "epoch": 8.529510961214164, + "grad_norm": 161.6466819493176, + "kl": 0.5830078125, + "learning_rate": 2.8913288288288284e-07, + "loss": 0.0006, + "reward": 3.6265861988067627, + "reward_std": 0.10466808825731277, + "rewards/final_reward": 1.3112621176439139, + "rewards/mask_iou_reward": 0.6556310588219569, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6265860795974731, + "rewards/thk_ans_format_reward": 1.0, + "step": 2525, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.86458587646484, + "epoch": 8.532883642495785, + "grad_norm": 7.11750737081999, + "kl": 0.4521484375, + "learning_rate": 2.8885135135135136e-07, + "loss": 0.0005, + "reward": 3.681373715400696, + "reward_std": 0.049964262172579765, + "rewards/final_reward": 1.7365305951596461, + "rewards/mask_iou_reward": 0.8682652975798231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6813737154006958, + "rewards/thk_ans_format_reward": 1.0, + "step": 2526, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.37500762939453, + "epoch": 8.536256323777403, + "grad_norm": 7.352675052372248, + "kl": 0.6171875, + "learning_rate": 2.885698198198198e-07, + "loss": 0.0006, + "reward": 3.810825228691101, + "reward_std": 0.049029380083084106, + "rewards/final_reward": 1.8610386605291138, + "rewards/mask_iou_reward": 0.9305193302645569, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8108253479003906, + "rewards/thk_ans_format_reward": 1.0, + "step": 2527, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.25000762939453, + "epoch": 8.539629005059021, + "grad_norm": 10.908191542281262, + "kl": 0.443359375, + "learning_rate": 2.882882882882883e-07, + "loss": 0.0004, + "reward": 3.490597367286682, + "reward_std": 0.07156710140407085, + "rewards/final_reward": 0.9468506256295272, + "rewards/mask_iou_reward": 0.4734253128147636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4905971884727478, + "rewards/thk_ans_format_reward": 1.0, + "step": 2528, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.59375762939453, + "epoch": 8.543001686340641, + "grad_norm": 11.045030765018707, + "kl": 0.4306640625, + "learning_rate": 2.8800675675675674e-07, + "loss": 0.0004, + "reward": 3.5810824632644653, + "reward_std": 0.07197471894323826, + "rewards/final_reward": 1.613765532250191, + "rewards/mask_iou_reward": 0.8068827661250955, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5810824632644653, + "rewards/thk_ans_format_reward": 1.0, + "step": 2529, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.0416717529297, + "epoch": 8.54637436762226, + "grad_norm": 14.456941600396206, + "kl": 0.458984375, + "learning_rate": 2.877252252252252e-07, + "loss": 0.0005, + "reward": 3.5833380222320557, + "reward_std": 0.06957448460161686, + "rewards/final_reward": 1.878485730364916, + "rewards/mask_iou_reward": 0.939242865182458, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5833380222320557, + "rewards/thk_ans_format_reward": 1.0, + "step": 2530, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.53125, + "epoch": 8.549747048903878, + "grad_norm": 12.664397632704164, + "kl": 0.4677734375, + "learning_rate": 2.874436936936937e-07, + "loss": 0.0005, + "reward": 3.670201539993286, + "reward_std": 0.023576030042022467, + "rewards/final_reward": 1.4009314958362795, + "rewards/mask_iou_reward": 0.7004657479181398, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6702014803886414, + "rewards/thk_ans_format_reward": 1.0, + "step": 2531, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.9791717529297, + "epoch": 8.553119730185497, + "grad_norm": 18.977288893600278, + "kl": 0.4638671875, + "learning_rate": 2.871621621621622e-07, + "loss": 0.0005, + "reward": 3.491969347000122, + "reward_std": 0.059009552001953125, + "rewards/final_reward": 1.1136705530124207, + "rewards/mask_iou_reward": 0.5568352765062103, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4919692277908325, + "rewards/thk_ans_format_reward": 1.0, + "step": 2532, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.7291717529297, + "epoch": 8.556492411467117, + "grad_norm": 9.423306246424612, + "kl": 0.498046875, + "learning_rate": 2.8688063063063063e-07, + "loss": 0.0005, + "reward": 3.7101043462753296, + "reward_std": 0.015714637003839016, + "rewards/final_reward": 1.8516040221880408, + "rewards/mask_iou_reward": 0.9258020110940204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7101043462753296, + "rewards/thk_ans_format_reward": 1.0, + "step": 2533, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.06250762939453, + "epoch": 8.559865092748735, + "grad_norm": 11.2280568629412, + "kl": 0.4375, + "learning_rate": 2.865990990990991e-07, + "loss": 0.0004, + "reward": 3.4110137224197388, + "reward_std": 0.11291562020778656, + "rewards/final_reward": 1.6769507866752837, + "rewards/mask_iou_reward": 0.8384753933376419, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4110137224197388, + "rewards/thk_ans_format_reward": 1.0, + "step": 2534, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6666717529297, + "epoch": 8.563237774030354, + "grad_norm": 13.206039611412244, + "kl": 0.46875, + "learning_rate": 2.8631756756756756e-07, + "loss": 0.0005, + "reward": 3.5476059913635254, + "reward_std": 0.077615050598979, + "rewards/final_reward": 1.3061237702838464, + "rewards/mask_iou_reward": 0.6530618851419232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5476059317588806, + "rewards/thk_ans_format_reward": 1.0, + "step": 2535, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.3854217529297, + "epoch": 8.566610455311974, + "grad_norm": 12.925283052005339, + "kl": 0.3955078125, + "learning_rate": 2.8603603603603607e-07, + "loss": 0.0004, + "reward": 3.677824378013611, + "reward_std": 0.04426476452499628, + "rewards/final_reward": 1.8621604391032591, + "rewards/mask_iou_reward": 0.9310802195516296, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.677824318408966, + "rewards/thk_ans_format_reward": 1.0, + "step": 2536, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.96875762939453, + "epoch": 8.569983136593592, + "grad_norm": 26.031777808217022, + "kl": 0.3955078125, + "learning_rate": 2.857545045045045e-07, + "loss": 0.0004, + "reward": 3.6277259588241577, + "reward_std": 0.07600187882781029, + "rewards/final_reward": 1.5448248992903784, + "rewards/mask_iou_reward": 0.7724124496451892, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.627725899219513, + "rewards/thk_ans_format_reward": 1.0, + "step": 2537, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.6354217529297, + "epoch": 8.57335581787521, + "grad_norm": 18.72130767765532, + "kl": 0.419921875, + "learning_rate": 2.8547297297297294e-07, + "loss": 0.0005, + "reward": 3.756251096725464, + "reward_std": 0.05335315503180027, + "rewards/final_reward": 1.4595373271868004, + "rewards/mask_iou_reward": 0.7297686635934002, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.75625079870224, + "rewards/thk_ans_format_reward": 1.0, + "step": 2538, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.375, + "epoch": 8.576728499156829, + "grad_norm": 88.11159314067041, + "kl": 0.6083984375, + "learning_rate": 2.851914414414414e-07, + "loss": 0.0006, + "reward": 3.179754376411438, + "reward_std": 0.13116220384836197, + "rewards/final_reward": 0.8943421914303193, + "rewards/mask_iou_reward": 0.44717109571515967, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1797543168067932, + "rewards/thk_ans_format_reward": 1.0, + "step": 2539, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.17708587646484, + "epoch": 8.580101180438449, + "grad_norm": 5.732811017543621, + "kl": 0.375, + "learning_rate": 2.8490990990990986e-07, + "loss": 0.0004, + "reward": 3.658676028251648, + "reward_std": 0.11672421544790268, + "rewards/final_reward": 1.626043902889117, + "rewards/mask_iou_reward": 0.8130219514445585, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6690927743911743, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2540, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 118.67708969116211, + "epoch": 8.583473861720067, + "grad_norm": 66.56071625946628, + "kl": 0.7890625, + "learning_rate": 2.8462837837837837e-07, + "loss": 0.0008, + "reward": 3.4864169359207153, + "reward_std": 0.1309407837688923, + "rewards/final_reward": 1.6494711168698228, + "rewards/mask_iou_reward": 0.8247355584349114, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4864166378974915, + "rewards/thk_ans_format_reward": 1.0, + "step": 2541, + "think_completion_length": 10.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.3541717529297, + "epoch": 8.586846543001686, + "grad_norm": 8.568815174969663, + "kl": 0.470703125, + "learning_rate": 2.8434684684684683e-07, + "loss": 0.0005, + "reward": 3.5493093729019165, + "reward_std": 0.08573894761502743, + "rewards/final_reward": 1.477603366021547, + "rewards/mask_iou_reward": 0.7388016830107735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5493090152740479, + "rewards/thk_ans_format_reward": 1.0, + "step": 2542, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.17709350585938, + "epoch": 8.590219224283306, + "grad_norm": 7.298574233572978, + "kl": 0.4609375, + "learning_rate": 2.840653153153153e-07, + "loss": 0.0005, + "reward": 3.6566131114959717, + "reward_std": 0.11266613006591797, + "rewards/final_reward": 1.5176401166274118, + "rewards/mask_iou_reward": 0.7588200583137059, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6566131711006165, + "rewards/thk_ans_format_reward": 1.0, + "step": 2543, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.14583587646484, + "epoch": 8.593591905564924, + "grad_norm": 26.55636292637615, + "kl": 0.5673828125, + "learning_rate": 2.8378378378378376e-07, + "loss": 0.0006, + "reward": 3.783558487892151, + "reward_std": 0.042043750174343586, + "rewards/final_reward": 1.9065222306385974, + "rewards/mask_iou_reward": 0.9532611153192987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7835585474967957, + "rewards/thk_ans_format_reward": 1.0, + "step": 2544, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.65625762939453, + "epoch": 8.596964586846543, + "grad_norm": 12.844963607007417, + "kl": 0.77734375, + "learning_rate": 2.835022522522522e-07, + "loss": 0.0008, + "reward": 3.539679527282715, + "reward_std": 0.04935073805972934, + "rewards/final_reward": 1.8597483490103142, + "rewards/mask_iou_reward": 0.9298741745051571, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5396792888641357, + "rewards/thk_ans_format_reward": 1.0, + "step": 2545, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.8854217529297, + "epoch": 8.600337268128161, + "grad_norm": 45.98019538397138, + "kl": 0.5439453125, + "learning_rate": 2.8322072072072073e-07, + "loss": 0.0005, + "reward": 3.5690609216690063, + "reward_std": 0.04167831316590309, + "rewards/final_reward": 1.0290717329363943, + "rewards/mask_iou_reward": 0.5145358664681972, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5690608620643616, + "rewards/thk_ans_format_reward": 1.0, + "step": 2546, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.42708587646484, + "epoch": 8.603709949409781, + "grad_norm": 6.232417525749554, + "kl": 0.4443359375, + "learning_rate": 2.829391891891892e-07, + "loss": 0.0005, + "reward": 3.1595572233200073, + "reward_std": 0.10324277426116168, + "rewards/final_reward": 1.074434420208405, + "rewards/mask_iou_reward": 0.5372172101042025, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.180390626192093, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2547, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.84375, + "epoch": 8.6070826306914, + "grad_norm": 42.12515218147006, + "kl": 0.455078125, + "learning_rate": 2.8265765765765765e-07, + "loss": 0.0005, + "reward": 3.5446321964263916, + "reward_std": 0.08529717102646828, + "rewards/final_reward": 1.8174893080345567, + "rewards/mask_iou_reward": 0.9087446540172783, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5446322560310364, + "rewards/thk_ans_format_reward": 1.0, + "step": 2548, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.12500762939453, + "epoch": 8.610455311973018, + "grad_norm": 9.854312839397398, + "kl": 0.408203125, + "learning_rate": 2.823761261261261e-07, + "loss": 0.0004, + "reward": 3.767038345336914, + "reward_std": 0.06215832382440567, + "rewards/final_reward": 1.904067015100479, + "rewards/mask_iou_reward": 0.9520335075502395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.767038345336914, + "rewards/thk_ans_format_reward": 1.0, + "step": 2549, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.38541793823242, + "epoch": 8.613827993254638, + "grad_norm": 6.976886758251197, + "kl": 0.568359375, + "learning_rate": 2.8209459459459457e-07, + "loss": 0.0006, + "reward": 3.729358434677124, + "reward_std": 0.08664998784661293, + "rewards/final_reward": 1.9594386198347054, + "rewards/mask_iou_reward": 0.9797193099173527, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7293584942817688, + "rewards/thk_ans_format_reward": 1.0, + "step": 2550, + "think_completion_length": 12.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.83333587646484, + "epoch": 8.617200674536257, + "grad_norm": 6.5393567285906995, + "kl": 0.43359375, + "learning_rate": 2.818130630630631e-07, + "loss": 0.0005, + "reward": 3.4602993726730347, + "reward_std": 0.168155737221241, + "rewards/final_reward": 1.905844650653056, + "rewards/mask_iou_reward": 0.952922325326528, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.470715880393982, + "rewards/thk_ans_format_reward": 1.0, + "step": 2551, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.73959350585938, + "epoch": 8.620573355817875, + "grad_norm": 11.85388614199403, + "kl": 0.455078125, + "learning_rate": 2.8153153153153155e-07, + "loss": 0.0005, + "reward": 3.556620478630066, + "reward_std": 0.061766088008880615, + "rewards/final_reward": 1.7875575724226098, + "rewards/mask_iou_reward": 0.8937787862113049, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5566202998161316, + "rewards/thk_ans_format_reward": 1.0, + "step": 2552, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.0104217529297, + "epoch": 8.623946037099493, + "grad_norm": 10.547197211481848, + "kl": 0.4208984375, + "learning_rate": 2.8125e-07, + "loss": 0.0004, + "reward": 3.4441052675247192, + "reward_std": 0.08604156225919724, + "rewards/final_reward": 1.3584484672443913, + "rewards/mask_iou_reward": 0.6792242336221956, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4441050291061401, + "rewards/thk_ans_format_reward": 1.0, + "step": 2553, + "think_completion_length": 9.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.59375762939453, + "epoch": 8.627318718381114, + "grad_norm": 18.734391005705923, + "kl": 0.5078125, + "learning_rate": 2.8096846846846847e-07, + "loss": 0.0005, + "reward": 3.513301134109497, + "reward_std": 0.07055116072297096, + "rewards/final_reward": 1.6922842308381596, + "rewards/mask_iou_reward": 0.8461421154190798, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5133010149002075, + "rewards/thk_ans_format_reward": 1.0, + "step": 2554, + "think_completion_length": 11.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.9791717529297, + "epoch": 8.630691399662732, + "grad_norm": 26.468379779375635, + "kl": 0.62109375, + "learning_rate": 2.8068693693693693e-07, + "loss": 0.0006, + "reward": 3.3101227283477783, + "reward_std": 0.2818397730588913, + "rewards/final_reward": 1.654746762295793, + "rewards/mask_iou_reward": 0.8273733811478965, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3309558629989624, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2555, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.8229217529297, + "epoch": 8.63406408094435, + "grad_norm": 5.290031724177615, + "kl": 0.5205078125, + "learning_rate": 2.804054054054054e-07, + "loss": 0.0005, + "reward": 3.3882901668548584, + "reward_std": 0.04036908410489559, + "rewards/final_reward": 1.7354911968642042, + "rewards/mask_iou_reward": 0.8677455984321021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.388290286064148, + "rewards/thk_ans_format_reward": 1.0, + "step": 2556, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.7916717529297, + "epoch": 8.63743676222597, + "grad_norm": 15.860539572750685, + "kl": 0.4921875, + "learning_rate": 2.8012387387387385e-07, + "loss": 0.0005, + "reward": 3.6285096406936646, + "reward_std": 0.09304303559474647, + "rewards/final_reward": 1.8183068185832045, + "rewards/mask_iou_reward": 0.9091534092916023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6285096406936646, + "rewards/thk_ans_format_reward": 1.0, + "step": 2557, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.1041717529297, + "epoch": 8.640809443507589, + "grad_norm": 16.90640893768875, + "kl": 0.435546875, + "learning_rate": 2.798423423423423e-07, + "loss": 0.0005, + "reward": 3.600364923477173, + "reward_std": 0.056407464668154716, + "rewards/final_reward": 1.4578039481564466, + "rewards/mask_iou_reward": 0.7289019740782233, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6003649234771729, + "rewards/thk_ans_format_reward": 1.0, + "step": 2558, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.34375762939453, + "epoch": 8.644182124789207, + "grad_norm": 11.810250822733689, + "kl": 0.458984375, + "learning_rate": 2.7956081081081077e-07, + "loss": 0.0005, + "reward": 3.52646803855896, + "reward_std": 0.05152006074786186, + "rewards/final_reward": 0.9467531404162066, + "rewards/mask_iou_reward": 0.4733765702081033, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5264679193496704, + "rewards/thk_ans_format_reward": 1.0, + "step": 2559, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.28125762939453, + "epoch": 8.647554806070826, + "grad_norm": 5.154818837939803, + "kl": 0.4912109375, + "learning_rate": 2.7927927927927923e-07, + "loss": 0.0006, + "reward": 3.6589834690093994, + "reward_std": 0.04206683021038771, + "rewards/final_reward": 1.6065023994965042, + "rewards/mask_iou_reward": 0.8032511997482521, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6589834094047546, + "rewards/thk_ans_format_reward": 1.0, + "step": 2560, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.4479217529297, + "epoch": 8.650927487352446, + "grad_norm": 16.38009193888725, + "kl": 0.587890625, + "learning_rate": 2.7899774774774775e-07, + "loss": 0.0006, + "reward": 3.7777464389801025, + "reward_std": 0.029515139758586884, + "rewards/final_reward": 1.896249769299833, + "rewards/mask_iou_reward": 0.9481248846499165, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.777746319770813, + "rewards/thk_ans_format_reward": 1.0, + "step": 2561, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.3020896911621, + "epoch": 8.654300168634064, + "grad_norm": 9.35187219045533, + "kl": 0.482421875, + "learning_rate": 2.787162162162162e-07, + "loss": 0.0005, + "reward": 3.4067482948303223, + "reward_std": 0.11706292629241943, + "rewards/final_reward": 1.073244567798334, + "rewards/mask_iou_reward": 0.536622283899167, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4067482352256775, + "rewards/thk_ans_format_reward": 1.0, + "step": 2562, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.65625762939453, + "epoch": 8.657672849915683, + "grad_norm": 6.688008058376227, + "kl": 0.453125, + "learning_rate": 2.7843468468468467e-07, + "loss": 0.0005, + "reward": 3.7152053117752075, + "reward_std": 0.023756575770676136, + "rewards/final_reward": 1.8820338318907854, + "rewards/mask_iou_reward": 0.9410169159453927, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7152053117752075, + "rewards/thk_ans_format_reward": 1.0, + "step": 2563, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.6979217529297, + "epoch": 8.661045531197303, + "grad_norm": 34.29633067802778, + "kl": 0.427734375, + "learning_rate": 2.7815315315315313e-07, + "loss": 0.0006, + "reward": 3.733788013458252, + "reward_std": 0.02422420820221305, + "rewards/final_reward": 1.9275858243600428, + "rewards/mask_iou_reward": 0.9637929121800214, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.733788013458252, + "rewards/thk_ans_format_reward": 1.0, + "step": 2564, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.1354217529297, + "epoch": 8.664418212478921, + "grad_norm": 12.887212200092963, + "kl": 0.4501953125, + "learning_rate": 2.778716216216216e-07, + "loss": 0.0005, + "reward": 3.491172194480896, + "reward_std": 0.07623483892530203, + "rewards/final_reward": 1.2990106052827048, + "rewards/mask_iou_reward": 0.6495053026413524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4911721348762512, + "rewards/thk_ans_format_reward": 1.0, + "step": 2565, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.6979217529297, + "epoch": 8.66779089376054, + "grad_norm": 12.075436890130483, + "kl": 0.4765625, + "learning_rate": 2.775900900900901e-07, + "loss": 0.0005, + "reward": 3.7700542211532593, + "reward_std": 0.09697945602238178, + "rewards/final_reward": 1.9443128383259172, + "rewards/mask_iou_reward": 0.9721564191629586, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7700544595718384, + "rewards/thk_ans_format_reward": 1.0, + "step": 2566, + "think_completion_length": 10.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.09375, + "epoch": 8.671163575042158, + "grad_norm": 7.363763311581606, + "kl": 0.43359375, + "learning_rate": 2.7730855855855856e-07, + "loss": 0.0004, + "reward": 3.4510533809661865, + "reward_std": 0.042235566303133965, + "rewards/final_reward": 1.744057022524851, + "rewards/mask_iou_reward": 0.8720285112624255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.451053500175476, + "rewards/thk_ans_format_reward": 1.0, + "step": 2567, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.34375762939453, + "epoch": 8.674536256323778, + "grad_norm": 13.697326391954496, + "kl": 0.583984375, + "learning_rate": 2.77027027027027e-07, + "loss": 0.0006, + "reward": 3.4283299446105957, + "reward_std": 0.040075878612697124, + "rewards/final_reward": 1.8370620019922566, + "rewards/mask_iou_reward": 0.9185310009961283, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.42833012342453, + "rewards/thk_ans_format_reward": 1.0, + "step": 2568, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.9479217529297, + "epoch": 8.677908937605396, + "grad_norm": 11.546432521196792, + "kl": 0.455078125, + "learning_rate": 2.767454954954955e-07, + "loss": 0.0005, + "reward": 3.5929055213928223, + "reward_std": 0.059562329202890396, + "rewards/final_reward": 1.768698927073372, + "rewards/mask_iou_reward": 0.884349463536686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5929054021835327, + "rewards/thk_ans_format_reward": 1.0, + "step": 2569, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.3125, + "epoch": 8.681281618887015, + "grad_norm": 10.9044199056127, + "kl": 0.390625, + "learning_rate": 2.7646396396396394e-07, + "loss": 0.0004, + "reward": 3.2602450847625732, + "reward_std": 0.03169908095151186, + "rewards/final_reward": 1.8138352665157995, + "rewards/mask_iou_reward": 0.9069176332578998, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2602448463439941, + "rewards/thk_ans_format_reward": 1.0, + "step": 2570, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.8541717529297, + "epoch": 8.684654300168635, + "grad_norm": 54.539015249753554, + "kl": 0.5869140625, + "learning_rate": 2.7618243243243246e-07, + "loss": 0.0006, + "reward": 3.5645973682403564, + "reward_std": 0.11071610450744629, + "rewards/final_reward": 0.8719106954026625, + "rewards/mask_iou_reward": 0.43595534770133126, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5645974278450012, + "rewards/thk_ans_format_reward": 1.0, + "step": 2571, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.81250762939453, + "epoch": 8.688026981450253, + "grad_norm": 11.927673383869944, + "kl": 0.408203125, + "learning_rate": 2.759009009009009e-07, + "loss": 0.0006, + "reward": 3.6039984226226807, + "reward_std": 0.10941345617175102, + "rewards/final_reward": 0.9543782866716485, + "rewards/mask_iou_reward": 0.47718914333582424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6039982438087463, + "rewards/thk_ans_format_reward": 1.0, + "step": 2572, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.05208587646484, + "epoch": 8.691399662731872, + "grad_norm": 8.034050467478677, + "kl": 0.595703125, + "learning_rate": 2.756193693693694e-07, + "loss": 0.0006, + "reward": 3.7109304666519165, + "reward_std": 0.033792685717344284, + "rewards/final_reward": 1.7174564503363023, + "rewards/mask_iou_reward": 0.8587282251681512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7109304070472717, + "rewards/thk_ans_format_reward": 1.0, + "step": 2573, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.0104217529297, + "epoch": 8.69477234401349, + "grad_norm": 21.422356986248246, + "kl": 0.623046875, + "learning_rate": 2.7533783783783784e-07, + "loss": 0.0006, + "reward": 3.3546335697174072, + "reward_std": 0.07038544863462448, + "rewards/final_reward": 0.7904122383978938, + "rewards/mask_iou_reward": 0.3952061191989469, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.354633629322052, + "rewards/thk_ans_format_reward": 1.0, + "step": 2574, + "think_completion_length": 9.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.8541717529297, + "epoch": 8.69814502529511, + "grad_norm": 9.696842487528931, + "kl": 0.5712890625, + "learning_rate": 2.7505630630630625e-07, + "loss": 0.0006, + "reward": 3.74173903465271, + "reward_std": 0.04419276397675276, + "rewards/final_reward": 1.784767222990809, + "rewards/mask_iou_reward": 0.8923836114954045, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7417391538619995, + "rewards/thk_ans_format_reward": 1.0, + "step": 2575, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.3229217529297, + "epoch": 8.701517706576729, + "grad_norm": 9.956113940997724, + "kl": 0.421875, + "learning_rate": 2.7477477477477476e-07, + "loss": 0.0004, + "reward": 3.691778302192688, + "reward_std": 0.03693321347236633, + "rewards/final_reward": 1.8514881707197457, + "rewards/mask_iou_reward": 0.9257440853598728, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6917780637741089, + "rewards/thk_ans_format_reward": 1.0, + "step": 2576, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.92709350585938, + "epoch": 8.704890387858347, + "grad_norm": 19.513661619610637, + "kl": 0.47265625, + "learning_rate": 2.744932432432432e-07, + "loss": 0.0005, + "reward": 3.6737890243530273, + "reward_std": 0.03834380768239498, + "rewards/final_reward": 1.7957259894180986, + "rewards/mask_iou_reward": 0.8978629947090493, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6737890243530273, + "rewards/thk_ans_format_reward": 1.0, + "step": 2577, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.6666717529297, + "epoch": 8.708263069139967, + "grad_norm": 13.19141830442237, + "kl": 0.484375, + "learning_rate": 2.742117117117117e-07, + "loss": 0.0005, + "reward": 3.340681791305542, + "reward_std": 0.12751448899507523, + "rewards/final_reward": 1.8338263788167821, + "rewards/mask_iou_reward": 0.9169131894083911, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3406816124916077, + "rewards/thk_ans_format_reward": 1.0, + "step": 2578, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.4166717529297, + "epoch": 8.711635750421586, + "grad_norm": 16.411047595909793, + "kl": 0.4189453125, + "learning_rate": 2.7393018018018014e-07, + "loss": 0.0004, + "reward": 3.5782195329666138, + "reward_std": 0.08094577863812447, + "rewards/final_reward": 1.4291149300643984, + "rewards/mask_iou_reward": 0.7145574650321992, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5782194137573242, + "rewards/thk_ans_format_reward": 1.0, + "step": 2579, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.5625, + "epoch": 8.715008431703204, + "grad_norm": 16.802202255267115, + "kl": 0.583984375, + "learning_rate": 2.736486486486486e-07, + "loss": 0.0006, + "reward": 3.513985753059387, + "reward_std": 0.13777944818139076, + "rewards/final_reward": 1.2669673172997142, + "rewards/mask_iou_reward": 0.6334836586498571, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5139857530593872, + "rewards/thk_ans_format_reward": 1.0, + "step": 2580, + "think_completion_length": 10.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.81250762939453, + "epoch": 8.718381112984822, + "grad_norm": 8.00176037412226, + "kl": 0.4365234375, + "learning_rate": 2.733671171171171e-07, + "loss": 0.0005, + "reward": 3.552944779396057, + "reward_std": 0.09154192451387644, + "rewards/final_reward": 1.4492299638504018, + "rewards/mask_iou_reward": 0.7246149819252009, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5529447793960571, + "rewards/thk_ans_format_reward": 1.0, + "step": 2581, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.6666717529297, + "epoch": 8.721753794266442, + "grad_norm": 10.310261963697636, + "kl": 0.42578125, + "learning_rate": 2.730855855855856e-07, + "loss": 0.0004, + "reward": 3.181455135345459, + "reward_std": 0.08033962082117796, + "rewards/final_reward": 1.543839269099942, + "rewards/mask_iou_reward": 0.771919634549971, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1814552545547485, + "rewards/thk_ans_format_reward": 1.0, + "step": 2582, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.28125, + "epoch": 8.72512647554806, + "grad_norm": 18.977015116224848, + "kl": 0.421875, + "learning_rate": 2.7280405405405404e-07, + "loss": 0.0005, + "reward": 3.833088994026184, + "reward_std": 0.0317679438740015, + "rewards/final_reward": 1.8962071656871986, + "rewards/mask_iou_reward": 0.9481035828435993, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.833088994026184, + "rewards/thk_ans_format_reward": 1.0, + "step": 2583, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.1979217529297, + "epoch": 8.72849915682968, + "grad_norm": 6.04162016708347, + "kl": 0.4169921875, + "learning_rate": 2.725225225225225e-07, + "loss": 0.0004, + "reward": 3.3477823734283447, + "reward_std": 0.08656807988882065, + "rewards/final_reward": 1.2145620779352024, + "rewards/mask_iou_reward": 0.6072810389676012, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3477822542190552, + "rewards/thk_ans_format_reward": 1.0, + "step": 2584, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.5937614440918, + "epoch": 8.7318718381113, + "grad_norm": 59.301908875851915, + "kl": 0.447265625, + "learning_rate": 2.7224099099099096e-07, + "loss": 0.0004, + "reward": 3.1889774799346924, + "reward_std": 0.09784254245460033, + "rewards/final_reward": 1.2470772716566363, + "rewards/mask_iou_reward": 0.6235386358283181, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1889773607254028, + "rewards/thk_ans_format_reward": 1.0, + "step": 2585, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.39583587646484, + "epoch": 8.735244519392918, + "grad_norm": 11.564177217628647, + "kl": 0.427734375, + "learning_rate": 2.719594594594595e-07, + "loss": 0.0004, + "reward": 3.7254035472869873, + "reward_std": 0.08181975595653057, + "rewards/final_reward": 1.7824145137487353, + "rewards/mask_iou_reward": 0.8912072568743676, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7254034876823425, + "rewards/thk_ans_format_reward": 1.0, + "step": 2586, + "think_completion_length": 9.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1666717529297, + "epoch": 8.738617200674536, + "grad_norm": 9.045044514348094, + "kl": 0.5498046875, + "learning_rate": 2.7167792792792793e-07, + "loss": 0.0006, + "reward": 3.5254725217819214, + "reward_std": 0.04110686667263508, + "rewards/final_reward": 1.1279837635335666, + "rewards/mask_iou_reward": 0.5639918817667833, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.525472342967987, + "rewards/thk_ans_format_reward": 1.0, + "step": 2587, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.6875, + "epoch": 8.741989881956155, + "grad_norm": 11.478877926068497, + "kl": 0.71875, + "learning_rate": 2.713963963963964e-07, + "loss": 0.0007, + "reward": 3.7656898498535156, + "reward_std": 0.08955633267760277, + "rewards/final_reward": 1.6858383612881407, + "rewards/mask_iou_reward": 0.8429191806440703, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7656898498535156, + "rewards/thk_ans_format_reward": 1.0, + "step": 2588, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.18750762939453, + "epoch": 8.745362563237775, + "grad_norm": 16.235873553011174, + "kl": 0.4306640625, + "learning_rate": 2.7111486486486486e-07, + "loss": 0.0004, + "reward": 3.3747421503067017, + "reward_std": 0.027278369292616844, + "rewards/final_reward": 1.9198226083096936, + "rewards/mask_iou_reward": 0.9599113041548468, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3747420907020569, + "rewards/thk_ans_format_reward": 1.0, + "step": 2589, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 128.64583587646484, + "epoch": 8.748735244519393, + "grad_norm": 10.47292339165554, + "kl": 0.677734375, + "learning_rate": 2.708333333333333e-07, + "loss": 0.0007, + "reward": 3.50990092754364, + "reward_std": 0.06432923208922148, + "rewards/final_reward": 0.9000077052200236, + "rewards/mask_iou_reward": 0.4500038526100118, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5099010467529297, + "rewards/thk_ans_format_reward": 1.0, + "step": 2590, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.9791717529297, + "epoch": 8.752107925801011, + "grad_norm": 17.664010232266858, + "kl": 0.408203125, + "learning_rate": 2.7055180180180183e-07, + "loss": 0.0004, + "reward": 3.2131507396698, + "reward_std": 0.12909862026572227, + "rewards/final_reward": 1.7976802139126753, + "rewards/mask_iou_reward": 0.8988401069563376, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2131509184837341, + "rewards/thk_ans_format_reward": 1.0, + "step": 2591, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.11459350585938, + "epoch": 8.75548060708263, + "grad_norm": 9.558168924239965, + "kl": 0.548828125, + "learning_rate": 2.702702702702703e-07, + "loss": 0.0006, + "reward": 3.4076467752456665, + "reward_std": 0.03746108431369066, + "rewards/final_reward": 1.3606280399295545, + "rewards/mask_iou_reward": 0.6803140199647773, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4076467156410217, + "rewards/thk_ans_format_reward": 1.0, + "step": 2592, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.14584350585938, + "epoch": 8.75885328836425, + "grad_norm": 11.049869173104032, + "kl": 0.435546875, + "learning_rate": 2.6998873873873875e-07, + "loss": 0.0004, + "reward": 3.631982922554016, + "reward_std": 0.048909788485616446, + "rewards/final_reward": 1.874138768786148, + "rewards/mask_iou_reward": 0.937069384393074, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6319828629493713, + "rewards/thk_ans_format_reward": 1.0, + "step": 2593, + "think_completion_length": 10.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.05208587646484, + "epoch": 8.762225969645868, + "grad_norm": 15.549417475269248, + "kl": 0.494140625, + "learning_rate": 2.697072072072072e-07, + "loss": 0.0005, + "reward": 3.5601353645324707, + "reward_std": 0.048820956610143185, + "rewards/final_reward": 1.745202148046324, + "rewards/mask_iou_reward": 0.872601074023162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5601353645324707, + "rewards/thk_ans_format_reward": 1.0, + "step": 2594, + "think_completion_length": 9.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.0104217529297, + "epoch": 8.765598650927487, + "grad_norm": 9.690853129284346, + "kl": 0.4345703125, + "learning_rate": 2.694256756756756e-07, + "loss": 0.0004, + "reward": 3.626412868499756, + "reward_std": 0.020583651028573513, + "rewards/final_reward": 1.554970225171132, + "rewards/mask_iou_reward": 0.777485112585566, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6264130473136902, + "rewards/thk_ans_format_reward": 1.0, + "step": 2595, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.79166793823242, + "epoch": 8.768971332209107, + "grad_norm": 9.419266648136379, + "kl": 0.494140625, + "learning_rate": 2.6914414414414413e-07, + "loss": 0.0006, + "reward": 3.422475814819336, + "reward_std": 0.03806304559111595, + "rewards/final_reward": 1.8769513964133184, + "rewards/mask_iou_reward": 0.9384756982066592, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4224757552146912, + "rewards/thk_ans_format_reward": 1.0, + "step": 2596, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.4270896911621, + "epoch": 8.772344013490725, + "grad_norm": 6.032356877197085, + "kl": 0.5927734375, + "learning_rate": 2.688626126126126e-07, + "loss": 0.0006, + "reward": 3.665559768676758, + "reward_std": 0.031542010605335236, + "rewards/final_reward": 1.7459385507383132, + "rewards/mask_iou_reward": 0.8729692753691566, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6655599474906921, + "rewards/thk_ans_format_reward": 1.0, + "step": 2597, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.8541717529297, + "epoch": 8.775716694772344, + "grad_norm": 12.511204960457997, + "kl": 0.4970703125, + "learning_rate": 2.6858108108108105e-07, + "loss": 0.0005, + "reward": 3.4443455934524536, + "reward_std": 0.13716903142631054, + "rewards/final_reward": 1.4805604503427627, + "rewards/mask_iou_reward": 0.7402802251713814, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4547622203826904, + "rewards/thk_ans_format_reward": 1.0, + "step": 2598, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.1875, + "epoch": 8.779089376053962, + "grad_norm": 18.14749880774283, + "kl": 0.5205078125, + "learning_rate": 2.682995495495495e-07, + "loss": 0.0005, + "reward": 3.4348336458206177, + "reward_std": 0.04534833878278732, + "rewards/final_reward": 1.6291405471232325, + "rewards/mask_iou_reward": 0.8145702735616163, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4348336458206177, + "rewards/thk_ans_format_reward": 1.0, + "step": 2599, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.5104217529297, + "epoch": 8.782462057335582, + "grad_norm": 20.271029674111745, + "kl": 0.564453125, + "learning_rate": 2.68018018018018e-07, + "loss": 0.0006, + "reward": 3.7123841047286987, + "reward_std": 0.052057093009352684, + "rewards/final_reward": 1.9473939414920798, + "rewards/mask_iou_reward": 0.9736969707460399, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7123839259147644, + "rewards/thk_ans_format_reward": 1.0, + "step": 2600, + "think_completion_length": 12.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.6979217529297, + "epoch": 8.7858347386172, + "grad_norm": 19.986727941177254, + "kl": 0.5400390625, + "learning_rate": 2.677364864864865e-07, + "loss": 0.0005, + "reward": 3.5862083435058594, + "reward_std": 0.07752817496657372, + "rewards/final_reward": 1.8850161595394952, + "rewards/mask_iou_reward": 0.9425080797697476, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5862082242965698, + "rewards/thk_ans_format_reward": 1.0, + "step": 2601, + "think_completion_length": 10.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.38541793823242, + "epoch": 8.789207419898819, + "grad_norm": 9.11727486811287, + "kl": 0.5859375, + "learning_rate": 2.6745495495495495e-07, + "loss": 0.0006, + "reward": 3.5917123556137085, + "reward_std": 0.028876617550849915, + "rewards/final_reward": 1.0631040108756813, + "rewards/mask_iou_reward": 0.5315520054378406, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.591712474822998, + "rewards/thk_ans_format_reward": 1.0, + "step": 2602, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.75000762939453, + "epoch": 8.79258010118044, + "grad_norm": 11.357046550930207, + "kl": 0.4443359375, + "learning_rate": 2.671734234234234e-07, + "loss": 0.0005, + "reward": 3.6925617456436157, + "reward_std": 0.11417952738702297, + "rewards/final_reward": 1.717848772073134, + "rewards/mask_iou_reward": 0.858924386036567, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6925618648529053, + "rewards/thk_ans_format_reward": 1.0, + "step": 2603, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.2916717529297, + "epoch": 8.795952782462058, + "grad_norm": 10.95869088214582, + "kl": 0.4033203125, + "learning_rate": 2.6689189189189187e-07, + "loss": 0.0004, + "reward": 3.3376840353012085, + "reward_std": 0.1011296734213829, + "rewards/final_reward": 1.6609974239055063, + "rewards/mask_iou_reward": 0.8304987119527532, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3376837968826294, + "rewards/thk_ans_format_reward": 1.0, + "step": 2604, + "think_completion_length": 9.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.6041717529297, + "epoch": 8.799325463743676, + "grad_norm": 23.19978010706122, + "kl": 0.4560546875, + "learning_rate": 2.6661036036036033e-07, + "loss": 0.0005, + "reward": 3.5850476026535034, + "reward_std": 0.040439434349536896, + "rewards/final_reward": 0.8097829479056705, + "rewards/mask_iou_reward": 0.40489147395283526, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5850474834442139, + "rewards/thk_ans_format_reward": 1.0, + "step": 2605, + "think_completion_length": 10.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.08333587646484, + "epoch": 8.802698145025294, + "grad_norm": 41.379014866284514, + "kl": 0.4853515625, + "learning_rate": 2.6632882882882885e-07, + "loss": 0.0005, + "reward": 3.5459182262420654, + "reward_std": 0.05569390393793583, + "rewards/final_reward": 1.0564543807372129, + "rewards/mask_iou_reward": 0.5282271903686064, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5459181070327759, + "rewards/thk_ans_format_reward": 1.0, + "step": 2606, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.6666717529297, + "epoch": 8.806070826306915, + "grad_norm": 8.74549334092815, + "kl": 0.4736328125, + "learning_rate": 2.660472972972973e-07, + "loss": 0.0005, + "reward": 3.487770438194275, + "reward_std": 0.12233811803162098, + "rewards/final_reward": 1.2958860855271568, + "rewards/mask_iou_reward": 0.6479430427635784, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4981870651245117, + "rewards/thk_ans_format_reward": 1.0, + "step": 2607, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.0104217529297, + "epoch": 8.809443507588533, + "grad_norm": 13.26405079753535, + "kl": 0.572265625, + "learning_rate": 2.6576576576576577e-07, + "loss": 0.0006, + "reward": 3.5517358779907227, + "reward_std": 0.04905109805986285, + "rewards/final_reward": 1.2092090290589885, + "rewards/mask_iou_reward": 0.6046045145294943, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5517358183860779, + "rewards/thk_ans_format_reward": 1.0, + "step": 2608, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.78125762939453, + "epoch": 8.812816188870151, + "grad_norm": 20.005619354745726, + "kl": 0.646484375, + "learning_rate": 2.6548423423423423e-07, + "loss": 0.0007, + "reward": 3.5325050354003906, + "reward_std": 0.025053212884813547, + "rewards/final_reward": 1.002610821867861, + "rewards/mask_iou_reward": 0.5013054109339306, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.532505214214325, + "rewards/thk_ans_format_reward": 1.0, + "step": 2609, + "think_completion_length": 9.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.28125762939453, + "epoch": 8.816188870151771, + "grad_norm": 14.765386223784136, + "kl": 0.5078125, + "learning_rate": 2.652027027027027e-07, + "loss": 0.0005, + "reward": 3.8309184312820435, + "reward_std": 0.014153166441246867, + "rewards/final_reward": 1.9386175055923904, + "rewards/mask_iou_reward": 0.9693087527961952, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.830918550491333, + "rewards/thk_ans_format_reward": 1.0, + "step": 2610, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.5104217529297, + "epoch": 8.81956155143339, + "grad_norm": 9.850666141328157, + "kl": 0.58984375, + "learning_rate": 2.6492117117117115e-07, + "loss": 0.0006, + "reward": 3.3679628372192383, + "reward_std": 0.07900388538837433, + "rewards/final_reward": 1.8492453060512983, + "rewards/mask_iou_reward": 0.9246226530256492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.367962896823883, + "rewards/thk_ans_format_reward": 1.0, + "step": 2611, + "think_completion_length": 10.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.0729217529297, + "epoch": 8.822934232715008, + "grad_norm": 11.286693578515079, + "kl": 0.53125, + "learning_rate": 2.6463963963963966e-07, + "loss": 0.0005, + "reward": 3.205089807510376, + "reward_std": 0.10610627755522728, + "rewards/final_reward": 0.5328642452260304, + "rewards/mask_iou_reward": 0.2664321226130152, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.205089807510376, + "rewards/thk_ans_format_reward": 1.0, + "step": 2612, + "think_completion_length": 10.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.92708587646484, + "epoch": 8.826306913996627, + "grad_norm": 62.562830341723284, + "kl": 0.4521484375, + "learning_rate": 2.643581081081081e-07, + "loss": 0.0005, + "reward": 3.6393673419952393, + "reward_std": 0.08628643676638603, + "rewards/final_reward": 1.7409598350531619, + "rewards/mask_iou_reward": 0.8704799175265809, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6393672823905945, + "rewards/thk_ans_format_reward": 1.0, + "step": 2613, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.30208587646484, + "epoch": 8.829679595278247, + "grad_norm": 9.169418403340412, + "kl": 0.4541015625, + "learning_rate": 2.640765765765766e-07, + "loss": 0.0004, + "reward": 3.658125400543213, + "reward_std": 0.06685709208250046, + "rewards/final_reward": 1.5693793803364242, + "rewards/mask_iou_reward": 0.7846896901682121, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6581252813339233, + "rewards/thk_ans_format_reward": 1.0, + "step": 2614, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.4375, + "epoch": 8.833052276559865, + "grad_norm": 9.424040489571068, + "kl": 0.46484375, + "learning_rate": 2.63795045045045e-07, + "loss": 0.0005, + "reward": 3.3040276765823364, + "reward_std": 0.051268843933939934, + "rewards/final_reward": 1.3129399364091325, + "rewards/mask_iou_reward": 0.6564699682045663, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3040276169776917, + "rewards/thk_ans_format_reward": 1.0, + "step": 2615, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.2604217529297, + "epoch": 8.836424957841484, + "grad_norm": 11.799474042076646, + "kl": 0.453125, + "learning_rate": 2.6351351351351345e-07, + "loss": 0.0005, + "reward": 3.416573643684387, + "reward_std": 0.07485876977443695, + "rewards/final_reward": 1.8482420419840249, + "rewards/mask_iou_reward": 0.9241210209920124, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4165735244750977, + "rewards/thk_ans_format_reward": 1.0, + "step": 2616, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.64583587646484, + "epoch": 8.839797639123104, + "grad_norm": 7.646928271668927, + "kl": 0.5078125, + "learning_rate": 2.6323198198198197e-07, + "loss": 0.0005, + "reward": 3.5402209758758545, + "reward_std": 0.05744621530175209, + "rewards/final_reward": 1.6883144009827764, + "rewards/mask_iou_reward": 0.8441572004913882, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5402206778526306, + "rewards/thk_ans_format_reward": 1.0, + "step": 2617, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.36458587646484, + "epoch": 8.843170320404722, + "grad_norm": 10.752846546517851, + "kl": 0.458984375, + "learning_rate": 2.6295045045045043e-07, + "loss": 0.0005, + "reward": 3.4455671310424805, + "reward_std": 0.07160164043307304, + "rewards/final_reward": 1.8307324658180666, + "rewards/mask_iou_reward": 0.9153662329090333, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4455673694610596, + "rewards/thk_ans_format_reward": 1.0, + "step": 2618, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.92708587646484, + "epoch": 8.84654300168634, + "grad_norm": 23.347348826498127, + "kl": 0.5146484375, + "learning_rate": 2.626689189189189e-07, + "loss": 0.0005, + "reward": 3.562696099281311, + "reward_std": 0.031371730379760265, + "rewards/final_reward": 1.3731017182197023, + "rewards/mask_iou_reward": 0.6865508591098511, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5626959800720215, + "rewards/thk_ans_format_reward": 1.0, + "step": 2619, + "think_completion_length": 10.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.78125762939453, + "epoch": 8.849915682967959, + "grad_norm": 12.860734234006927, + "kl": 0.4619140625, + "learning_rate": 2.6238738738738735e-07, + "loss": 0.0005, + "reward": 3.329040288925171, + "reward_std": 0.17197439819574356, + "rewards/final_reward": 1.7695295108402456, + "rewards/mask_iou_reward": 0.8847647554201228, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.370707094669342, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2620, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.98958587646484, + "epoch": 8.853288364249579, + "grad_norm": 15.301909395105529, + "kl": 0.5302734375, + "learning_rate": 2.621058558558558e-07, + "loss": 0.0006, + "reward": 3.573215365409851, + "reward_std": 0.04483833443373442, + "rewards/final_reward": 1.4895611136624436, + "rewards/mask_iou_reward": 0.7447805568312218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.573215365409851, + "rewards/thk_ans_format_reward": 1.0, + "step": 2621, + "think_completion_length": 9.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.46875, + "epoch": 8.856661045531197, + "grad_norm": 50.53942012992708, + "kl": 0.439453125, + "learning_rate": 2.618243243243243e-07, + "loss": 0.0005, + "reward": 3.6168044805526733, + "reward_std": 0.12859731912612915, + "rewards/final_reward": 1.6656976467758042, + "rewards/mask_iou_reward": 0.8328488233879021, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6168044209480286, + "rewards/thk_ans_format_reward": 1.0, + "step": 2622, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.21875762939453, + "epoch": 8.860033726812816, + "grad_norm": 11.774345122272566, + "kl": 0.412109375, + "learning_rate": 2.615427927927928e-07, + "loss": 0.0004, + "reward": 3.6533669233322144, + "reward_std": 0.12147049978375435, + "rewards/final_reward": 1.5555595096657289, + "rewards/mask_iou_reward": 0.7777797548328644, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6637837290763855, + "rewards/thk_ans_format_reward": 1.0, + "step": 2623, + "think_completion_length": 9.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.34376525878906, + "epoch": 8.863406408094434, + "grad_norm": 6.525971220926675, + "kl": 0.421875, + "learning_rate": 2.6126126126126124e-07, + "loss": 0.0004, + "reward": 3.5324405431747437, + "reward_std": 0.020154454745352268, + "rewards/final_reward": 1.9465013247374108, + "rewards/mask_iou_reward": 0.9732506623687054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5324404835700989, + "rewards/thk_ans_format_reward": 1.0, + "step": 2624, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.87501525878906, + "epoch": 8.866779089376054, + "grad_norm": 81.03369462301879, + "kl": 0.443359375, + "learning_rate": 2.609797297297297e-07, + "loss": 0.0004, + "reward": 3.6387441158294678, + "reward_std": 0.03701675124466419, + "rewards/final_reward": 1.4862304581123538, + "rewards/mask_iou_reward": 0.7431152290561769, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6387439966201782, + "rewards/thk_ans_format_reward": 1.0, + "step": 2625, + "think_completion_length": 9.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.125, + "epoch": 8.870151770657673, + "grad_norm": 30.87060923711784, + "kl": 0.4853515625, + "learning_rate": 2.6069819819819817e-07, + "loss": 0.0005, + "reward": 3.176121711730957, + "reward_std": 0.1424817405641079, + "rewards/final_reward": 1.209586082100811, + "rewards/mask_iou_reward": 0.6047930410504055, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1761216521263123, + "rewards/thk_ans_format_reward": 1.0, + "step": 2626, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.1875, + "epoch": 8.873524451939291, + "grad_norm": 14.645248060791376, + "kl": 0.48046875, + "learning_rate": 2.604166666666667e-07, + "loss": 0.0005, + "reward": 3.415264129638672, + "reward_std": 0.1616402491927147, + "rewards/final_reward": 1.1429228559742883, + "rewards/mask_iou_reward": 0.5714614279871442, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4152640104293823, + "rewards/thk_ans_format_reward": 1.0, + "step": 2627, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.03125762939453, + "epoch": 8.876897133220911, + "grad_norm": 12.615776582831762, + "kl": 0.3984375, + "learning_rate": 2.6013513513513514e-07, + "loss": 0.0004, + "reward": 3.315216302871704, + "reward_std": 0.08645356260240078, + "rewards/final_reward": 1.722401614164057, + "rewards/mask_iou_reward": 0.8612008070820285, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3152162730693817, + "rewards/thk_ans_format_reward": 1.0, + "step": 2628, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.0104217529297, + "epoch": 8.88026981450253, + "grad_norm": 21.727976468632153, + "kl": 0.408203125, + "learning_rate": 2.598536036036036e-07, + "loss": 0.0004, + "reward": 3.459446907043457, + "reward_std": 0.14448082819581032, + "rewards/final_reward": 1.3135695645172558, + "rewards/mask_iou_reward": 0.6567847822586279, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4594467282295227, + "rewards/thk_ans_format_reward": 1.0, + "step": 2629, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.09375762939453, + "epoch": 8.883642495784148, + "grad_norm": 7.832814400376301, + "kl": 0.4609375, + "learning_rate": 2.5957207207207206e-07, + "loss": 0.0005, + "reward": 3.6723581552505493, + "reward_std": 0.11408434621989727, + "rewards/final_reward": 1.5644726703356446, + "rewards/mask_iou_reward": 0.7822363351678223, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6723580956459045, + "rewards/thk_ans_format_reward": 1.0, + "step": 2630, + "think_completion_length": 11.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.05208587646484, + "epoch": 8.887015177065766, + "grad_norm": 32.03645850756957, + "kl": 0.4375, + "learning_rate": 2.592905405405405e-07, + "loss": 0.0004, + "reward": 3.4289125204086304, + "reward_std": 0.037827394902706146, + "rewards/final_reward": 1.3664140757626264, + "rewards/mask_iou_reward": 0.6832070378813132, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4289124608039856, + "rewards/thk_ans_format_reward": 1.0, + "step": 2631, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.02083587646484, + "epoch": 8.890387858347387, + "grad_norm": 17.042048133003938, + "kl": 0.4990234375, + "learning_rate": 2.5900900900900904e-07, + "loss": 0.0005, + "reward": 3.583601951599121, + "reward_std": 0.04651731997728348, + "rewards/final_reward": 1.789611237700948, + "rewards/mask_iou_reward": 0.894805618850474, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5836020708084106, + "rewards/thk_ans_format_reward": 1.0, + "step": 2632, + "think_completion_length": 12.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.8125, + "epoch": 8.893760539629005, + "grad_norm": 39.833372050013864, + "kl": 0.505859375, + "learning_rate": 2.587274774774775e-07, + "loss": 0.0005, + "reward": 3.650436520576477, + "reward_std": 0.03200624976307154, + "rewards/final_reward": 1.7966636620129028, + "rewards/mask_iou_reward": 0.8983318310064514, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6504364609718323, + "rewards/thk_ans_format_reward": 1.0, + "step": 2633, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.7083435058594, + "epoch": 8.897133220910623, + "grad_norm": 29.622911554214088, + "kl": 0.3349609375, + "learning_rate": 2.5844594594594596e-07, + "loss": 0.0003, + "reward": 3.3017475605010986, + "reward_std": 0.18274864368140697, + "rewards/final_reward": 0.8420971193307066, + "rewards/mask_iou_reward": 0.4210485596653533, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3225809335708618, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2634, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.46875762939453, + "epoch": 8.900505902192243, + "grad_norm": 8.892742647552673, + "kl": 0.4501953125, + "learning_rate": 2.5816441441441436e-07, + "loss": 0.0005, + "reward": 3.360956907272339, + "reward_std": 0.03940213192254305, + "rewards/final_reward": 0.9547359003275036, + "rewards/mask_iou_reward": 0.4773679501637518, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3609567880630493, + "rewards/thk_ans_format_reward": 1.0, + "step": 2635, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.80208587646484, + "epoch": 8.903878583473862, + "grad_norm": 171.80196189316987, + "kl": 0.46484375, + "learning_rate": 2.578828828828828e-07, + "loss": 0.0005, + "reward": 3.7178579568862915, + "reward_std": 0.04392486624419689, + "rewards/final_reward": 1.557018958881521, + "rewards/mask_iou_reward": 0.7785094794407605, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.717857837677002, + "rewards/thk_ans_format_reward": 1.0, + "step": 2636, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.4166717529297, + "epoch": 8.90725126475548, + "grad_norm": 39.6809223643521, + "kl": 0.50390625, + "learning_rate": 2.5760135135135134e-07, + "loss": 0.0005, + "reward": 3.664846181869507, + "reward_std": 0.08971784822642803, + "rewards/final_reward": 1.628289576339221, + "rewards/mask_iou_reward": 0.8141447881696104, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6648462414741516, + "rewards/thk_ans_format_reward": 1.0, + "step": 2637, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.98959350585938, + "epoch": 8.910623946037099, + "grad_norm": 14.746750635196374, + "kl": 0.40234375, + "learning_rate": 2.573198198198198e-07, + "loss": 0.0004, + "reward": 3.678009271621704, + "reward_std": 0.04709428362548351, + "rewards/final_reward": 1.849764863148052, + "rewards/mask_iou_reward": 0.924882431574026, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6780093908309937, + "rewards/thk_ans_format_reward": 1.0, + "step": 2638, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.875, + "epoch": 8.913996627318719, + "grad_norm": 51.01927196117225, + "kl": 0.4228515625, + "learning_rate": 2.5703828828828826e-07, + "loss": 0.0004, + "reward": 3.4034098386764526, + "reward_std": 0.07623914256691933, + "rewards/final_reward": 0.8302799876257294, + "rewards/mask_iou_reward": 0.4151399938128647, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4034096598625183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2639, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.39583587646484, + "epoch": 8.917369308600337, + "grad_norm": 12.773784866652818, + "kl": 0.431640625, + "learning_rate": 2.567567567567567e-07, + "loss": 0.0004, + "reward": 3.4415433406829834, + "reward_std": 0.11167657189071178, + "rewards/final_reward": 1.6203662508946173, + "rewards/mask_iou_reward": 0.8101831254473086, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4415434002876282, + "rewards/thk_ans_format_reward": 1.0, + "step": 2640, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.375, + "epoch": 8.920741989881956, + "grad_norm": 8.17540569049189, + "kl": 0.458984375, + "learning_rate": 2.564752252252252e-07, + "loss": 0.0005, + "reward": 3.6591590642929077, + "reward_std": 0.08187056519091129, + "rewards/final_reward": 1.334430933250397, + "rewards/mask_iou_reward": 0.6672154666251985, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6591590642929077, + "rewards/thk_ans_format_reward": 1.0, + "step": 2641, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.28125381469727, + "epoch": 8.924114671163576, + "grad_norm": 11.630567244110326, + "kl": 0.560546875, + "learning_rate": 2.561936936936937e-07, + "loss": 0.0006, + "reward": 3.5551464557647705, + "reward_std": 0.1242928933352232, + "rewards/final_reward": 1.1889093049377917, + "rewards/mask_iou_reward": 0.5944546524688958, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5551465153694153, + "rewards/thk_ans_format_reward": 1.0, + "step": 2642, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.92708587646484, + "epoch": 8.927487352445194, + "grad_norm": 8.779738218183537, + "kl": 0.4775390625, + "learning_rate": 2.5591216216216216e-07, + "loss": 0.0005, + "reward": 3.487833023071289, + "reward_std": 0.08055975451134145, + "rewards/final_reward": 1.1506063369972375, + "rewards/mask_iou_reward": 0.5753031684986187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4878326654434204, + "rewards/thk_ans_format_reward": 1.0, + "step": 2643, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.96875, + "epoch": 8.930860033726812, + "grad_norm": 6.416920565803001, + "kl": 0.4951171875, + "learning_rate": 2.556306306306306e-07, + "loss": 0.0005, + "reward": 3.6678950786590576, + "reward_std": 0.034986887127161026, + "rewards/final_reward": 1.799880776864869, + "rewards/mask_iou_reward": 0.8999403884324345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6678951978683472, + "rewards/thk_ans_format_reward": 1.0, + "step": 2644, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.52083587646484, + "epoch": 8.93423271500843, + "grad_norm": 8.99742168656809, + "kl": 0.501953125, + "learning_rate": 2.553490990990991e-07, + "loss": 0.0005, + "reward": 3.372067928314209, + "reward_std": 0.06782113015651703, + "rewards/final_reward": 1.8697549752479614, + "rewards/mask_iou_reward": 0.9348774876239807, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3720678091049194, + "rewards/thk_ans_format_reward": 1.0, + "step": 2645, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.1145935058594, + "epoch": 8.937605396290051, + "grad_norm": 13.757136143314941, + "kl": 0.380859375, + "learning_rate": 2.5506756756756754e-07, + "loss": 0.0004, + "reward": 3.5446051359176636, + "reward_std": 0.1319795325398445, + "rewards/final_reward": 1.3958622800108376, + "rewards/mask_iou_reward": 0.6979311400054188, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5654385089874268, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2646, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.8854217529297, + "epoch": 8.94097807757167, + "grad_norm": 9.816895266921808, + "kl": 0.4501953125, + "learning_rate": 2.5478603603603605e-07, + "loss": 0.0005, + "reward": 3.4566650390625, + "reward_std": 0.08659421931952238, + "rewards/final_reward": 1.2735341116940226, + "rewards/mask_iou_reward": 0.6367670558470113, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4566651582717896, + "rewards/thk_ans_format_reward": 1.0, + "step": 2647, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.6666717529297, + "epoch": 8.944350758853288, + "grad_norm": 15.86401428521161, + "kl": 0.4638671875, + "learning_rate": 2.545045045045045e-07, + "loss": 0.0005, + "reward": 3.6637405157089233, + "reward_std": 0.04727690666913986, + "rewards/final_reward": 1.897336213430683, + "rewards/mask_iou_reward": 0.9486681067153415, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6637406945228577, + "rewards/thk_ans_format_reward": 1.0, + "step": 2648, + "think_completion_length": 11.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.55208587646484, + "epoch": 8.947723440134908, + "grad_norm": 19.73551194433044, + "kl": 0.3935546875, + "learning_rate": 2.5422297297297297e-07, + "loss": 0.0004, + "reward": 3.512809634208679, + "reward_std": 0.06839705258607864, + "rewards/final_reward": 1.4805996022986838, + "rewards/mask_iou_reward": 0.7402998011493419, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5128096342086792, + "rewards/thk_ans_format_reward": 1.0, + "step": 2649, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.02084350585938, + "epoch": 8.951096121416526, + "grad_norm": 28.12710538047436, + "kl": 0.4130859375, + "learning_rate": 2.5394144144144143e-07, + "loss": 0.0004, + "reward": 3.279626727104187, + "reward_std": 0.05651633441448212, + "rewards/final_reward": 1.340548502695853, + "rewards/mask_iou_reward": 0.6702742513479265, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2796268165111542, + "rewards/thk_ans_format_reward": 1.0, + "step": 2650, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.40625762939453, + "epoch": 8.954468802698145, + "grad_norm": 12.8049783585612, + "kl": 0.48828125, + "learning_rate": 2.536599099099099e-07, + "loss": 0.0005, + "reward": 3.8202362060546875, + "reward_std": 0.01339608570560813, + "rewards/final_reward": 1.8661134566366775, + "rewards/mask_iou_reward": 0.9330567283183387, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8202361464500427, + "rewards/thk_ans_format_reward": 1.0, + "step": 2651, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.72916793823242, + "epoch": 8.957841483979763, + "grad_norm": 7.0945906774410625, + "kl": 0.4033203125, + "learning_rate": 2.533783783783784e-07, + "loss": 0.0004, + "reward": 3.421821355819702, + "reward_std": 0.039436303079128265, + "rewards/final_reward": 1.819093730521853, + "rewards/mask_iou_reward": 0.9095468652609265, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4218213558197021, + "rewards/thk_ans_format_reward": 1.0, + "step": 2652, + "think_completion_length": 10.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.3229217529297, + "epoch": 8.961214165261383, + "grad_norm": 27.860175075426934, + "kl": 0.521484375, + "learning_rate": 2.5309684684684687e-07, + "loss": 0.0005, + "reward": 3.437433958053589, + "reward_std": 0.12433646619319916, + "rewards/final_reward": 1.7747834075878353, + "rewards/mask_iou_reward": 0.8873917037939176, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4374338388442993, + "rewards/thk_ans_format_reward": 1.0, + "step": 2653, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.01041793823242, + "epoch": 8.964586846543002, + "grad_norm": 6.465993260335525, + "kl": 0.4853515625, + "learning_rate": 2.5281531531531533e-07, + "loss": 0.0005, + "reward": 3.613997220993042, + "reward_std": 0.024749555392190814, + "rewards/final_reward": 1.8598908330075345, + "rewards/mask_iou_reward": 0.9299454165037673, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6139971613883972, + "rewards/thk_ans_format_reward": 1.0, + "step": 2654, + "think_completion_length": 10.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.5, + "epoch": 8.96795952782462, + "grad_norm": 169.99969242460082, + "kl": 0.384765625, + "learning_rate": 2.5253378378378374e-07, + "loss": 0.0004, + "reward": 3.538175344467163, + "reward_std": 0.07858727127313614, + "rewards/final_reward": 1.645226200115244, + "rewards/mask_iou_reward": 0.822613100057622, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5381752848625183, + "rewards/thk_ans_format_reward": 1.0, + "step": 2655, + "think_completion_length": 10.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.46875, + "epoch": 8.97133220910624, + "grad_norm": 22.292604120261398, + "kl": 0.58984375, + "learning_rate": 2.522522522522522e-07, + "loss": 0.0006, + "reward": 3.0568559169769287, + "reward_std": 0.1006831880658865, + "rewards/final_reward": 0.5641649094732629, + "rewards/mask_iou_reward": 0.28208245473663146, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0568559765815735, + "rewards/thk_ans_format_reward": 1.0, + "step": 2656, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.17708587646484, + "epoch": 8.974704890387859, + "grad_norm": 6.934613809314335, + "kl": 0.478515625, + "learning_rate": 2.519707207207207e-07, + "loss": 0.0005, + "reward": 3.728635311126709, + "reward_std": 0.02214963547885418, + "rewards/final_reward": 1.6594442697257372, + "rewards/mask_iou_reward": 0.8297221348628686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7286354899406433, + "rewards/thk_ans_format_reward": 1.0, + "step": 2657, + "think_completion_length": 10.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.92708587646484, + "epoch": 8.978077571669477, + "grad_norm": 10.436256111361867, + "kl": 0.509765625, + "learning_rate": 2.5168918918918917e-07, + "loss": 0.0005, + "reward": 3.538660407066345, + "reward_std": 0.029477974399924278, + "rewards/final_reward": 1.1480477415799157, + "rewards/mask_iou_reward": 0.5740238707899579, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5386605262756348, + "rewards/thk_ans_format_reward": 1.0, + "step": 2658, + "think_completion_length": 9.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.14583587646484, + "epoch": 8.981450252951095, + "grad_norm": 13.813894213545238, + "kl": 0.4111328125, + "learning_rate": 2.5140765765765763e-07, + "loss": 0.0004, + "reward": 3.539559245109558, + "reward_std": 0.03265107958577573, + "rewards/final_reward": 1.871948467193768, + "rewards/mask_iou_reward": 0.935974233596884, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.539559245109558, + "rewards/thk_ans_format_reward": 1.0, + "step": 2659, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.12500381469727, + "epoch": 8.984822934232715, + "grad_norm": 14.094723839060045, + "kl": 0.505859375, + "learning_rate": 2.511261261261261e-07, + "loss": 0.0005, + "reward": 3.61898672580719, + "reward_std": 0.029271011240780354, + "rewards/final_reward": 1.828193769950737, + "rewards/mask_iou_reward": 0.9140968849753685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6189867854118347, + "rewards/thk_ans_format_reward": 1.0, + "step": 2660, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.1666717529297, + "epoch": 8.988195615514334, + "grad_norm": 14.438567754631755, + "kl": 0.4326171875, + "learning_rate": 2.5084459459459455e-07, + "loss": 0.0005, + "reward": 3.5245808362960815, + "reward_std": 0.03417748771607876, + "rewards/final_reward": 1.5107258270611736, + "rewards/mask_iou_reward": 0.7553629135305868, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5245808959007263, + "rewards/thk_ans_format_reward": 1.0, + "step": 2661, + "think_completion_length": 10.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.33333587646484, + "epoch": 8.991568296795952, + "grad_norm": 11.380236907302288, + "kl": 0.6640625, + "learning_rate": 2.5056306306306307e-07, + "loss": 0.0007, + "reward": 3.6742966175079346, + "reward_std": 0.034709298983216286, + "rewards/final_reward": 1.6202422930984484, + "rewards/mask_iou_reward": 0.8101211465492242, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.674296498298645, + "rewards/thk_ans_format_reward": 1.0, + "step": 2662, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.36458587646484, + "epoch": 8.994940978077572, + "grad_norm": 45.308936467943454, + "kl": 0.4951171875, + "learning_rate": 2.5028153153153153e-07, + "loss": 0.0005, + "reward": 3.7181190252304077, + "reward_std": 0.016878115944564342, + "rewards/final_reward": 1.4298181162040127, + "rewards/mask_iou_reward": 0.7149090581020063, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.718118965625763, + "rewards/thk_ans_format_reward": 1.0, + "step": 2663, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.2631607055664, + "epoch": 8.99831365935919, + "grad_norm": 9.649917036875612, + "kl": 0.4326171875, + "learning_rate": 2.5e-07, + "loss": 0.0004, + "reward": 3.3768553733825684, + "reward_std": 0.05902155674993992, + "rewards/final_reward": 1.4501204854085197, + "rewards/mask_iou_reward": 0.7250602427042598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.376855492591858, + "rewards/thk_ans_format_reward": 1.0, + "step": 2664, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.55208587646484, + "epoch": 9.003372681281618, + "grad_norm": 15.152185059708602, + "kl": 0.51171875, + "learning_rate": 2.4971846846846845e-07, + "loss": 0.0005, + "reward": 3.7161524295806885, + "reward_std": 0.052857328206300735, + "rewards/final_reward": 1.3831503284858946, + "rewards/mask_iou_reward": 0.6915751642429473, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.716152310371399, + "rewards/thk_ans_format_reward": 1.0, + "step": 2665, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.61458587646484, + "epoch": 9.006745362563239, + "grad_norm": 71.87257427370919, + "kl": 0.5576171875, + "learning_rate": 2.494369369369369e-07, + "loss": 0.0006, + "reward": 3.7588504552841187, + "reward_std": 0.03195140324532986, + "rewards/final_reward": 1.950061648307034, + "rewards/mask_iou_reward": 0.975030824153517, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7588502168655396, + "rewards/thk_ans_format_reward": 1.0, + "step": 2666, + "think_completion_length": 9.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.125, + "epoch": 9.010118043844857, + "grad_norm": 25.326411709571683, + "kl": 0.603515625, + "learning_rate": 2.4915540540540537e-07, + "loss": 0.0006, + "reward": 3.7052032947540283, + "reward_std": 0.045940013602375984, + "rewards/final_reward": 1.6143038150577902, + "rewards/mask_iou_reward": 0.8071519075288951, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7052034139633179, + "rewards/thk_ans_format_reward": 1.0, + "step": 2667, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.20834350585938, + "epoch": 9.013490725126475, + "grad_norm": 7.283528282342515, + "kl": 0.4326171875, + "learning_rate": 2.488738738738739e-07, + "loss": 0.0005, + "reward": 3.5239779949188232, + "reward_std": 0.06010612426325679, + "rewards/final_reward": 1.7654436370875735, + "rewards/mask_iou_reward": 0.8827218185437867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5239779353141785, + "rewards/thk_ans_format_reward": 1.0, + "step": 2668, + "think_completion_length": 9.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.67708587646484, + "epoch": 9.016863406408094, + "grad_norm": 16.417099390110156, + "kl": 0.4033203125, + "learning_rate": 2.4859234234234234e-07, + "loss": 0.0004, + "reward": 3.594439744949341, + "reward_std": 0.0543990321457386, + "rewards/final_reward": 1.7274174146954215, + "rewards/mask_iou_reward": 0.8637087073477108, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5944397449493408, + "rewards/thk_ans_format_reward": 1.0, + "step": 2669, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.43750762939453, + "epoch": 9.020236087689714, + "grad_norm": 8.903204844746314, + "kl": 0.5078125, + "learning_rate": 2.483108108108108e-07, + "loss": 0.0005, + "reward": 3.4996542930603027, + "reward_std": 0.06193845346570015, + "rewards/final_reward": 1.6113397853745253, + "rewards/mask_iou_reward": 0.8056698926872626, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4996538162231445, + "rewards/thk_ans_format_reward": 1.0, + "step": 2670, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.9166717529297, + "epoch": 9.023608768971332, + "grad_norm": 9.246171153011565, + "kl": 0.513671875, + "learning_rate": 2.4802927927927927e-07, + "loss": 0.0006, + "reward": 3.5686081647872925, + "reward_std": 0.12947729974985123, + "rewards/final_reward": 1.722881013585082, + "rewards/mask_iou_reward": 0.861440506792541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5686081647872925, + "rewards/thk_ans_format_reward": 1.0, + "step": 2671, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.6666717529297, + "epoch": 9.02698145025295, + "grad_norm": 20.437319408883287, + "kl": 0.4892578125, + "learning_rate": 2.4774774774774773e-07, + "loss": 0.0005, + "reward": 3.53087317943573, + "reward_std": 0.06894206255674362, + "rewards/final_reward": 1.8161740667128283, + "rewards/mask_iou_reward": 0.9080870333564142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5308731198310852, + "rewards/thk_ans_format_reward": 1.0, + "step": 2672, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.3854217529297, + "epoch": 9.03035413153457, + "grad_norm": 15.565045926865892, + "kl": 0.5185546875, + "learning_rate": 2.4746621621621624e-07, + "loss": 0.0005, + "reward": 3.667048692703247, + "reward_std": 0.07611064240336418, + "rewards/final_reward": 1.3353405029124257, + "rewards/mask_iou_reward": 0.6676702514562128, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6670488119125366, + "rewards/thk_ans_format_reward": 1.0, + "step": 2673, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.4791717529297, + "epoch": 9.03372681281619, + "grad_norm": 7.936379232416905, + "kl": 0.4296875, + "learning_rate": 2.4718468468468465e-07, + "loss": 0.0004, + "reward": 3.2925609350204468, + "reward_std": 0.036365545354783535, + "rewards/final_reward": 1.525952263897908, + "rewards/mask_iou_reward": 0.762976131948954, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2925609350204468, + "rewards/thk_ans_format_reward": 1.0, + "step": 2674, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.09375762939453, + "epoch": 9.037099494097808, + "grad_norm": 11.974096155908322, + "kl": 0.4921875, + "learning_rate": 2.469031531531531e-07, + "loss": 0.0005, + "reward": 3.6752796173095703, + "reward_std": 0.009360826574265957, + "rewards/final_reward": 1.400129051099373, + "rewards/mask_iou_reward": 0.7000645255496865, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6752796173095703, + "rewards/thk_ans_format_reward": 1.0, + "step": 2675, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.42709350585938, + "epoch": 9.040472175379426, + "grad_norm": 7.324533013799514, + "kl": 0.58984375, + "learning_rate": 2.466216216216216e-07, + "loss": 0.0006, + "reward": 3.5461130142211914, + "reward_std": 0.10013717226684093, + "rewards/final_reward": 1.9402012319072974, + "rewards/mask_iou_reward": 0.9701006159536487, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5461129546165466, + "rewards/thk_ans_format_reward": 1.0, + "step": 2676, + "think_completion_length": 11.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.53125762939453, + "epoch": 9.043844856661046, + "grad_norm": 19.556279213220137, + "kl": 0.4609375, + "learning_rate": 2.463400900900901e-07, + "loss": 0.0005, + "reward": 3.4152863025665283, + "reward_std": 0.1509530497714877, + "rewards/final_reward": 1.1074901545950122, + "rewards/mask_iou_reward": 0.5537450772975061, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.415286362171173, + "rewards/thk_ans_format_reward": 1.0, + "step": 2677, + "think_completion_length": 10.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.3125, + "epoch": 9.047217537942664, + "grad_norm": 9.370784542991062, + "kl": 0.5458984375, + "learning_rate": 2.4605855855855854e-07, + "loss": 0.0006, + "reward": 3.72617244720459, + "reward_std": 0.04560376284644008, + "rewards/final_reward": 1.8367343242937455, + "rewards/mask_iou_reward": 0.9183671621468728, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.726172387599945, + "rewards/thk_ans_format_reward": 1.0, + "step": 2678, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.33333587646484, + "epoch": 9.050590219224283, + "grad_norm": 20.410965872762333, + "kl": 0.564453125, + "learning_rate": 2.45777027027027e-07, + "loss": 0.0006, + "reward": 3.73172664642334, + "reward_std": 0.05516933067701757, + "rewards/final_reward": 1.829631024851735, + "rewards/mask_iou_reward": 0.9148155124258674, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7317265272140503, + "rewards/thk_ans_format_reward": 1.0, + "step": 2679, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.8541717529297, + "epoch": 9.053962900505903, + "grad_norm": 10.325320257959437, + "kl": 0.431640625, + "learning_rate": 2.4549549549549547e-07, + "loss": 0.0004, + "reward": 3.636067748069763, + "reward_std": 0.08627158403396606, + "rewards/final_reward": 1.8044409887507031, + "rewards/mask_iou_reward": 0.9022204943753516, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6360676288604736, + "rewards/thk_ans_format_reward": 1.0, + "step": 2680, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.1041717529297, + "epoch": 9.057335581787521, + "grad_norm": 10.811211779463441, + "kl": 0.494140625, + "learning_rate": 2.45213963963964e-07, + "loss": 0.0005, + "reward": 3.682226538658142, + "reward_std": 0.13119321130216122, + "rewards/final_reward": 1.6477921250399699, + "rewards/mask_iou_reward": 0.8238960625199849, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6822264790534973, + "rewards/thk_ans_format_reward": 1.0, + "step": 2681, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.5416717529297, + "epoch": 9.06070826306914, + "grad_norm": 7.052167029313177, + "kl": 0.5390625, + "learning_rate": 2.4493243243243244e-07, + "loss": 0.0005, + "reward": 3.679692506790161, + "reward_std": 0.04925611428916454, + "rewards/final_reward": 1.473480284634789, + "rewards/mask_iou_reward": 0.7367401423173945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6796923875808716, + "rewards/thk_ans_format_reward": 1.0, + "step": 2682, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.0729217529297, + "epoch": 9.064080944350758, + "grad_norm": 7.20948865847875, + "kl": 0.53515625, + "learning_rate": 2.446509009009009e-07, + "loss": 0.0006, + "reward": 3.765621781349182, + "reward_std": 0.11311442777514458, + "rewards/final_reward": 1.5743881457581685, + "rewards/mask_iou_reward": 0.7871940728790843, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7656217813491821, + "rewards/thk_ans_format_reward": 1.0, + "step": 2683, + "think_completion_length": 10.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.04166793823242, + "epoch": 9.067453625632378, + "grad_norm": 9.033882101019975, + "kl": 0.45703125, + "learning_rate": 2.4436936936936936e-07, + "loss": 0.0005, + "reward": 3.5185396671295166, + "reward_std": 0.03499617241322994, + "rewards/final_reward": 1.715690100780282, + "rewards/mask_iou_reward": 0.857845050390141, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5185397267341614, + "rewards/thk_ans_format_reward": 1.0, + "step": 2684, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.9479217529297, + "epoch": 9.070826306913997, + "grad_norm": 42.283447515692345, + "kl": 0.4150390625, + "learning_rate": 2.440878378378378e-07, + "loss": 0.0004, + "reward": 3.500747799873352, + "reward_std": 0.07099348679184914, + "rewards/final_reward": 1.4050785705392468, + "rewards/mask_iou_reward": 0.7025392852696234, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.500747799873352, + "rewards/thk_ans_format_reward": 1.0, + "step": 2685, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.71875762939453, + "epoch": 9.074198988195615, + "grad_norm": 64.2349849457948, + "kl": 0.4560546875, + "learning_rate": 2.438063063063063e-07, + "loss": 0.0005, + "reward": 3.7512688636779785, + "reward_std": 0.02270980691537261, + "rewards/final_reward": 1.7508754565417008, + "rewards/mask_iou_reward": 0.8754377282708504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.751268982887268, + "rewards/thk_ans_format_reward": 1.0, + "step": 2686, + "think_completion_length": 10.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.1041717529297, + "epoch": 9.077571669477235, + "grad_norm": 8.261737297277202, + "kl": 0.4921875, + "learning_rate": 2.4352477477477474e-07, + "loss": 0.0005, + "reward": 3.7740758657455444, + "reward_std": 0.02103525586426258, + "rewards/final_reward": 1.7791438643069668, + "rewards/mask_iou_reward": 0.8895719321534834, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7740757465362549, + "rewards/thk_ans_format_reward": 1.0, + "step": 2687, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.21875, + "epoch": 9.080944350758854, + "grad_norm": 12.090658556796027, + "kl": 0.44921875, + "learning_rate": 2.4324324324324326e-07, + "loss": 0.0005, + "reward": 3.363537311553955, + "reward_std": 0.01269416231662035, + "rewards/final_reward": 1.3129628747279014, + "rewards/mask_iou_reward": 0.6564814373639507, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3635371327400208, + "rewards/thk_ans_format_reward": 1.0, + "step": 2688, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.82291793823242, + "epoch": 9.084317032040472, + "grad_norm": 30.84198680259513, + "kl": 0.8203125, + "learning_rate": 2.429617117117117e-07, + "loss": 0.0008, + "reward": 3.442896008491516, + "reward_std": 0.030627870932221413, + "rewards/final_reward": 1.6634693374207838, + "rewards/mask_iou_reward": 0.8317346687103919, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4428958892822266, + "rewards/thk_ans_format_reward": 1.0, + "step": 2689, + "think_completion_length": 10.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.45833587646484, + "epoch": 9.08768971332209, + "grad_norm": 11.753779857493912, + "kl": 0.4951171875, + "learning_rate": 2.426801801801802e-07, + "loss": 0.0005, + "reward": 3.4357932806015015, + "reward_std": 0.3079192712903023, + "rewards/final_reward": 1.3933123160743839, + "rewards/mask_iou_reward": 0.6966561580371919, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.4670435190200806, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2690, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.38541793823242, + "epoch": 9.09106239460371, + "grad_norm": 28.493010119573313, + "kl": 0.4873046875, + "learning_rate": 2.4239864864864864e-07, + "loss": 0.0005, + "reward": 3.7012124061584473, + "reward_std": 0.047339873388409615, + "rewards/final_reward": 1.6889713608322674, + "rewards/mask_iou_reward": 0.8444856804161337, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7012121677398682, + "rewards/thk_ans_format_reward": 1.0, + "step": 2691, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.875, + "epoch": 9.094435075885329, + "grad_norm": 21.79511338790455, + "kl": 0.8408203125, + "learning_rate": 2.421171171171171e-07, + "loss": 0.0009, + "reward": 3.6198463439941406, + "reward_std": 0.12590062618255615, + "rewards/final_reward": 1.6057972566895269, + "rewards/mask_iou_reward": 0.8028986283447634, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.619846224784851, + "rewards/thk_ans_format_reward": 1.0, + "step": 2692, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.83333587646484, + "epoch": 9.097807757166947, + "grad_norm": 37.82955624535845, + "kl": 0.412109375, + "learning_rate": 2.4183558558558556e-07, + "loss": 0.0004, + "reward": 3.8154425621032715, + "reward_std": 0.03506853384897113, + "rewards/final_reward": 1.8068316281833143, + "rewards/mask_iou_reward": 0.9034158140916572, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8154423236846924, + "rewards/thk_ans_format_reward": 1.0, + "step": 2693, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.59375762939453, + "epoch": 9.101180438448566, + "grad_norm": 16.82302390912551, + "kl": 0.3876953125, + "learning_rate": 2.41554054054054e-07, + "loss": 0.0004, + "reward": 3.6368253231048584, + "reward_std": 0.06186164543032646, + "rewards/final_reward": 1.6361911395760615, + "rewards/mask_iou_reward": 0.8180955697880308, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6368253231048584, + "rewards/thk_ans_format_reward": 1.0, + "step": 2694, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.37500762939453, + "epoch": 9.104553119730186, + "grad_norm": 15.457808180787062, + "kl": 0.490234375, + "learning_rate": 2.412725225225225e-07, + "loss": 0.0005, + "reward": 3.486648440361023, + "reward_std": 0.03985132835805416, + "rewards/final_reward": 1.8332984350297488, + "rewards/mask_iou_reward": 0.9166492175148744, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4866483211517334, + "rewards/thk_ans_format_reward": 1.0, + "step": 2695, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.48958587646484, + "epoch": 9.107925801011804, + "grad_norm": 21.822536342612135, + "kl": 0.6455078125, + "learning_rate": 2.40990990990991e-07, + "loss": 0.0006, + "reward": 3.6761653423309326, + "reward_std": 0.08975771814584732, + "rewards/final_reward": 1.7592890844121998, + "rewards/mask_iou_reward": 0.8796445422060999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6761654019355774, + "rewards/thk_ans_format_reward": 1.0, + "step": 2696, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.09375762939453, + "epoch": 9.111298482293423, + "grad_norm": 337.1635694325612, + "kl": 0.3984375, + "learning_rate": 2.4070945945945946e-07, + "loss": 0.0004, + "reward": 3.569099545478821, + "reward_std": 0.08558660000562668, + "rewards/final_reward": 1.368445699274106, + "rewards/mask_iou_reward": 0.684222849637053, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5690995454788208, + "rewards/thk_ans_format_reward": 1.0, + "step": 2697, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.36458587646484, + "epoch": 9.114671163575043, + "grad_norm": 86.0876614096406, + "kl": 0.4296875, + "learning_rate": 2.404279279279279e-07, + "loss": 0.0005, + "reward": 3.6935207843780518, + "reward_std": 0.016915190033614635, + "rewards/final_reward": 1.8676412256394792, + "rewards/mask_iou_reward": 0.9338206128197396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6935204863548279, + "rewards/thk_ans_format_reward": 1.0, + "step": 2698, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.23959350585938, + "epoch": 9.118043844856661, + "grad_norm": 6.3744183320808085, + "kl": 0.662109375, + "learning_rate": 2.401463963963964e-07, + "loss": 0.0007, + "reward": 3.4611141681671143, + "reward_std": 0.052461449056863785, + "rewards/final_reward": 0.9666298071768408, + "rewards/mask_iou_reward": 0.4833149035884204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4611140489578247, + "rewards/thk_ans_format_reward": 1.0, + "step": 2699, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.3854217529297, + "epoch": 9.12141652613828, + "grad_norm": 15.110711837565969, + "kl": 0.498046875, + "learning_rate": 2.3986486486486484e-07, + "loss": 0.0005, + "reward": 3.5770293474197388, + "reward_std": 0.06012692954391241, + "rewards/final_reward": 1.2116396444247322, + "rewards/mask_iou_reward": 0.6058198222123661, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5770291090011597, + "rewards/thk_ans_format_reward": 1.0, + "step": 2700, + "think_completion_length": 9.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.28126525878906, + "epoch": 9.124789207419898, + "grad_norm": 12.57204091524218, + "kl": 0.396484375, + "learning_rate": 2.3958333333333335e-07, + "loss": 0.0004, + "reward": 3.559865355491638, + "reward_std": 0.21300777792930603, + "rewards/final_reward": 1.4534144906746593, + "rewards/mask_iou_reward": 0.7267072453373297, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.6015319228172302, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2701, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.19791793823242, + "epoch": 9.128161888701518, + "grad_norm": 15.569747051910305, + "kl": 0.5263671875, + "learning_rate": 2.393018018018018e-07, + "loss": 0.0005, + "reward": 3.6546788215637207, + "reward_std": 0.02495476300828159, + "rewards/final_reward": 1.3387666501724387, + "rewards/mask_iou_reward": 0.6693833250862193, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6546787023544312, + "rewards/thk_ans_format_reward": 1.0, + "step": 2702, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.12500762939453, + "epoch": 9.131534569983137, + "grad_norm": 9.758601611244647, + "kl": 0.4072265625, + "learning_rate": 2.3902027027027027e-07, + "loss": 0.0003, + "reward": 3.5538190603256226, + "reward_std": 0.03838097210973501, + "rewards/final_reward": 1.7076908719734, + "rewards/mask_iou_reward": 0.8538454359867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5538190007209778, + "rewards/thk_ans_format_reward": 1.0, + "step": 2703, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.8854217529297, + "epoch": 9.134907251264755, + "grad_norm": 12.736825560495458, + "kl": 0.470703125, + "learning_rate": 2.3873873873873873e-07, + "loss": 0.0005, + "reward": 3.607846260070801, + "reward_std": 0.016801190562546253, + "rewards/final_reward": 1.2867632234165858, + "rewards/mask_iou_reward": 0.6433816117082929, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6078462600708008, + "rewards/thk_ans_format_reward": 1.0, + "step": 2704, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.15625762939453, + "epoch": 9.138279932546375, + "grad_norm": 21.990337483598392, + "kl": 0.4541015625, + "learning_rate": 2.384572072072072e-07, + "loss": 0.0005, + "reward": 3.675834536552429, + "reward_std": 0.04092971049249172, + "rewards/final_reward": 1.7989294183946072, + "rewards/mask_iou_reward": 0.8994647091973036, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.675834596157074, + "rewards/thk_ans_format_reward": 1.0, + "step": 2705, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.65625, + "epoch": 9.141652613827993, + "grad_norm": 10.407018219762056, + "kl": 0.5693359375, + "learning_rate": 2.3817567567567568e-07, + "loss": 0.0006, + "reward": 3.3383116722106934, + "reward_std": 0.05316999740898609, + "rewards/final_reward": 1.4855893540566543, + "rewards/mask_iou_reward": 0.7427946770283271, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3383113145828247, + "rewards/thk_ans_format_reward": 1.0, + "step": 2706, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.1666717529297, + "epoch": 9.145025295109612, + "grad_norm": 8.801464639082292, + "kl": 0.46875, + "learning_rate": 2.3789414414414414e-07, + "loss": 0.0005, + "reward": 3.641343593597412, + "reward_std": 0.05162101425230503, + "rewards/final_reward": 1.8753772703918348, + "rewards/mask_iou_reward": 0.9376886351959174, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6413435339927673, + "rewards/thk_ans_format_reward": 1.0, + "step": 2707, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.90625762939453, + "epoch": 9.14839797639123, + "grad_norm": 9.035870813243875, + "kl": 0.4296875, + "learning_rate": 2.376126126126126e-07, + "loss": 0.0005, + "reward": 3.6754767894744873, + "reward_std": 0.030587462708353996, + "rewards/final_reward": 1.6339339723553117, + "rewards/mask_iou_reward": 0.8169669861776558, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6754766702651978, + "rewards/thk_ans_format_reward": 1.0, + "step": 2708, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.1666717529297, + "epoch": 9.15177065767285, + "grad_norm": 13.765975270879892, + "kl": 0.421875, + "learning_rate": 2.3733108108108106e-07, + "loss": 0.0004, + "reward": 3.6789125204086304, + "reward_std": 0.12016797810792923, + "rewards/final_reward": 1.8026906448987035, + "rewards/mask_iou_reward": 0.9013453224493517, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6789124011993408, + "rewards/thk_ans_format_reward": 1.0, + "step": 2709, + "think_completion_length": 6.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.6354217529297, + "epoch": 9.155143338954469, + "grad_norm": 21.10436404174611, + "kl": 0.4052734375, + "learning_rate": 2.3704954954954952e-07, + "loss": 0.0004, + "reward": 3.5580179691314697, + "reward_std": 0.047667574137449265, + "rewards/final_reward": 1.312607811481132, + "rewards/mask_iou_reward": 0.656303905740566, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5580180287361145, + "rewards/thk_ans_format_reward": 1.0, + "step": 2710, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.4166717529297, + "epoch": 9.158516020236087, + "grad_norm": 33.08955034272978, + "kl": 0.4453125, + "learning_rate": 2.36768018018018e-07, + "loss": 0.0004, + "reward": 3.8308794498443604, + "reward_std": 0.021719856187701225, + "rewards/final_reward": 1.7712656060535998, + "rewards/mask_iou_reward": 0.8856328030267999, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.830879271030426, + "rewards/thk_ans_format_reward": 1.0, + "step": 2711, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.89583587646484, + "epoch": 9.161888701517707, + "grad_norm": 16.238099343860256, + "kl": 0.4501953125, + "learning_rate": 2.3648648648648647e-07, + "loss": 0.0005, + "reward": 3.6413317918777466, + "reward_std": 0.15415740525349975, + "rewards/final_reward": 1.3824449875031644, + "rewards/mask_iou_reward": 0.6912224937515822, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6413318514823914, + "rewards/thk_ans_format_reward": 1.0, + "step": 2712, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.89583587646484, + "epoch": 9.165261382799326, + "grad_norm": 26.11702177124519, + "kl": 0.4384765625, + "learning_rate": 2.3620495495495493e-07, + "loss": 0.0004, + "reward": 3.1481897830963135, + "reward_std": 0.10512983053922653, + "rewards/final_reward": 1.535971815659202, + "rewards/mask_iou_reward": 0.767985907829601, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1481897830963135, + "rewards/thk_ans_format_reward": 1.0, + "step": 2713, + "think_completion_length": 11.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.0729217529297, + "epoch": 9.168634064080944, + "grad_norm": 10.88067258384534, + "kl": 0.3583984375, + "learning_rate": 2.3592342342342342e-07, + "loss": 0.0004, + "reward": 3.620645046234131, + "reward_std": 0.17188102006912231, + "rewards/final_reward": 1.874649497316282, + "rewards/mask_iou_reward": 0.937324748658141, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6414784789085388, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2714, + "think_completion_length": 7.458333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.30208587646484, + "epoch": 9.172006745362562, + "grad_norm": 13.439799881812803, + "kl": 0.509765625, + "learning_rate": 2.3564189189189188e-07, + "loss": 0.0005, + "reward": 3.6212422847747803, + "reward_std": 0.03326452663168311, + "rewards/final_reward": 1.6828357434864154, + "rewards/mask_iou_reward": 0.8414178717432077, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6212422847747803, + "rewards/thk_ans_format_reward": 1.0, + "step": 2715, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.4166717529297, + "epoch": 9.175379426644183, + "grad_norm": 38.14283135240575, + "kl": 1.384765625, + "learning_rate": 2.3536036036036037e-07, + "loss": 0.0015, + "reward": 3.5311455726623535, + "reward_std": 0.08386744372546673, + "rewards/final_reward": 1.1419197146860056, + "rewards/mask_iou_reward": 0.5709598573430028, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5311453342437744, + "rewards/thk_ans_format_reward": 1.0, + "step": 2716, + "think_completion_length": 10.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.98959350585938, + "epoch": 9.178752107925801, + "grad_norm": 7.467749101139426, + "kl": 0.4287109375, + "learning_rate": 2.3507882882882883e-07, + "loss": 0.0004, + "reward": 3.3673723936080933, + "reward_std": 0.03370837680995464, + "rewards/final_reward": 1.1762894471475038, + "rewards/mask_iou_reward": 0.5881447235737519, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3673723638057709, + "rewards/thk_ans_format_reward": 1.0, + "step": 2717, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.9791717529297, + "epoch": 9.18212478920742, + "grad_norm": 21.04843141904536, + "kl": 0.400390625, + "learning_rate": 2.347972972972973e-07, + "loss": 0.0004, + "reward": 3.6892040967941284, + "reward_std": 0.07573777623474598, + "rewards/final_reward": 1.8205007861710938, + "rewards/mask_iou_reward": 0.9102503930855469, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.689204216003418, + "rewards/thk_ans_format_reward": 1.0, + "step": 2718, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.125, + "epoch": 9.18549747048904, + "grad_norm": 17.602023385378942, + "kl": 0.55078125, + "learning_rate": 2.3451576576576575e-07, + "loss": 0.0006, + "reward": 3.3853163719177246, + "reward_std": 0.05290138069540262, + "rewards/final_reward": 1.1025753701321557, + "rewards/mask_iou_reward": 0.5512876850660778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3853165805339813, + "rewards/thk_ans_format_reward": 1.0, + "step": 2719, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.625, + "epoch": 9.188870151770658, + "grad_norm": 10.662517906773052, + "kl": 0.5, + "learning_rate": 2.342342342342342e-07, + "loss": 0.0005, + "reward": 3.4125794172286987, + "reward_std": 0.07688865810632706, + "rewards/final_reward": 1.5585264025322876, + "rewards/mask_iou_reward": 0.7792632012661438, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4125792384147644, + "rewards/thk_ans_format_reward": 1.0, + "step": 2720, + "think_completion_length": 11.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.7604217529297, + "epoch": 9.192242833052276, + "grad_norm": 12.407990721986765, + "kl": 0.4931640625, + "learning_rate": 2.339527027027027e-07, + "loss": 0.0005, + "reward": 3.6630319356918335, + "reward_std": 0.07096875412389636, + "rewards/final_reward": 1.2780035781827974, + "rewards/mask_iou_reward": 0.6390017890913987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6630319356918335, + "rewards/thk_ans_format_reward": 1.0, + "step": 2721, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.33333587646484, + "epoch": 9.195615514333895, + "grad_norm": 8.666970300646721, + "kl": 0.47265625, + "learning_rate": 2.3367117117117116e-07, + "loss": 0.0005, + "reward": 3.353834390640259, + "reward_std": 0.07080530747771263, + "rewards/final_reward": 1.5095522708688724, + "rewards/mask_iou_reward": 0.7547761354344362, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3538345694541931, + "rewards/thk_ans_format_reward": 1.0, + "step": 2722, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.01042175292969, + "epoch": 9.198988195615515, + "grad_norm": 13.157124933134252, + "kl": 0.51953125, + "learning_rate": 2.3338963963963962e-07, + "loss": 0.0005, + "reward": 3.3819308280944824, + "reward_std": 0.08063776372000575, + "rewards/final_reward": 1.5869294039563557, + "rewards/mask_iou_reward": 0.7934647019781779, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3819307088851929, + "rewards/thk_ans_format_reward": 1.0, + "step": 2723, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.48958587646484, + "epoch": 9.202360876897133, + "grad_norm": 13.680351131666008, + "kl": 0.478515625, + "learning_rate": 2.331081081081081e-07, + "loss": 0.0005, + "reward": 3.3788503408432007, + "reward_std": 0.06524365022778511, + "rewards/final_reward": 1.132651409256614, + "rewards/mask_iou_reward": 0.566325704628307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3788503110408783, + "rewards/thk_ans_format_reward": 1.0, + "step": 2724, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.92708587646484, + "epoch": 9.205733558178752, + "grad_norm": 113.04888297972742, + "kl": 1.263671875, + "learning_rate": 2.3282657657657657e-07, + "loss": 0.0013, + "reward": 3.409723162651062, + "reward_std": 0.2719406746327877, + "rewards/final_reward": 1.6586036546563951, + "rewards/mask_iou_reward": 0.8293018273281976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4097230434417725, + "rewards/thk_ans_format_reward": 1.0, + "step": 2725, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.46875762939453, + "epoch": 9.209106239460372, + "grad_norm": 6.657850170510586, + "kl": 0.74609375, + "learning_rate": 2.3254504504504505e-07, + "loss": 0.0008, + "reward": 3.5601396560668945, + "reward_std": 0.05424804985523224, + "rewards/final_reward": 1.0, + "rewards/mask_iou_reward": 0.5, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5601398944854736, + "rewards/thk_ans_format_reward": 1.0, + "step": 2726, + "think_completion_length": 9.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.0, + "epoch": 9.21247892074199, + "grad_norm": 13.024736310820327, + "kl": 0.4609375, + "learning_rate": 2.3226351351351351e-07, + "loss": 0.0004, + "reward": 3.525766372680664, + "reward_std": 0.078833919018507, + "rewards/final_reward": 1.0751320455824074, + "rewards/mask_iou_reward": 0.5375660227912037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5257662534713745, + "rewards/thk_ans_format_reward": 1.0, + "step": 2727, + "think_completion_length": 10.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.8229217529297, + "epoch": 9.215851602023609, + "grad_norm": 10.84058202337327, + "kl": 0.5947265625, + "learning_rate": 2.3198198198198195e-07, + "loss": 0.0006, + "reward": 3.80234432220459, + "reward_std": 0.049020628444850445, + "rewards/final_reward": 1.6783597664770171, + "rewards/mask_iou_reward": 0.8391798832385086, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8023445010185242, + "rewards/thk_ans_format_reward": 1.0, + "step": 2728, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.8125, + "epoch": 9.219224283305227, + "grad_norm": 15.706268381170005, + "kl": 0.4677734375, + "learning_rate": 2.3170045045045044e-07, + "loss": 0.0005, + "reward": 3.2954295873641968, + "reward_std": 0.10786097124218941, + "rewards/final_reward": 1.4206121526546707, + "rewards/mask_iou_reward": 0.7103060763273353, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.295429527759552, + "rewards/thk_ans_format_reward": 1.0, + "step": 2729, + "think_completion_length": 10.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.1875114440918, + "epoch": 9.222596964586847, + "grad_norm": 16.073526537458733, + "kl": 0.564453125, + "learning_rate": 2.314189189189189e-07, + "loss": 0.0006, + "reward": 3.780893921852112, + "reward_std": 0.08602484688162804, + "rewards/final_reward": 1.6251333088983986, + "rewards/mask_iou_reward": 0.8125666544491993, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7808939814567566, + "rewards/thk_ans_format_reward": 1.0, + "step": 2730, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.45833587646484, + "epoch": 9.225969645868465, + "grad_norm": 8.961890235000222, + "kl": 0.583984375, + "learning_rate": 2.3113738738738738e-07, + "loss": 0.0006, + "reward": 3.4740850925445557, + "reward_std": 0.06211280822753906, + "rewards/final_reward": 1.6200055105987259, + "rewards/mask_iou_reward": 0.8100027552993629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4740851521492004, + "rewards/thk_ans_format_reward": 1.0, + "step": 2731, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.07291793823242, + "epoch": 9.229342327150084, + "grad_norm": 11.245455267084438, + "kl": 0.50390625, + "learning_rate": 2.3085585585585584e-07, + "loss": 0.0005, + "reward": 3.4567354917526245, + "reward_std": 0.02357149589806795, + "rewards/final_reward": 1.7550627231620988, + "rewards/mask_iou_reward": 0.8775313615810494, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.456735610961914, + "rewards/thk_ans_format_reward": 1.0, + "step": 2732, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.6666717529297, + "epoch": 9.232715008431704, + "grad_norm": 47.63244965917098, + "kl": 0.412109375, + "learning_rate": 2.305743243243243e-07, + "loss": 0.0004, + "reward": 3.6387423276901245, + "reward_std": 0.0591295319609344, + "rewards/final_reward": 1.6464311243950158, + "rewards/mask_iou_reward": 0.8232155621975079, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6387420296669006, + "rewards/thk_ans_format_reward": 1.0, + "step": 2733, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.59375, + "epoch": 9.236087689713322, + "grad_norm": 9.854372652070786, + "kl": 0.4228515625, + "learning_rate": 2.302927927927928e-07, + "loss": 0.0004, + "reward": 3.661292791366577, + "reward_std": 0.06245612911880016, + "rewards/final_reward": 1.718917708254288, + "rewards/mask_iou_reward": 0.859458854127144, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6612926721572876, + "rewards/thk_ans_format_reward": 1.0, + "step": 2734, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.3125, + "epoch": 9.23946037099494, + "grad_norm": 9.775384666157215, + "kl": 0.49609375, + "learning_rate": 2.3001126126126125e-07, + "loss": 0.0005, + "reward": 3.6032204627990723, + "reward_std": 0.07593077456112951, + "rewards/final_reward": 1.0478658552790578, + "rewards/mask_iou_reward": 0.5239329276395289, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6032204627990723, + "rewards/thk_ans_format_reward": 1.0, + "step": 2735, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.12500762939453, + "epoch": 9.24283305227656, + "grad_norm": 9.93721364648544, + "kl": 0.3740234375, + "learning_rate": 2.2972972972972974e-07, + "loss": 0.0004, + "reward": 3.5557074546813965, + "reward_std": 0.05211344361305237, + "rewards/final_reward": 1.4102751917452139, + "rewards/mask_iou_reward": 0.7051375958726069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.555707573890686, + "rewards/thk_ans_format_reward": 1.0, + "step": 2736, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.85416793823242, + "epoch": 9.24620573355818, + "grad_norm": 16.21290070522786, + "kl": 0.669921875, + "learning_rate": 2.294481981981982e-07, + "loss": 0.0007, + "reward": 3.4519814252853394, + "reward_std": 0.07387526426464319, + "rewards/final_reward": 1.8517275535386748, + "rewards/mask_iou_reward": 0.9258637767693374, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4519813656806946, + "rewards/thk_ans_format_reward": 1.0, + "step": 2737, + "think_completion_length": 9.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.8229217529297, + "epoch": 9.249578414839798, + "grad_norm": 17.709552106732847, + "kl": 0.5380859375, + "learning_rate": 2.2916666666666663e-07, + "loss": 0.0006, + "reward": 3.5484334230422974, + "reward_std": 0.02219875669106841, + "rewards/final_reward": 1.6846610932227493, + "rewards/mask_iou_reward": 0.8423305466113746, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5484334230422974, + "rewards/thk_ans_format_reward": 1.0, + "step": 2738, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.62500762939453, + "epoch": 9.252951096121416, + "grad_norm": 51.63817221274276, + "kl": 0.4375, + "learning_rate": 2.2888513513513512e-07, + "loss": 0.0004, + "reward": 3.5409849882125854, + "reward_std": 0.050369157921522856, + "rewards/final_reward": 1.7093165381275603, + "rewards/mask_iou_reward": 0.8546582690637802, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5409849286079407, + "rewards/thk_ans_format_reward": 1.0, + "step": 2739, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.48959350585938, + "epoch": 9.256323777403036, + "grad_norm": 44.18713123663508, + "kl": 0.451171875, + "learning_rate": 2.2860360360360358e-07, + "loss": 0.0005, + "reward": 3.589063048362732, + "reward_std": 0.07845005393028259, + "rewards/final_reward": 1.8372715492254943, + "rewards/mask_iou_reward": 0.9186357746127471, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5890628695487976, + "rewards/thk_ans_format_reward": 1.0, + "step": 2740, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.25, + "epoch": 9.259696458684655, + "grad_norm": 11.948237225096927, + "kl": 0.5185546875, + "learning_rate": 2.2832207207207207e-07, + "loss": 0.0005, + "reward": 3.4584085941314697, + "reward_std": 0.042058190330863, + "rewards/final_reward": 1.5690370451956515, + "rewards/mask_iou_reward": 0.7845185225978257, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4584084153175354, + "rewards/thk_ans_format_reward": 1.0, + "step": 2741, + "think_completion_length": 11.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.45833587646484, + "epoch": 9.263069139966273, + "grad_norm": 7.820089401290567, + "kl": 0.49609375, + "learning_rate": 2.2804054054054053e-07, + "loss": 0.0005, + "reward": 3.644398331642151, + "reward_std": 0.02735021524131298, + "rewards/final_reward": 1.2831713487682677, + "rewards/mask_iou_reward": 0.6415856743841338, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6443983912467957, + "rewards/thk_ans_format_reward": 1.0, + "step": 2742, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.0729217529297, + "epoch": 9.266441821247891, + "grad_norm": 11.92523700648404, + "kl": 0.44921875, + "learning_rate": 2.27759009009009e-07, + "loss": 0.0004, + "reward": 3.547249913215637, + "reward_std": 0.08835725113749504, + "rewards/final_reward": 0.8956250313878287, + "rewards/mask_iou_reward": 0.44781251569391434, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5472498536109924, + "rewards/thk_ans_format_reward": 1.0, + "step": 2743, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.80208587646484, + "epoch": 9.269814502529512, + "grad_norm": 17.071450563721925, + "kl": 0.5263671875, + "learning_rate": 2.2747747747747748e-07, + "loss": 0.0005, + "reward": 3.250998616218567, + "reward_std": 0.10396800190210342, + "rewards/final_reward": 1.5332373470798437, + "rewards/mask_iou_reward": 0.7666186735399219, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.250998616218567, + "rewards/thk_ans_format_reward": 1.0, + "step": 2744, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.5104217529297, + "epoch": 9.27318718381113, + "grad_norm": 17.36090561387625, + "kl": 0.4482421875, + "learning_rate": 2.2719594594594594e-07, + "loss": 0.0005, + "reward": 3.337352991104126, + "reward_std": 0.14889592677354813, + "rewards/final_reward": 1.15980387890831, + "rewards/mask_iou_reward": 0.579901939454155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3373527526855469, + "rewards/thk_ans_format_reward": 1.0, + "step": 2745, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.02083587646484, + "epoch": 9.276559865092748, + "grad_norm": 76.89275415294323, + "kl": 0.45703125, + "learning_rate": 2.2691441441441443e-07, + "loss": 0.0005, + "reward": 3.601949453353882, + "reward_std": 0.18171671777963638, + "rewards/final_reward": 1.4770694682232217, + "rewards/mask_iou_reward": 0.7385347341116109, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6019494533538818, + "rewards/thk_ans_format_reward": 1.0, + "step": 2746, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.33333587646484, + "epoch": 9.279932546374368, + "grad_norm": 7.111772882118619, + "kl": 0.4208984375, + "learning_rate": 2.2663288288288289e-07, + "loss": 0.0005, + "reward": 3.4016687870025635, + "reward_std": 0.014272671192884445, + "rewards/final_reward": 1.836040758464693, + "rewards/mask_iou_reward": 0.9180203792323465, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4016689360141754, + "rewards/thk_ans_format_reward": 1.0, + "step": 2747, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.61458587646484, + "epoch": 9.283305227655987, + "grad_norm": 16.122805060508597, + "kl": 0.45703125, + "learning_rate": 2.2635135135135132e-07, + "loss": 0.0005, + "reward": 3.6596243381500244, + "reward_std": 0.11477926932275295, + "rewards/final_reward": 1.8422299682834904, + "rewards/mask_iou_reward": 0.9211149841417452, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6596242785453796, + "rewards/thk_ans_format_reward": 1.0, + "step": 2748, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.90625762939453, + "epoch": 9.286677908937605, + "grad_norm": 11.169039142915206, + "kl": 0.5107421875, + "learning_rate": 2.260698198198198e-07, + "loss": 0.0005, + "reward": 3.5329428911209106, + "reward_std": 0.05546556948684156, + "rewards/final_reward": 1.790726196815894, + "rewards/mask_iou_reward": 0.895363098407947, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5329428911209106, + "rewards/thk_ans_format_reward": 1.0, + "step": 2749, + "think_completion_length": 11.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.0416717529297, + "epoch": 9.290050590219224, + "grad_norm": 12.772723235514924, + "kl": 1.099609375, + "learning_rate": 2.2578828828828827e-07, + "loss": 0.0011, + "reward": 3.4059635400772095, + "reward_std": 0.1593819446861744, + "rewards/final_reward": 1.0051670460292021, + "rewards/mask_iou_reward": 0.5025835230146011, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.426796793937683, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2750, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.76042938232422, + "epoch": 9.293423271500844, + "grad_norm": 7.808991961017135, + "kl": 0.423828125, + "learning_rate": 2.2550675675675673e-07, + "loss": 0.0004, + "reward": 3.221948981285095, + "reward_std": 0.03692587744444609, + "rewards/final_reward": 1.4581314656441513, + "rewards/mask_iou_reward": 0.7290657328220757, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2219487726688385, + "rewards/thk_ans_format_reward": 1.0, + "step": 2751, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.23959350585938, + "epoch": 9.296795952782462, + "grad_norm": 15.10335540799134, + "kl": 0.4306640625, + "learning_rate": 2.2522522522522522e-07, + "loss": 0.0004, + "reward": 3.3186607360839844, + "reward_std": 0.06642237678170204, + "rewards/final_reward": 0.3598344031115008, + "rewards/mask_iou_reward": 0.1799172015557504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3186606764793396, + "rewards/thk_ans_format_reward": 1.0, + "step": 2752, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.6666717529297, + "epoch": 9.30016863406408, + "grad_norm": 38.570647147712435, + "kl": 0.4482421875, + "learning_rate": 2.2494369369369368e-07, + "loss": 0.0005, + "reward": 3.66131055355072, + "reward_std": 0.03528841398656368, + "rewards/final_reward": 1.517077571028143, + "rewards/mask_iou_reward": 0.7585387855140715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6613103747367859, + "rewards/thk_ans_format_reward": 1.0, + "step": 2753, + "think_completion_length": 12.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.54166793823242, + "epoch": 9.3035413153457, + "grad_norm": 24.013114970469317, + "kl": 0.478515625, + "learning_rate": 2.2466216216216216e-07, + "loss": 0.0005, + "reward": 3.2862175703048706, + "reward_std": 0.11558353528380394, + "rewards/final_reward": 0.9459682073556034, + "rewards/mask_iou_reward": 0.4729841036778017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2862175107002258, + "rewards/thk_ans_format_reward": 1.0, + "step": 2754, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.78125762939453, + "epoch": 9.306913996627319, + "grad_norm": 7.600629988537983, + "kl": 0.5322265625, + "learning_rate": 2.2438063063063062e-07, + "loss": 0.0005, + "reward": 3.2242766618728638, + "reward_std": 0.03124239854514599, + "rewards/final_reward": 1.2069206340078744, + "rewards/mask_iou_reward": 0.6034603170039372, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.224276602268219, + "rewards/thk_ans_format_reward": 1.0, + "step": 2755, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.72916793823242, + "epoch": 9.310286677908937, + "grad_norm": 14.613051056094477, + "kl": 0.50390625, + "learning_rate": 2.2409909909909909e-07, + "loss": 0.0005, + "reward": 3.601539373397827, + "reward_std": 0.0633353553712368, + "rewards/final_reward": 1.6655733024676027, + "rewards/mask_iou_reward": 0.8327866512338014, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.601539433002472, + "rewards/thk_ans_format_reward": 1.0, + "step": 2756, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.5104217529297, + "epoch": 9.313659359190556, + "grad_norm": 11.469524143937166, + "kl": 0.408203125, + "learning_rate": 2.2381756756756757e-07, + "loss": 0.0004, + "reward": 3.648510694503784, + "reward_std": 0.024022470228374004, + "rewards/final_reward": 1.8617895600621224, + "rewards/mask_iou_reward": 0.9308947800310612, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6485106945037842, + "rewards/thk_ans_format_reward": 1.0, + "step": 2757, + "think_completion_length": 9.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.59375, + "epoch": 9.317032040472176, + "grad_norm": 12.709447015602692, + "kl": 0.474609375, + "learning_rate": 2.23536036036036e-07, + "loss": 0.0005, + "reward": 3.4784998893737793, + "reward_std": 0.1642679050564766, + "rewards/final_reward": 1.806289599758458, + "rewards/mask_iou_reward": 0.903144799879229, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4784998893737793, + "rewards/thk_ans_format_reward": 1.0, + "step": 2758, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.6979217529297, + "epoch": 9.320404721753794, + "grad_norm": 12.901017739216863, + "kl": 0.4248046875, + "learning_rate": 2.232545045045045e-07, + "loss": 0.0004, + "reward": 3.3456382751464844, + "reward_std": 0.17587218433618546, + "rewards/final_reward": 1.1098193639560703, + "rewards/mask_iou_reward": 0.5549096819780351, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3456381559371948, + "rewards/thk_ans_format_reward": 1.0, + "step": 2759, + "think_completion_length": 12.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.09375381469727, + "epoch": 9.323777403035413, + "grad_norm": 9.370012619189097, + "kl": 0.4833984375, + "learning_rate": 2.2297297297297295e-07, + "loss": 0.0005, + "reward": 3.3864450454711914, + "reward_std": 0.07419527135789394, + "rewards/final_reward": 1.264501399628911, + "rewards/mask_iou_reward": 0.6322506998144555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3864449858665466, + "rewards/thk_ans_format_reward": 1.0, + "step": 2760, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.3020935058594, + "epoch": 9.327150084317031, + "grad_norm": 13.10499968912817, + "kl": 0.4248046875, + "learning_rate": 2.2269144144144141e-07, + "loss": 0.0004, + "reward": 3.5883008241653442, + "reward_std": 0.2940108925104141, + "rewards/final_reward": 1.4497035366533044, + "rewards/mask_iou_reward": 0.7248517683266522, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.671634018421173, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 2761, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.1875, + "epoch": 9.330522765598651, + "grad_norm": 34.535691879703776, + "kl": 0.41015625, + "learning_rate": 2.224099099099099e-07, + "loss": 0.0004, + "reward": 3.440687894821167, + "reward_std": 0.026611979119479656, + "rewards/final_reward": 1.379231231386878, + "rewards/mask_iou_reward": 0.689615615693439, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4406880140304565, + "rewards/thk_ans_format_reward": 1.0, + "step": 2762, + "think_completion_length": 10.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.7916717529297, + "epoch": 9.33389544688027, + "grad_norm": 10.049491198654804, + "kl": 0.4208984375, + "learning_rate": 2.2212837837837836e-07, + "loss": 0.0004, + "reward": 3.5684189796447754, + "reward_std": 0.08044159226119518, + "rewards/final_reward": 1.3640238839079284, + "rewards/mask_iou_reward": 0.6820119419539642, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.568418800830841, + "rewards/thk_ans_format_reward": 1.0, + "step": 2763, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.70833587646484, + "epoch": 9.337268128161888, + "grad_norm": 45.7431214634519, + "kl": 0.7724609375, + "learning_rate": 2.2184684684684685e-07, + "loss": 0.0008, + "reward": 3.304787039756775, + "reward_std": 0.056283093988895416, + "rewards/final_reward": 1.3249543151857697, + "rewards/mask_iou_reward": 0.6624771575928848, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3047870993614197, + "rewards/thk_ans_format_reward": 1.0, + "step": 2764, + "think_completion_length": 10.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.2083396911621, + "epoch": 9.340640809443508, + "grad_norm": 16.427846702225523, + "kl": 0.453125, + "learning_rate": 2.215653153153153e-07, + "loss": 0.0005, + "reward": 3.6365526914596558, + "reward_std": 0.04116538679227233, + "rewards/final_reward": 1.459079772140508, + "rewards/mask_iou_reward": 0.729539886070254, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6365526914596558, + "rewards/thk_ans_format_reward": 1.0, + "step": 2765, + "think_completion_length": 10.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.75000762939453, + "epoch": 9.344013490725127, + "grad_norm": 30.64628568079336, + "kl": 0.619140625, + "learning_rate": 2.2128378378378377e-07, + "loss": 0.0007, + "reward": 3.6885385513305664, + "reward_std": 0.03211810206994414, + "rewards/final_reward": 1.9013723725850795, + "rewards/mask_iou_reward": 0.9506861862925398, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6885384321212769, + "rewards/thk_ans_format_reward": 1.0, + "step": 2766, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.13542938232422, + "epoch": 9.347386172006745, + "grad_norm": 17.744072847836915, + "kl": 0.3955078125, + "learning_rate": 2.2100225225225226e-07, + "loss": 0.0004, + "reward": 3.4503098726272583, + "reward_std": 0.08233339712023735, + "rewards/final_reward": 1.1525913602996831, + "rewards/mask_iou_reward": 0.5762956801498416, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4503095149993896, + "rewards/thk_ans_format_reward": 1.0, + "step": 2767, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.9375, + "epoch": 9.350758853288363, + "grad_norm": 10.819239423457798, + "kl": 0.3896484375, + "learning_rate": 2.207207207207207e-07, + "loss": 0.0004, + "reward": 3.7137391567230225, + "reward_std": 0.07989808917045593, + "rewards/final_reward": 1.4908883016966383, + "rewards/mask_iou_reward": 0.7454441508483192, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7137390971183777, + "rewards/thk_ans_format_reward": 1.0, + "step": 2768, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.3229293823242, + "epoch": 9.354131534569984, + "grad_norm": 9.131795084672026, + "kl": 0.798828125, + "learning_rate": 2.2043918918918918e-07, + "loss": 0.0008, + "reward": 3.4504722356796265, + "reward_std": 0.025778494775295258, + "rewards/final_reward": 1.4927454592099887, + "rewards/mask_iou_reward": 0.7463727296049943, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4504724144935608, + "rewards/thk_ans_format_reward": 1.0, + "step": 2769, + "think_completion_length": 9.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.6041717529297, + "epoch": 9.357504215851602, + "grad_norm": 10.494561667187618, + "kl": 0.4384765625, + "learning_rate": 2.2015765765765764e-07, + "loss": 0.0004, + "reward": 3.1785553693771362, + "reward_std": 0.13898254744708538, + "rewards/final_reward": 1.6997683199459952, + "rewards/mask_iou_reward": 0.8498841599729976, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1785553693771362, + "rewards/thk_ans_format_reward": 1.0, + "step": 2770, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.375, + "epoch": 9.36087689713322, + "grad_norm": 18.847084911989935, + "kl": 0.96875, + "learning_rate": 2.198761261261261e-07, + "loss": 0.001, + "reward": 3.3483951091766357, + "reward_std": 0.0744620319455862, + "rewards/final_reward": 1.822842056260184, + "rewards/mask_iou_reward": 0.911421028130092, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3483951687812805, + "rewards/thk_ans_format_reward": 1.0, + "step": 2771, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.40625, + "epoch": 9.36424957841484, + "grad_norm": 13.94947088934637, + "kl": 0.431640625, + "learning_rate": 2.195945945945946e-07, + "loss": 0.0004, + "reward": 3.551551580429077, + "reward_std": 0.10540284961462021, + "rewards/final_reward": 1.421445187031141, + "rewards/mask_iou_reward": 0.7107225935155705, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5515515208244324, + "rewards/thk_ans_format_reward": 1.0, + "step": 2772, + "think_completion_length": 10.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.05208587646484, + "epoch": 9.367622259696459, + "grad_norm": 13.877847201874827, + "kl": 0.4296875, + "learning_rate": 2.1931306306306305e-07, + "loss": 0.0004, + "reward": 3.6659048795700073, + "reward_std": 0.030204295529983938, + "rewards/final_reward": 1.4251490346438225, + "rewards/mask_iou_reward": 0.7125745173219112, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6659048795700073, + "rewards/thk_ans_format_reward": 1.0, + "step": 2773, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.1979217529297, + "epoch": 9.370994940978077, + "grad_norm": 14.637147856784619, + "kl": 0.5185546875, + "learning_rate": 2.1903153153153154e-07, + "loss": 0.0006, + "reward": 3.6994014978408813, + "reward_std": 0.028351569548249245, + "rewards/final_reward": 1.422695676590725, + "rewards/mask_iou_reward": 0.7113478382953625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.699401617050171, + "rewards/thk_ans_format_reward": 1.0, + "step": 2774, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.4166717529297, + "epoch": 9.374367622259696, + "grad_norm": 9.120025231728167, + "kl": 0.634765625, + "learning_rate": 2.1875e-07, + "loss": 0.0006, + "reward": 3.551692247390747, + "reward_std": 0.04547809809446335, + "rewards/final_reward": 1.16152331109033, + "rewards/mask_iou_reward": 0.580761655545165, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.551692008972168, + "rewards/thk_ans_format_reward": 1.0, + "step": 2775, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.25, + "epoch": 9.377740303541316, + "grad_norm": 9.89069404329003, + "kl": 0.4990234375, + "learning_rate": 2.1846846846846846e-07, + "loss": 0.0005, + "reward": 3.7603938579559326, + "reward_std": 0.013669957872480154, + "rewards/final_reward": 1.9294720336901374, + "rewards/mask_iou_reward": 0.9647360168450687, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7603938579559326, + "rewards/thk_ans_format_reward": 1.0, + "step": 2776, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.2604217529297, + "epoch": 9.381112984822934, + "grad_norm": 8.331803620841779, + "kl": 0.4443359375, + "learning_rate": 2.1818693693693694e-07, + "loss": 0.0004, + "reward": 3.568936824798584, + "reward_std": 0.21196496207267046, + "rewards/final_reward": 1.3753243020169517, + "rewards/mask_iou_reward": 0.6876621510084758, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.6106035709381104, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2777, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.27083587646484, + "epoch": 9.384485666104553, + "grad_norm": 10.35051680882014, + "kl": 0.455078125, + "learning_rate": 2.1790540540540538e-07, + "loss": 0.0005, + "reward": 3.6697943210601807, + "reward_std": 0.011216352228075266, + "rewards/final_reward": 1.5587942518005602, + "rewards/mask_iou_reward": 0.7793971259002801, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6697942018508911, + "rewards/thk_ans_format_reward": 1.0, + "step": 2778, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.28125762939453, + "epoch": 9.387858347386173, + "grad_norm": 15.491498623038067, + "kl": 0.4501953125, + "learning_rate": 2.1762387387387387e-07, + "loss": 0.0005, + "reward": 3.496561288833618, + "reward_std": 0.05198364332318306, + "rewards/final_reward": 1.6413194591458895, + "rewards/mask_iou_reward": 0.8206597295729448, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4965611696243286, + "rewards/thk_ans_format_reward": 1.0, + "step": 2779, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.39584350585938, + "epoch": 9.391231028667791, + "grad_norm": 13.157649856445605, + "kl": 0.4296875, + "learning_rate": 2.1734234234234233e-07, + "loss": 0.0004, + "reward": 3.51545250415802, + "reward_std": 0.05822751484811306, + "rewards/final_reward": 1.3395250521672077, + "rewards/mask_iou_reward": 0.6697625260836039, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5154521465301514, + "rewards/thk_ans_format_reward": 1.0, + "step": 2780, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.11459350585938, + "epoch": 9.39460370994941, + "grad_norm": 10.604900553990017, + "kl": 0.4404296875, + "learning_rate": 2.170608108108108e-07, + "loss": 0.0005, + "reward": 3.6119425296783447, + "reward_std": 0.06541701033711433, + "rewards/final_reward": 1.712878393204974, + "rewards/mask_iou_reward": 0.856439196602487, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6119424104690552, + "rewards/thk_ans_format_reward": 1.0, + "step": 2781, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.34375, + "epoch": 9.397976391231028, + "grad_norm": 10.330735606148046, + "kl": 0.486328125, + "learning_rate": 2.1677927927927927e-07, + "loss": 0.0005, + "reward": 3.646028161048889, + "reward_std": 0.1866953857243061, + "rewards/final_reward": 1.9406015070581604, + "rewards/mask_iou_reward": 0.9703007535290802, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6668614745140076, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2782, + "think_completion_length": 10.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.0729217529297, + "epoch": 9.401349072512648, + "grad_norm": 82.46974246026812, + "kl": 0.572265625, + "learning_rate": 2.1649774774774774e-07, + "loss": 0.0006, + "reward": 3.3745267391204834, + "reward_std": 0.10908135771751404, + "rewards/final_reward": 1.268176157405094, + "rewards/mask_iou_reward": 0.634088078702547, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3849433660507202, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2783, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.03125, + "epoch": 9.404721753794266, + "grad_norm": 30.945234479281133, + "kl": 0.375, + "learning_rate": 2.1621621621621622e-07, + "loss": 0.0004, + "reward": 3.5293290615081787, + "reward_std": 0.0759376734495163, + "rewards/final_reward": 1.6069022349327682, + "rewards/mask_iou_reward": 0.8034511174663841, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.529329001903534, + "rewards/thk_ans_format_reward": 1.0, + "step": 2784, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.02083587646484, + "epoch": 9.408094435075885, + "grad_norm": 22.46595324259033, + "kl": 0.501953125, + "learning_rate": 2.1593468468468468e-07, + "loss": 0.0005, + "reward": 3.8743661642074585, + "reward_std": 0.05244017764925957, + "rewards/final_reward": 1.8538225025823447, + "rewards/mask_iou_reward": 0.9269112512911724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8743661642074585, + "rewards/thk_ans_format_reward": 1.0, + "step": 2785, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.25, + "epoch": 9.411467116357505, + "grad_norm": 87.8717800260948, + "kl": 0.4267578125, + "learning_rate": 2.1565315315315314e-07, + "loss": 0.0004, + "reward": 3.36142361164093, + "reward_std": 0.08803194761276245, + "rewards/final_reward": 1.5342241975445043, + "rewards/mask_iou_reward": 0.7671120987722522, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3614235520362854, + "rewards/thk_ans_format_reward": 1.0, + "step": 2786, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.875, + "epoch": 9.414839797639123, + "grad_norm": 69.32235015398565, + "kl": 0.5048828125, + "learning_rate": 2.1537162162162163e-07, + "loss": 0.0005, + "reward": 3.302255868911743, + "reward_std": 0.07581621408462524, + "rewards/final_reward": 1.2662602907992375, + "rewards/mask_iou_reward": 0.6331301453996188, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3022557497024536, + "rewards/thk_ans_format_reward": 1.0, + "step": 2787, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.50000762939453, + "epoch": 9.418212478920742, + "grad_norm": 22.671197228263072, + "kl": 0.482421875, + "learning_rate": 2.1509009009009006e-07, + "loss": 0.0005, + "reward": 3.691166639328003, + "reward_std": 0.029686040244996548, + "rewards/final_reward": 1.8484224625982473, + "rewards/mask_iou_reward": 0.9242112312991236, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6911665201187134, + "rewards/thk_ans_format_reward": 1.0, + "step": 2788, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.30208587646484, + "epoch": 9.42158516020236, + "grad_norm": 20.21814340136336, + "kl": 0.42578125, + "learning_rate": 2.1480855855855855e-07, + "loss": 0.0004, + "reward": 3.7652477025985718, + "reward_std": 0.060860181925818324, + "rewards/final_reward": 1.9556798290556365, + "rewards/mask_iou_reward": 0.9778399145278183, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7652477025985718, + "rewards/thk_ans_format_reward": 1.0, + "step": 2789, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.4479217529297, + "epoch": 9.42495784148398, + "grad_norm": 72.0763048743931, + "kl": 0.525390625, + "learning_rate": 2.14527027027027e-07, + "loss": 0.0005, + "reward": 3.4164334535598755, + "reward_std": 0.1100648082792759, + "rewards/final_reward": 1.1242582255843374, + "rewards/mask_iou_reward": 0.5621291127921687, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.416433334350586, + "rewards/thk_ans_format_reward": 1.0, + "step": 2790, + "think_completion_length": 9.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.77084350585938, + "epoch": 9.428330522765599, + "grad_norm": 13.376003044819086, + "kl": 0.498046875, + "learning_rate": 2.1424549549549547e-07, + "loss": 0.0005, + "reward": 3.608410954475403, + "reward_std": 0.12914259731769562, + "rewards/final_reward": 1.7236305317824567, + "rewards/mask_iou_reward": 0.8618152658912284, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6084111332893372, + "rewards/thk_ans_format_reward": 1.0, + "step": 2791, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.4479217529297, + "epoch": 9.431703204047217, + "grad_norm": 7.346062847254678, + "kl": 0.5341796875, + "learning_rate": 2.1396396396396396e-07, + "loss": 0.0007, + "reward": 3.62905216217041, + "reward_std": 0.049922844395041466, + "rewards/final_reward": 1.8607454231881027, + "rewards/mask_iou_reward": 0.9303727115940513, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6290519833564758, + "rewards/thk_ans_format_reward": 1.0, + "step": 2792, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.5208396911621, + "epoch": 9.435075885328837, + "grad_norm": 17.302456866640764, + "kl": 0.47265625, + "learning_rate": 2.1368243243243242e-07, + "loss": 0.0005, + "reward": 3.407977819442749, + "reward_std": 0.23191232979297638, + "rewards/final_reward": 1.5567038813895737, + "rewards/mask_iou_reward": 0.7783519406947869, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4079777002334595, + "rewards/thk_ans_format_reward": 1.0, + "step": 2793, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.0104217529297, + "epoch": 9.438448566610456, + "grad_norm": 23.319498306812275, + "kl": 0.4443359375, + "learning_rate": 2.134009009009009e-07, + "loss": 0.0004, + "reward": 3.6632840633392334, + "reward_std": 0.04057050496339798, + "rewards/final_reward": 1.3894538766016005, + "rewards/mask_iou_reward": 0.6947269383008002, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6632840633392334, + "rewards/thk_ans_format_reward": 1.0, + "step": 2794, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.3854217529297, + "epoch": 9.441821247892074, + "grad_norm": 34.32798971403974, + "kl": 0.6640625, + "learning_rate": 2.1311936936936937e-07, + "loss": 0.0007, + "reward": 3.824118494987488, + "reward_std": 0.09926417097449303, + "rewards/final_reward": 1.7717287731157554, + "rewards/mask_iou_reward": 0.8858643865578777, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8241184949874878, + "rewards/thk_ans_format_reward": 1.0, + "step": 2795, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.2916717529297, + "epoch": 9.445193929173692, + "grad_norm": 12.333120877458516, + "kl": 0.4375, + "learning_rate": 2.1283783783783783e-07, + "loss": 0.0005, + "reward": 3.581265926361084, + "reward_std": 0.08018996939063072, + "rewards/final_reward": 1.5700802326638046, + "rewards/mask_iou_reward": 0.7850401163319023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5812659859657288, + "rewards/thk_ans_format_reward": 1.0, + "step": 2796, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.71875762939453, + "epoch": 9.448566610455313, + "grad_norm": 7.065641300682204, + "kl": 0.4169921875, + "learning_rate": 2.1255630630630632e-07, + "loss": 0.0005, + "reward": 3.70440411567688, + "reward_std": 0.04049869813024998, + "rewards/final_reward": 1.2983284435416518, + "rewards/mask_iou_reward": 0.6491642217708259, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7044039964675903, + "rewards/thk_ans_format_reward": 1.0, + "step": 2797, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.28125, + "epoch": 9.451939291736931, + "grad_norm": 13.797851066630797, + "kl": 0.4619140625, + "learning_rate": 2.1227477477477475e-07, + "loss": 0.0005, + "reward": 3.374970555305481, + "reward_std": 0.0377837847918272, + "rewards/final_reward": 1.3635178205191425, + "rewards/mask_iou_reward": 0.6817589102595712, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.374970555305481, + "rewards/thk_ans_format_reward": 1.0, + "step": 2798, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.58333587646484, + "epoch": 9.45531197301855, + "grad_norm": 7.334236819259594, + "kl": 0.4619140625, + "learning_rate": 2.1199324324324324e-07, + "loss": 0.0005, + "reward": 3.2692062854766846, + "reward_std": 0.084585752338171, + "rewards/final_reward": 1.1545577914849081, + "rewards/mask_iou_reward": 0.5772788957424541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2692063450813293, + "rewards/thk_ans_format_reward": 1.0, + "step": 2799, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.36459350585938, + "epoch": 9.45868465430017, + "grad_norm": 23.920565501545674, + "kl": 0.525390625, + "learning_rate": 2.117117117117117e-07, + "loss": 0.0005, + "reward": 3.405628800392151, + "reward_std": 0.1159110739827156, + "rewards/final_reward": 1.7630714442007784, + "rewards/mask_iou_reward": 0.8815357221003892, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4056288003921509, + "rewards/thk_ans_format_reward": 1.0, + "step": 2800, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.03126525878906, + "epoch": 9.462057335581788, + "grad_norm": 7.875361715270865, + "kl": 0.5126953125, + "learning_rate": 2.1143018018018016e-07, + "loss": 0.0005, + "reward": 3.4100340604782104, + "reward_std": 0.08821703493595123, + "rewards/final_reward": 1.370184435113929, + "rewards/mask_iou_reward": 0.6850922175569645, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4100341200828552, + "rewards/thk_ans_format_reward": 1.0, + "step": 2801, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.67708587646484, + "epoch": 9.465430016863406, + "grad_norm": 19.085261433748908, + "kl": 0.5966796875, + "learning_rate": 2.1114864864864865e-07, + "loss": 0.0006, + "reward": 3.590638756752014, + "reward_std": 0.023143062833696604, + "rewards/final_reward": 1.7881143745703603, + "rewards/mask_iou_reward": 0.8940571872851801, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5906386971473694, + "rewards/thk_ans_format_reward": 1.0, + "step": 2802, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.93750381469727, + "epoch": 9.468802698145025, + "grad_norm": 13.034191044866834, + "kl": 0.423828125, + "learning_rate": 2.108671171171171e-07, + "loss": 0.0004, + "reward": 3.4855542182922363, + "reward_std": 0.11600197479128838, + "rewards/final_reward": 1.338612547361113, + "rewards/mask_iou_reward": 0.6693062736805565, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4855543375015259, + "rewards/thk_ans_format_reward": 1.0, + "step": 2803, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.84375, + "epoch": 9.472175379426645, + "grad_norm": 16.930757784970062, + "kl": 0.470703125, + "learning_rate": 2.1058558558558557e-07, + "loss": 0.0005, + "reward": 3.5270018577575684, + "reward_std": 0.05098412372171879, + "rewards/final_reward": 1.841535809184157, + "rewards/mask_iou_reward": 0.9207679045920785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.527001678943634, + "rewards/thk_ans_format_reward": 1.0, + "step": 2804, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 141.59375, + "epoch": 9.475548060708263, + "grad_norm": 8.068372827679651, + "kl": 0.591796875, + "learning_rate": 2.1030405405405406e-07, + "loss": 0.0006, + "reward": 3.5839741230010986, + "reward_std": 0.021892188116908073, + "rewards/final_reward": 1.6267602993609627, + "rewards/mask_iou_reward": 0.8133801496804813, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.583974003791809, + "rewards/thk_ans_format_reward": 1.0, + "step": 2805, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.20833587646484, + "epoch": 9.478920741989882, + "grad_norm": 8.689718251800521, + "kl": 0.4111328125, + "learning_rate": 2.1002252252252252e-07, + "loss": 0.0004, + "reward": 3.6123223304748535, + "reward_std": 0.030155442655086517, + "rewards/final_reward": 1.8596118805407436, + "rewards/mask_iou_reward": 0.9298059402703718, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6123226284980774, + "rewards/thk_ans_format_reward": 1.0, + "step": 2806, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.31250762939453, + "epoch": 9.4822934232715, + "grad_norm": 8.01553579629774, + "kl": 0.447265625, + "learning_rate": 2.09740990990991e-07, + "loss": 0.0005, + "reward": 3.1493096351623535, + "reward_std": 0.18032991886138916, + "rewards/final_reward": 1.1444689445247709, + "rewards/mask_iou_reward": 0.5722344722623854, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1493096947669983, + "rewards/thk_ans_format_reward": 1.0, + "step": 2807, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 97.47916793823242, + "epoch": 9.48566610455312, + "grad_norm": 6.552411445556528, + "kl": 0.5888671875, + "learning_rate": 2.0945945945945944e-07, + "loss": 0.0006, + "reward": 3.522453546524048, + "reward_std": 0.012052702717483044, + "rewards/final_reward": 1.1182514968418769, + "rewards/mask_iou_reward": 0.5591257484209384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5224534273147583, + "rewards/thk_ans_format_reward": 1.0, + "step": 2808, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.71875762939453, + "epoch": 9.489038785834738, + "grad_norm": 8.233794743221564, + "kl": 0.4638671875, + "learning_rate": 2.091779279279279e-07, + "loss": 0.0005, + "reward": 3.684836983680725, + "reward_std": 0.10470253601670265, + "rewards/final_reward": 1.3776622207917923, + "rewards/mask_iou_reward": 0.6888311103958962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6848370432853699, + "rewards/thk_ans_format_reward": 1.0, + "step": 2809, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4479217529297, + "epoch": 9.492411467116357, + "grad_norm": 23.72330348754612, + "kl": 0.458984375, + "learning_rate": 2.0889639639639638e-07, + "loss": 0.0005, + "reward": 3.4966185092926025, + "reward_std": 0.057080830447375774, + "rewards/final_reward": 1.0069303965521077, + "rewards/mask_iou_reward": 0.5034651982760538, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.496618628501892, + "rewards/thk_ans_format_reward": 1.0, + "step": 2810, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.43750762939453, + "epoch": 9.495784148397977, + "grad_norm": 15.26687033533583, + "kl": 0.591796875, + "learning_rate": 2.0861486486486485e-07, + "loss": 0.0006, + "reward": 3.5150842666625977, + "reward_std": 0.1310267341323197, + "rewards/final_reward": 1.6641343179725157, + "rewards/mask_iou_reward": 0.8320671589862578, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5359174609184265, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2811, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.14583587646484, + "epoch": 9.499156829679595, + "grad_norm": 8.620305372098384, + "kl": 0.44140625, + "learning_rate": 2.0833333333333333e-07, + "loss": 0.0005, + "reward": 3.5341700315475464, + "reward_std": 0.049245577305555344, + "rewards/final_reward": 1.544416112337022, + "rewards/mask_iou_reward": 0.772208056168511, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.534170150756836, + "rewards/thk_ans_format_reward": 1.0, + "step": 2812, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.86458587646484, + "epoch": 9.502529510961214, + "grad_norm": 6.127603181830918, + "kl": 0.5126953125, + "learning_rate": 2.080518018018018e-07, + "loss": 0.0005, + "reward": 3.625041961669922, + "reward_std": 0.041364286094903946, + "rewards/final_reward": 1.5758643227687703, + "rewards/mask_iou_reward": 0.7879321613843852, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6250417828559875, + "rewards/thk_ans_format_reward": 1.0, + "step": 2813, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.66667938232422, + "epoch": 9.505902192242832, + "grad_norm": 14.304108783799528, + "kl": 0.5126953125, + "learning_rate": 2.0777027027027025e-07, + "loss": 0.0005, + "reward": 3.7484500408172607, + "reward_std": 0.051718422677367926, + "rewards/final_reward": 1.8292283794214241, + "rewards/mask_iou_reward": 0.9146141897107121, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.748449981212616, + "rewards/thk_ans_format_reward": 1.0, + "step": 2814, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.4479217529297, + "epoch": 9.509274873524452, + "grad_norm": 9.773661468760594, + "kl": 0.3955078125, + "learning_rate": 2.0748873873873874e-07, + "loss": 0.0004, + "reward": 3.4886139631271362, + "reward_std": 0.045159148052334785, + "rewards/final_reward": 1.7602771452812136, + "rewards/mask_iou_reward": 0.8801385726406068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4886139631271362, + "rewards/thk_ans_format_reward": 1.0, + "step": 2815, + "think_completion_length": 7.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.98958587646484, + "epoch": 9.51264755480607, + "grad_norm": 16.820619286046863, + "kl": 0.4951171875, + "learning_rate": 2.072072072072072e-07, + "loss": 0.0006, + "reward": 3.7381627559661865, + "reward_std": 0.11359736323356628, + "rewards/final_reward": 1.5969598305950474, + "rewards/mask_iou_reward": 0.7984799152975237, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.738162636756897, + "rewards/thk_ans_format_reward": 1.0, + "step": 2816, + "think_completion_length": 7.083333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.42709350585938, + "epoch": 9.516020236087689, + "grad_norm": 25.73561044687546, + "kl": 0.587890625, + "learning_rate": 2.069256756756757e-07, + "loss": 0.0006, + "reward": 3.377044200897217, + "reward_std": 0.0334283453412354, + "rewards/final_reward": 1.0100619557690784, + "rewards/mask_iou_reward": 0.5050309778845392, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3770442605018616, + "rewards/thk_ans_format_reward": 1.0, + "step": 2817, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.8541717529297, + "epoch": 9.51939291736931, + "grad_norm": 24.29405298701604, + "kl": 0.4287109375, + "learning_rate": 2.0664414414414412e-07, + "loss": 0.0005, + "reward": 3.7982823848724365, + "reward_std": 0.2190344613045454, + "rewards/final_reward": 1.7568705996030087, + "rewards/mask_iou_reward": 0.8784352998015044, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.8399492502212524, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2818, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.05208587646484, + "epoch": 9.522765598650928, + "grad_norm": 53.99049145280786, + "kl": 0.470703125, + "learning_rate": 2.0636261261261258e-07, + "loss": 0.0005, + "reward": 3.6919091939926147, + "reward_std": 0.023173667024821043, + "rewards/final_reward": 1.84371278506089, + "rewards/mask_iou_reward": 0.921856392530445, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6919094324111938, + "rewards/thk_ans_format_reward": 1.0, + "step": 2819, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.1354217529297, + "epoch": 9.526138279932546, + "grad_norm": 16.178033795092553, + "kl": 0.431640625, + "learning_rate": 2.0608108108108107e-07, + "loss": 0.0004, + "reward": 3.308905839920044, + "reward_std": 0.08162033371627331, + "rewards/final_reward": 1.4741759558412735, + "rewards/mask_iou_reward": 0.7370879779206367, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.308905839920044, + "rewards/thk_ans_format_reward": 1.0, + "step": 2820, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.23959350585938, + "epoch": 9.529510961214164, + "grad_norm": 8.992478435504813, + "kl": 0.3759765625, + "learning_rate": 2.0579954954954953e-07, + "loss": 0.0004, + "reward": 3.4609246253967285, + "reward_std": 0.02671785280108452, + "rewards/final_reward": 0.9214808888032776, + "rewards/mask_iou_reward": 0.4607404444016388, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4609246253967285, + "rewards/thk_ans_format_reward": 1.0, + "step": 2821, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.65625762939453, + "epoch": 9.532883642495785, + "grad_norm": 44.005912402987626, + "kl": 0.4619140625, + "learning_rate": 2.0551801801801802e-07, + "loss": 0.0005, + "reward": 3.5948944091796875, + "reward_std": 0.012664198991842568, + "rewards/final_reward": 1.7858823723189787, + "rewards/mask_iou_reward": 0.8929411861594894, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.594894289970398, + "rewards/thk_ans_format_reward": 1.0, + "step": 2822, + "think_completion_length": 6.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.7916793823242, + "epoch": 9.536256323777403, + "grad_norm": 50.977032139791866, + "kl": 0.5048828125, + "learning_rate": 2.0523648648648648e-07, + "loss": 0.0005, + "reward": 3.54253351688385, + "reward_std": 0.6379697918891907, + "rewards/final_reward": 1.8271490412164877, + "rewards/mask_iou_reward": 0.9135745206082438, + "rewards/sam_format_reward": 0.9375000298023224, + "rewards/sam_reward_func_ultra": 1.6675333976745605, + "rewards/thk_ans_format_reward": 0.9375000298023224, + "step": 2823, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.56250762939453, + "epoch": 9.539629005059021, + "grad_norm": 11.056068929360753, + "kl": 0.41796875, + "learning_rate": 2.0495495495495494e-07, + "loss": 0.0004, + "reward": 3.621427297592163, + "reward_std": 0.030739820562303066, + "rewards/final_reward": 1.3949359143596283, + "rewards/mask_iou_reward": 0.6974679571798141, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6214274168014526, + "rewards/thk_ans_format_reward": 1.0, + "step": 2824, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.46875762939453, + "epoch": 9.543001686340641, + "grad_norm": 19.432115157877316, + "kl": 0.4228515625, + "learning_rate": 2.0467342342342343e-07, + "loss": 0.0004, + "reward": 3.6139813661575317, + "reward_std": 0.07440846040844917, + "rewards/final_reward": 1.8043686765361797, + "rewards/mask_iou_reward": 0.9021843382680899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6139812469482422, + "rewards/thk_ans_format_reward": 1.0, + "step": 2825, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.39583587646484, + "epoch": 9.54637436762226, + "grad_norm": 30.093602692816805, + "kl": 0.4404296875, + "learning_rate": 2.043918918918919e-07, + "loss": 0.0005, + "reward": 3.518091917037964, + "reward_std": 0.22877466678619385, + "rewards/final_reward": 1.5167956450892555, + "rewards/mask_iou_reward": 0.7583978225446277, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5389251112937927, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2826, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.0729217529297, + "epoch": 9.549747048903878, + "grad_norm": 8.053954032812966, + "kl": 0.3759765625, + "learning_rate": 2.0411036036036035e-07, + "loss": 0.0004, + "reward": 3.590845227241516, + "reward_std": 0.020092520862817764, + "rewards/final_reward": 1.6205377049667196, + "rewards/mask_iou_reward": 0.8102688524833598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5908452272415161, + "rewards/thk_ans_format_reward": 1.0, + "step": 2827, + "think_completion_length": 9.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.43750762939453, + "epoch": 9.553119730185497, + "grad_norm": 6.808401299666237, + "kl": 0.48828125, + "learning_rate": 2.038288288288288e-07, + "loss": 0.0005, + "reward": 3.514745831489563, + "reward_std": 0.057424647733569145, + "rewards/final_reward": 0.5211337043794396, + "rewards/mask_iou_reward": 0.2605668521897198, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5147458910942078, + "rewards/thk_ans_format_reward": 1.0, + "step": 2828, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.30208587646484, + "epoch": 9.556492411467117, + "grad_norm": 12.857766432793152, + "kl": 0.4296875, + "learning_rate": 2.0354729729729727e-07, + "loss": 0.0005, + "reward": 3.6339670419692993, + "reward_std": 0.162049344740808, + "rewards/final_reward": 1.6951763072555914, + "rewards/mask_iou_reward": 0.8475881536277957, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6548004150390625, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2829, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.33333587646484, + "epoch": 9.559865092748735, + "grad_norm": 8.916261260749389, + "kl": 0.3984375, + "learning_rate": 2.0326576576576576e-07, + "loss": 0.0005, + "reward": 3.8546788692474365, + "reward_std": 0.035455340053886175, + "rewards/final_reward": 1.8457946564614258, + "rewards/mask_iou_reward": 0.9228973282307129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8546786308288574, + "rewards/thk_ans_format_reward": 1.0, + "step": 2830, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.01042938232422, + "epoch": 9.563237774030354, + "grad_norm": 89.02326475916136, + "kl": 0.439453125, + "learning_rate": 2.0298423423423422e-07, + "loss": 0.0004, + "reward": 3.3965485095977783, + "reward_std": 0.07727420050650835, + "rewards/final_reward": 1.6133975362772532, + "rewards/mask_iou_reward": 0.8066987681386266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3965486288070679, + "rewards/thk_ans_format_reward": 1.0, + "step": 2831, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.48958587646484, + "epoch": 9.566610455311974, + "grad_norm": 21.9967390475209, + "kl": 0.4501953125, + "learning_rate": 2.027027027027027e-07, + "loss": 0.0005, + "reward": 3.4815778732299805, + "reward_std": 0.0476123932749033, + "rewards/final_reward": 1.590671112034137, + "rewards/mask_iou_reward": 0.7953355560170685, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4815776944160461, + "rewards/thk_ans_format_reward": 1.0, + "step": 2832, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.56250762939453, + "epoch": 9.569983136593592, + "grad_norm": 10.440515270065333, + "kl": 0.60546875, + "learning_rate": 2.0242117117117117e-07, + "loss": 0.0006, + "reward": 3.641559362411499, + "reward_std": 0.08363806898705661, + "rewards/final_reward": 1.3991671911885808, + "rewards/mask_iou_reward": 0.6995835955942904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.641559362411499, + "rewards/thk_ans_format_reward": 1.0, + "step": 2833, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.10417938232422, + "epoch": 9.57335581787521, + "grad_norm": 8.531738541716415, + "kl": 0.5087890625, + "learning_rate": 2.0213963963963963e-07, + "loss": 0.0005, + "reward": 3.0582525730133057, + "reward_std": 0.03069372847676277, + "rewards/final_reward": 1.8473170777391852, + "rewards/mask_iou_reward": 0.9236585388695926, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0582523941993713, + "rewards/thk_ans_format_reward": 1.0, + "step": 2834, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.4479217529297, + "epoch": 9.576728499156829, + "grad_norm": 8.21043348175991, + "kl": 0.703125, + "learning_rate": 2.0185810810810811e-07, + "loss": 0.0007, + "reward": 3.729646682739258, + "reward_std": 0.06171860918402672, + "rewards/final_reward": 1.3267377948958567, + "rewards/mask_iou_reward": 0.6633688974479284, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7296466827392578, + "rewards/thk_ans_format_reward": 1.0, + "step": 2835, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.2708435058594, + "epoch": 9.580101180438449, + "grad_norm": 10.642499161128272, + "kl": 0.427734375, + "learning_rate": 2.0157657657657657e-07, + "loss": 0.0004, + "reward": 3.362258553504944, + "reward_std": 0.16495228372514248, + "rewards/final_reward": 1.7112586410634867, + "rewards/mask_iou_reward": 0.8556293205317433, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3830917477607727, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2836, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.20833587646484, + "epoch": 9.583473861720067, + "grad_norm": 21.840827938504045, + "kl": 0.513671875, + "learning_rate": 2.0129504504504503e-07, + "loss": 0.0005, + "reward": 3.5530951023101807, + "reward_std": 0.05677308700978756, + "rewards/final_reward": 1.543489324604837, + "rewards/mask_iou_reward": 0.7717446623024184, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5530951023101807, + "rewards/thk_ans_format_reward": 1.0, + "step": 2837, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.30208587646484, + "epoch": 9.586846543001686, + "grad_norm": 9.55288922944407, + "kl": 0.462890625, + "learning_rate": 2.010135135135135e-07, + "loss": 0.0005, + "reward": 3.3496075868606567, + "reward_std": 0.05496904905885458, + "rewards/final_reward": 0.8730091994708987, + "rewards/mask_iou_reward": 0.43650459973544936, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3496074676513672, + "rewards/thk_ans_format_reward": 1.0, + "step": 2838, + "think_completion_length": 11.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.7916717529297, + "epoch": 9.590219224283306, + "grad_norm": 18.349280040949843, + "kl": 0.412109375, + "learning_rate": 2.0073198198198196e-07, + "loss": 0.0005, + "reward": 3.684578537940979, + "reward_std": 0.17463991791009903, + "rewards/final_reward": 1.3727932052574463, + "rewards/mask_iou_reward": 0.6863966026287232, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7054117321968079, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2839, + "think_completion_length": 10.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.36458587646484, + "epoch": 9.593591905564924, + "grad_norm": 13.069291248829193, + "kl": 0.525390625, + "learning_rate": 2.0045045045045044e-07, + "loss": 0.0005, + "reward": 3.7611004114151, + "reward_std": 0.10698830150067806, + "rewards/final_reward": 1.5477931511291438, + "rewards/mask_iou_reward": 0.7738965755645719, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7611003518104553, + "rewards/thk_ans_format_reward": 1.0, + "step": 2840, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.17708587646484, + "epoch": 9.596964586846543, + "grad_norm": 11.268109832702788, + "kl": 0.42578125, + "learning_rate": 2.001689189189189e-07, + "loss": 0.0004, + "reward": 3.5092090368270874, + "reward_std": 0.08594032749533653, + "rewards/final_reward": 1.531189337710352, + "rewards/mask_iou_reward": 0.765594668855176, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.509209156036377, + "rewards/thk_ans_format_reward": 1.0, + "step": 2841, + "think_completion_length": 10.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.30209350585938, + "epoch": 9.600337268128161, + "grad_norm": 7.4274243871513805, + "kl": 0.4140625, + "learning_rate": 1.998873873873874e-07, + "loss": 0.0005, + "reward": 3.5415148735046387, + "reward_std": 0.046683117747306824, + "rewards/final_reward": 1.481277049260404, + "rewards/mask_iou_reward": 0.740638524630202, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5415149927139282, + "rewards/thk_ans_format_reward": 1.0, + "step": 2842, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.87500762939453, + "epoch": 9.603709949409781, + "grad_norm": 55.56419412559832, + "kl": 0.4609375, + "learning_rate": 1.9960585585585585e-07, + "loss": 0.0005, + "reward": 3.7100802659988403, + "reward_std": 0.023584270384162664, + "rewards/final_reward": 1.7788794468594284, + "rewards/mask_iou_reward": 0.8894397234297142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7100803852081299, + "rewards/thk_ans_format_reward": 1.0, + "step": 2843, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.81250762939453, + "epoch": 9.6070826306914, + "grad_norm": 52.155557130505954, + "kl": 0.4541015625, + "learning_rate": 1.993243243243243e-07, + "loss": 0.0005, + "reward": 3.713410258293152, + "reward_std": 0.026679479517042637, + "rewards/final_reward": 1.7559925178717823, + "rewards/mask_iou_reward": 0.8779962589358912, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7134102582931519, + "rewards/thk_ans_format_reward": 1.0, + "step": 2844, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.9583396911621, + "epoch": 9.610455311973018, + "grad_norm": 13.12615115215815, + "kl": 0.46875, + "learning_rate": 1.990427927927928e-07, + "loss": 0.0005, + "reward": 3.2496907711029053, + "reward_std": 0.22381212748587132, + "rewards/final_reward": 1.5477438783253565, + "rewards/mask_iou_reward": 0.7738719391626783, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.291357159614563, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2845, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.58334350585938, + "epoch": 9.613827993254638, + "grad_norm": 12.077068644217759, + "kl": 0.4794921875, + "learning_rate": 1.9876126126126126e-07, + "loss": 0.0005, + "reward": 3.672512650489807, + "reward_std": 0.04940144717693329, + "rewards/final_reward": 1.7971136101756204, + "rewards/mask_iou_reward": 0.8985568050878102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.672512710094452, + "rewards/thk_ans_format_reward": 1.0, + "step": 2846, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.86459350585938, + "epoch": 9.617200674536257, + "grad_norm": 11.875814882323452, + "kl": 0.4873046875, + "learning_rate": 1.9847972972972972e-07, + "loss": 0.0005, + "reward": 3.702423334121704, + "reward_std": 0.0695484783500433, + "rewards/final_reward": 1.546965553843143, + "rewards/mask_iou_reward": 0.7734827769215715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7024234533309937, + "rewards/thk_ans_format_reward": 1.0, + "step": 2847, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.41667938232422, + "epoch": 9.620573355817875, + "grad_norm": 12.888691948386171, + "kl": 0.427734375, + "learning_rate": 1.9819819819819818e-07, + "loss": 0.0004, + "reward": 3.688524842262268, + "reward_std": 0.06056614965200424, + "rewards/final_reward": 1.271105252712622, + "rewards/mask_iou_reward": 0.635552626356311, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.688524842262268, + "rewards/thk_ans_format_reward": 1.0, + "step": 2848, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.73958587646484, + "epoch": 9.623946037099493, + "grad_norm": 9.132160917417677, + "kl": 0.4716796875, + "learning_rate": 1.9791666666666664e-07, + "loss": 0.0005, + "reward": 3.5782171487808228, + "reward_std": 0.056133901700377464, + "rewards/final_reward": 1.1767378869727714, + "rewards/mask_iou_reward": 0.5883689434863857, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.578217089176178, + "rewards/thk_ans_format_reward": 1.0, + "step": 2849, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.5104217529297, + "epoch": 9.627318718381114, + "grad_norm": 34.526748684702255, + "kl": 0.4560546875, + "learning_rate": 1.9763513513513513e-07, + "loss": 0.0005, + "reward": 3.475613832473755, + "reward_std": 0.07276524603366852, + "rewards/final_reward": 1.910589242117398, + "rewards/mask_iou_reward": 0.955294621058699, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4756136536598206, + "rewards/thk_ans_format_reward": 1.0, + "step": 2850, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.1041717529297, + "epoch": 9.630691399662732, + "grad_norm": 17.432998355345177, + "kl": 0.43359375, + "learning_rate": 1.973536036036036e-07, + "loss": 0.0004, + "reward": 3.3042174577713013, + "reward_std": 0.11458679661154747, + "rewards/final_reward": 0.8685805073799024, + "rewards/mask_iou_reward": 0.4342902536899512, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3042174577713013, + "rewards/thk_ans_format_reward": 1.0, + "step": 2851, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.40625762939453, + "epoch": 9.63406408094435, + "grad_norm": 16.700568944178123, + "kl": 0.3974609375, + "learning_rate": 1.9707207207207208e-07, + "loss": 0.0004, + "reward": 3.670613646507263, + "reward_std": 0.06477308459579945, + "rewards/final_reward": 1.623943956731143, + "rewards/mask_iou_reward": 0.8119719783655714, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6706136465072632, + "rewards/thk_ans_format_reward": 1.0, + "step": 2852, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.875, + "epoch": 9.63743676222597, + "grad_norm": 7.654529656910214, + "kl": 0.365234375, + "learning_rate": 1.9679054054054054e-07, + "loss": 0.0004, + "reward": 3.3408288955688477, + "reward_std": 0.19411547109484673, + "rewards/final_reward": 1.2078860906808835, + "rewards/mask_iou_reward": 0.6039430453404417, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.3616620898246765, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2853, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.83333587646484, + "epoch": 9.640809443507589, + "grad_norm": 9.992895153761978, + "kl": 0.4052734375, + "learning_rate": 1.96509009009009e-07, + "loss": 0.0004, + "reward": 3.546285390853882, + "reward_std": 0.07918055914342403, + "rewards/final_reward": 1.3308878379968134, + "rewards/mask_iou_reward": 0.6654439189984067, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5462854504585266, + "rewards/thk_ans_format_reward": 1.0, + "step": 2854, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.2291717529297, + "epoch": 9.644182124789207, + "grad_norm": 22.997248674417847, + "kl": 0.533203125, + "learning_rate": 1.9622747747747749e-07, + "loss": 0.0005, + "reward": 3.7459217309951782, + "reward_std": 0.029171346686780453, + "rewards/final_reward": 1.7417187497924966, + "rewards/mask_iou_reward": 0.8708593748962483, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7459213733673096, + "rewards/thk_ans_format_reward": 1.0, + "step": 2855, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.6354217529297, + "epoch": 9.647554806070826, + "grad_norm": 35.438827198570095, + "kl": 0.3837890625, + "learning_rate": 1.9594594594594595e-07, + "loss": 0.0004, + "reward": 3.6280492544174194, + "reward_std": 0.028105991892516613, + "rewards/final_reward": 1.9573374531014816, + "rewards/mask_iou_reward": 0.9786687265507408, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6280492544174194, + "rewards/thk_ans_format_reward": 1.0, + "step": 2856, + "think_completion_length": 9.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.42708587646484, + "epoch": 9.650927487352446, + "grad_norm": 25.10256945414261, + "kl": 0.4345703125, + "learning_rate": 1.956644144144144e-07, + "loss": 0.0004, + "reward": 3.5365242958068848, + "reward_std": 0.05269638076424599, + "rewards/final_reward": 1.9359144661557814, + "rewards/mask_iou_reward": 0.9679572330778907, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5365243554115295, + "rewards/thk_ans_format_reward": 1.0, + "step": 2857, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.3541717529297, + "epoch": 9.654300168634064, + "grad_norm": 11.130338599723647, + "kl": 0.5693359375, + "learning_rate": 1.9538288288288287e-07, + "loss": 0.0006, + "reward": 3.5365134477615356, + "reward_std": 0.01926427148282528, + "rewards/final_reward": 0.9418421923795386, + "rewards/mask_iou_reward": 0.4709210961897693, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5365132093429565, + "rewards/thk_ans_format_reward": 1.0, + "step": 2858, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.69792938232422, + "epoch": 9.657672849915683, + "grad_norm": 41.97861008305546, + "kl": 0.5126953125, + "learning_rate": 1.9510135135135133e-07, + "loss": 0.0005, + "reward": 3.6388747692108154, + "reward_std": 0.08557260315865278, + "rewards/final_reward": 1.4884645818165254, + "rewards/mask_iou_reward": 0.7442322909082627, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6388747692108154, + "rewards/thk_ans_format_reward": 1.0, + "step": 2859, + "think_completion_length": 9.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8229217529297, + "epoch": 9.661045531197303, + "grad_norm": 9.662696333050443, + "kl": 0.66015625, + "learning_rate": 1.9481981981981982e-07, + "loss": 0.0006, + "reward": 3.6072356700897217, + "reward_std": 0.057722508907318115, + "rewards/final_reward": 1.7999765534732362, + "rewards/mask_iou_reward": 0.8999882767366181, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.607235610485077, + "rewards/thk_ans_format_reward": 1.0, + "step": 2860, + "think_completion_length": 10.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.0625, + "epoch": 9.664418212478921, + "grad_norm": 11.981101585335209, + "kl": 0.478515625, + "learning_rate": 1.9453828828828828e-07, + "loss": 0.0005, + "reward": 3.332722783088684, + "reward_std": 0.09129737317562103, + "rewards/final_reward": 0.5991891207919265, + "rewards/mask_iou_reward": 0.29959456039596327, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3327226638793945, + "rewards/thk_ans_format_reward": 1.0, + "step": 2861, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.3229217529297, + "epoch": 9.66779089376054, + "grad_norm": 25.87603734036553, + "kl": 0.4580078125, + "learning_rate": 1.9425675675675674e-07, + "loss": 0.0005, + "reward": 3.6100000143051147, + "reward_std": 0.07859287038445473, + "rewards/final_reward": 1.7322674504974922, + "rewards/mask_iou_reward": 0.8661337252487461, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6100000143051147, + "rewards/thk_ans_format_reward": 1.0, + "step": 2862, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.7604217529297, + "epoch": 9.671163575042158, + "grad_norm": 7.350777072868899, + "kl": 0.5625, + "learning_rate": 1.9397522522522522e-07, + "loss": 0.0006, + "reward": 3.685990810394287, + "reward_std": 0.03065543156117201, + "rewards/final_reward": 1.5288908551946414, + "rewards/mask_iou_reward": 0.7644454275973207, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6859906911849976, + "rewards/thk_ans_format_reward": 1.0, + "step": 2863, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.2395896911621, + "epoch": 9.674536256323778, + "grad_norm": 14.105106198347636, + "kl": 0.611328125, + "learning_rate": 1.9369369369369368e-07, + "loss": 0.0006, + "reward": 3.8575843572616577, + "reward_std": 0.021873501129448414, + "rewards/final_reward": 1.7784202895456058, + "rewards/mask_iou_reward": 0.8892101447728029, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8575843572616577, + "rewards/thk_ans_format_reward": 1.0, + "step": 2864, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.75000762939453, + "epoch": 9.677908937605396, + "grad_norm": 9.671511209060515, + "kl": 0.47265625, + "learning_rate": 1.9341216216216217e-07, + "loss": 0.0004, + "reward": 3.6749985218048096, + "reward_std": 0.062141310423612595, + "rewards/final_reward": 1.9276742195950716, + "rewards/mask_iou_reward": 0.9638371097975358, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6749984622001648, + "rewards/thk_ans_format_reward": 1.0, + "step": 2865, + "think_completion_length": 10.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.23959350585938, + "epoch": 9.681281618887015, + "grad_norm": 14.355187256720177, + "kl": 0.443359375, + "learning_rate": 1.9313063063063063e-07, + "loss": 0.0004, + "reward": 3.312918186187744, + "reward_std": 0.07452259492129087, + "rewards/final_reward": 1.587132035400053, + "rewards/mask_iou_reward": 0.7935660177000265, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3129181861877441, + "rewards/thk_ans_format_reward": 1.0, + "step": 2866, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.8229217529297, + "epoch": 9.684654300168635, + "grad_norm": 21.42101266336042, + "kl": 0.4296875, + "learning_rate": 1.9284909909909907e-07, + "loss": 0.0004, + "reward": 3.5092891454696655, + "reward_std": 0.0639540646225214, + "rewards/final_reward": 1.7018374552469127, + "rewards/mask_iou_reward": 0.8509187276234563, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5092891454696655, + "rewards/thk_ans_format_reward": 1.0, + "step": 2867, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.625, + "epoch": 9.688026981450253, + "grad_norm": 9.391503997617093, + "kl": 0.513671875, + "learning_rate": 1.9256756756756755e-07, + "loss": 0.0006, + "reward": 3.828369140625, + "reward_std": 0.05252628936432302, + "rewards/final_reward": 1.9016076696050992, + "rewards/mask_iou_reward": 0.9508038348025496, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.828369140625, + "rewards/thk_ans_format_reward": 1.0, + "step": 2868, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.89583587646484, + "epoch": 9.691399662731872, + "grad_norm": 9.51695567109779, + "kl": 0.576171875, + "learning_rate": 1.9228603603603601e-07, + "loss": 0.0006, + "reward": 3.6162601709365845, + "reward_std": 0.14732644706964493, + "rewards/final_reward": 1.7979722921278647, + "rewards/mask_iou_reward": 0.8989861460639323, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6162601113319397, + "rewards/thk_ans_format_reward": 1.0, + "step": 2869, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.69792938232422, + "epoch": 9.69477234401349, + "grad_norm": 13.885753410600678, + "kl": 0.4423828125, + "learning_rate": 1.920045045045045e-07, + "loss": 0.0005, + "reward": 3.571479082107544, + "reward_std": 0.037937651155516505, + "rewards/final_reward": 1.94194025944619, + "rewards/mask_iou_reward": 0.970970129723095, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5714789032936096, + "rewards/thk_ans_format_reward": 1.0, + "step": 2870, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.64584350585938, + "epoch": 9.69814502529511, + "grad_norm": 9.751534245853538, + "kl": 0.4345703125, + "learning_rate": 1.9172297297297296e-07, + "loss": 0.0005, + "reward": 3.57540225982666, + "reward_std": 0.08446568250656128, + "rewards/final_reward": 1.8323340722062373, + "rewards/mask_iou_reward": 0.9161670361031187, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5754019618034363, + "rewards/thk_ans_format_reward": 1.0, + "step": 2871, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.43751525878906, + "epoch": 9.701517706576729, + "grad_norm": 25.327767980045827, + "kl": 0.427734375, + "learning_rate": 1.9144144144144142e-07, + "loss": 0.0004, + "reward": 3.1767791509628296, + "reward_std": 0.16962359100580215, + "rewards/final_reward": 1.242573685966323, + "rewards/mask_iou_reward": 0.6212868429831615, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1767792701721191, + "rewards/thk_ans_format_reward": 1.0, + "step": 2872, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.0729217529297, + "epoch": 9.704890387858347, + "grad_norm": 9.41024703721106, + "kl": 0.435546875, + "learning_rate": 1.911599099099099e-07, + "loss": 0.0005, + "reward": 3.851110577583313, + "reward_std": 0.03305862098932266, + "rewards/final_reward": 1.9446028234279495, + "rewards/mask_iou_reward": 0.9723014117139748, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8511103391647339, + "rewards/thk_ans_format_reward": 1.0, + "step": 2873, + "think_completion_length": 9.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.83333587646484, + "epoch": 9.708263069139967, + "grad_norm": 25.116639383545913, + "kl": 0.595703125, + "learning_rate": 1.9087837837837837e-07, + "loss": 0.0007, + "reward": 3.62956440448761, + "reward_std": 0.04598809592425823, + "rewards/final_reward": 1.8718766468069932, + "rewards/mask_iou_reward": 0.9359383234034966, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6295644640922546, + "rewards/thk_ans_format_reward": 1.0, + "step": 2874, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.02083587646484, + "epoch": 9.711635750421586, + "grad_norm": 8.665427467336604, + "kl": 0.6279296875, + "learning_rate": 1.9059684684684686e-07, + "loss": 0.0006, + "reward": 3.5676958560943604, + "reward_std": 0.0411482872441411, + "rewards/final_reward": 1.9172117945198663, + "rewards/mask_iou_reward": 0.9586058972599332, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5676957964897156, + "rewards/thk_ans_format_reward": 1.0, + "step": 2875, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.20833587646484, + "epoch": 9.715008431703204, + "grad_norm": 8.656622653908304, + "kl": 0.474609375, + "learning_rate": 1.9031531531531532e-07, + "loss": 0.0005, + "reward": 3.753931999206543, + "reward_std": 0.015061838086694479, + "rewards/final_reward": 1.7427191758720917, + "rewards/mask_iou_reward": 0.8713595879360458, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.753931999206543, + "rewards/thk_ans_format_reward": 1.0, + "step": 2876, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.7604217529297, + "epoch": 9.718381112984822, + "grad_norm": 18.87539927772862, + "kl": 0.451171875, + "learning_rate": 1.9003378378378375e-07, + "loss": 0.0007, + "reward": 3.5150372982025146, + "reward_std": 0.033334359526634216, + "rewards/final_reward": 1.7786438688846506, + "rewards/mask_iou_reward": 0.8893219344423253, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5150372982025146, + "rewards/thk_ans_format_reward": 1.0, + "step": 2877, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.80209350585938, + "epoch": 9.721753794266442, + "grad_norm": 11.569865957597344, + "kl": 0.423828125, + "learning_rate": 1.8975225225225224e-07, + "loss": 0.0004, + "reward": 3.5578192472457886, + "reward_std": 0.025620101019740105, + "rewards/final_reward": 1.266151811013885, + "rewards/mask_iou_reward": 0.6330759055069425, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5578192472457886, + "rewards/thk_ans_format_reward": 1.0, + "step": 2878, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.55209350585938, + "epoch": 9.72512647554806, + "grad_norm": 12.922203046169475, + "kl": 0.4345703125, + "learning_rate": 1.894707207207207e-07, + "loss": 0.0004, + "reward": 3.7492141723632812, + "reward_std": 0.03508290648460388, + "rewards/final_reward": 1.8595868449740705, + "rewards/mask_iou_reward": 0.9297934224870352, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7492140531539917, + "rewards/thk_ans_format_reward": 1.0, + "step": 2879, + "think_completion_length": 10.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.35417938232422, + "epoch": 9.72849915682968, + "grad_norm": 18.81680975098738, + "kl": 0.4453125, + "learning_rate": 1.891891891891892e-07, + "loss": 0.0004, + "reward": 3.4972580671310425, + "reward_std": 0.03990233689546585, + "rewards/final_reward": 1.6644943242104895, + "rewards/mask_iou_reward": 0.8322471621052447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4972580671310425, + "rewards/thk_ans_format_reward": 1.0, + "step": 2880, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.30208587646484, + "epoch": 9.7318718381113, + "grad_norm": 15.639340031886851, + "kl": 0.529296875, + "learning_rate": 1.8890765765765765e-07, + "loss": 0.0005, + "reward": 3.573089361190796, + "reward_std": 0.023514626547694206, + "rewards/final_reward": 1.8379977343968616, + "rewards/mask_iou_reward": 0.9189988671984308, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5730892419815063, + "rewards/thk_ans_format_reward": 1.0, + "step": 2881, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.45834350585938, + "epoch": 9.735244519392918, + "grad_norm": 55.216362046610456, + "kl": 0.46875, + "learning_rate": 1.886261261261261e-07, + "loss": 0.0005, + "reward": 3.632011890411377, + "reward_std": 0.04148270934820175, + "rewards/final_reward": 1.554612728702804, + "rewards/mask_iou_reward": 0.777306364351402, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6320120096206665, + "rewards/thk_ans_format_reward": 1.0, + "step": 2882, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.61458587646484, + "epoch": 9.738617200674536, + "grad_norm": 8.776607905670236, + "kl": 0.50390625, + "learning_rate": 1.883445945945946e-07, + "loss": 0.0006, + "reward": 3.7969205379486084, + "reward_std": 0.03961823880672455, + "rewards/final_reward": 1.7294312710622033, + "rewards/mask_iou_reward": 0.8647156355311016, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7969204783439636, + "rewards/thk_ans_format_reward": 1.0, + "step": 2883, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.6041717529297, + "epoch": 9.741989881956155, + "grad_norm": 28.578509588903785, + "kl": 0.4267578125, + "learning_rate": 1.8806306306306306e-07, + "loss": 0.0004, + "reward": 3.512947916984558, + "reward_std": 0.12944239377975464, + "rewards/final_reward": 1.3326931978912158, + "rewards/mask_iou_reward": 0.6663465989456079, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.512947916984558, + "rewards/thk_ans_format_reward": 1.0, + "step": 2884, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.50000762939453, + "epoch": 9.745362563237775, + "grad_norm": 7.72589885581016, + "kl": 0.4501953125, + "learning_rate": 1.8778153153153154e-07, + "loss": 0.0004, + "reward": 3.344777226448059, + "reward_std": 0.04612966813147068, + "rewards/final_reward": 1.068333717229333, + "rewards/mask_iou_reward": 0.5341668586146665, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3447771072387695, + "rewards/thk_ans_format_reward": 1.0, + "step": 2885, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.02083587646484, + "epoch": 9.748735244519393, + "grad_norm": 25.71317419883287, + "kl": 0.595703125, + "learning_rate": 1.875e-07, + "loss": 0.0006, + "reward": 3.7086650133132935, + "reward_std": 0.08883501403033733, + "rewards/final_reward": 1.6490717348582409, + "rewards/mask_iou_reward": 0.8245358674291204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7086647152900696, + "rewards/thk_ans_format_reward": 1.0, + "step": 2886, + "think_completion_length": 9.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.40625381469727, + "epoch": 9.752107925801011, + "grad_norm": 17.403028726088174, + "kl": 0.55078125, + "learning_rate": 1.8721846846846844e-07, + "loss": 0.0006, + "reward": 3.697953701019287, + "reward_std": 0.0342103186994791, + "rewards/final_reward": 1.7548838832095166, + "rewards/mask_iou_reward": 0.8774419416047583, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6979536414146423, + "rewards/thk_ans_format_reward": 1.0, + "step": 2887, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.89584350585938, + "epoch": 9.75548060708263, + "grad_norm": 13.441340996428666, + "kl": 0.513671875, + "learning_rate": 1.8693693693693693e-07, + "loss": 0.0005, + "reward": 3.58347225189209, + "reward_std": 0.03695745766162872, + "rewards/final_reward": 1.780572817810726, + "rewards/mask_iou_reward": 0.890286408905363, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5834720730781555, + "rewards/thk_ans_format_reward": 1.0, + "step": 2888, + "think_completion_length": 9.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.28125381469727, + "epoch": 9.75885328836425, + "grad_norm": 9.806673786228945, + "kl": 0.57421875, + "learning_rate": 1.8665540540540539e-07, + "loss": 0.0006, + "reward": 3.58203661441803, + "reward_std": 0.12312077358365059, + "rewards/final_reward": 0.9904166333088189, + "rewards/mask_iou_reward": 0.49520831665440945, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5820364356040955, + "rewards/thk_ans_format_reward": 1.0, + "step": 2889, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.4166717529297, + "epoch": 9.762225969645868, + "grad_norm": 8.634095090279637, + "kl": 0.4501953125, + "learning_rate": 1.8637387387387387e-07, + "loss": 0.0005, + "reward": 3.577873945236206, + "reward_std": 0.0438346890732646, + "rewards/final_reward": 1.4734708922293716, + "rewards/mask_iou_reward": 0.7367354461146858, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5778738856315613, + "rewards/thk_ans_format_reward": 1.0, + "step": 2890, + "think_completion_length": 10.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.78126525878906, + "epoch": 9.765598650927487, + "grad_norm": 21.725535355634502, + "kl": 0.509765625, + "learning_rate": 1.8609234234234233e-07, + "loss": 0.0005, + "reward": 3.6148117780685425, + "reward_std": 0.06591962184756994, + "rewards/final_reward": 1.7778298342514938, + "rewards/mask_iou_reward": 0.8889149171257469, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6148120760917664, + "rewards/thk_ans_format_reward": 1.0, + "step": 2891, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.20833587646484, + "epoch": 9.768971332209107, + "grad_norm": 17.3936906485533, + "kl": 0.5234375, + "learning_rate": 1.858108108108108e-07, + "loss": 0.0005, + "reward": 3.6338586807250977, + "reward_std": 0.09483909234404564, + "rewards/final_reward": 1.4055704259070454, + "rewards/mask_iou_reward": 0.7027852129535227, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6338586807250977, + "rewards/thk_ans_format_reward": 1.0, + "step": 2892, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.25000762939453, + "epoch": 9.772344013490725, + "grad_norm": 53.145981829869896, + "kl": 0.560546875, + "learning_rate": 1.8552927927927928e-07, + "loss": 0.0006, + "reward": 3.6119606494903564, + "reward_std": 0.09925832878798246, + "rewards/final_reward": 1.2578269223932477, + "rewards/mask_iou_reward": 0.6289134611966238, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.611960530281067, + "rewards/thk_ans_format_reward": 1.0, + "step": 2893, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.75000762939453, + "epoch": 9.775716694772344, + "grad_norm": 22.358863679413318, + "kl": 0.6171875, + "learning_rate": 1.8524774774774774e-07, + "loss": 0.0006, + "reward": 3.8267993927001953, + "reward_std": 0.020386284217238426, + "rewards/final_reward": 1.9695081912671557, + "rewards/mask_iou_reward": 0.9847540956335779, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8267992734909058, + "rewards/thk_ans_format_reward": 1.0, + "step": 2894, + "think_completion_length": 12.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.75000762939453, + "epoch": 9.779089376053962, + "grad_norm": 10.042252567866315, + "kl": 0.431640625, + "learning_rate": 1.8496621621621623e-07, + "loss": 0.0005, + "reward": 3.6202865839004517, + "reward_std": 0.02719450183212757, + "rewards/final_reward": 1.4129175307550255, + "rewards/mask_iou_reward": 0.7064587653775127, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6202865242958069, + "rewards/thk_ans_format_reward": 1.0, + "step": 2895, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.4791717529297, + "epoch": 9.782462057335582, + "grad_norm": 8.299297282864869, + "kl": 0.66015625, + "learning_rate": 1.8468468468468466e-07, + "loss": 0.0007, + "reward": 3.6827521324157715, + "reward_std": 0.04479054640978575, + "rewards/final_reward": 1.7186386513465273, + "rewards/mask_iou_reward": 0.8593193256732636, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6827520728111267, + "rewards/thk_ans_format_reward": 1.0, + "step": 2896, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.45833587646484, + "epoch": 9.7858347386172, + "grad_norm": 11.577478643284216, + "kl": 0.623046875, + "learning_rate": 1.8440315315315313e-07, + "loss": 0.0006, + "reward": 3.799902319908142, + "reward_std": 0.042380135506391525, + "rewards/final_reward": 1.901752685896824, + "rewards/mask_iou_reward": 0.950876342948412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7999022603034973, + "rewards/thk_ans_format_reward": 1.0, + "step": 2897, + "think_completion_length": 6.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.625, + "epoch": 9.789207419898819, + "grad_norm": 19.355628986540445, + "kl": 0.5244140625, + "learning_rate": 1.841216216216216e-07, + "loss": 0.0005, + "reward": 3.282261848449707, + "reward_std": 0.09237036108970642, + "rewards/final_reward": 1.6548112822568366, + "rewards/mask_iou_reward": 0.8274056411284183, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.282261848449707, + "rewards/thk_ans_format_reward": 1.0, + "step": 2898, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.14583587646484, + "epoch": 9.79258010118044, + "grad_norm": 22.404219362540285, + "kl": 0.4814453125, + "learning_rate": 1.8384009009009007e-07, + "loss": 0.0005, + "reward": 3.1129021644592285, + "reward_std": 0.04305828921496868, + "rewards/final_reward": 0.548002698119196, + "rewards/mask_iou_reward": 0.274001349059598, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1129021644592285, + "rewards/thk_ans_format_reward": 1.0, + "step": 2899, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.8854217529297, + "epoch": 9.795952782462058, + "grad_norm": 12.759625247955837, + "kl": 0.5048828125, + "learning_rate": 1.8355855855855856e-07, + "loss": 0.0005, + "reward": 3.7565758228302, + "reward_std": 0.08746011555194855, + "rewards/final_reward": 1.6317371237065799, + "rewards/mask_iou_reward": 0.8158685618532899, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7565756440162659, + "rewards/thk_ans_format_reward": 1.0, + "step": 2900, + "think_completion_length": 10.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.1875, + "epoch": 9.799325463743676, + "grad_norm": 22.181261915354742, + "kl": 0.44140625, + "learning_rate": 1.8327702702702702e-07, + "loss": 0.0004, + "reward": 3.4462321996688843, + "reward_std": 0.045475758612155914, + "rewards/final_reward": 1.6241891651138398, + "rewards/mask_iou_reward": 0.8120945825569199, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4462321400642395, + "rewards/thk_ans_format_reward": 1.0, + "step": 2901, + "think_completion_length": 7.041666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.90625, + "epoch": 9.802698145025294, + "grad_norm": 19.883643169696228, + "kl": 0.4521484375, + "learning_rate": 1.8299549549549548e-07, + "loss": 0.0005, + "reward": 3.6311463117599487, + "reward_std": 0.02114281803369522, + "rewards/final_reward": 1.7492453606934104, + "rewards/mask_iou_reward": 0.8746226803467052, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6311463713645935, + "rewards/thk_ans_format_reward": 1.0, + "step": 2902, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.5, + "epoch": 9.806070826306915, + "grad_norm": 12.52322065751209, + "kl": 0.458984375, + "learning_rate": 1.8271396396396397e-07, + "loss": 0.0005, + "reward": 3.7113521099090576, + "reward_std": 0.12051836773753166, + "rewards/final_reward": 1.8746252562926338, + "rewards/mask_iou_reward": 0.9373126281463169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7113521099090576, + "rewards/thk_ans_format_reward": 1.0, + "step": 2903, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.96875762939453, + "epoch": 9.809443507588533, + "grad_norm": 27.936075958457913, + "kl": 0.494140625, + "learning_rate": 1.8243243243243243e-07, + "loss": 0.0005, + "reward": 3.5085314512252808, + "reward_std": 0.03908315673470497, + "rewards/final_reward": 1.9092109378444695, + "rewards/mask_iou_reward": 0.9546054689222347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5085315108299255, + "rewards/thk_ans_format_reward": 1.0, + "step": 2904, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.68750762939453, + "epoch": 9.812816188870151, + "grad_norm": 20.574499633313156, + "kl": 0.623046875, + "learning_rate": 1.8215090090090092e-07, + "loss": 0.0008, + "reward": 3.7213168144226074, + "reward_std": 0.09398959390819073, + "rewards/final_reward": 1.4996546278202438, + "rewards/mask_iou_reward": 0.7498273139101219, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7213165760040283, + "rewards/thk_ans_format_reward": 1.0, + "step": 2905, + "think_completion_length": 11.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.25000762939453, + "epoch": 9.816188870151771, + "grad_norm": 14.292492911019973, + "kl": 0.5546875, + "learning_rate": 1.8186936936936935e-07, + "loss": 0.0005, + "reward": 3.7173796892166138, + "reward_std": 0.06229967065155506, + "rewards/final_reward": 1.7996702310086092, + "rewards/mask_iou_reward": 0.8998351155043046, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7173798084259033, + "rewards/thk_ans_format_reward": 1.0, + "step": 2906, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.4375, + "epoch": 9.81956155143339, + "grad_norm": 8.865503077281648, + "kl": 0.4423828125, + "learning_rate": 1.815878378378378e-07, + "loss": 0.0005, + "reward": 3.446324944496155, + "reward_std": 0.04598255269229412, + "rewards/final_reward": 1.935048211500431, + "rewards/mask_iou_reward": 0.9675241057502155, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.44632488489151, + "rewards/thk_ans_format_reward": 1.0, + "step": 2907, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.92709350585938, + "epoch": 9.822934232715008, + "grad_norm": 11.956988910613449, + "kl": 0.470703125, + "learning_rate": 1.813063063063063e-07, + "loss": 0.0005, + "reward": 3.64016330242157, + "reward_std": 0.026767144328914583, + "rewards/final_reward": 1.8412667576827553, + "rewards/mask_iou_reward": 0.9206333788413776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.640163242816925, + "rewards/thk_ans_format_reward": 1.0, + "step": 2908, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.36458587646484, + "epoch": 9.826306913996627, + "grad_norm": 35.14376581025715, + "kl": 0.630859375, + "learning_rate": 1.8102477477477476e-07, + "loss": 0.0006, + "reward": 3.6144161224365234, + "reward_std": 0.1693898644298315, + "rewards/final_reward": 1.640366187628075, + "rewards/mask_iou_reward": 0.8201830938140375, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6248327493667603, + "rewards/thk_ans_format_reward": 1.0, + "step": 2909, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.56251525878906, + "epoch": 9.829679595278247, + "grad_norm": 10.483673875517827, + "kl": 0.5126953125, + "learning_rate": 1.8074324324324325e-07, + "loss": 0.0005, + "reward": 3.6567115783691406, + "reward_std": 0.03177372459322214, + "rewards/final_reward": 1.540711419191794, + "rewards/mask_iou_reward": 0.770355709595897, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.656711459159851, + "rewards/thk_ans_format_reward": 1.0, + "step": 2910, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.1979217529297, + "epoch": 9.833052276559865, + "grad_norm": 12.243313598862233, + "kl": 0.482421875, + "learning_rate": 1.804617117117117e-07, + "loss": 0.0005, + "reward": 3.477978467941284, + "reward_std": 0.054888444021344185, + "rewards/final_reward": 1.393397765441062, + "rewards/mask_iou_reward": 0.696698882720531, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4779785871505737, + "rewards/thk_ans_format_reward": 1.0, + "step": 2911, + "think_completion_length": 10.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.83333587646484, + "epoch": 9.836424957841484, + "grad_norm": 32.98155743117736, + "kl": 0.44921875, + "learning_rate": 1.8018018018018017e-07, + "loss": 0.0004, + "reward": 3.407157063484192, + "reward_std": 0.038790177553892136, + "rewards/final_reward": 1.738950939104507, + "rewards/mask_iou_reward": 0.8694754695522535, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4071571826934814, + "rewards/thk_ans_format_reward": 1.0, + "step": 2912, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.625, + "epoch": 9.839797639123104, + "grad_norm": 40.04921195295062, + "kl": 0.408203125, + "learning_rate": 1.7989864864864865e-07, + "loss": 0.0004, + "reward": 3.4153835773468018, + "reward_std": 0.022413354832679033, + "rewards/final_reward": 1.0282049835428524, + "rewards/mask_iou_reward": 0.5141024917714262, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4153834581375122, + "rewards/thk_ans_format_reward": 1.0, + "step": 2913, + "think_completion_length": 10.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.42708587646484, + "epoch": 9.843170320404722, + "grad_norm": 10.017276834975831, + "kl": 0.599609375, + "learning_rate": 1.7961711711711712e-07, + "loss": 0.0007, + "reward": 3.7833425998687744, + "reward_std": 0.03610672801733017, + "rewards/final_reward": 1.9598054072300441, + "rewards/mask_iou_reward": 0.9799027036150221, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7833424806594849, + "rewards/thk_ans_format_reward": 1.0, + "step": 2914, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.95833587646484, + "epoch": 9.84654300168634, + "grad_norm": 46.39224566897612, + "kl": 0.564453125, + "learning_rate": 1.7933558558558558e-07, + "loss": 0.0006, + "reward": 3.693509101867676, + "reward_std": 0.023674975149333477, + "rewards/final_reward": 1.8687781753558061, + "rewards/mask_iou_reward": 0.9343890876779031, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6935092210769653, + "rewards/thk_ans_format_reward": 1.0, + "step": 2915, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.20833587646484, + "epoch": 9.849915682967959, + "grad_norm": 33.12544783333367, + "kl": 0.4609375, + "learning_rate": 1.7905405405405404e-07, + "loss": 0.0005, + "reward": 3.379317879676819, + "reward_std": 0.07394935376942158, + "rewards/final_reward": 0.4523041336614774, + "rewards/mask_iou_reward": 0.2261520668307387, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.379317820072174, + "rewards/thk_ans_format_reward": 1.0, + "step": 2916, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 133.8541717529297, + "epoch": 9.853288364249579, + "grad_norm": 5.957307232631729, + "kl": 0.50390625, + "learning_rate": 1.787725225225225e-07, + "loss": 0.0005, + "reward": 3.7431968450546265, + "reward_std": 0.021535064559429884, + "rewards/final_reward": 1.7233288962292024, + "rewards/mask_iou_reward": 0.8616644481146012, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7431967854499817, + "rewards/thk_ans_format_reward": 1.0, + "step": 2917, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.33333587646484, + "epoch": 9.856661045531197, + "grad_norm": 8.247865879330005, + "kl": 0.548828125, + "learning_rate": 1.7849099099099098e-07, + "loss": 0.0006, + "reward": 3.375833749771118, + "reward_std": 0.03770332410931587, + "rewards/final_reward": 1.6320474866468297, + "rewards/mask_iou_reward": 0.8160237433234149, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.375833809375763, + "rewards/thk_ans_format_reward": 1.0, + "step": 2918, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.96875762939453, + "epoch": 9.860033726812816, + "grad_norm": 8.409200834188237, + "kl": 0.4375, + "learning_rate": 1.7820945945945945e-07, + "loss": 0.0004, + "reward": 3.7715072631835938, + "reward_std": 0.02342725871130824, + "rewards/final_reward": 1.5038488635778524, + "rewards/mask_iou_reward": 0.7519244317889262, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7715071439743042, + "rewards/thk_ans_format_reward": 1.0, + "step": 2919, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.28125762939453, + "epoch": 9.863406408094434, + "grad_norm": 27.130241868912144, + "kl": 0.5166015625, + "learning_rate": 1.779279279279279e-07, + "loss": 0.0005, + "reward": 3.368473529815674, + "reward_std": 0.07650148123502731, + "rewards/final_reward": 1.604985981721935, + "rewards/mask_iou_reward": 0.8024929908609675, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.368473470211029, + "rewards/thk_ans_format_reward": 1.0, + "step": 2920, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.73959350585938, + "epoch": 9.866779089376054, + "grad_norm": 12.110615569131403, + "kl": 0.984375, + "learning_rate": 1.776463963963964e-07, + "loss": 0.001, + "reward": 3.820041060447693, + "reward_std": 0.02060036826878786, + "rewards/final_reward": 1.7444984203992941, + "rewards/mask_iou_reward": 0.8722492101996471, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8200412392616272, + "rewards/thk_ans_format_reward": 1.0, + "step": 2921, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.6875, + "epoch": 9.870151770657673, + "grad_norm": 10.618169954242731, + "kl": 0.484375, + "learning_rate": 1.7736486486486485e-07, + "loss": 0.0005, + "reward": 3.7311623096466064, + "reward_std": 0.0554316071793437, + "rewards/final_reward": 1.8880639926998695, + "rewards/mask_iou_reward": 0.9440319963499347, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.731162190437317, + "rewards/thk_ans_format_reward": 1.0, + "step": 2922, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.43750762939453, + "epoch": 9.873524451939291, + "grad_norm": 10.306036874313062, + "kl": 0.45703125, + "learning_rate": 1.7708333333333334e-07, + "loss": 0.0005, + "reward": 3.725971221923828, + "reward_std": 0.049082960933446884, + "rewards/final_reward": 1.800602803022311, + "rewards/mask_iou_reward": 0.9003014015111555, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7259709239006042, + "rewards/thk_ans_format_reward": 1.0, + "step": 2923, + "think_completion_length": 6.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.9479217529297, + "epoch": 9.876897133220911, + "grad_norm": 11.546465496663332, + "kl": 0.4404296875, + "learning_rate": 1.768018018018018e-07, + "loss": 0.0005, + "reward": 3.609371066093445, + "reward_std": 0.03989543952047825, + "rewards/final_reward": 1.694492264856553, + "rewards/mask_iou_reward": 0.8472461324282765, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6093710660934448, + "rewards/thk_ans_format_reward": 1.0, + "step": 2924, + "think_completion_length": 8.958333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.8229217529297, + "epoch": 9.88026981450253, + "grad_norm": 29.196481852946373, + "kl": 0.4521484375, + "learning_rate": 1.7652027027027026e-07, + "loss": 0.0005, + "reward": 3.4165241718292236, + "reward_std": 0.04025060310959816, + "rewards/final_reward": 1.485056108184279, + "rewards/mask_iou_reward": 0.7425280540921395, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4165241718292236, + "rewards/thk_ans_format_reward": 1.0, + "step": 2925, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.37500762939453, + "epoch": 9.883642495784148, + "grad_norm": 11.587141178290794, + "kl": 0.4248046875, + "learning_rate": 1.7623873873873872e-07, + "loss": 0.0004, + "reward": 3.633195996284485, + "reward_std": 0.05130454897880554, + "rewards/final_reward": 1.6711122317909908, + "rewards/mask_iou_reward": 0.8355561158954954, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6331957578659058, + "rewards/thk_ans_format_reward": 1.0, + "step": 2926, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.08333587646484, + "epoch": 9.887015177065766, + "grad_norm": 13.206269452632558, + "kl": 0.5595703125, + "learning_rate": 1.7595720720720718e-07, + "loss": 0.0006, + "reward": 3.688322424888611, + "reward_std": 0.03490753611549735, + "rewards/final_reward": 1.8829596973409148, + "rewards/mask_iou_reward": 0.9414798486704574, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6883226037025452, + "rewards/thk_ans_format_reward": 1.0, + "step": 2927, + "think_completion_length": 11.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.20833587646484, + "epoch": 9.890387858347387, + "grad_norm": 15.933006268489812, + "kl": 0.509765625, + "learning_rate": 1.7567567567567567e-07, + "loss": 0.0006, + "reward": 3.5859495401382446, + "reward_std": 0.09474263805896044, + "rewards/final_reward": 1.5611196697849357, + "rewards/mask_iou_reward": 0.7805598348924678, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5859493613243103, + "rewards/thk_ans_format_reward": 1.0, + "step": 2928, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.3541717529297, + "epoch": 9.893760539629005, + "grad_norm": 20.903893582274613, + "kl": 0.498046875, + "learning_rate": 1.7539414414414413e-07, + "loss": 0.0005, + "reward": 3.428972840309143, + "reward_std": 0.10967486724257469, + "rewards/final_reward": 1.555474882501772, + "rewards/mask_iou_reward": 0.777737441250886, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4289729595184326, + "rewards/thk_ans_format_reward": 1.0, + "step": 2929, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.93750762939453, + "epoch": 9.897133220910623, + "grad_norm": 17.340904195009937, + "kl": 0.421875, + "learning_rate": 1.751126126126126e-07, + "loss": 0.0005, + "reward": 3.2104990482330322, + "reward_std": 0.03169256402179599, + "rewards/final_reward": 0.706509706649572, + "rewards/mask_iou_reward": 0.353254853324786, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2104989886283875, + "rewards/thk_ans_format_reward": 1.0, + "step": 2930, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.9166717529297, + "epoch": 9.900505902192243, + "grad_norm": 9.984017440437876, + "kl": 0.533203125, + "learning_rate": 1.7483108108108108e-07, + "loss": 0.0005, + "reward": 3.6238770484924316, + "reward_std": 0.04825960611924529, + "rewards/final_reward": 1.497253118569068, + "rewards/mask_iou_reward": 0.748626559284534, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.623877227306366, + "rewards/thk_ans_format_reward": 1.0, + "step": 2931, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.27083587646484, + "epoch": 9.903878583473862, + "grad_norm": 6.951641531264174, + "kl": 0.5078125, + "learning_rate": 1.7454954954954954e-07, + "loss": 0.0005, + "reward": 3.6145602464675903, + "reward_std": 0.09818883240222931, + "rewards/final_reward": 1.7906074611252016, + "rewards/mask_iou_reward": 0.8953037305626008, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6145601868629456, + "rewards/thk_ans_format_reward": 1.0, + "step": 2932, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 148.37500762939453, + "epoch": 9.90725126475548, + "grad_norm": 76.02270592629036, + "kl": 0.5341796875, + "learning_rate": 1.7426801801801803e-07, + "loss": 0.0005, + "reward": 3.306624174118042, + "reward_std": 0.14644039422273636, + "rewards/final_reward": 0.8189659901344464, + "rewards/mask_iou_reward": 0.4094829950672232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3066240549087524, + "rewards/thk_ans_format_reward": 1.0, + "step": 2933, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.55208587646484, + "epoch": 9.910623946037099, + "grad_norm": 8.56338879773896, + "kl": 0.4306640625, + "learning_rate": 1.739864864864865e-07, + "loss": 0.0004, + "reward": 3.6486175060272217, + "reward_std": 0.026789831928908825, + "rewards/final_reward": 1.8038686547675227, + "rewards/mask_iou_reward": 0.9019343273837613, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6486175060272217, + "rewards/thk_ans_format_reward": 1.0, + "step": 2934, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.59375762939453, + "epoch": 9.913996627318719, + "grad_norm": 11.418503877009623, + "kl": 0.400390625, + "learning_rate": 1.7370495495495495e-07, + "loss": 0.0004, + "reward": 3.7190651893615723, + "reward_std": 0.13147340714931488, + "rewards/final_reward": 1.2865869158245113, + "rewards/mask_iou_reward": 0.6432934579122557, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7190650701522827, + "rewards/thk_ans_format_reward": 1.0, + "step": 2935, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.71875, + "epoch": 9.917369308600337, + "grad_norm": 9.971262476586999, + "kl": 0.45703125, + "learning_rate": 1.734234234234234e-07, + "loss": 0.0004, + "reward": 3.8314409255981445, + "reward_std": 0.03323422558605671, + "rewards/final_reward": 1.8260853344315668, + "rewards/mask_iou_reward": 0.9130426672157834, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8314408659934998, + "rewards/thk_ans_format_reward": 1.0, + "step": 2936, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.18750381469727, + "epoch": 9.920741989881956, + "grad_norm": 36.2502426585584, + "kl": 0.498046875, + "learning_rate": 1.7314189189189187e-07, + "loss": 0.0005, + "reward": 3.436210036277771, + "reward_std": 0.152422234416008, + "rewards/final_reward": 1.5981619230412196, + "rewards/mask_iou_reward": 0.7990809615206098, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4466266632080078, + "rewards/thk_ans_format_reward": 1.0, + "step": 2937, + "think_completion_length": 8.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.5729217529297, + "epoch": 9.924114671163576, + "grad_norm": 11.851306195065153, + "kl": 0.498046875, + "learning_rate": 1.7286036036036036e-07, + "loss": 0.0005, + "reward": 3.2320234775543213, + "reward_std": 0.08792375959455967, + "rewards/final_reward": 0.45280834392977154, + "rewards/mask_iou_reward": 0.22640417196488577, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2320235967636108, + "rewards/thk_ans_format_reward": 1.0, + "step": 2938, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.84375762939453, + "epoch": 9.927487352445194, + "grad_norm": 15.982465512064199, + "kl": 0.453125, + "learning_rate": 1.7257882882882882e-07, + "loss": 0.0005, + "reward": 3.6887292861938477, + "reward_std": 0.017841395922005177, + "rewards/final_reward": 1.4235046997267258, + "rewards/mask_iou_reward": 0.7117523498633629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6887291073799133, + "rewards/thk_ans_format_reward": 1.0, + "step": 2939, + "think_completion_length": 10.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.0104217529297, + "epoch": 9.930860033726812, + "grad_norm": 14.291848001764228, + "kl": 0.498046875, + "learning_rate": 1.7229729729729728e-07, + "loss": 0.0005, + "reward": 3.6745972633361816, + "reward_std": 0.06178950145840645, + "rewards/final_reward": 1.9635444240660527, + "rewards/mask_iou_reward": 0.9817722120330263, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6745970845222473, + "rewards/thk_ans_format_reward": 1.0, + "step": 2940, + "think_completion_length": 9.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.2604217529297, + "epoch": 9.93423271500843, + "grad_norm": 7.356592155605947, + "kl": 0.453125, + "learning_rate": 1.7201576576576577e-07, + "loss": 0.0005, + "reward": 3.3794697523117065, + "reward_std": 0.09687015041708946, + "rewards/final_reward": 1.695792669363521, + "rewards/mask_iou_reward": 0.8478963346817605, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3794699907302856, + "rewards/thk_ans_format_reward": 1.0, + "step": 2941, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.0416717529297, + "epoch": 9.937605396290051, + "grad_norm": 11.633500137625838, + "kl": 0.3662109375, + "learning_rate": 1.7173423423423423e-07, + "loss": 0.0004, + "reward": 3.1571407318115234, + "reward_std": 0.03151166997849941, + "rewards/final_reward": 0.9415822823880424, + "rewards/mask_iou_reward": 0.4707911411940212, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.157140702009201, + "rewards/thk_ans_format_reward": 1.0, + "step": 2942, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.42708587646484, + "epoch": 9.94097807757167, + "grad_norm": 16.193175894939944, + "kl": 0.61328125, + "learning_rate": 1.714527027027027e-07, + "loss": 0.0006, + "reward": 3.3838253021240234, + "reward_std": 0.1029960885643959, + "rewards/final_reward": 1.0003497415716365, + "rewards/mask_iou_reward": 0.5001748707858182, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3838250637054443, + "rewards/thk_ans_format_reward": 1.0, + "step": 2943, + "think_completion_length": 11.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.8958396911621, + "epoch": 9.944350758853288, + "grad_norm": 10.13797614381608, + "kl": 0.505859375, + "learning_rate": 1.7117117117117117e-07, + "loss": 0.0005, + "reward": 3.6523609161376953, + "reward_std": 0.044180636294186115, + "rewards/final_reward": 1.8727462066479181, + "rewards/mask_iou_reward": 0.9363731033239591, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.652361273765564, + "rewards/thk_ans_format_reward": 1.0, + "step": 2944, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.67709350585938, + "epoch": 9.947723440134908, + "grad_norm": 11.285243604780446, + "kl": 0.51171875, + "learning_rate": 1.7088963963963963e-07, + "loss": 0.0005, + "reward": 3.1645601987838745, + "reward_std": 0.09775098785758018, + "rewards/final_reward": 1.036894984210309, + "rewards/mask_iou_reward": 0.5184474921051545, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.164560079574585, + "rewards/thk_ans_format_reward": 1.0, + "step": 2945, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.43750762939453, + "epoch": 9.951096121416526, + "grad_norm": 10.555231941128286, + "kl": 0.435546875, + "learning_rate": 1.706081081081081e-07, + "loss": 0.0004, + "reward": 3.799447178840637, + "reward_std": 0.022393792401999235, + "rewards/final_reward": 1.7339816399490353, + "rewards/mask_iou_reward": 0.8669908199745177, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7994474172592163, + "rewards/thk_ans_format_reward": 1.0, + "step": 2946, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.68750762939453, + "epoch": 9.954468802698145, + "grad_norm": 8.845005544933374, + "kl": 0.4521484375, + "learning_rate": 1.7032657657657656e-07, + "loss": 0.0005, + "reward": 3.6235469579696655, + "reward_std": 0.055221643298864365, + "rewards/final_reward": 1.281981455757465, + "rewards/mask_iou_reward": 0.6409907278787325, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6235471963882446, + "rewards/thk_ans_format_reward": 1.0, + "step": 2947, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 153.89584350585938, + "epoch": 9.957841483979763, + "grad_norm": 11.47169436446601, + "kl": 0.5224609375, + "learning_rate": 1.7004504504504504e-07, + "loss": 0.0005, + "reward": 3.5237936973571777, + "reward_std": 0.07607119157910347, + "rewards/final_reward": 1.8342230228232848, + "rewards/mask_iou_reward": 0.9171115114116424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5237935781478882, + "rewards/thk_ans_format_reward": 1.0, + "step": 2948, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.1875, + "epoch": 9.961214165261383, + "grad_norm": 10.261729370598339, + "kl": 0.43359375, + "learning_rate": 1.697635135135135e-07, + "loss": 0.0004, + "reward": 3.6379356384277344, + "reward_std": 0.03940104506909847, + "rewards/final_reward": 1.620287095778117, + "rewards/mask_iou_reward": 0.8101435478890585, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.637935757637024, + "rewards/thk_ans_format_reward": 1.0, + "step": 2949, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.5416717529297, + "epoch": 9.964586846543002, + "grad_norm": 17.48974843978608, + "kl": 0.97265625, + "learning_rate": 1.6948198198198196e-07, + "loss": 0.001, + "reward": 3.606302499771118, + "reward_std": 0.04735631635412574, + "rewards/final_reward": 1.796730760126513, + "rewards/mask_iou_reward": 0.8983653800632565, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6063026189804077, + "rewards/thk_ans_format_reward": 1.0, + "step": 2950, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.3854217529297, + "epoch": 9.96795952782462, + "grad_norm": 10.82864820514153, + "kl": 0.4345703125, + "learning_rate": 1.6920045045045045e-07, + "loss": 0.0004, + "reward": 3.7164015769958496, + "reward_std": 0.11439871042966843, + "rewards/final_reward": 1.6110125597812006, + "rewards/mask_iou_reward": 0.8055062798906003, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.71640145778656, + "rewards/thk_ans_format_reward": 1.0, + "step": 2951, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.20833587646484, + "epoch": 9.97133220910624, + "grad_norm": 110.90492820482815, + "kl": 0.470703125, + "learning_rate": 1.689189189189189e-07, + "loss": 0.0005, + "reward": 3.732366681098938, + "reward_std": 0.0235447958111763, + "rewards/final_reward": 1.9120111381690408, + "rewards/mask_iou_reward": 0.9560055690845204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7323666214942932, + "rewards/thk_ans_format_reward": 1.0, + "step": 2952, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.6458435058594, + "epoch": 9.974704890387859, + "grad_norm": 9.008389810735736, + "kl": 1.67578125, + "learning_rate": 1.686373873873874e-07, + "loss": 0.0017, + "reward": 3.518873453140259, + "reward_std": 0.03557255119085312, + "rewards/final_reward": 1.5394322548166186, + "rewards/mask_iou_reward": 0.7697161274083093, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.518873393535614, + "rewards/thk_ans_format_reward": 1.0, + "step": 2953, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.45834350585938, + "epoch": 9.978077571669477, + "grad_norm": 14.853078093798148, + "kl": 0.4912109375, + "learning_rate": 1.6835585585585586e-07, + "loss": 0.0005, + "reward": 3.7388641834259033, + "reward_std": 0.048480624333024025, + "rewards/final_reward": 1.8573203732843733, + "rewards/mask_iou_reward": 0.9286601866421866, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7388643026351929, + "rewards/thk_ans_format_reward": 1.0, + "step": 2954, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.39583587646484, + "epoch": 9.981450252951095, + "grad_norm": 22.784244760850683, + "kl": 0.857421875, + "learning_rate": 1.680743243243243e-07, + "loss": 0.0009, + "reward": 3.455596923828125, + "reward_std": 0.05335235968232155, + "rewards/final_reward": 1.6826988393083862, + "rewards/mask_iou_reward": 0.8413494196541931, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4555969834327698, + "rewards/thk_ans_format_reward": 1.0, + "step": 2955, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.03125, + "epoch": 9.984822934232715, + "grad_norm": 22.288546418231373, + "kl": 0.4384765625, + "learning_rate": 1.6779279279279278e-07, + "loss": 0.0004, + "reward": 3.4251439571380615, + "reward_std": 0.028437476605176926, + "rewards/final_reward": 1.1720492580818076, + "rewards/mask_iou_reward": 0.5860246290409038, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4251437187194824, + "rewards/thk_ans_format_reward": 1.0, + "step": 2956, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.03125, + "epoch": 9.988195615514334, + "grad_norm": 16.51769126088518, + "kl": 0.60546875, + "learning_rate": 1.6751126126126124e-07, + "loss": 0.0006, + "reward": 3.3870999813079834, + "reward_std": 0.05538425035774708, + "rewards/final_reward": 1.507213691171632, + "rewards/mask_iou_reward": 0.753606845585816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3870996832847595, + "rewards/thk_ans_format_reward": 1.0, + "step": 2957, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.875, + "epoch": 9.991568296795952, + "grad_norm": 7.731847744255573, + "kl": 0.517578125, + "learning_rate": 1.6722972972972973e-07, + "loss": 0.0005, + "reward": 3.5086575746536255, + "reward_std": 0.038238752633333206, + "rewards/final_reward": 1.5398872895969526, + "rewards/mask_iou_reward": 0.7699436447984763, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5086576342582703, + "rewards/thk_ans_format_reward": 1.0, + "step": 2958, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.33333587646484, + "epoch": 9.994940978077572, + "grad_norm": 31.29226900190095, + "kl": 0.40234375, + "learning_rate": 1.669481981981982e-07, + "loss": 0.0004, + "reward": 3.445936441421509, + "reward_std": 0.21446402929723263, + "rewards/final_reward": 1.4543522509606437, + "rewards/mask_iou_reward": 0.7271761254803218, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.4771862030029297, + "rewards/thk_ans_format_reward": 1.0, + "step": 2959, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.78948211669922, + "epoch": 9.99831365935919, + "grad_norm": 13.639265248226923, + "kl": 0.5166015625, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0005, + "reward": 3.4749454259872437, + "reward_std": 0.021927848923951387, + "rewards/final_reward": 1.6298833637258023, + "rewards/mask_iou_reward": 0.8149416818629012, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4749454855918884, + "rewards/thk_ans_format_reward": 1.0, + "step": 2960, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.18750762939453, + "epoch": 10.003372681281618, + "grad_norm": 11.074702040879929, + "kl": 0.517578125, + "learning_rate": 1.6638513513513514e-07, + "loss": 0.0005, + "reward": 3.554487109184265, + "reward_std": 0.057201748713850975, + "rewards/final_reward": 1.7276102054288849, + "rewards/mask_iou_reward": 0.8638051027144424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.55448716878891, + "rewards/thk_ans_format_reward": 1.0, + "step": 2961, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 136.3958396911621, + "epoch": 10.006745362563239, + "grad_norm": 17.21456270645261, + "kl": 0.5576171875, + "learning_rate": 1.661036036036036e-07, + "loss": 0.0005, + "reward": 3.784704804420471, + "reward_std": 0.0637154346331954, + "rewards/final_reward": 1.7564609223970806, + "rewards/mask_iou_reward": 0.8782304611985403, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.784704566001892, + "rewards/thk_ans_format_reward": 1.0, + "step": 2962, + "think_completion_length": 9.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.1354217529297, + "epoch": 10.010118043844857, + "grad_norm": 10.966578836490807, + "kl": 0.4501953125, + "learning_rate": 1.6582207207207209e-07, + "loss": 0.0005, + "reward": 3.706210970878601, + "reward_std": 0.034264068119227886, + "rewards/final_reward": 1.6498285462680125, + "rewards/mask_iou_reward": 0.8249142731340062, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.706210970878601, + "rewards/thk_ans_format_reward": 1.0, + "step": 2963, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.33333587646484, + "epoch": 10.013490725126475, + "grad_norm": 14.586749604154246, + "kl": 0.400390625, + "learning_rate": 1.6554054054054055e-07, + "loss": 0.0005, + "reward": 3.7034146785736084, + "reward_std": 0.15973122231662273, + "rewards/final_reward": 1.8500299913430904, + "rewards/mask_iou_reward": 0.9250149956715452, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7242479920387268, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2964, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.98958587646484, + "epoch": 10.016863406408094, + "grad_norm": 16.797745597040446, + "kl": 0.5888671875, + "learning_rate": 1.6525900900900898e-07, + "loss": 0.0006, + "reward": 3.473345994949341, + "reward_std": 0.02174593461677432, + "rewards/final_reward": 1.1856811101943596, + "rewards/mask_iou_reward": 0.5928405550971798, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4733460545539856, + "rewards/thk_ans_format_reward": 1.0, + "step": 2965, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.33334350585938, + "epoch": 10.020236087689714, + "grad_norm": 18.3615049832474, + "kl": 0.443359375, + "learning_rate": 1.6497747747747747e-07, + "loss": 0.0004, + "reward": 3.4337416887283325, + "reward_std": 0.05834187474101782, + "rewards/final_reward": 1.1960408761287606, + "rewards/mask_iou_reward": 0.5980204380643803, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4337416887283325, + "rewards/thk_ans_format_reward": 1.0, + "step": 2966, + "think_completion_length": 9.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.86458587646484, + "epoch": 10.023608768971332, + "grad_norm": 17.822461269329146, + "kl": 0.68359375, + "learning_rate": 1.6469594594594593e-07, + "loss": 0.0007, + "reward": 3.599363684654236, + "reward_std": 0.01909334654919803, + "rewards/final_reward": 1.160729292363, + "rewards/mask_iou_reward": 0.5803646461815, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5993636846542358, + "rewards/thk_ans_format_reward": 1.0, + "step": 2967, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.39583587646484, + "epoch": 10.02698145025295, + "grad_norm": 6.870113350227844, + "kl": 0.638671875, + "learning_rate": 1.6441441441441442e-07, + "loss": 0.0007, + "reward": 3.8057702779769897, + "reward_std": 0.014783780090510845, + "rewards/final_reward": 1.8299943991686436, + "rewards/mask_iou_reward": 0.9149971995843218, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.805770218372345, + "rewards/thk_ans_format_reward": 1.0, + "step": 2968, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.67708587646484, + "epoch": 10.03035413153457, + "grad_norm": 15.001810746884246, + "kl": 0.423828125, + "learning_rate": 1.6413288288288288e-07, + "loss": 0.0004, + "reward": 3.303891658782959, + "reward_std": 0.12750976346433163, + "rewards/final_reward": 1.1526862911700408, + "rewards/mask_iou_reward": 0.5763431455850204, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.303891658782959, + "rewards/thk_ans_format_reward": 1.0, + "step": 2969, + "think_completion_length": 9.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.38543701171875, + "epoch": 10.03372681281619, + "grad_norm": 7.303224734033114, + "kl": 0.3857421875, + "learning_rate": 1.6385135135135134e-07, + "loss": 0.0004, + "reward": 3.4886913299560547, + "reward_std": 0.1877976879477501, + "rewards/final_reward": 1.896774348900549, + "rewards/mask_iou_reward": 0.9483871744502745, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5303580164909363, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2970, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.88542938232422, + "epoch": 10.037099494097808, + "grad_norm": 8.140977371174358, + "kl": 0.4140625, + "learning_rate": 1.6356981981981982e-07, + "loss": 0.0004, + "reward": 3.4109930992126465, + "reward_std": 0.14596521109342575, + "rewards/final_reward": 1.4359857698723864, + "rewards/mask_iou_reward": 0.7179928849361932, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4318262338638306, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2971, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.1250114440918, + "epoch": 10.040472175379426, + "grad_norm": 6.908211064573147, + "kl": 0.4951171875, + "learning_rate": 1.6328828828828828e-07, + "loss": 0.0005, + "reward": 3.5527108907699585, + "reward_std": 0.0504262950271368, + "rewards/final_reward": 1.452446585897007, + "rewards/mask_iou_reward": 0.7262232929485035, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5527108311653137, + "rewards/thk_ans_format_reward": 1.0, + "step": 2972, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.09375762939453, + "epoch": 10.043844856661046, + "grad_norm": 17.422370241399438, + "kl": 0.5751953125, + "learning_rate": 1.6300675675675674e-07, + "loss": 0.0006, + "reward": 3.5222601890563965, + "reward_std": 0.0661549512296915, + "rewards/final_reward": 1.777932428648302, + "rewards/mask_iou_reward": 0.888966214324151, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5222598910331726, + "rewards/thk_ans_format_reward": 1.0, + "step": 2973, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.7291717529297, + "epoch": 10.047217537942664, + "grad_norm": 14.254621038601908, + "kl": 0.419921875, + "learning_rate": 1.6272522522522523e-07, + "loss": 0.0004, + "reward": 3.6620055437088013, + "reward_std": 0.08311491832137108, + "rewards/final_reward": 1.4122443571993455, + "rewards/mask_iou_reward": 0.7061221785996727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6620052456855774, + "rewards/thk_ans_format_reward": 1.0, + "step": 2974, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.56250762939453, + "epoch": 10.050590219224283, + "grad_norm": 16.099298514297175, + "kl": 0.4072265625, + "learning_rate": 1.6244369369369367e-07, + "loss": 0.0004, + "reward": 3.5003554821014404, + "reward_std": 0.03283052425831556, + "rewards/final_reward": 1.4058345564704786, + "rewards/mask_iou_reward": 0.7029172782352393, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.500355303287506, + "rewards/thk_ans_format_reward": 1.0, + "step": 2975, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 147.55208587646484, + "epoch": 10.053962900505903, + "grad_norm": 13.845801977509861, + "kl": 0.61328125, + "learning_rate": 1.6216216216216215e-07, + "loss": 0.0006, + "reward": 3.2513362169265747, + "reward_std": 0.11728479154407978, + "rewards/final_reward": 0.5600943317194658, + "rewards/mask_iou_reward": 0.2800471658597329, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.251335859298706, + "rewards/thk_ans_format_reward": 1.0, + "step": 2976, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 135.75, + "epoch": 10.057335581787521, + "grad_norm": 9.462435762142102, + "kl": 0.494140625, + "learning_rate": 1.6188063063063061e-07, + "loss": 0.0005, + "reward": 3.6380168199539185, + "reward_std": 0.014404607936739922, + "rewards/final_reward": 1.2512799839296525, + "rewards/mask_iou_reward": 0.6256399919648262, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6380165815353394, + "rewards/thk_ans_format_reward": 1.0, + "step": 2977, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.4791717529297, + "epoch": 10.06070826306914, + "grad_norm": 32.37701361935778, + "kl": 0.642578125, + "learning_rate": 1.6159909909909907e-07, + "loss": 0.0007, + "reward": 3.2058398723602295, + "reward_std": 0.07977872295305133, + "rewards/final_reward": 0.7515505715066793, + "rewards/mask_iou_reward": 0.3757752857533396, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2058398127555847, + "rewards/thk_ans_format_reward": 1.0, + "step": 2978, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.9791717529297, + "epoch": 10.064080944350758, + "grad_norm": 101.75506967696498, + "kl": 0.509765625, + "learning_rate": 1.6131756756756756e-07, + "loss": 0.0005, + "reward": 3.602941393852234, + "reward_std": 0.10243973135948181, + "rewards/final_reward": 1.7877489590511702, + "rewards/mask_iou_reward": 0.8938744795255851, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.602941632270813, + "rewards/thk_ans_format_reward": 1.0, + "step": 2979, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.62500762939453, + "epoch": 10.067453625632378, + "grad_norm": 11.296191395336175, + "kl": 0.427734375, + "learning_rate": 1.6103603603603602e-07, + "loss": 0.0005, + "reward": 3.6917929649353027, + "reward_std": 0.09627137146890163, + "rewards/final_reward": 1.882786193694157, + "rewards/mask_iou_reward": 0.9413930968470785, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.691792905330658, + "rewards/thk_ans_format_reward": 1.0, + "step": 2980, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.7604217529297, + "epoch": 10.070826306913997, + "grad_norm": 9.002201625690828, + "kl": 0.4228515625, + "learning_rate": 1.607545045045045e-07, + "loss": 0.0004, + "reward": 3.6802202463150024, + "reward_std": 0.05004505813121796, + "rewards/final_reward": 1.4352498487281913, + "rewards/mask_iou_reward": 0.7176249243640956, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6802199482917786, + "rewards/thk_ans_format_reward": 1.0, + "step": 2981, + "think_completion_length": 10.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.65625762939453, + "epoch": 10.074198988195615, + "grad_norm": 7.67063156372946, + "kl": 0.4111328125, + "learning_rate": 1.6047297297297297e-07, + "loss": 0.0006, + "reward": 3.744957447052002, + "reward_std": 0.021554138511419296, + "rewards/final_reward": 1.5525201778527098, + "rewards/mask_iou_reward": 0.7762600889263549, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.744957447052002, + "rewards/thk_ans_format_reward": 1.0, + "step": 2982, + "think_completion_length": 6.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.5416717529297, + "epoch": 10.077571669477235, + "grad_norm": 20.886407622595403, + "kl": 0.4521484375, + "learning_rate": 1.6019144144144143e-07, + "loss": 0.0005, + "reward": 3.314300298690796, + "reward_std": 0.13691934198141098, + "rewards/final_reward": 1.1969343026669517, + "rewards/mask_iou_reward": 0.5984671513334758, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3143000602722168, + "rewards/thk_ans_format_reward": 1.0, + "step": 2983, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.09375762939453, + "epoch": 10.080944350758854, + "grad_norm": 16.196788087359803, + "kl": 0.607421875, + "learning_rate": 1.5990990990990992e-07, + "loss": 0.0006, + "reward": 3.558018445968628, + "reward_std": 0.027575062587857246, + "rewards/final_reward": 1.7583389896541204, + "rewards/mask_iou_reward": 0.8791694948270602, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5580185651779175, + "rewards/thk_ans_format_reward": 1.0, + "step": 2984, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.7812614440918, + "epoch": 10.084317032040472, + "grad_norm": 10.59394553028089, + "kl": 0.435546875, + "learning_rate": 1.5962837837837835e-07, + "loss": 0.0004, + "reward": 3.549351930618286, + "reward_std": 0.057122400030493736, + "rewards/final_reward": 1.7864545330082315, + "rewards/mask_iou_reward": 0.8932272665041158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5493518710136414, + "rewards/thk_ans_format_reward": 1.0, + "step": 2985, + "think_completion_length": 9.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.0104217529297, + "epoch": 10.08768971332209, + "grad_norm": 7.993621357063377, + "kl": 0.58203125, + "learning_rate": 1.5934684684684684e-07, + "loss": 0.0006, + "reward": 3.766826868057251, + "reward_std": 0.034018273930996656, + "rewards/final_reward": 1.9693942374379159, + "rewards/mask_iou_reward": 0.9846971187189579, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7668269276618958, + "rewards/thk_ans_format_reward": 1.0, + "step": 2986, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.40626525878906, + "epoch": 10.09106239460371, + "grad_norm": 7.8579908910789, + "kl": 0.474609375, + "learning_rate": 1.590653153153153e-07, + "loss": 0.0006, + "reward": 3.545803666114807, + "reward_std": 0.16862626932561398, + "rewards/final_reward": 1.7307227523857664, + "rewards/mask_iou_reward": 0.8653613761928832, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5666367411613464, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 2987, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.4166717529297, + "epoch": 10.094435075885329, + "grad_norm": 14.888977164594754, + "kl": 0.392578125, + "learning_rate": 1.5878378378378376e-07, + "loss": 0.0004, + "reward": 3.4926047325134277, + "reward_std": 0.1254997132346034, + "rewards/final_reward": 1.6567365575695758, + "rewards/mask_iou_reward": 0.8283682787847879, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.534271240234375, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 2988, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.0520896911621, + "epoch": 10.097807757166947, + "grad_norm": 178.17919910050486, + "kl": 0.509765625, + "learning_rate": 1.5850225225225225e-07, + "loss": 0.0005, + "reward": 3.5052605867385864, + "reward_std": 0.07205817103385925, + "rewards/final_reward": 1.3663548258724365, + "rewards/mask_iou_reward": 0.6831774129362183, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5052603483200073, + "rewards/thk_ans_format_reward": 1.0, + "step": 2989, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.7604217529297, + "epoch": 10.101180438448566, + "grad_norm": 9.404698816959156, + "kl": 0.515625, + "learning_rate": 1.582207207207207e-07, + "loss": 0.0005, + "reward": 3.4773871898651123, + "reward_std": 0.04026375897228718, + "rewards/final_reward": 1.5072943995416463, + "rewards/mask_iou_reward": 0.7536471997708232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4773871302604675, + "rewards/thk_ans_format_reward": 1.0, + "step": 2990, + "think_completion_length": 6.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.90625, + "epoch": 10.104553119730186, + "grad_norm": 17.635410219594927, + "kl": 0.564453125, + "learning_rate": 1.579391891891892e-07, + "loss": 0.0006, + "reward": 3.370635747909546, + "reward_std": 0.08598719723522663, + "rewards/final_reward": 0.9948692949078439, + "rewards/mask_iou_reward": 0.49743464745392196, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3706358075141907, + "rewards/thk_ans_format_reward": 1.0, + "step": 2991, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.53125762939453, + "epoch": 10.107925801011804, + "grad_norm": 8.498665489249783, + "kl": 0.416015625, + "learning_rate": 1.5765765765765766e-07, + "loss": 0.0004, + "reward": 3.434711217880249, + "reward_std": 0.053263518027961254, + "rewards/final_reward": 1.2998215907954818, + "rewards/mask_iou_reward": 0.6499107953977409, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.434711217880249, + "rewards/thk_ans_format_reward": 1.0, + "step": 2992, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.14583587646484, + "epoch": 10.111298482293423, + "grad_norm": 11.28021036658622, + "kl": 0.4296875, + "learning_rate": 1.5737612612612612e-07, + "loss": 0.0004, + "reward": 3.796360492706299, + "reward_std": 0.03567369282245636, + "rewards/final_reward": 1.616571069132243, + "rewards/mask_iou_reward": 0.8082855345661215, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7963602542877197, + "rewards/thk_ans_format_reward": 1.0, + "step": 2993, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.89583587646484, + "epoch": 10.114671163575043, + "grad_norm": 12.73797474710979, + "kl": 0.5234375, + "learning_rate": 1.570945945945946e-07, + "loss": 0.0005, + "reward": 3.5808873176574707, + "reward_std": 0.07881723530590534, + "rewards/final_reward": 0.8961576623113501, + "rewards/mask_iou_reward": 0.44807883115567504, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5808873772621155, + "rewards/thk_ans_format_reward": 1.0, + "step": 2994, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.98958587646484, + "epoch": 10.118043844856661, + "grad_norm": 9.283367900830749, + "kl": 0.4599609375, + "learning_rate": 1.5681306306306304e-07, + "loss": 0.0005, + "reward": 3.6719605922698975, + "reward_std": 0.039013393223285675, + "rewards/final_reward": 1.5600923645952776, + "rewards/mask_iou_reward": 0.7800461822976388, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6719606518745422, + "rewards/thk_ans_format_reward": 1.0, + "step": 2995, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.6666717529297, + "epoch": 10.12141652613828, + "grad_norm": 18.461367215836585, + "kl": 0.474609375, + "learning_rate": 1.5653153153153153e-07, + "loss": 0.0005, + "reward": 3.6122279167175293, + "reward_std": 0.05605051852762699, + "rewards/final_reward": 1.843798595685787, + "rewards/mask_iou_reward": 0.9218992978428935, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6122277975082397, + "rewards/thk_ans_format_reward": 1.0, + "step": 2996, + "think_completion_length": 9.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.61458587646484, + "epoch": 10.124789207419898, + "grad_norm": 13.510683556899686, + "kl": 0.4072265625, + "learning_rate": 1.5624999999999999e-07, + "loss": 0.0004, + "reward": 3.553600788116455, + "reward_std": 0.14882715791463852, + "rewards/final_reward": 1.5099988524276768, + "rewards/mask_iou_reward": 0.7549994262138384, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.553600788116455, + "rewards/thk_ans_format_reward": 1.0, + "step": 2997, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.4479217529297, + "epoch": 10.128161888701518, + "grad_norm": 7.768656628795247, + "kl": 0.416015625, + "learning_rate": 1.5596846846846845e-07, + "loss": 0.0004, + "reward": 3.483713388442993, + "reward_std": 0.061230381950736046, + "rewards/final_reward": 1.5944935107421987, + "rewards/mask_iou_reward": 0.7972467553710993, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4837132096290588, + "rewards/thk_ans_format_reward": 1.0, + "step": 2998, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.2604217529297, + "epoch": 10.131534569983137, + "grad_norm": 14.741656546568667, + "kl": 0.4453125, + "learning_rate": 1.5568693693693693e-07, + "loss": 0.0005, + "reward": 3.785600185394287, + "reward_std": 0.042826587334275246, + "rewards/final_reward": 1.6696943980484984, + "rewards/mask_iou_reward": 0.8348471990242492, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7856003642082214, + "rewards/thk_ans_format_reward": 1.0, + "step": 2999, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.2604217529297, + "epoch": 10.134907251264755, + "grad_norm": 11.287609971331385, + "kl": 0.4287109375, + "learning_rate": 1.554054054054054e-07, + "loss": 0.0004, + "reward": 3.588198661804199, + "reward_std": 0.009367643389850855, + "rewards/final_reward": 1.8614195241644378, + "rewards/mask_iou_reward": 0.9307097620822189, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.588198959827423, + "rewards/thk_ans_format_reward": 1.0, + "step": 3000, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.0416717529297, + "epoch": 10.138279932546375, + "grad_norm": 7.047347437674352, + "kl": 0.4853515625, + "learning_rate": 1.5512387387387388e-07, + "loss": 0.0005, + "reward": 3.660353422164917, + "reward_std": 0.06875004037283361, + "rewards/final_reward": 1.5753046420376315, + "rewards/mask_iou_reward": 0.7876523210188158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.660353183746338, + "rewards/thk_ans_format_reward": 1.0, + "step": 3001, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.80208587646484, + "epoch": 10.141652613827993, + "grad_norm": 7.245173276493627, + "kl": 0.48828125, + "learning_rate": 1.5484234234234234e-07, + "loss": 0.0005, + "reward": 3.671100616455078, + "reward_std": 0.06982170045375824, + "rewards/final_reward": 1.6626594398675416, + "rewards/mask_iou_reward": 0.8313297199337708, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6711003184318542, + "rewards/thk_ans_format_reward": 1.0, + "step": 3002, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.93750762939453, + "epoch": 10.145025295109612, + "grad_norm": 12.782446082021838, + "kl": 0.4755859375, + "learning_rate": 1.545608108108108e-07, + "loss": 0.0005, + "reward": 3.6925946474075317, + "reward_std": 0.07178288232535124, + "rewards/final_reward": 1.834792823104551, + "rewards/mask_iou_reward": 0.9173964115522755, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6925946474075317, + "rewards/thk_ans_format_reward": 1.0, + "step": 3003, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.70833587646484, + "epoch": 10.14839797639123, + "grad_norm": 8.797747104877757, + "kl": 0.4453125, + "learning_rate": 1.542792792792793e-07, + "loss": 0.0004, + "reward": 3.409364938735962, + "reward_std": 0.044542797608301044, + "rewards/final_reward": 0.7013575445373311, + "rewards/mask_iou_reward": 0.35067877226866556, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4093648195266724, + "rewards/thk_ans_format_reward": 1.0, + "step": 3004, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.2604217529297, + "epoch": 10.15177065767285, + "grad_norm": 16.59870925962645, + "kl": 0.533203125, + "learning_rate": 1.5399774774774772e-07, + "loss": 0.0005, + "reward": 3.5411399602890015, + "reward_std": 0.027776396833360195, + "rewards/final_reward": 1.9392752603293033, + "rewards/mask_iou_reward": 0.9696376301646517, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5411400198936462, + "rewards/thk_ans_format_reward": 1.0, + "step": 3005, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.21875, + "epoch": 10.155143338954469, + "grad_norm": 13.806504308080143, + "kl": 0.849609375, + "learning_rate": 1.537162162162162e-07, + "loss": 0.0008, + "reward": 3.625385046005249, + "reward_std": 0.11380976252257824, + "rewards/final_reward": 1.1961639545209342, + "rewards/mask_iou_reward": 0.5980819772604671, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6253849864006042, + "rewards/thk_ans_format_reward": 1.0, + "step": 3006, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.11458587646484, + "epoch": 10.158516020236087, + "grad_norm": 46.019880167512845, + "kl": 0.43359375, + "learning_rate": 1.5343468468468467e-07, + "loss": 0.0004, + "reward": 3.5516542196273804, + "reward_std": 0.09062372241169214, + "rewards/final_reward": 1.5300205463825334, + "rewards/mask_iou_reward": 0.7650102731912667, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5516541600227356, + "rewards/thk_ans_format_reward": 1.0, + "step": 3007, + "think_completion_length": 10.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.8541717529297, + "epoch": 10.161888701517707, + "grad_norm": 210.24459797416944, + "kl": 0.4365234375, + "learning_rate": 1.5315315315315313e-07, + "loss": 0.0004, + "reward": 3.5649502277374268, + "reward_std": 0.07269263081252575, + "rewards/final_reward": 1.65368951076421, + "rewards/mask_iou_reward": 0.826844755382105, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.564950168132782, + "rewards/thk_ans_format_reward": 1.0, + "step": 3008, + "think_completion_length": 7.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.1979217529297, + "epoch": 10.165261382799326, + "grad_norm": 7.431795164683937, + "kl": 0.4365234375, + "learning_rate": 1.5287162162162162e-07, + "loss": 0.0004, + "reward": 3.682637095451355, + "reward_std": 0.04478604253381491, + "rewards/final_reward": 1.4957267679750599, + "rewards/mask_iou_reward": 0.7478633839875299, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6826370358467102, + "rewards/thk_ans_format_reward": 1.0, + "step": 3009, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.09375762939453, + "epoch": 10.168634064080944, + "grad_norm": 43.951720601436165, + "kl": 0.5146484375, + "learning_rate": 1.5259009009009008e-07, + "loss": 0.0005, + "reward": 3.2419824600219727, + "reward_std": 0.09567523375153542, + "rewards/final_reward": 0.9779962089804097, + "rewards/mask_iou_reward": 0.48899810449020487, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2419825792312622, + "rewards/thk_ans_format_reward": 1.0, + "step": 3010, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.45833587646484, + "epoch": 10.172006745362562, + "grad_norm": 21.70274754464807, + "kl": 0.51953125, + "learning_rate": 1.5230855855855857e-07, + "loss": 0.0005, + "reward": 3.8406925201416016, + "reward_std": 0.01246106019243598, + "rewards/final_reward": 1.8198003630804593, + "rewards/mask_iou_reward": 0.9099001815402297, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.840692400932312, + "rewards/thk_ans_format_reward": 1.0, + "step": 3011, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.3854217529297, + "epoch": 10.175379426644183, + "grad_norm": 7.894902107360492, + "kl": 0.4482421875, + "learning_rate": 1.5202702702702703e-07, + "loss": 0.0005, + "reward": 3.2574455738067627, + "reward_std": 0.18548056297004223, + "rewards/final_reward": 1.2295448944124066, + "rewards/mask_iou_reward": 0.6147724472062033, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2782787680625916, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3012, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.6875, + "epoch": 10.178752107925801, + "grad_norm": 10.25010181038996, + "kl": 0.494140625, + "learning_rate": 1.517454954954955e-07, + "loss": 0.0006, + "reward": 3.657282590866089, + "reward_std": 0.06546662375330925, + "rewards/final_reward": 1.2590024112232103, + "rewards/mask_iou_reward": 0.6295012056116052, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.657282531261444, + "rewards/thk_ans_format_reward": 1.0, + "step": 3013, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.58333587646484, + "epoch": 10.18212478920742, + "grad_norm": 19.823039707702044, + "kl": 0.4228515625, + "learning_rate": 1.5146396396396398e-07, + "loss": 0.0005, + "reward": 3.6551074981689453, + "reward_std": 0.08102907240390778, + "rewards/final_reward": 1.8555531520368094, + "rewards/mask_iou_reward": 0.9277765760184047, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6551074385643005, + "rewards/thk_ans_format_reward": 1.0, + "step": 3014, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.84375762939453, + "epoch": 10.18549747048904, + "grad_norm": 11.426374871385816, + "kl": 0.50390625, + "learning_rate": 1.511824324324324e-07, + "loss": 0.0005, + "reward": 3.4826070070266724, + "reward_std": 0.0319385826587677, + "rewards/final_reward": 1.738464134583586, + "rewards/mask_iou_reward": 0.869232067291793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4826070666313171, + "rewards/thk_ans_format_reward": 1.0, + "step": 3015, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.4375, + "epoch": 10.188870151770658, + "grad_norm": 20.16353455629868, + "kl": 0.38671875, + "learning_rate": 1.509009009009009e-07, + "loss": 0.0004, + "reward": 3.5612874031066895, + "reward_std": 0.09555951785296202, + "rewards/final_reward": 1.8304799898021402, + "rewards/mask_iou_reward": 0.9152399949010701, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5612870454788208, + "rewards/thk_ans_format_reward": 1.0, + "step": 3016, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.9166717529297, + "epoch": 10.192242833052276, + "grad_norm": 28.516210393580096, + "kl": 0.47265625, + "learning_rate": 1.5061936936936936e-07, + "loss": 0.0005, + "reward": 3.6769092082977295, + "reward_std": 0.05226367339491844, + "rewards/final_reward": 1.5773681028693793, + "rewards/mask_iou_reward": 0.7886840514346897, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6769092679023743, + "rewards/thk_ans_format_reward": 1.0, + "step": 3017, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.47917938232422, + "epoch": 10.195615514333895, + "grad_norm": 67.81277800194884, + "kl": 0.4306640625, + "learning_rate": 1.5033783783783782e-07, + "loss": 0.0005, + "reward": 3.811589479446411, + "reward_std": 0.010986678651534021, + "rewards/final_reward": 1.8891695953724157, + "rewards/mask_iou_reward": 0.9445847976862078, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.811589539051056, + "rewards/thk_ans_format_reward": 1.0, + "step": 3018, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.03125762939453, + "epoch": 10.198988195615515, + "grad_norm": 8.865567503921177, + "kl": 0.765625, + "learning_rate": 1.500563063063063e-07, + "loss": 0.0008, + "reward": 3.419390916824341, + "reward_std": 0.026064681820571423, + "rewards/final_reward": 1.5951678983941053, + "rewards/mask_iou_reward": 0.7975839491970527, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4193909168243408, + "rewards/thk_ans_format_reward": 1.0, + "step": 3019, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.40625, + "epoch": 10.202360876897133, + "grad_norm": 14.18369248827003, + "kl": 0.3935546875, + "learning_rate": 1.4977477477477477e-07, + "loss": 0.0004, + "reward": 3.5527989864349365, + "reward_std": 0.03511756705120206, + "rewards/final_reward": 0.9831977676554682, + "rewards/mask_iou_reward": 0.4915988838277341, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5527989268302917, + "rewards/thk_ans_format_reward": 1.0, + "step": 3020, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.02083587646484, + "epoch": 10.205733558178752, + "grad_norm": 9.654467905221043, + "kl": 0.443359375, + "learning_rate": 1.4949324324324325e-07, + "loss": 0.0005, + "reward": 3.4940335750579834, + "reward_std": 0.09076207876205444, + "rewards/final_reward": 1.1036258644813064, + "rewards/mask_iou_reward": 0.5518129322406532, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4940335154533386, + "rewards/thk_ans_format_reward": 1.0, + "step": 3021, + "think_completion_length": 8.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.00000381469727, + "epoch": 10.209106239460372, + "grad_norm": 15.997006681536114, + "kl": 0.515625, + "learning_rate": 1.4921171171171171e-07, + "loss": 0.0005, + "reward": 3.1683356761932373, + "reward_std": 0.02166743017733097, + "rewards/final_reward": 1.7799907009308857, + "rewards/mask_iou_reward": 0.8899953504654429, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1683356165885925, + "rewards/thk_ans_format_reward": 1.0, + "step": 3022, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.4791717529297, + "epoch": 10.21247892074199, + "grad_norm": 10.312279577192898, + "kl": 0.4375, + "learning_rate": 1.4893018018018018e-07, + "loss": 0.0005, + "reward": 3.736538052558899, + "reward_std": 0.024791957112029195, + "rewards/final_reward": 1.8820995461035657, + "rewards/mask_iou_reward": 0.9410497730517828, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7365379929542542, + "rewards/thk_ans_format_reward": 1.0, + "step": 3023, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.25, + "epoch": 10.215851602023609, + "grad_norm": 10.412018438289754, + "kl": 0.4609375, + "learning_rate": 1.4864864864864866e-07, + "loss": 0.0005, + "reward": 3.5170916318893433, + "reward_std": 0.3517511487007141, + "rewards/final_reward": 1.2948631458501156, + "rewards/mask_iou_reward": 0.6474315729250578, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5483417510986328, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 3024, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.48958587646484, + "epoch": 10.219224283305227, + "grad_norm": 115.1120642139752, + "kl": 1.458984375, + "learning_rate": 1.483671171171171e-07, + "loss": 0.0015, + "reward": 3.6020342111587524, + "reward_std": 0.035571375861763954, + "rewards/final_reward": 1.3863584342528519, + "rewards/mask_iou_reward": 0.6931792171264259, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6020342707633972, + "rewards/thk_ans_format_reward": 1.0, + "step": 3025, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.36458587646484, + "epoch": 10.222596964586847, + "grad_norm": 14.95920819273015, + "kl": 0.421875, + "learning_rate": 1.4808558558558556e-07, + "loss": 0.0004, + "reward": 3.603550434112549, + "reward_std": 0.057018641382455826, + "rewards/final_reward": 1.2303422133345536, + "rewards/mask_iou_reward": 0.6151711066672768, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6035504341125488, + "rewards/thk_ans_format_reward": 1.0, + "step": 3026, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.41667938232422, + "epoch": 10.225969645868465, + "grad_norm": 14.760725731411785, + "kl": 0.4404296875, + "learning_rate": 1.4780405405405404e-07, + "loss": 0.0005, + "reward": 3.7295358180999756, + "reward_std": 0.037195175886154175, + "rewards/final_reward": 1.8347441322552864, + "rewards/mask_iou_reward": 0.9173720661276432, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7295357584953308, + "rewards/thk_ans_format_reward": 1.0, + "step": 3027, + "think_completion_length": 7.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.5729217529297, + "epoch": 10.229342327150084, + "grad_norm": 6.346323784178766, + "kl": 0.421875, + "learning_rate": 1.475225225225225e-07, + "loss": 0.0004, + "reward": 3.4006311893463135, + "reward_std": 0.10144811868667603, + "rewards/final_reward": 1.8824980037471182, + "rewards/mask_iou_reward": 0.9412490018735591, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.400631308555603, + "rewards/thk_ans_format_reward": 1.0, + "step": 3028, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.78126525878906, + "epoch": 10.232715008431704, + "grad_norm": 8.824476487222075, + "kl": 0.4111328125, + "learning_rate": 1.47240990990991e-07, + "loss": 0.0004, + "reward": 3.7033588886260986, + "reward_std": 0.06088071269914508, + "rewards/final_reward": 1.5659035082472332, + "rewards/mask_iou_reward": 0.7829517541236166, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.703358769416809, + "rewards/thk_ans_format_reward": 1.0, + "step": 3029, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.12501525878906, + "epoch": 10.236087689713322, + "grad_norm": 28.21267807554526, + "kl": 0.505859375, + "learning_rate": 1.4695945945945945e-07, + "loss": 0.0005, + "reward": 3.7472550868988037, + "reward_std": 0.04748426750302315, + "rewards/final_reward": 1.8960582219169015, + "rewards/mask_iou_reward": 0.9480291109584508, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7472549080848694, + "rewards/thk_ans_format_reward": 1.0, + "step": 3030, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.87501525878906, + "epoch": 10.23946037099494, + "grad_norm": 10.76581095317364, + "kl": 0.490234375, + "learning_rate": 1.4667792792792791e-07, + "loss": 0.0005, + "reward": 3.740869402885437, + "reward_std": 0.22409842908382416, + "rewards/final_reward": 1.678593825559922, + "rewards/mask_iou_reward": 0.839296912779961, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7617026567459106, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3031, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.1979217529297, + "epoch": 10.24283305227656, + "grad_norm": 16.11696078361264, + "kl": 0.525390625, + "learning_rate": 1.463963963963964e-07, + "loss": 0.0005, + "reward": 3.488149046897888, + "reward_std": 0.017415442038327456, + "rewards/final_reward": 1.104710881954887, + "rewards/mask_iou_reward": 0.5523554409774435, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4881488680839539, + "rewards/thk_ans_format_reward": 1.0, + "step": 3032, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.1041717529297, + "epoch": 10.24620573355818, + "grad_norm": 11.79782727085532, + "kl": 0.6591796875, + "learning_rate": 1.4611486486486486e-07, + "loss": 0.0007, + "reward": 3.353591799736023, + "reward_std": 0.05060257390141487, + "rewards/final_reward": 1.440081338540764, + "rewards/mask_iou_reward": 0.720040669270382, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.353591799736023, + "rewards/thk_ans_format_reward": 1.0, + "step": 3033, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.42708587646484, + "epoch": 10.249578414839798, + "grad_norm": 34.85221516597678, + "kl": 9.57421875, + "learning_rate": 1.4583333333333335e-07, + "loss": 0.0096, + "reward": 3.6186007261276245, + "reward_std": 0.06899136863648891, + "rewards/final_reward": 1.5653077673713776, + "rewards/mask_iou_reward": 0.7826538836856888, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6186007261276245, + "rewards/thk_ans_format_reward": 1.0, + "step": 3034, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.8958435058594, + "epoch": 10.252951096121416, + "grad_norm": 45.66579665600695, + "kl": 0.4111328125, + "learning_rate": 1.4555180180180178e-07, + "loss": 0.0004, + "reward": 3.5985703468322754, + "reward_std": 0.24263647571206093, + "rewards/final_reward": 1.8425728631798113, + "rewards/mask_iou_reward": 0.9212864315899056, + "rewards/sam_format_reward": 0.9583333432674408, + "rewards/sam_reward_func_ultra": 1.681903898715973, + "rewards/thk_ans_format_reward": 0.9583333432674408, + "step": 3035, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.02083587646484, + "epoch": 10.256323777403036, + "grad_norm": 7.415837993379524, + "kl": 0.4443359375, + "learning_rate": 1.4527027027027024e-07, + "loss": 0.0004, + "reward": 3.685833692550659, + "reward_std": 0.03945900313556194, + "rewards/final_reward": 1.7368624821102698, + "rewards/mask_iou_reward": 0.8684312410551349, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6858336925506592, + "rewards/thk_ans_format_reward": 1.0, + "step": 3036, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.93751525878906, + "epoch": 10.259696458684655, + "grad_norm": 30.27222983383401, + "kl": 0.5341796875, + "learning_rate": 1.4498873873873873e-07, + "loss": 0.0005, + "reward": 3.559749484062195, + "reward_std": 0.08500716462731361, + "rewards/final_reward": 1.6927849118778742, + "rewards/mask_iou_reward": 0.8463924559389371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.559749722480774, + "rewards/thk_ans_format_reward": 1.0, + "step": 3037, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.7916793823242, + "epoch": 10.263069139966273, + "grad_norm": 5.858934619023522, + "kl": 0.701171875, + "learning_rate": 1.447072072072072e-07, + "loss": 0.0007, + "reward": 3.5214637517929077, + "reward_std": 0.06163910590112209, + "rewards/final_reward": 1.7095983083281925, + "rewards/mask_iou_reward": 0.8547991541640962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5214638113975525, + "rewards/thk_ans_format_reward": 1.0, + "step": 3038, + "think_completion_length": 8.291666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.00000762939453, + "epoch": 10.266441821247891, + "grad_norm": 13.128488544669443, + "kl": 0.4150390625, + "learning_rate": 1.4442567567567568e-07, + "loss": 0.0004, + "reward": 3.762415289878845, + "reward_std": 0.098273616284132, + "rewards/final_reward": 1.842667643260338, + "rewards/mask_iou_reward": 0.921333821630169, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7624154090881348, + "rewards/thk_ans_format_reward": 1.0, + "step": 3039, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.5833435058594, + "epoch": 10.269814502529512, + "grad_norm": 10.121474630049313, + "kl": 0.376953125, + "learning_rate": 1.4414414414414414e-07, + "loss": 0.0004, + "reward": 3.6652374267578125, + "reward_std": 0.03870641253888607, + "rewards/final_reward": 1.2309389639257047, + "rewards/mask_iou_reward": 0.6154694819628523, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6652374267578125, + "rewards/thk_ans_format_reward": 1.0, + "step": 3040, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.5729217529297, + "epoch": 10.27318718381113, + "grad_norm": 10.438654044764187, + "kl": 0.416015625, + "learning_rate": 1.438626126126126e-07, + "loss": 0.0004, + "reward": 3.6193418502807617, + "reward_std": 0.019348585978150368, + "rewards/final_reward": 1.921007736284884, + "rewards/mask_iou_reward": 0.960503868142442, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6193416714668274, + "rewards/thk_ans_format_reward": 1.0, + "step": 3041, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.88542938232422, + "epoch": 10.276559865092748, + "grad_norm": 9.229400078665387, + "kl": 0.4716796875, + "learning_rate": 1.435810810810811e-07, + "loss": 0.0005, + "reward": 3.881032109260559, + "reward_std": 0.015071406960487366, + "rewards/final_reward": 1.8651157299403511, + "rewards/mask_iou_reward": 0.9325578649701756, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8810319304466248, + "rewards/thk_ans_format_reward": 1.0, + "step": 3042, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.72916793823242, + "epoch": 10.279932546374368, + "grad_norm": 20.614520400940542, + "kl": 0.73046875, + "learning_rate": 1.4329954954954955e-07, + "loss": 0.0007, + "reward": 3.632157564163208, + "reward_std": 0.046529789455235004, + "rewards/final_reward": 1.5362756707926812, + "rewards/mask_iou_reward": 0.7681378353963406, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6321576237678528, + "rewards/thk_ans_format_reward": 1.0, + "step": 3043, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.1041717529297, + "epoch": 10.283305227655987, + "grad_norm": 36.98989263520269, + "kl": 0.4287109375, + "learning_rate": 1.4301801801801803e-07, + "loss": 0.0004, + "reward": 3.5096585750579834, + "reward_std": 0.1018081046640873, + "rewards/final_reward": 1.855999401330896, + "rewards/mask_iou_reward": 0.927999700665448, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5096585750579834, + "rewards/thk_ans_format_reward": 1.0, + "step": 3044, + "think_completion_length": 9.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.46875762939453, + "epoch": 10.286677908937605, + "grad_norm": 10.131238284617314, + "kl": 0.45703125, + "learning_rate": 1.4273648648648647e-07, + "loss": 0.0005, + "reward": 3.261106252670288, + "reward_std": 0.050902172923088074, + "rewards/final_reward": 1.1564071824493518, + "rewards/mask_iou_reward": 0.5782035912246759, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2611061930656433, + "rewards/thk_ans_format_reward": 1.0, + "step": 3045, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 173.3229217529297, + "epoch": 10.290050590219224, + "grad_norm": 13.902812917861246, + "kl": 0.56640625, + "learning_rate": 1.4245495495495493e-07, + "loss": 0.0006, + "reward": 3.7212787866592407, + "reward_std": 0.07197993621230125, + "rewards/final_reward": 1.8479431196911142, + "rewards/mask_iou_reward": 0.9239715598455571, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7212787866592407, + "rewards/thk_ans_format_reward": 1.0, + "step": 3046, + "think_completion_length": 13.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.71875, + "epoch": 10.293423271500844, + "grad_norm": 14.873126860968421, + "kl": 0.4013671875, + "learning_rate": 1.4217342342342342e-07, + "loss": 0.0004, + "reward": 3.8307950496673584, + "reward_std": 0.01546748448163271, + "rewards/final_reward": 1.8101105255828047, + "rewards/mask_iou_reward": 0.9050552627914024, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8307951092720032, + "rewards/thk_ans_format_reward": 1.0, + "step": 3047, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.7916717529297, + "epoch": 10.296795952782462, + "grad_norm": 20.70107362318775, + "kl": 0.431640625, + "learning_rate": 1.4189189189189188e-07, + "loss": 0.0004, + "reward": 3.799539804458618, + "reward_std": 0.04087031399831176, + "rewards/final_reward": 1.880673807035448, + "rewards/mask_iou_reward": 0.940336903517724, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.799539864063263, + "rewards/thk_ans_format_reward": 1.0, + "step": 3048, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0104217529297, + "epoch": 10.30016863406408, + "grad_norm": 20.29669481716131, + "kl": 0.4267578125, + "learning_rate": 1.4161036036036036e-07, + "loss": 0.0004, + "reward": 3.5044933557510376, + "reward_std": 0.03597615472972393, + "rewards/final_reward": 1.813689756326931, + "rewards/mask_iou_reward": 0.9068448781634655, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5044932961463928, + "rewards/thk_ans_format_reward": 1.0, + "step": 3049, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.45834350585938, + "epoch": 10.3035413153457, + "grad_norm": 6.06329398522273, + "kl": 0.578125, + "learning_rate": 1.4132882882882883e-07, + "loss": 0.0006, + "reward": 3.7332570552825928, + "reward_std": 0.058670297265052795, + "rewards/final_reward": 1.9510471461607337, + "rewards/mask_iou_reward": 0.9755235730803669, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7332569360733032, + "rewards/thk_ans_format_reward": 1.0, + "step": 3050, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.8541717529297, + "epoch": 10.306913996627319, + "grad_norm": 10.59607026615506, + "kl": 0.611328125, + "learning_rate": 1.4104729729729729e-07, + "loss": 0.0006, + "reward": 3.7314302921295166, + "reward_std": 0.02940399432554841, + "rewards/final_reward": 1.8294265843891298, + "rewards/mask_iou_reward": 0.9147132921945649, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7314303517341614, + "rewards/thk_ans_format_reward": 1.0, + "step": 3051, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.97917938232422, + "epoch": 10.310286677908937, + "grad_norm": 7.163739321310357, + "kl": 0.4052734375, + "learning_rate": 1.4076576576576577e-07, + "loss": 0.0004, + "reward": 3.285438656806946, + "reward_std": 0.0969182513654232, + "rewards/final_reward": 1.2372289659372262, + "rewards/mask_iou_reward": 0.6186144829686131, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2854386568069458, + "rewards/thk_ans_format_reward": 1.0, + "step": 3052, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.7291717529297, + "epoch": 10.313659359190556, + "grad_norm": 19.472550548175086, + "kl": 0.5625, + "learning_rate": 1.4048423423423423e-07, + "loss": 0.0006, + "reward": 3.4311933517456055, + "reward_std": 0.12055841088294983, + "rewards/final_reward": 1.6633807218495948, + "rewards/mask_iou_reward": 0.8316903609247974, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4311934113502502, + "rewards/thk_ans_format_reward": 1.0, + "step": 3053, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.79167938232422, + "epoch": 10.317032040472176, + "grad_norm": 51.19403605734523, + "kl": 0.4560546875, + "learning_rate": 1.402027027027027e-07, + "loss": 0.0005, + "reward": 3.355466604232788, + "reward_std": 0.02344503626227379, + "rewards/final_reward": 1.869085082618021, + "rewards/mask_iou_reward": 0.9345425413090105, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3554664850234985, + "rewards/thk_ans_format_reward": 1.0, + "step": 3054, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.4166717529297, + "epoch": 10.320404721753794, + "grad_norm": 12.938744913511657, + "kl": 0.484375, + "learning_rate": 1.3992117117117116e-07, + "loss": 0.0005, + "reward": 3.5117716789245605, + "reward_std": 0.015957183204591274, + "rewards/final_reward": 0.5727209760785165, + "rewards/mask_iou_reward": 0.28636048803925823, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5117716193199158, + "rewards/thk_ans_format_reward": 1.0, + "step": 3055, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.70833587646484, + "epoch": 10.323777403035413, + "grad_norm": 11.810036001437068, + "kl": 0.501953125, + "learning_rate": 1.3963963963963962e-07, + "loss": 0.0005, + "reward": 3.472335457801819, + "reward_std": 0.04017342161387205, + "rewards/final_reward": 1.7242630804990866, + "rewards/mask_iou_reward": 0.8621315402495433, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4723355174064636, + "rewards/thk_ans_format_reward": 1.0, + "step": 3056, + "think_completion_length": 10.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.6979217529297, + "epoch": 10.327150084317031, + "grad_norm": 28.163535898988556, + "kl": 0.466796875, + "learning_rate": 1.393581081081081e-07, + "loss": 0.0005, + "reward": 3.6833776235580444, + "reward_std": 0.056855410104617476, + "rewards/final_reward": 1.755651434260051, + "rewards/mask_iou_reward": 0.8778257171300256, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6833775639533997, + "rewards/thk_ans_format_reward": 1.0, + "step": 3057, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.8229217529297, + "epoch": 10.330522765598651, + "grad_norm": 11.691020876666066, + "kl": 0.8681640625, + "learning_rate": 1.3907657657657656e-07, + "loss": 0.0009, + "reward": 3.5328752994537354, + "reward_std": 0.02728255931288004, + "rewards/final_reward": 1.775632264751895, + "rewards/mask_iou_reward": 0.8878161323759475, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5328753590583801, + "rewards/thk_ans_format_reward": 1.0, + "step": 3058, + "think_completion_length": 10.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.78125, + "epoch": 10.33389544688027, + "grad_norm": 5.488915685669901, + "kl": 0.74609375, + "learning_rate": 1.3879504504504505e-07, + "loss": 0.0007, + "reward": 3.5203044414520264, + "reward_std": 0.03370634466409683, + "rewards/final_reward": 1.976008900677117, + "rewards/mask_iou_reward": 0.9880044503385585, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.520304560661316, + "rewards/thk_ans_format_reward": 1.0, + "step": 3059, + "think_completion_length": 11.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.02083587646484, + "epoch": 10.337268128161888, + "grad_norm": 14.039207196930784, + "kl": 0.609375, + "learning_rate": 1.385135135135135e-07, + "loss": 0.0006, + "reward": 3.555195689201355, + "reward_std": 0.12395616993308067, + "rewards/final_reward": 1.818213667140335, + "rewards/mask_iou_reward": 0.9091068335701675, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5551958084106445, + "rewards/thk_ans_format_reward": 1.0, + "step": 3060, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.59375762939453, + "epoch": 10.340640809443508, + "grad_norm": 11.364492284892538, + "kl": 0.4423828125, + "learning_rate": 1.3823198198198197e-07, + "loss": 0.0004, + "reward": 3.6343125104904175, + "reward_std": 0.049136221408843994, + "rewards/final_reward": 1.7915940520924123, + "rewards/mask_iou_reward": 0.8957970260462061, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.634312391281128, + "rewards/thk_ans_format_reward": 1.0, + "step": 3061, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.9166717529297, + "epoch": 10.344013490725127, + "grad_norm": 7.810797873979519, + "kl": 0.396484375, + "learning_rate": 1.3795045045045046e-07, + "loss": 0.0005, + "reward": 3.295444369316101, + "reward_std": 0.08025751262903214, + "rewards/final_reward": 1.177657295365734, + "rewards/mask_iou_reward": 0.588828647682867, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2954444289207458, + "rewards/thk_ans_format_reward": 1.0, + "step": 3062, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.68750762939453, + "epoch": 10.347386172006745, + "grad_norm": 14.246952677980223, + "kl": 0.53515625, + "learning_rate": 1.3766891891891892e-07, + "loss": 0.0005, + "reward": 3.6882766485214233, + "reward_std": 0.087204210460186, + "rewards/final_reward": 1.7750797740027533, + "rewards/mask_iou_reward": 0.8875398870013766, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6882765889167786, + "rewards/thk_ans_format_reward": 1.0, + "step": 3063, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.50000762939453, + "epoch": 10.350758853288363, + "grad_norm": 44.254324086618205, + "kl": 0.3994140625, + "learning_rate": 1.3738738738738738e-07, + "loss": 0.0004, + "reward": 3.6730443239212036, + "reward_std": 0.037104617804288864, + "rewards/final_reward": 1.861958143782565, + "rewards/mask_iou_reward": 0.9309790718912825, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6730441451072693, + "rewards/thk_ans_format_reward": 1.0, + "step": 3064, + "think_completion_length": 9.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.78126525878906, + "epoch": 10.354131534569984, + "grad_norm": 18.577352776682563, + "kl": 0.5625, + "learning_rate": 1.3710585585585584e-07, + "loss": 0.0006, + "reward": 3.7525107860565186, + "reward_std": 0.03455898258835077, + "rewards/final_reward": 1.7476212503404103, + "rewards/mask_iou_reward": 0.8738106251702051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7525106072425842, + "rewards/thk_ans_format_reward": 1.0, + "step": 3065, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.30208587646484, + "epoch": 10.357504215851602, + "grad_norm": 7.837123501383387, + "kl": 0.4873046875, + "learning_rate": 1.368243243243243e-07, + "loss": 0.0005, + "reward": 3.599125623703003, + "reward_std": 0.03124680370092392, + "rewards/final_reward": 1.267670592041667, + "rewards/mask_iou_reward": 0.6338352960208335, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5991255640983582, + "rewards/thk_ans_format_reward": 1.0, + "step": 3066, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.2604217529297, + "epoch": 10.36087689713322, + "grad_norm": 12.620081651732837, + "kl": 0.5390625, + "learning_rate": 1.365427927927928e-07, + "loss": 0.0006, + "reward": 3.6787497997283936, + "reward_std": 0.059250480961054564, + "rewards/final_reward": 1.6039802302769672, + "rewards/mask_iou_reward": 0.8019901151384836, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.678749680519104, + "rewards/thk_ans_format_reward": 1.0, + "step": 3067, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.84375, + "epoch": 10.36424957841484, + "grad_norm": 10.729647212051022, + "kl": 0.3984375, + "learning_rate": 1.3626126126126125e-07, + "loss": 0.0004, + "reward": 3.7765551805496216, + "reward_std": 0.0969116073101759, + "rewards/final_reward": 1.8530384947731258, + "rewards/mask_iou_reward": 0.9265192473865629, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7765551805496216, + "rewards/thk_ans_format_reward": 1.0, + "step": 3068, + "think_completion_length": 8.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.67708587646484, + "epoch": 10.367622259696459, + "grad_norm": 17.869347979918643, + "kl": 0.6171875, + "learning_rate": 1.3597972972972974e-07, + "loss": 0.0006, + "reward": 3.801684260368347, + "reward_std": 0.03484427556395531, + "rewards/final_reward": 1.7825355703083081, + "rewards/mask_iou_reward": 0.8912677851541541, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8016842603683472, + "rewards/thk_ans_format_reward": 1.0, + "step": 3069, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.02084350585938, + "epoch": 10.370994940978077, + "grad_norm": 9.546367466466828, + "kl": 0.443359375, + "learning_rate": 1.356981981981982e-07, + "loss": 0.0005, + "reward": 3.4482284784317017, + "reward_std": 0.1556754820048809, + "rewards/final_reward": 1.4857851477246928, + "rewards/mask_iou_reward": 0.7428925738623464, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4482285380363464, + "rewards/thk_ans_format_reward": 1.0, + "step": 3070, + "think_completion_length": 8.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.125, + "epoch": 10.374367622259696, + "grad_norm": 24.998556434514093, + "kl": 0.4150390625, + "learning_rate": 1.3541666666666666e-07, + "loss": 0.0004, + "reward": 3.7703176736831665, + "reward_std": 0.019884683191776276, + "rewards/final_reward": 1.695647588416222, + "rewards/mask_iou_reward": 0.847823794208111, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.770317792892456, + "rewards/thk_ans_format_reward": 1.0, + "step": 3071, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.15625, + "epoch": 10.377740303541316, + "grad_norm": 10.177827488944438, + "kl": 0.46875, + "learning_rate": 1.3513513513513515e-07, + "loss": 0.0005, + "reward": 3.3514381647109985, + "reward_std": 0.1550460159778595, + "rewards/final_reward": 1.6597564033637382, + "rewards/mask_iou_reward": 0.8298782016818691, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3514381647109985, + "rewards/thk_ans_format_reward": 1.0, + "step": 3072, + "think_completion_length": 9.666666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.32292938232422, + "epoch": 10.381112984822934, + "grad_norm": 8.509582773501355, + "kl": 0.447265625, + "learning_rate": 1.348536036036036e-07, + "loss": 0.0005, + "reward": 3.4990919828414917, + "reward_std": 0.07601478323340416, + "rewards/final_reward": 1.8712923387097589, + "rewards/mask_iou_reward": 0.9356461693548794, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4990919828414917, + "rewards/thk_ans_format_reward": 1.0, + "step": 3073, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.7708435058594, + "epoch": 10.384485666104553, + "grad_norm": 5.405843324927532, + "kl": 0.36328125, + "learning_rate": 1.3457207207207207e-07, + "loss": 0.0004, + "reward": 3.723816156387329, + "reward_std": 0.04175476357340813, + "rewards/final_reward": 1.6051074212975824, + "rewards/mask_iou_reward": 0.8025537106487912, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7238160967826843, + "rewards/thk_ans_format_reward": 1.0, + "step": 3074, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.0729217529297, + "epoch": 10.387858347386173, + "grad_norm": 5.879979396345877, + "kl": 0.49609375, + "learning_rate": 1.3429054054054053e-07, + "loss": 0.0005, + "reward": 3.746949791908264, + "reward_std": 0.0503030139952898, + "rewards/final_reward": 1.7305502081549002, + "rewards/mask_iou_reward": 0.8652751040774501, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.746949553489685, + "rewards/thk_ans_format_reward": 1.0, + "step": 3075, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.6354217529297, + "epoch": 10.391231028667791, + "grad_norm": 26.78246561208562, + "kl": 0.3935546875, + "learning_rate": 1.34009009009009e-07, + "loss": 0.0005, + "reward": 3.5735831260681152, + "reward_std": 0.04332230752333999, + "rewards/final_reward": 1.248773449844379, + "rewards/mask_iou_reward": 0.6243867249221895, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5735830664634705, + "rewards/thk_ans_format_reward": 1.0, + "step": 3076, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.43750762939453, + "epoch": 10.39460370994941, + "grad_norm": 10.001276634062897, + "kl": 0.4296875, + "learning_rate": 1.3372747747747748e-07, + "loss": 0.0004, + "reward": 3.7090967893600464, + "reward_std": 0.03287575836293399, + "rewards/final_reward": 1.9403098107212258, + "rewards/mask_iou_reward": 0.9701549053606129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7090969681739807, + "rewards/thk_ans_format_reward": 1.0, + "step": 3077, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.25000762939453, + "epoch": 10.397976391231028, + "grad_norm": 18.211080017805312, + "kl": 0.49609375, + "learning_rate": 1.3344594594594594e-07, + "loss": 0.0005, + "reward": 3.458000421524048, + "reward_std": 0.05489188525825739, + "rewards/final_reward": 1.5067666567106284, + "rewards/mask_iou_reward": 0.7533833283553142, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4580003023147583, + "rewards/thk_ans_format_reward": 1.0, + "step": 3078, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.8541717529297, + "epoch": 10.401349072512648, + "grad_norm": 14.014103313482549, + "kl": 0.4716796875, + "learning_rate": 1.3316441441441442e-07, + "loss": 0.0005, + "reward": 3.5899453163146973, + "reward_std": 0.056886350736021996, + "rewards/final_reward": 1.6582750067291117, + "rewards/mask_iou_reward": 0.8291375033645558, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5899450778961182, + "rewards/thk_ans_format_reward": 1.0, + "step": 3079, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.2604217529297, + "epoch": 10.404721753794266, + "grad_norm": 11.468827152465424, + "kl": 0.681640625, + "learning_rate": 1.3288288288288288e-07, + "loss": 0.0007, + "reward": 3.7649770975112915, + "reward_std": 0.07069988921284676, + "rewards/final_reward": 1.8338776917010735, + "rewards/mask_iou_reward": 0.9169388458505368, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7649770379066467, + "rewards/thk_ans_format_reward": 1.0, + "step": 3080, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.6458396911621, + "epoch": 10.408094435075885, + "grad_norm": 11.085719211351236, + "kl": 0.4833984375, + "learning_rate": 1.3260135135135134e-07, + "loss": 0.0005, + "reward": 3.5876041650772095, + "reward_std": 0.19347361475229263, + "rewards/final_reward": 1.6700500913049008, + "rewards/mask_iou_reward": 0.8350250456524504, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5980209112167358, + "rewards/thk_ans_format_reward": 1.0, + "step": 3081, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.11458587646484, + "epoch": 10.411467116357505, + "grad_norm": 38.16946217127505, + "kl": 0.4169921875, + "learning_rate": 1.3231981981981983e-07, + "loss": 0.0004, + "reward": 3.5083699226379395, + "reward_std": 0.045381806790828705, + "rewards/final_reward": 1.7874092424230517, + "rewards/mask_iou_reward": 0.8937046212115258, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5083699822425842, + "rewards/thk_ans_format_reward": 1.0, + "step": 3082, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.33334350585938, + "epoch": 10.414839797639123, + "grad_norm": 10.40371028482117, + "kl": 0.564453125, + "learning_rate": 1.320382882882883e-07, + "loss": 0.0006, + "reward": 3.4012542963027954, + "reward_std": 0.13047372177243233, + "rewards/final_reward": 1.448548234038224, + "rewards/mask_iou_reward": 0.724274117019112, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.4220874905586243, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3083, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.75, + "epoch": 10.418212478920742, + "grad_norm": 12.777325204116849, + "kl": 0.517578125, + "learning_rate": 1.3175675675675673e-07, + "loss": 0.0005, + "reward": 3.7277798652648926, + "reward_std": 0.05290667526423931, + "rewards/final_reward": 1.7855710697263598, + "rewards/mask_iou_reward": 0.8927855348631799, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.727779746055603, + "rewards/thk_ans_format_reward": 1.0, + "step": 3084, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.33334350585938, + "epoch": 10.42158516020236, + "grad_norm": 27.100025984014987, + "kl": 0.4765625, + "learning_rate": 1.3147522522522521e-07, + "loss": 0.0005, + "reward": 3.613362193107605, + "reward_std": 0.05754461046308279, + "rewards/final_reward": 1.8879014683702529, + "rewards/mask_iou_reward": 0.9439507341851264, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6133623123168945, + "rewards/thk_ans_format_reward": 1.0, + "step": 3085, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.6875, + "epoch": 10.42495784148398, + "grad_norm": 10.037161786953995, + "kl": 0.505859375, + "learning_rate": 1.3119369369369367e-07, + "loss": 0.0005, + "reward": 3.4091343879699707, + "reward_std": 0.09373841434717178, + "rewards/final_reward": 1.3407313291318463, + "rewards/mask_iou_reward": 0.6703656645659232, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4091344475746155, + "rewards/thk_ans_format_reward": 1.0, + "step": 3086, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.6041717529297, + "epoch": 10.428330522765599, + "grad_norm": 12.974808226706712, + "kl": 0.4169921875, + "learning_rate": 1.3091216216216216e-07, + "loss": 0.0004, + "reward": 3.6260504722595215, + "reward_std": 0.042093608528375626, + "rewards/final_reward": 1.736496195289474, + "rewards/mask_iou_reward": 0.868248097644737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6260504722595215, + "rewards/thk_ans_format_reward": 1.0, + "step": 3087, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.95833587646484, + "epoch": 10.431703204047217, + "grad_norm": 11.04435290865121, + "kl": 0.42578125, + "learning_rate": 1.3063063063063062e-07, + "loss": 0.0005, + "reward": 3.579022169113159, + "reward_std": 0.05499119684100151, + "rewards/final_reward": 1.669786111592614, + "rewards/mask_iou_reward": 0.834893055796307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5790221095085144, + "rewards/thk_ans_format_reward": 1.0, + "step": 3088, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.1458396911621, + "epoch": 10.435075885328837, + "grad_norm": 11.159996455611521, + "kl": 0.59765625, + "learning_rate": 1.3034909909909908e-07, + "loss": 0.0006, + "reward": 3.652455687522888, + "reward_std": 0.02265701163560152, + "rewards/final_reward": 1.8173958249268871, + "rewards/mask_iou_reward": 0.9086979124634436, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6524556279182434, + "rewards/thk_ans_format_reward": 1.0, + "step": 3089, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 175.67708587646484, + "epoch": 10.438448566610456, + "grad_norm": 8.753344860212746, + "kl": 0.5224609375, + "learning_rate": 1.3006756756756757e-07, + "loss": 0.0005, + "reward": 3.7730822563171387, + "reward_std": 0.030583191197365522, + "rewards/final_reward": 1.6578078450498204, + "rewards/mask_iou_reward": 0.8289039225249102, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7730821371078491, + "rewards/thk_ans_format_reward": 1.0, + "step": 3090, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.59376525878906, + "epoch": 10.441821247892074, + "grad_norm": 12.035103836623776, + "kl": 0.4619140625, + "learning_rate": 1.2978603603603603e-07, + "loss": 0.0007, + "reward": 3.6348942518234253, + "reward_std": 0.057315885089337826, + "rewards/final_reward": 1.4426078250224328, + "rewards/mask_iou_reward": 0.7213039125112164, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6348941922187805, + "rewards/thk_ans_format_reward": 1.0, + "step": 3091, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.6354217529297, + "epoch": 10.445193929173692, + "grad_norm": 21.070982930474848, + "kl": 0.61328125, + "learning_rate": 1.2950450450450452e-07, + "loss": 0.0006, + "reward": 3.4380534887313843, + "reward_std": 0.04184722830541432, + "rewards/final_reward": 1.6863444507629848, + "rewards/mask_iou_reward": 0.8431722253814924, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.43805330991745, + "rewards/thk_ans_format_reward": 1.0, + "step": 3092, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.65625, + "epoch": 10.448566610455313, + "grad_norm": 18.121357835908366, + "kl": 0.66015625, + "learning_rate": 1.2922297297297298e-07, + "loss": 0.0007, + "reward": 3.4254337549209595, + "reward_std": 0.05125655606389046, + "rewards/final_reward": 1.0593367921198502, + "rewards/mask_iou_reward": 0.5296683960599251, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4254336953163147, + "rewards/thk_ans_format_reward": 1.0, + "step": 3093, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.55208587646484, + "epoch": 10.451939291736931, + "grad_norm": 6.635819206062496, + "kl": 0.52734375, + "learning_rate": 1.289414414414414e-07, + "loss": 0.0005, + "reward": 3.7229537963867188, + "reward_std": 0.030687447171658278, + "rewards/final_reward": 1.7149159311178637, + "rewards/mask_iou_reward": 0.8574579655589318, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7229537963867188, + "rewards/thk_ans_format_reward": 1.0, + "step": 3094, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 156.67708587646484, + "epoch": 10.45531197301855, + "grad_norm": 36.00135551494365, + "kl": 0.46875, + "learning_rate": 1.286599099099099e-07, + "loss": 0.0005, + "reward": 3.5245972871780396, + "reward_std": 0.048061273992061615, + "rewards/final_reward": 1.2067712422147152, + "rewards/mask_iou_reward": 0.6033856211073576, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.52459716796875, + "rewards/thk_ans_format_reward": 1.0, + "step": 3095, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.8125, + "epoch": 10.45868465430017, + "grad_norm": 6.8862624024723935, + "kl": 0.5234375, + "learning_rate": 1.2837837837837836e-07, + "loss": 0.0005, + "reward": 3.550014853477478, + "reward_std": 0.036302256397902966, + "rewards/final_reward": 0.8265123227011708, + "rewards/mask_iou_reward": 0.4132561613505854, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5500149130821228, + "rewards/thk_ans_format_reward": 1.0, + "step": 3096, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.46875762939453, + "epoch": 10.462057335581788, + "grad_norm": 35.856713973040975, + "kl": 0.501953125, + "learning_rate": 1.2809684684684685e-07, + "loss": 0.0005, + "reward": 3.73908531665802, + "reward_std": 0.0826911348849535, + "rewards/final_reward": 1.7057263593186214, + "rewards/mask_iou_reward": 0.8528631796593107, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7390851378440857, + "rewards/thk_ans_format_reward": 1.0, + "step": 3097, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.90626525878906, + "epoch": 10.465430016863406, + "grad_norm": 7.584187280860042, + "kl": 0.4541015625, + "learning_rate": 1.278153153153153e-07, + "loss": 0.0005, + "reward": 3.591671943664551, + "reward_std": 0.13332638936117291, + "rewards/final_reward": 1.9272358695258596, + "rewards/mask_iou_reward": 0.9636179347629298, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6125050783157349, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3098, + "think_completion_length": 8.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.86459350585938, + "epoch": 10.468802698145025, + "grad_norm": 13.897238219395529, + "kl": 0.4208984375, + "learning_rate": 1.2753378378378377e-07, + "loss": 0.0004, + "reward": 3.6895748376846313, + "reward_std": 0.024271592497825623, + "rewards/final_reward": 1.8354064804276868, + "rewards/mask_iou_reward": 0.9177032402138434, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6895748972892761, + "rewards/thk_ans_format_reward": 1.0, + "step": 3099, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.40626525878906, + "epoch": 10.472175379426645, + "grad_norm": 47.21938702989025, + "kl": 0.3671875, + "learning_rate": 1.2725225225225226e-07, + "loss": 0.0003, + "reward": 3.7118654251098633, + "reward_std": 0.031610630452632904, + "rewards/final_reward": 1.547019004124051, + "rewards/mask_iou_reward": 0.7735095020620255, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7118653059005737, + "rewards/thk_ans_format_reward": 1.0, + "step": 3100, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.85417175292969, + "epoch": 10.475548060708263, + "grad_norm": 11.341914701976625, + "kl": 0.4951171875, + "learning_rate": 1.2697072072072072e-07, + "loss": 0.0005, + "reward": 3.369482636451721, + "reward_std": 0.024935521185398102, + "rewards/final_reward": 1.9687640793498886, + "rewards/mask_iou_reward": 0.9843820396749443, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.369482398033142, + "rewards/thk_ans_format_reward": 1.0, + "step": 3101, + "think_completion_length": 7.583333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.89583587646484, + "epoch": 10.478920741989882, + "grad_norm": 8.101147783530548, + "kl": 0.3388671875, + "learning_rate": 1.266891891891892e-07, + "loss": 0.0004, + "reward": 3.565877318382263, + "reward_std": 0.029876575339585543, + "rewards/final_reward": 1.7556893977145163, + "rewards/mask_iou_reward": 0.8778446988572581, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5658771991729736, + "rewards/thk_ans_format_reward": 1.0, + "step": 3102, + "think_completion_length": 9.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.30208587646484, + "epoch": 10.4822934232715, + "grad_norm": 12.216823149622934, + "kl": 0.4443359375, + "learning_rate": 1.2640765765765766e-07, + "loss": 0.0005, + "reward": 3.601265549659729, + "reward_std": 0.07581897638738155, + "rewards/final_reward": 1.9240982076614976, + "rewards/mask_iou_reward": 0.9620491038307488, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6012657284736633, + "rewards/thk_ans_format_reward": 1.0, + "step": 3103, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.9791717529297, + "epoch": 10.48566610455312, + "grad_norm": 10.304927264889335, + "kl": 0.4453125, + "learning_rate": 1.261261261261261e-07, + "loss": 0.0005, + "reward": 3.7941430807113647, + "reward_std": 0.11498380824923515, + "rewards/final_reward": 1.7165435770248911, + "rewards/mask_iou_reward": 0.8582717885124456, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.8045597076416016, + "rewards/thk_ans_format_reward": 1.0, + "step": 3104, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.86459350585938, + "epoch": 10.489038785834738, + "grad_norm": 8.780323726364982, + "kl": 0.5126953125, + "learning_rate": 1.2584459459459459e-07, + "loss": 0.0005, + "reward": 3.632111072540283, + "reward_std": 0.09831107221543789, + "rewards/final_reward": 1.5393308961893373, + "rewards/mask_iou_reward": 0.7696654480946686, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6321112513542175, + "rewards/thk_ans_format_reward": 1.0, + "step": 3105, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.00001525878906, + "epoch": 10.492411467116357, + "grad_norm": 8.055355884111894, + "kl": 0.3359375, + "learning_rate": 1.2556306306306305e-07, + "loss": 0.0003, + "reward": 3.6069531440734863, + "reward_std": 0.03117097169160843, + "rewards/final_reward": 1.7884215969975852, + "rewards/mask_iou_reward": 0.8942107984987926, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6069526672363281, + "rewards/thk_ans_format_reward": 1.0, + "step": 3106, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.92709350585938, + "epoch": 10.495784148397977, + "grad_norm": 10.36770739560577, + "kl": 0.4267578125, + "learning_rate": 1.2528153153153153e-07, + "loss": 0.0004, + "reward": 3.5954439640045166, + "reward_std": 0.015956447925418615, + "rewards/final_reward": 1.8034319018309988, + "rewards/mask_iou_reward": 0.9017159509154994, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5954439043998718, + "rewards/thk_ans_format_reward": 1.0, + "step": 3107, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.40625762939453, + "epoch": 10.499156829679595, + "grad_norm": 18.19336016512648, + "kl": 0.4248046875, + "learning_rate": 1.25e-07, + "loss": 0.0005, + "reward": 3.6256226301193237, + "reward_std": 0.046171706169843674, + "rewards/final_reward": 1.875848951219222, + "rewards/mask_iou_reward": 0.937924475609611, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6256226301193237, + "rewards/thk_ans_format_reward": 1.0, + "step": 3108, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.55209350585938, + "epoch": 10.502529510961214, + "grad_norm": 9.518348775740566, + "kl": 0.4599609375, + "learning_rate": 1.2471846846846846e-07, + "loss": 0.0004, + "reward": 3.7291589975357056, + "reward_std": 0.021761665120720863, + "rewards/final_reward": 1.7267013531189652, + "rewards/mask_iou_reward": 0.8633506765594826, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7291589379310608, + "rewards/thk_ans_format_reward": 1.0, + "step": 3109, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.87500762939453, + "epoch": 10.505902192242832, + "grad_norm": 6.22035865216061, + "kl": 0.5078125, + "learning_rate": 1.2443693693693694e-07, + "loss": 0.0005, + "reward": 3.3314541578292847, + "reward_std": 0.13443849235773087, + "rewards/final_reward": 1.6590253812713955, + "rewards/mask_iou_reward": 0.8295126906356978, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3314539790153503, + "rewards/thk_ans_format_reward": 1.0, + "step": 3110, + "think_completion_length": 9.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.56251525878906, + "epoch": 10.509274873524452, + "grad_norm": 6.872467141781061, + "kl": 0.46875, + "learning_rate": 1.241554054054054e-07, + "loss": 0.0005, + "reward": 3.4900680780410767, + "reward_std": 0.038352854549884796, + "rewards/final_reward": 1.909817570806116, + "rewards/mask_iou_reward": 0.954908785403058, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4900680184364319, + "rewards/thk_ans_format_reward": 1.0, + "step": 3111, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.56251525878906, + "epoch": 10.51264755480607, + "grad_norm": 10.280674378090223, + "kl": 0.4111328125, + "learning_rate": 1.2387387387387386e-07, + "loss": 0.0004, + "reward": 3.6083072423934937, + "reward_std": 0.04970341920852661, + "rewards/final_reward": 1.6760755903575955, + "rewards/mask_iou_reward": 0.8380377951787977, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.608307123184204, + "rewards/thk_ans_format_reward": 1.0, + "step": 3112, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.1354217529297, + "epoch": 10.516020236087689, + "grad_norm": 23.28563797167097, + "kl": 0.431640625, + "learning_rate": 1.2359234234234232e-07, + "loss": 0.0005, + "reward": 3.634291648864746, + "reward_std": 0.02619549073278904, + "rewards/final_reward": 1.748686872196122, + "rewards/mask_iou_reward": 0.874343436098061, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6342918276786804, + "rewards/thk_ans_format_reward": 1.0, + "step": 3113, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.7916717529297, + "epoch": 10.51939291736931, + "grad_norm": 10.582982961174917, + "kl": 0.42578125, + "learning_rate": 1.233108108108108e-07, + "loss": 0.0004, + "reward": 3.5773465633392334, + "reward_std": 0.08526142570190132, + "rewards/final_reward": 1.6190797931129668, + "rewards/mask_iou_reward": 0.8095398965564834, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5773464441299438, + "rewards/thk_ans_format_reward": 1.0, + "step": 3114, + "think_completion_length": 8.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.47917938232422, + "epoch": 10.522765598650928, + "grad_norm": 11.808159257005988, + "kl": 0.4267578125, + "learning_rate": 1.2302927927927927e-07, + "loss": 0.0004, + "reward": 3.7226184606552124, + "reward_std": 0.024431753903627396, + "rewards/final_reward": 1.778847208369364, + "rewards/mask_iou_reward": 0.889423604184682, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7226186394691467, + "rewards/thk_ans_format_reward": 1.0, + "step": 3115, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.65625, + "epoch": 10.526138279932546, + "grad_norm": 50.688113567662526, + "kl": 0.634765625, + "learning_rate": 1.2274774774774773e-07, + "loss": 0.0006, + "reward": 3.64953076839447, + "reward_std": 0.04483196325600147, + "rewards/final_reward": 1.8453000319146462, + "rewards/mask_iou_reward": 0.9226500159573231, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6495305895805359, + "rewards/thk_ans_format_reward": 1.0, + "step": 3116, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.28125762939453, + "epoch": 10.529510961214164, + "grad_norm": 8.422268951168364, + "kl": 0.44921875, + "learning_rate": 1.2246621621621622e-07, + "loss": 0.0005, + "reward": 3.286513924598694, + "reward_std": 0.055416141636669636, + "rewards/final_reward": 0.2872778971173338, + "rewards/mask_iou_reward": 0.1436389485586669, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2865139245986938, + "rewards/thk_ans_format_reward": 1.0, + "step": 3117, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.50000762939453, + "epoch": 10.532883642495785, + "grad_norm": 10.599310623729721, + "kl": 0.5302734375, + "learning_rate": 1.2218468468468468e-07, + "loss": 0.0006, + "reward": 3.5024014711380005, + "reward_std": 0.028407627250999212, + "rewards/final_reward": 1.8209042963764035, + "rewards/mask_iou_reward": 0.9104521481882017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5024012327194214, + "rewards/thk_ans_format_reward": 1.0, + "step": 3118, + "think_completion_length": 8.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.3854217529297, + "epoch": 10.536256323777403, + "grad_norm": 34.94987635831581, + "kl": 0.47265625, + "learning_rate": 1.2190315315315314e-07, + "loss": 0.0005, + "reward": 3.6050848960876465, + "reward_std": 0.1102399006485939, + "rewards/final_reward": 1.8166443892282316, + "rewards/mask_iou_reward": 0.9083221946141158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.605084776878357, + "rewards/thk_ans_format_reward": 1.0, + "step": 3119, + "think_completion_length": 7.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.70833587646484, + "epoch": 10.539629005059021, + "grad_norm": 8.738350824887798, + "kl": 0.439453125, + "learning_rate": 1.2162162162162163e-07, + "loss": 0.0005, + "reward": 3.62838351726532, + "reward_std": 0.025765706785023212, + "rewards/final_reward": 1.4891728952793306, + "rewards/mask_iou_reward": 0.7445864476396653, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6283833384513855, + "rewards/thk_ans_format_reward": 1.0, + "step": 3120, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3854217529297, + "epoch": 10.543001686340641, + "grad_norm": 15.598197022156395, + "kl": 0.37109375, + "learning_rate": 1.213400900900901e-07, + "loss": 0.0004, + "reward": 3.7006378173828125, + "reward_std": 0.040458193980157375, + "rewards/final_reward": 1.4425537032359959, + "rewards/mask_iou_reward": 0.7212768516179979, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.700637698173523, + "rewards/thk_ans_format_reward": 1.0, + "step": 3121, + "think_completion_length": 8.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.27083587646484, + "epoch": 10.54637436762226, + "grad_norm": 13.870187339867892, + "kl": 0.599609375, + "learning_rate": 1.2105855855855855e-07, + "loss": 0.0006, + "reward": 3.5342659950256348, + "reward_std": 0.02030755707528442, + "rewards/final_reward": 1.8798160791428677, + "rewards/mask_iou_reward": 0.9399080395714339, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5342658758163452, + "rewards/thk_ans_format_reward": 1.0, + "step": 3122, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.4479217529297, + "epoch": 10.549747048903878, + "grad_norm": 21.325236325585628, + "kl": 0.4404296875, + "learning_rate": 1.20777027027027e-07, + "loss": 0.0005, + "reward": 3.6597487926483154, + "reward_std": 0.07287406735122204, + "rewards/final_reward": 1.4427094025292027, + "rewards/mask_iou_reward": 0.7213547012646013, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6597489714622498, + "rewards/thk_ans_format_reward": 1.0, + "step": 3123, + "think_completion_length": 9.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.30209350585938, + "epoch": 10.553119730185497, + "grad_norm": 75.3481537657436, + "kl": 1.54296875, + "learning_rate": 1.204954954954955e-07, + "loss": 0.0016, + "reward": 3.491469383239746, + "reward_std": 0.054703426314517856, + "rewards/final_reward": 1.8018433956168733, + "rewards/mask_iou_reward": 0.9009216978084367, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5123026967048645, + "rewards/thk_ans_format_reward": 1.0, + "step": 3124, + "think_completion_length": 6.833333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.1041717529297, + "epoch": 10.556492411467117, + "grad_norm": 10.292393267531867, + "kl": 0.47265625, + "learning_rate": 1.2021396396396396e-07, + "loss": 0.0005, + "reward": 3.3591725826263428, + "reward_std": 0.023047425784170628, + "rewards/final_reward": 1.831021463665678, + "rewards/mask_iou_reward": 0.915510731832839, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3591725826263428, + "rewards/thk_ans_format_reward": 1.0, + "step": 3125, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.2916717529297, + "epoch": 10.559865092748735, + "grad_norm": 39.62698803266399, + "kl": 0.3984375, + "learning_rate": 1.1993243243243242e-07, + "loss": 0.0004, + "reward": 3.537477135658264, + "reward_std": 0.06482739746570587, + "rewards/final_reward": 1.289090077875543, + "rewards/mask_iou_reward": 0.6445450389377715, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5374772548675537, + "rewards/thk_ans_format_reward": 1.0, + "step": 3126, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.15626525878906, + "epoch": 10.563237774030354, + "grad_norm": 9.282324146659736, + "kl": 0.3828125, + "learning_rate": 1.196509009009009e-07, + "loss": 0.0004, + "reward": 3.5415862798690796, + "reward_std": 0.09624642692506313, + "rewards/final_reward": 1.480154818709382, + "rewards/mask_iou_reward": 0.740077409354691, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5415863990783691, + "rewards/thk_ans_format_reward": 1.0, + "step": 3127, + "think_completion_length": 7.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.15625762939453, + "epoch": 10.566610455311974, + "grad_norm": 9.707291647146798, + "kl": 0.4462890625, + "learning_rate": 1.1936936936936937e-07, + "loss": 0.0005, + "reward": 3.565924644470215, + "reward_std": 0.030608173459768295, + "rewards/final_reward": 1.7547236881395065, + "rewards/mask_iou_reward": 0.8773618440697533, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5659246444702148, + "rewards/thk_ans_format_reward": 1.0, + "step": 3128, + "think_completion_length": 8.708333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.6979217529297, + "epoch": 10.569983136593592, + "grad_norm": 18.0298815257855, + "kl": 0.5205078125, + "learning_rate": 1.1908783783783784e-07, + "loss": 0.0005, + "reward": 3.5914461612701416, + "reward_std": 0.02858224418014288, + "rewards/final_reward": 1.4344778908627398, + "rewards/mask_iou_reward": 0.7172389454313699, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5914462804794312, + "rewards/thk_ans_format_reward": 1.0, + "step": 3129, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.4166717529297, + "epoch": 10.57335581787521, + "grad_norm": 6.43292906131864, + "kl": 0.4423828125, + "learning_rate": 1.188063063063063e-07, + "loss": 0.0005, + "reward": 3.5320615768432617, + "reward_std": 0.0571563933044672, + "rewards/final_reward": 1.7172090882626136, + "rewards/mask_iou_reward": 0.8586045441313068, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5320618152618408, + "rewards/thk_ans_format_reward": 1.0, + "step": 3130, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.62500762939453, + "epoch": 10.576728499156829, + "grad_norm": 18.8093852353866, + "kl": 0.50390625, + "learning_rate": 1.1852477477477476e-07, + "loss": 0.0005, + "reward": 3.671657681465149, + "reward_std": 0.030323058366775513, + "rewards/final_reward": 1.4895206263686176, + "rewards/mask_iou_reward": 0.7447603131843088, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6716576218605042, + "rewards/thk_ans_format_reward": 1.0, + "step": 3131, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.36459350585938, + "epoch": 10.580101180438449, + "grad_norm": 16.060535556302863, + "kl": 0.4130859375, + "learning_rate": 1.1824324324324324e-07, + "loss": 0.0004, + "reward": 3.5097408294677734, + "reward_std": 0.0780508778989315, + "rewards/final_reward": 1.3697000814567954, + "rewards/mask_iou_reward": 0.6848500407283977, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5097407102584839, + "rewards/thk_ans_format_reward": 1.0, + "step": 3132, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.9791717529297, + "epoch": 10.583473861720067, + "grad_norm": 22.222040156683512, + "kl": 0.619140625, + "learning_rate": 1.1796171171171171e-07, + "loss": 0.0006, + "reward": 3.2384536266326904, + "reward_std": 0.21653933450579643, + "rewards/final_reward": 1.4502180401317724, + "rewards/mask_iou_reward": 0.7251090200658862, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2384536862373352, + "rewards/thk_ans_format_reward": 1.0, + "step": 3133, + "think_completion_length": 7.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.875, + "epoch": 10.586846543001686, + "grad_norm": 12.796673567815091, + "kl": 0.3876953125, + "learning_rate": 1.1768018018018018e-07, + "loss": 0.0004, + "reward": 3.5177581310272217, + "reward_std": 0.04781328607350588, + "rewards/final_reward": 1.688266574289527, + "rewards/mask_iou_reward": 0.8441332871447635, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.517758071422577, + "rewards/thk_ans_format_reward": 1.0, + "step": 3134, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.9791717529297, + "epoch": 10.590219224283306, + "grad_norm": 37.131481922445516, + "kl": 0.630859375, + "learning_rate": 1.1739864864864864e-07, + "loss": 0.0006, + "reward": 3.4619154930114746, + "reward_std": 0.03177159791812301, + "rewards/final_reward": 1.81192326704147, + "rewards/mask_iou_reward": 0.905961633520735, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4619154930114746, + "rewards/thk_ans_format_reward": 1.0, + "step": 3135, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.83334350585938, + "epoch": 10.593591905564924, + "grad_norm": 9.559126232506571, + "kl": 0.52734375, + "learning_rate": 1.171171171171171e-07, + "loss": 0.0005, + "reward": 3.460838556289673, + "reward_std": 0.030008337926119566, + "rewards/final_reward": 0.8600520619291397, + "rewards/mask_iou_reward": 0.43002603096456987, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4608385562896729, + "rewards/thk_ans_format_reward": 1.0, + "step": 3136, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.61458587646484, + "epoch": 10.596964586846543, + "grad_norm": 8.868938047375126, + "kl": 0.4970703125, + "learning_rate": 1.1683558558558558e-07, + "loss": 0.0005, + "reward": 3.5875900983810425, + "reward_std": 0.06950164213776588, + "rewards/final_reward": 0.9352956134800028, + "rewards/mask_iou_reward": 0.4676478067400014, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5875900983810425, + "rewards/thk_ans_format_reward": 1.0, + "step": 3137, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.34375762939453, + "epoch": 10.600337268128161, + "grad_norm": 8.631692924608684, + "kl": 0.4736328125, + "learning_rate": 1.1655405405405405e-07, + "loss": 0.0005, + "reward": 3.4805089235305786, + "reward_std": 0.03367413394153118, + "rewards/final_reward": 1.632930184323556, + "rewards/mask_iou_reward": 0.816465092161778, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4805086851119995, + "rewards/thk_ans_format_reward": 1.0, + "step": 3138, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.97917938232422, + "epoch": 10.603709949409781, + "grad_norm": 6.632640004769775, + "kl": 0.4287109375, + "learning_rate": 1.1627252252252253e-07, + "loss": 0.0004, + "reward": 3.681812882423401, + "reward_std": 0.1951524093747139, + "rewards/final_reward": 1.7462302154525413, + "rewards/mask_iou_reward": 0.8731151077262707, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7026461362838745, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3139, + "think_completion_length": 9.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.40625762939453, + "epoch": 10.6070826306914, + "grad_norm": 13.731958002321711, + "kl": 0.4052734375, + "learning_rate": 1.1599099099099097e-07, + "loss": 0.0004, + "reward": 3.4694260358810425, + "reward_std": 0.05269887298345566, + "rewards/final_reward": 1.4713394746301809, + "rewards/mask_iou_reward": 0.7356697373150904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4694259762763977, + "rewards/thk_ans_format_reward": 1.0, + "step": 3140, + "think_completion_length": 8.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.3958396911621, + "epoch": 10.610455311973018, + "grad_norm": 10.443905821748844, + "kl": 0.546875, + "learning_rate": 1.1570945945945945e-07, + "loss": 0.0005, + "reward": 3.3069279193878174, + "reward_std": 0.3599608391523361, + "rewards/final_reward": 1.0115353345672449, + "rewards/mask_iou_reward": 0.5057676672836224, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.3381779789924622, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3141, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.55209350585938, + "epoch": 10.613827993254638, + "grad_norm": 21.849604022714967, + "kl": 0.3603515625, + "learning_rate": 1.1542792792792792e-07, + "loss": 0.0004, + "reward": 3.516273021697998, + "reward_std": 0.15535355359315872, + "rewards/final_reward": 1.7520223331517997, + "rewards/mask_iou_reward": 0.8760111665758998, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.537106454372406, + "rewards/thk_ans_format_reward": 1.0, + "step": 3142, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.87501525878906, + "epoch": 10.617200674536257, + "grad_norm": 7.274680848470261, + "kl": 0.5224609375, + "learning_rate": 1.151463963963964e-07, + "loss": 0.0005, + "reward": 3.563242197036743, + "reward_std": 0.04035542719066143, + "rewards/final_reward": 1.5929854370163006, + "rewards/mask_iou_reward": 0.7964927185081503, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5632421374320984, + "rewards/thk_ans_format_reward": 1.0, + "step": 3143, + "think_completion_length": 9.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5520935058594, + "epoch": 10.620573355817875, + "grad_norm": 8.76195986955573, + "kl": 0.4267578125, + "learning_rate": 1.1486486486486487e-07, + "loss": 0.0006, + "reward": 3.703770875930786, + "reward_std": 0.03697956167161465, + "rewards/final_reward": 1.3946098700720109, + "rewards/mask_iou_reward": 0.6973049350360054, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7037709951400757, + "rewards/thk_ans_format_reward": 1.0, + "step": 3144, + "think_completion_length": 6.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.2916717529297, + "epoch": 10.623946037099493, + "grad_norm": 9.26128589112025, + "kl": 0.55078125, + "learning_rate": 1.1458333333333332e-07, + "loss": 0.0006, + "reward": 3.501655697822571, + "reward_std": 0.09267310053110123, + "rewards/final_reward": 1.7073892721831088, + "rewards/mask_iou_reward": 0.8536946360915544, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5016555190086365, + "rewards/thk_ans_format_reward": 1.0, + "step": 3145, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.3958396911621, + "epoch": 10.627318718381114, + "grad_norm": 20.42773432434273, + "kl": 0.560546875, + "learning_rate": 1.1430180180180179e-07, + "loss": 0.0006, + "reward": 3.547227621078491, + "reward_std": 0.09544646553695202, + "rewards/final_reward": 1.2804396471081816, + "rewards/mask_iou_reward": 0.6402198235540908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5472275614738464, + "rewards/thk_ans_format_reward": 1.0, + "step": 3146, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.40626525878906, + "epoch": 10.630691399662732, + "grad_norm": 9.38414902098105, + "kl": 0.51171875, + "learning_rate": 1.1402027027027026e-07, + "loss": 0.0005, + "reward": 3.6407686471939087, + "reward_std": 0.13431363552808762, + "rewards/final_reward": 1.5476860661383145, + "rewards/mask_iou_reward": 0.7738430330691572, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6616019010543823, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3147, + "think_completion_length": 7.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.2916717529297, + "epoch": 10.63406408094435, + "grad_norm": 16.44221325891791, + "kl": 0.3955078125, + "learning_rate": 1.1373873873873874e-07, + "loss": 0.0004, + "reward": 3.2191805839538574, + "reward_std": 0.10380381904542446, + "rewards/final_reward": 1.538585252347514, + "rewards/mask_iou_reward": 0.769292626173757, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.219180703163147, + "rewards/thk_ans_format_reward": 1.0, + "step": 3148, + "think_completion_length": 9.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.3125, + "epoch": 10.63743676222597, + "grad_norm": 15.05638678051703, + "kl": 0.48828125, + "learning_rate": 1.1345720720720721e-07, + "loss": 0.0005, + "reward": 3.862180471420288, + "reward_std": 0.011594453826546669, + "rewards/final_reward": 1.9617679351859052, + "rewards/mask_iou_reward": 0.9808839675929526, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8621803522109985, + "rewards/thk_ans_format_reward": 1.0, + "step": 3149, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.8958396911621, + "epoch": 10.640809443507589, + "grad_norm": 22.34933047326768, + "kl": 0.4013671875, + "learning_rate": 1.1317567567567566e-07, + "loss": 0.0004, + "reward": 3.4726303815841675, + "reward_std": 0.023849932476878166, + "rewards/final_reward": 1.2154385044698301, + "rewards/mask_iou_reward": 0.6077192522349151, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4726303815841675, + "rewards/thk_ans_format_reward": 1.0, + "step": 3150, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.9375, + "epoch": 10.644182124789207, + "grad_norm": 5.441981994440659, + "kl": 0.4072265625, + "learning_rate": 1.1289414414414413e-07, + "loss": 0.0004, + "reward": 3.5731762647628784, + "reward_std": 0.16858915518969297, + "rewards/final_reward": 1.7494747987237824, + "rewards/mask_iou_reward": 0.8747373993618912, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5940097570419312, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3151, + "think_completion_length": 10.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.0104217529297, + "epoch": 10.647554806070826, + "grad_norm": 8.798895286882274, + "kl": 0.654296875, + "learning_rate": 1.1261261261261261e-07, + "loss": 0.0006, + "reward": 3.7292591333389282, + "reward_std": 0.09326490294188261, + "rewards/final_reward": 1.831878290939525, + "rewards/mask_iou_reward": 0.9159391454697625, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7292590141296387, + "rewards/thk_ans_format_reward": 1.0, + "step": 3152, + "think_completion_length": 9.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 180.6979217529297, + "epoch": 10.650927487352446, + "grad_norm": 12.936342160811696, + "kl": 0.482421875, + "learning_rate": 1.1233108108108108e-07, + "loss": 0.0005, + "reward": 3.400019407272339, + "reward_std": 0.02918088808655739, + "rewards/final_reward": 1.4276408117049701, + "rewards/mask_iou_reward": 0.7138204058524851, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.400019347667694, + "rewards/thk_ans_format_reward": 1.0, + "step": 3153, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.23958587646484, + "epoch": 10.654300168634064, + "grad_norm": 45.72291023072974, + "kl": 0.49609375, + "learning_rate": 1.1204954954954954e-07, + "loss": 0.0005, + "reward": 3.5731505155563354, + "reward_std": 0.04934038035571575, + "rewards/final_reward": 1.5378148013277484, + "rewards/mask_iou_reward": 0.7689074006638742, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5731505155563354, + "rewards/thk_ans_format_reward": 1.0, + "step": 3154, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.5416717529297, + "epoch": 10.657672849915683, + "grad_norm": 13.974923581463589, + "kl": 0.455078125, + "learning_rate": 1.11768018018018e-07, + "loss": 0.0005, + "reward": 3.72958767414093, + "reward_std": 0.13264624774456024, + "rewards/final_reward": 1.822597720406296, + "rewards/mask_iou_reward": 0.911298860203148, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.7504209280014038, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3155, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.3125, + "epoch": 10.661045531197303, + "grad_norm": 20.442499355497326, + "kl": 0.4052734375, + "learning_rate": 1.1148648648648648e-07, + "loss": 0.0004, + "reward": 3.708630681037903, + "reward_std": 0.06668695248663425, + "rewards/final_reward": 1.805878203669689, + "rewards/mask_iou_reward": 0.9029391018348445, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7086305618286133, + "rewards/thk_ans_format_reward": 1.0, + "step": 3156, + "think_completion_length": 8.583333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.9166717529297, + "epoch": 10.664418212478921, + "grad_norm": 15.684658816394188, + "kl": 0.671875, + "learning_rate": 1.1120495495495495e-07, + "loss": 0.0007, + "reward": 3.311765432357788, + "reward_std": 0.09275224432349205, + "rewards/final_reward": 1.5564693854632141, + "rewards/mask_iou_reward": 0.7782346927316071, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.322182059288025, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3157, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 174.17709350585938, + "epoch": 10.66779089376054, + "grad_norm": 24.875937326115956, + "kl": 0.4521484375, + "learning_rate": 1.1092342342342342e-07, + "loss": 0.0005, + "reward": 3.5257372856140137, + "reward_std": 0.021635837852954865, + "rewards/final_reward": 1.0622315912320346, + "rewards/mask_iou_reward": 0.5311157956160173, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5257372856140137, + "rewards/thk_ans_format_reward": 1.0, + "step": 3158, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.43750762939453, + "epoch": 10.671163575042158, + "grad_norm": 48.47509905804193, + "kl": 0.466796875, + "learning_rate": 1.1064189189189189e-07, + "loss": 0.0005, + "reward": 3.70083487033844, + "reward_std": 0.017503976356238127, + "rewards/final_reward": 1.8391874891535551, + "rewards/mask_iou_reward": 0.9195937445767776, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7008347511291504, + "rewards/thk_ans_format_reward": 1.0, + "step": 3159, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.6041717529297, + "epoch": 10.674536256323778, + "grad_norm": 9.61768093844924, + "kl": 0.458984375, + "learning_rate": 1.1036036036036035e-07, + "loss": 0.0005, + "reward": 3.6513434648513794, + "reward_std": 0.03646405367180705, + "rewards/final_reward": 1.7044724619966742, + "rewards/mask_iou_reward": 0.8522362309983371, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.651343584060669, + "rewards/thk_ans_format_reward": 1.0, + "step": 3160, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.1041717529297, + "epoch": 10.677908937605396, + "grad_norm": 19.17738559099291, + "kl": 0.6015625, + "learning_rate": 1.1007882882882882e-07, + "loss": 0.0006, + "reward": 3.6847745180130005, + "reward_std": 0.03435724973678589, + "rewards/final_reward": 1.9092129905471467, + "rewards/mask_iou_reward": 0.9546064952735733, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6847743391990662, + "rewards/thk_ans_format_reward": 1.0, + "step": 3161, + "think_completion_length": 8.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.25, + "epoch": 10.681281618887015, + "grad_norm": 10.967767931797544, + "kl": 0.505859375, + "learning_rate": 1.097972972972973e-07, + "loss": 0.0005, + "reward": 3.380388021469116, + "reward_std": 0.025232050102204084, + "rewards/final_reward": 1.4506534061370655, + "rewards/mask_iou_reward": 0.7253267030685328, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3803880214691162, + "rewards/thk_ans_format_reward": 1.0, + "step": 3162, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.28125762939453, + "epoch": 10.684654300168635, + "grad_norm": 7.730518808963188, + "kl": 0.40234375, + "learning_rate": 1.0951576576576577e-07, + "loss": 0.0004, + "reward": 3.6407421827316284, + "reward_std": 0.06835777265951037, + "rewards/final_reward": 1.7174296933986817, + "rewards/mask_iou_reward": 0.8587148466993408, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.640742301940918, + "rewards/thk_ans_format_reward": 1.0, + "step": 3163, + "think_completion_length": 7.0 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.61459350585938, + "epoch": 10.688026981450253, + "grad_norm": 15.784759543118883, + "kl": 0.646484375, + "learning_rate": 1.0923423423423423e-07, + "loss": 0.0007, + "reward": 3.557579278945923, + "reward_std": 0.04200829612091184, + "rewards/final_reward": 1.5002643919718692, + "rewards/mask_iou_reward": 0.7501321959859346, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.557579219341278, + "rewards/thk_ans_format_reward": 1.0, + "step": 3164, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.72917938232422, + "epoch": 10.691399662731872, + "grad_norm": 12.029126664888803, + "kl": 0.4169921875, + "learning_rate": 1.0895270270270269e-07, + "loss": 0.0004, + "reward": 3.4169652462005615, + "reward_std": 0.07824656739830971, + "rewards/final_reward": 1.0356403614267473, + "rewards/mask_iou_reward": 0.5178201807133737, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4169652462005615, + "rewards/thk_ans_format_reward": 1.0, + "step": 3165, + "think_completion_length": 9.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.6666717529297, + "epoch": 10.69477234401349, + "grad_norm": 39.3048031368233, + "kl": 0.4921875, + "learning_rate": 1.0867117117117116e-07, + "loss": 0.0005, + "reward": 3.767988443374634, + "reward_std": 0.04218120127916336, + "rewards/final_reward": 1.7434309960952667, + "rewards/mask_iou_reward": 0.8717154980476334, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7679883241653442, + "rewards/thk_ans_format_reward": 1.0, + "step": 3166, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.5104217529297, + "epoch": 10.69814502529511, + "grad_norm": 17.339184068767512, + "kl": 0.4638671875, + "learning_rate": 1.0838963963963964e-07, + "loss": 0.0005, + "reward": 3.464288115501404, + "reward_std": 0.03953620605170727, + "rewards/final_reward": 1.3581066537637718, + "rewards/mask_iou_reward": 0.6790533268818859, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4642879366874695, + "rewards/thk_ans_format_reward": 1.0, + "step": 3167, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.2291717529297, + "epoch": 10.701517706576729, + "grad_norm": 12.181885012295425, + "kl": 0.427734375, + "learning_rate": 1.0810810810810811e-07, + "loss": 0.0004, + "reward": 3.3465646505355835, + "reward_std": 0.03928683325648308, + "rewards/final_reward": 1.4744314939278178, + "rewards/mask_iou_reward": 0.7372157469639089, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3465644121170044, + "rewards/thk_ans_format_reward": 1.0, + "step": 3168, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.61458587646484, + "epoch": 10.704890387858347, + "grad_norm": 9.592572877067397, + "kl": 0.416015625, + "learning_rate": 1.0782657657657657e-07, + "loss": 0.0004, + "reward": 3.708613395690918, + "reward_std": 0.07097115181386471, + "rewards/final_reward": 1.5417674942840909, + "rewards/mask_iou_reward": 0.7708837471420454, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7086135149002075, + "rewards/thk_ans_format_reward": 1.0, + "step": 3169, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.4479217529297, + "epoch": 10.708263069139967, + "grad_norm": 37.75207061018355, + "kl": 0.5849609375, + "learning_rate": 1.0754504504504503e-07, + "loss": 0.0006, + "reward": 3.46931254863739, + "reward_std": 0.08026197552680969, + "rewards/final_reward": 1.5688330174525815, + "rewards/mask_iou_reward": 0.7844165087262908, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4693125486373901, + "rewards/thk_ans_format_reward": 1.0, + "step": 3170, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.3333435058594, + "epoch": 10.711635750421586, + "grad_norm": 10.453179838498924, + "kl": 0.419921875, + "learning_rate": 1.072635135135135e-07, + "loss": 0.0004, + "reward": 3.502511143684387, + "reward_std": 0.03253740817308426, + "rewards/final_reward": 1.1187443726913366, + "rewards/mask_iou_reward": 0.5593721863456683, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5025108456611633, + "rewards/thk_ans_format_reward": 1.0, + "step": 3171, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.51042938232422, + "epoch": 10.715008431703204, + "grad_norm": 19.590390528265072, + "kl": 0.412109375, + "learning_rate": 1.0698198198198198e-07, + "loss": 0.0004, + "reward": 3.530317544937134, + "reward_std": 0.06995473802089691, + "rewards/final_reward": 1.922661907681869, + "rewards/mask_iou_reward": 0.9613309538409345, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.530317783355713, + "rewards/thk_ans_format_reward": 1.0, + "step": 3172, + "think_completion_length": 9.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.93750762939453, + "epoch": 10.718381112984822, + "grad_norm": 7.326455739047058, + "kl": 0.494140625, + "learning_rate": 1.0670045045045045e-07, + "loss": 0.0005, + "reward": 3.727341413497925, + "reward_std": 0.05604278179816902, + "rewards/final_reward": 1.7987221130595397, + "rewards/mask_iou_reward": 0.8993610565297698, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.72734135389328, + "rewards/thk_ans_format_reward": 1.0, + "step": 3173, + "think_completion_length": 8.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.1770935058594, + "epoch": 10.721753794266442, + "grad_norm": 9.937929320228857, + "kl": 0.802734375, + "learning_rate": 1.0641891891891891e-07, + "loss": 0.0008, + "reward": 3.434737205505371, + "reward_std": 0.12638374976813793, + "rewards/final_reward": 1.7686646256373062, + "rewards/mask_iou_reward": 0.8843323128186531, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4347370266914368, + "rewards/thk_ans_format_reward": 1.0, + "step": 3174, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.39584350585938, + "epoch": 10.72512647554806, + "grad_norm": 8.874852737333692, + "kl": 0.3779296875, + "learning_rate": 1.0613738738738738e-07, + "loss": 0.0004, + "reward": 3.7087361812591553, + "reward_std": 0.018423269502818584, + "rewards/final_reward": 1.6229624286472206, + "rewards/mask_iou_reward": 0.8114812143236103, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7087361812591553, + "rewards/thk_ans_format_reward": 1.0, + "step": 3175, + "think_completion_length": 8.583333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.03125, + "epoch": 10.72849915682968, + "grad_norm": 21.65028430383011, + "kl": 0.4404296875, + "learning_rate": 1.0585585585585585e-07, + "loss": 0.0006, + "reward": 3.6942501068115234, + "reward_std": 0.03157848212867975, + "rewards/final_reward": 1.9245843890419088, + "rewards/mask_iou_reward": 0.9622921945209544, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6942499279975891, + "rewards/thk_ans_format_reward": 1.0, + "step": 3176, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.34375762939453, + "epoch": 10.7318718381113, + "grad_norm": 19.97482282089851, + "kl": 0.53125, + "learning_rate": 1.0557432432432432e-07, + "loss": 0.0006, + "reward": 3.7493603229522705, + "reward_std": 0.030578995821997523, + "rewards/final_reward": 1.8926272902165633, + "rewards/mask_iou_reward": 0.9463136451082816, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7493602633476257, + "rewards/thk_ans_format_reward": 1.0, + "step": 3177, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.27083587646484, + "epoch": 10.735244519392918, + "grad_norm": 7.7444598309142565, + "kl": 0.5029296875, + "learning_rate": 1.0529279279279278e-07, + "loss": 0.0005, + "reward": 3.5413291454315186, + "reward_std": 0.029289917089045048, + "rewards/final_reward": 0.9951512429044618, + "rewards/mask_iou_reward": 0.4975756214522309, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5413289666175842, + "rewards/thk_ans_format_reward": 1.0, + "step": 3178, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.7395935058594, + "epoch": 10.738617200674536, + "grad_norm": 6.478442215378928, + "kl": 0.3828125, + "learning_rate": 1.0501126126126126e-07, + "loss": 0.0004, + "reward": 3.5706183910369873, + "reward_std": 0.05361626110970974, + "rewards/final_reward": 1.269704174686019, + "rewards/mask_iou_reward": 0.6348520873430095, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5706182718276978, + "rewards/thk_ans_format_reward": 1.0, + "step": 3179, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.4479217529297, + "epoch": 10.741989881956155, + "grad_norm": 16.661016023497513, + "kl": 0.4423828125, + "learning_rate": 1.0472972972972972e-07, + "loss": 0.0005, + "reward": 3.549037218093872, + "reward_std": 0.13535232981666923, + "rewards/final_reward": 1.5239766400728418, + "rewards/mask_iou_reward": 0.7619883200364209, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5698702931404114, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3180, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.70834350585938, + "epoch": 10.745362563237775, + "grad_norm": 18.01798808332277, + "kl": 0.423828125, + "learning_rate": 1.0444819819819819e-07, + "loss": 0.0004, + "reward": 3.6769832372665405, + "reward_std": 0.030254771932959557, + "rewards/final_reward": 1.7641558989696924, + "rewards/mask_iou_reward": 0.8820779494848462, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6769834756851196, + "rewards/thk_ans_format_reward": 1.0, + "step": 3181, + "think_completion_length": 8.541666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.0520935058594, + "epoch": 10.748735244519393, + "grad_norm": 9.840387283943757, + "kl": 0.3984375, + "learning_rate": 1.0416666666666667e-07, + "loss": 0.0004, + "reward": 3.1989909410476685, + "reward_std": 0.2475245175883174, + "rewards/final_reward": 0.6654427980785749, + "rewards/mask_iou_reward": 0.33272139903928744, + "rewards/sam_format_reward": 0.96875, + "rewards/sam_reward_func_ultra": 1.2614907622337341, + "rewards/thk_ans_format_reward": 0.96875, + "step": 3182, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.875, + "epoch": 10.752107925801011, + "grad_norm": 10.436477239154478, + "kl": 0.416015625, + "learning_rate": 1.0388513513513513e-07, + "loss": 0.0004, + "reward": 3.53597092628479, + "reward_std": 0.055690947920084, + "rewards/final_reward": 1.8742546162403326, + "rewards/mask_iou_reward": 0.9371273081201663, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.53597092628479, + "rewards/thk_ans_format_reward": 1.0, + "step": 3183, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.3854217529297, + "epoch": 10.75548060708263, + "grad_norm": 12.405938651548457, + "kl": 0.4375, + "learning_rate": 1.036036036036036e-07, + "loss": 0.0005, + "reward": 3.2055702209472656, + "reward_std": 0.20975297689437866, + "rewards/final_reward": 1.6398424581666908, + "rewards/mask_iou_reward": 0.8199212290833454, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.226403534412384, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3184, + "think_completion_length": 7.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.37501525878906, + "epoch": 10.75885328836425, + "grad_norm": 11.512690924025932, + "kl": 0.4453125, + "learning_rate": 1.0332207207207206e-07, + "loss": 0.0005, + "reward": 3.782678008079529, + "reward_std": 0.040368370711803436, + "rewards/final_reward": 1.7962749476994122, + "rewards/mask_iou_reward": 0.8981374738497061, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7826780080795288, + "rewards/thk_ans_format_reward": 1.0, + "step": 3185, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.61459350585938, + "epoch": 10.762225969645868, + "grad_norm": 9.533661368134151, + "kl": 0.6845703125, + "learning_rate": 1.0304054054054054e-07, + "loss": 0.0007, + "reward": 3.674705743789673, + "reward_std": 0.06669612042605877, + "rewards/final_reward": 1.4661596776198316, + "rewards/mask_iou_reward": 0.7330798388099158, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6747060418128967, + "rewards/thk_ans_format_reward": 1.0, + "step": 3186, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 183.8854217529297, + "epoch": 10.765598650927487, + "grad_norm": 21.614612511269403, + "kl": 0.673828125, + "learning_rate": 1.0275900900900901e-07, + "loss": 0.0007, + "reward": 3.7622073888778687, + "reward_std": 0.008181184297427535, + "rewards/final_reward": 1.815139171588172, + "rewards/mask_iou_reward": 0.907569585794086, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7622073292732239, + "rewards/thk_ans_format_reward": 1.0, + "step": 3187, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.0416793823242, + "epoch": 10.768971332209107, + "grad_norm": 12.895675518609572, + "kl": 0.423828125, + "learning_rate": 1.0247747747747747e-07, + "loss": 0.0004, + "reward": 3.4585185050964355, + "reward_std": 0.22953240387141705, + "rewards/final_reward": 1.7792611975583754, + "rewards/mask_iou_reward": 0.8896305987791877, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5001851320266724, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 3188, + "think_completion_length": 7.166666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.3229217529297, + "epoch": 10.772344013490725, + "grad_norm": 10.760076478630971, + "kl": 0.5849609375, + "learning_rate": 1.0219594594594594e-07, + "loss": 0.0006, + "reward": 3.7292808294296265, + "reward_std": 0.03788667544722557, + "rewards/final_reward": 1.426709951626853, + "rewards/mask_iou_reward": 0.7133549758134266, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.729280948638916, + "rewards/thk_ans_format_reward": 1.0, + "step": 3189, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.89584350585938, + "epoch": 10.775716694772344, + "grad_norm": 6.787950901772287, + "kl": 0.4140625, + "learning_rate": 1.019144144144144e-07, + "loss": 0.0004, + "reward": 3.651894688606262, + "reward_std": 0.0521730100736022, + "rewards/final_reward": 1.769749352345033, + "rewards/mask_iou_reward": 0.8848746761725165, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6518943905830383, + "rewards/thk_ans_format_reward": 1.0, + "step": 3190, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.70834350585938, + "epoch": 10.779089376053962, + "grad_norm": 11.302821962201145, + "kl": 0.3984375, + "learning_rate": 1.0163288288288288e-07, + "loss": 0.0004, + "reward": 3.6362640857696533, + "reward_std": 0.030033452436327934, + "rewards/final_reward": 1.6840505229700709, + "rewards/mask_iou_reward": 0.8420252614850354, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.636264145374298, + "rewards/thk_ans_format_reward": 1.0, + "step": 3191, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.5416717529297, + "epoch": 10.782462057335582, + "grad_norm": 15.81904763170142, + "kl": 0.55078125, + "learning_rate": 1.0135135135135135e-07, + "loss": 0.0006, + "reward": 3.6694321632385254, + "reward_std": 0.06063992343842983, + "rewards/final_reward": 1.3317782285626767, + "rewards/mask_iou_reward": 0.6658891142813383, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.669431984424591, + "rewards/thk_ans_format_reward": 1.0, + "step": 3192, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.61458587646484, + "epoch": 10.7858347386172, + "grad_norm": 7.475758488939929, + "kl": 0.3955078125, + "learning_rate": 1.0106981981981981e-07, + "loss": 0.0004, + "reward": 3.729905366897583, + "reward_std": 0.06045639142394066, + "rewards/final_reward": 1.7604984407554976, + "rewards/mask_iou_reward": 0.8802492203777488, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7299054265022278, + "rewards/thk_ans_format_reward": 1.0, + "step": 3193, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.33334350585938, + "epoch": 10.789207419898819, + "grad_norm": 11.706376543671285, + "kl": 0.47265625, + "learning_rate": 1.0078828828828829e-07, + "loss": 0.0005, + "reward": 3.769546627998352, + "reward_std": 0.03965230449102819, + "rewards/final_reward": 1.83340238694567, + "rewards/mask_iou_reward": 0.916701193472835, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7695463299751282, + "rewards/thk_ans_format_reward": 1.0, + "step": 3194, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 146.89583587646484, + "epoch": 10.79258010118044, + "grad_norm": 17.883562068634024, + "kl": 0.5537109375, + "learning_rate": 1.0050675675675675e-07, + "loss": 0.0006, + "reward": 3.3498950004577637, + "reward_std": 0.05078345909714699, + "rewards/final_reward": 1.6377353144867908, + "rewards/mask_iou_reward": 0.8188676572433954, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3498948812484741, + "rewards/thk_ans_format_reward": 1.0, + "step": 3195, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.5416717529297, + "epoch": 10.795952782462058, + "grad_norm": 12.160900753006624, + "kl": 0.45703125, + "learning_rate": 1.0022522522522522e-07, + "loss": 0.0005, + "reward": 3.6239691972732544, + "reward_std": 0.04034661874175072, + "rewards/final_reward": 1.172895157250723, + "rewards/mask_iou_reward": 0.5864475786253615, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6239690780639648, + "rewards/thk_ans_format_reward": 1.0, + "step": 3196, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.98958587646484, + "epoch": 10.799325463743676, + "grad_norm": 22.684056229809475, + "kl": 0.501953125, + "learning_rate": 9.99436936936937e-08, + "loss": 0.0005, + "reward": 3.7545530796051025, + "reward_std": 0.01758536882698536, + "rewards/final_reward": 1.8126713424449878, + "rewards/mask_iou_reward": 0.9063356712224939, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.754552960395813, + "rewards/thk_ans_format_reward": 1.0, + "step": 3197, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.9166717529297, + "epoch": 10.802698145025294, + "grad_norm": 6.930106395906022, + "kl": 0.408203125, + "learning_rate": 9.966216216216216e-08, + "loss": 0.0004, + "reward": 3.507596015930176, + "reward_std": 0.03860746696591377, + "rewards/final_reward": 1.4387048334123578, + "rewards/mask_iou_reward": 0.7193524167061789, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5075958967208862, + "rewards/thk_ans_format_reward": 1.0, + "step": 3198, + "think_completion_length": 8.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.5, + "epoch": 10.806070826306915, + "grad_norm": 34.761760766300156, + "kl": 0.587890625, + "learning_rate": 9.938063063063063e-08, + "loss": 0.0007, + "reward": 3.4861974716186523, + "reward_std": 0.04883183538913727, + "rewards/final_reward": 1.9181291083170278, + "rewards/mask_iou_reward": 0.9590645541585139, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4861974716186523, + "rewards/thk_ans_format_reward": 1.0, + "step": 3199, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.0729217529297, + "epoch": 10.809443507588533, + "grad_norm": 6.418198060708881, + "kl": 0.462890625, + "learning_rate": 9.909909909909909e-08, + "loss": 0.0005, + "reward": 3.2320475578308105, + "reward_std": 0.06442866101861, + "rewards/final_reward": 1.0330291851997446, + "rewards/mask_iou_reward": 0.5165145925998723, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2320473790168762, + "rewards/thk_ans_format_reward": 1.0, + "step": 3200, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.2604217529297, + "epoch": 10.812816188870151, + "grad_norm": 6.785660820670892, + "kl": 0.443359375, + "learning_rate": 9.881756756756756e-08, + "loss": 0.0006, + "reward": 3.643518805503845, + "reward_std": 0.019335764925926924, + "rewards/final_reward": 1.7463516779887074, + "rewards/mask_iou_reward": 0.8731758389943537, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6435189247131348, + "rewards/thk_ans_format_reward": 1.0, + "step": 3201, + "think_completion_length": 7.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5312614440918, + "epoch": 10.816188870151771, + "grad_norm": 33.51514881034473, + "kl": 0.435546875, + "learning_rate": 9.853603603603604e-08, + "loss": 0.0004, + "reward": 3.4961583614349365, + "reward_std": 0.024693522602319717, + "rewards/final_reward": 0.7044322796740056, + "rewards/mask_iou_reward": 0.3522161398370028, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4961583018302917, + "rewards/thk_ans_format_reward": 1.0, + "step": 3202, + "think_completion_length": 9.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.06250762939453, + "epoch": 10.81956155143339, + "grad_norm": 14.447403026749459, + "kl": 0.53125, + "learning_rate": 9.82545045045045e-08, + "loss": 0.0005, + "reward": 3.3684170246124268, + "reward_std": 0.035486179403960705, + "rewards/final_reward": 1.9557854709075924, + "rewards/mask_iou_reward": 0.9778927354537962, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3684170246124268, + "rewards/thk_ans_format_reward": 1.0, + "step": 3203, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.4479217529297, + "epoch": 10.822934232715008, + "grad_norm": 11.104553870128184, + "kl": 0.44921875, + "learning_rate": 9.797297297297297e-08, + "loss": 0.0005, + "reward": 3.383490204811096, + "reward_std": 0.04366124048829079, + "rewards/final_reward": 1.634455664503657, + "rewards/mask_iou_reward": 0.8172278322518285, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3834902048110962, + "rewards/thk_ans_format_reward": 1.0, + "step": 3204, + "think_completion_length": 8.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.3229217529297, + "epoch": 10.826306913996627, + "grad_norm": 15.877695572152604, + "kl": 0.423828125, + "learning_rate": 9.769144144144143e-08, + "loss": 0.0006, + "reward": 3.6447088718414307, + "reward_std": 0.0836187805980444, + "rewards/final_reward": 1.851138693242559, + "rewards/mask_iou_reward": 0.9255693466212795, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6447087526321411, + "rewards/thk_ans_format_reward": 1.0, + "step": 3205, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.89583587646484, + "epoch": 10.829679595278247, + "grad_norm": 39.2943027631783, + "kl": 0.4541015625, + "learning_rate": 9.740990990990991e-08, + "loss": 0.0005, + "reward": 3.7319670915603638, + "reward_std": 0.07651904597878456, + "rewards/final_reward": 1.3600613865001712, + "rewards/mask_iou_reward": 0.6800306932500856, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7319666743278503, + "rewards/thk_ans_format_reward": 1.0, + "step": 3206, + "think_completion_length": 8.541666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.67708587646484, + "epoch": 10.833052276559865, + "grad_norm": 11.942067417307154, + "kl": 0.4384765625, + "learning_rate": 9.712837837837837e-08, + "loss": 0.0004, + "reward": 3.27489697933197, + "reward_std": 0.05321320705115795, + "rewards/final_reward": 1.5223945434338848, + "rewards/mask_iou_reward": 0.7611972717169424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2748970985412598, + "rewards/thk_ans_format_reward": 1.0, + "step": 3207, + "think_completion_length": 7.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.5729293823242, + "epoch": 10.836424957841484, + "grad_norm": 9.369178782769083, + "kl": 0.4482421875, + "learning_rate": 9.684684684684684e-08, + "loss": 0.0005, + "reward": 3.5060927867889404, + "reward_std": 0.09241212159395218, + "rewards/final_reward": 1.0254573640507907, + "rewards/mask_iou_reward": 0.5127286820253953, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.50609290599823, + "rewards/thk_ans_format_reward": 1.0, + "step": 3208, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.81250762939453, + "epoch": 10.839797639123104, + "grad_norm": 6.76604175667331, + "kl": 0.3876953125, + "learning_rate": 9.656531531531532e-08, + "loss": 0.0004, + "reward": 3.7473455667495728, + "reward_std": 0.021959856152534485, + "rewards/final_reward": 1.7867562628767018, + "rewards/mask_iou_reward": 0.8933781314383509, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7473454475402832, + "rewards/thk_ans_format_reward": 1.0, + "step": 3209, + "think_completion_length": 10.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3541717529297, + "epoch": 10.843170320404722, + "grad_norm": 7.365789573300768, + "kl": 0.3984375, + "learning_rate": 9.628378378378378e-08, + "loss": 0.0004, + "reward": 3.7011457681655884, + "reward_std": 0.03818492125719786, + "rewards/final_reward": 1.1432885171328353, + "rewards/mask_iou_reward": 0.5716442585664177, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7011457085609436, + "rewards/thk_ans_format_reward": 1.0, + "step": 3210, + "think_completion_length": 7.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.85417938232422, + "epoch": 10.84654300168634, + "grad_norm": 13.904530784966804, + "kl": 0.4072265625, + "learning_rate": 9.600225225225225e-08, + "loss": 0.0004, + "reward": 3.4692060947418213, + "reward_std": 0.018061704467982054, + "rewards/final_reward": 0.9882531894496005, + "rewards/mask_iou_reward": 0.49412659472480025, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4692060947418213, + "rewards/thk_ans_format_reward": 1.0, + "step": 3211, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.5416717529297, + "epoch": 10.849915682967959, + "grad_norm": 70.01853729733192, + "kl": 0.412109375, + "learning_rate": 9.572072072072071e-08, + "loss": 0.0004, + "reward": 3.4471957683563232, + "reward_std": 0.09547746926546097, + "rewards/final_reward": 1.5687760234479442, + "rewards/mask_iou_reward": 0.7843880117239721, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4471957683563232, + "rewards/thk_ans_format_reward": 1.0, + "step": 3212, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.45834350585938, + "epoch": 10.853288364249579, + "grad_norm": 38.89937225809673, + "kl": 0.419921875, + "learning_rate": 9.543918918918919e-08, + "loss": 0.0004, + "reward": 3.6407090425491333, + "reward_std": 0.12232569698244333, + "rewards/final_reward": 1.54928817900061, + "rewards/mask_iou_reward": 0.774644089500305, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6511258482933044, + "rewards/thk_ans_format_reward": 1.0, + "step": 3213, + "think_completion_length": 8.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.7604217529297, + "epoch": 10.856661045531197, + "grad_norm": 13.272328594438857, + "kl": 0.560546875, + "learning_rate": 9.515765765765766e-08, + "loss": 0.0006, + "reward": 3.628928542137146, + "reward_std": 0.013613590504974127, + "rewards/final_reward": 1.6047447176640486, + "rewards/mask_iou_reward": 0.8023723588320243, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6289284229278564, + "rewards/thk_ans_format_reward": 1.0, + "step": 3214, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 155.6979217529297, + "epoch": 10.860033726812816, + "grad_norm": 10.236418470637137, + "kl": 0.4541015625, + "learning_rate": 9.487612612612612e-08, + "loss": 0.0005, + "reward": 3.5637048482894897, + "reward_std": 0.03039349429309368, + "rewards/final_reward": 1.8540802816023476, + "rewards/mask_iou_reward": 0.9270401408011738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5637049674987793, + "rewards/thk_ans_format_reward": 1.0, + "step": 3215, + "think_completion_length": 9.083333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.23958587646484, + "epoch": 10.863406408094434, + "grad_norm": 28.92028211701786, + "kl": 0.427734375, + "learning_rate": 9.45945945945946e-08, + "loss": 0.0004, + "reward": 3.6250351667404175, + "reward_std": 0.07762327417731285, + "rewards/final_reward": 1.8546084172688548, + "rewards/mask_iou_reward": 0.9273042086344274, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6250353455543518, + "rewards/thk_ans_format_reward": 1.0, + "step": 3216, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.31250762939453, + "epoch": 10.866779089376054, + "grad_norm": 8.611578128472187, + "kl": 0.4150390625, + "learning_rate": 9.431306306306305e-08, + "loss": 0.0004, + "reward": 3.6757365465164185, + "reward_std": 0.05385137163102627, + "rewards/final_reward": 1.6966801983814146, + "rewards/mask_iou_reward": 0.8483400991907073, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.675736427307129, + "rewards/thk_ans_format_reward": 1.0, + "step": 3217, + "think_completion_length": 9.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.90626525878906, + "epoch": 10.870151770657673, + "grad_norm": 22.430200048234568, + "kl": 0.41796875, + "learning_rate": 9.403153153153153e-08, + "loss": 0.0004, + "reward": 3.767126202583313, + "reward_std": 0.021601593121886253, + "rewards/final_reward": 1.73027651916644, + "rewards/mask_iou_reward": 0.86513825958322, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7671259045600891, + "rewards/thk_ans_format_reward": 1.0, + "step": 3218, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.3229217529297, + "epoch": 10.873524451939291, + "grad_norm": 11.13731742848097, + "kl": 0.4287109375, + "learning_rate": 9.375e-08, + "loss": 0.0004, + "reward": 3.4959752559661865, + "reward_std": 0.08527533710002899, + "rewards/final_reward": 1.5524004259183637, + "rewards/mask_iou_reward": 0.7762002129591818, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4959751963615417, + "rewards/thk_ans_format_reward": 1.0, + "step": 3219, + "think_completion_length": 6.958333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.56250762939453, + "epoch": 10.876897133220911, + "grad_norm": 16.294777386618534, + "kl": 0.5146484375, + "learning_rate": 9.346846846846846e-08, + "loss": 0.0005, + "reward": 3.5129069089889526, + "reward_std": 0.16785257682204247, + "rewards/final_reward": 1.2444732205540479, + "rewards/mask_iou_reward": 0.6222366102770239, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.5545734763145447, + "rewards/thk_ans_format_reward": 0.9791666865348816, + "step": 3220, + "think_completion_length": 9.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.46875762939453, + "epoch": 10.88026981450253, + "grad_norm": 15.48641745142567, + "kl": 0.3671875, + "learning_rate": 9.318693693693694e-08, + "loss": 0.0004, + "reward": 3.4492322206497192, + "reward_std": 0.009585548657923937, + "rewards/final_reward": 1.4472096435305972, + "rewards/mask_iou_reward": 0.7236048217652986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4492321610450745, + "rewards/thk_ans_format_reward": 1.0, + "step": 3221, + "think_completion_length": 7.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.0729293823242, + "epoch": 10.883642495784148, + "grad_norm": 12.46701648794007, + "kl": 0.431640625, + "learning_rate": 9.29054054054054e-08, + "loss": 0.0004, + "reward": 3.465346574783325, + "reward_std": 0.09596022218465805, + "rewards/final_reward": 1.506819034084586, + "rewards/mask_iou_reward": 0.753409517042293, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.46534663438797, + "rewards/thk_ans_format_reward": 1.0, + "step": 3222, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.61458587646484, + "epoch": 10.887015177065766, + "grad_norm": 9.487352623402131, + "kl": 0.357421875, + "learning_rate": 9.262387387387387e-08, + "loss": 0.0004, + "reward": 3.685336470603943, + "reward_std": 0.048621442168951035, + "rewards/final_reward": 1.708998655660046, + "rewards/mask_iou_reward": 0.854499327830023, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6853366494178772, + "rewards/thk_ans_format_reward": 1.0, + "step": 3223, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.15625, + "epoch": 10.890387858347387, + "grad_norm": 29.683667017829382, + "kl": 0.4052734375, + "learning_rate": 9.234234234234233e-08, + "loss": 0.0004, + "reward": 3.2169747352600098, + "reward_std": 0.06005913391709328, + "rewards/final_reward": 1.1622017621002811, + "rewards/mask_iou_reward": 0.5811008810501406, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.216974675655365, + "rewards/thk_ans_format_reward": 1.0, + "step": 3224, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.8854217529297, + "epoch": 10.893760539629005, + "grad_norm": 9.668247469756977, + "kl": 0.4580078125, + "learning_rate": 9.20608108108108e-08, + "loss": 0.0005, + "reward": 3.6324193477630615, + "reward_std": 0.039188480004668236, + "rewards/final_reward": 1.5502014532756223, + "rewards/mask_iou_reward": 0.7751007266378112, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6324193477630615, + "rewards/thk_ans_format_reward": 1.0, + "step": 3225, + "think_completion_length": 8.708333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0729217529297, + "epoch": 10.897133220910623, + "grad_norm": 15.305182881846388, + "kl": 0.88671875, + "learning_rate": 9.177927927927928e-08, + "loss": 0.0009, + "reward": 3.5677725076675415, + "reward_std": 0.042133665177971125, + "rewards/final_reward": 1.9255321917009562, + "rewards/mask_iou_reward": 0.9627660958504781, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5677724480628967, + "rewards/thk_ans_format_reward": 1.0, + "step": 3226, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.2604293823242, + "epoch": 10.900505902192243, + "grad_norm": 8.05123745901817, + "kl": 0.43359375, + "learning_rate": 9.149774774774774e-08, + "loss": 0.0004, + "reward": 3.1885026693344116, + "reward_std": 0.04743565432727337, + "rewards/final_reward": 0.5120157429678219, + "rewards/mask_iou_reward": 0.25600787148391096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.1885027289390564, + "rewards/thk_ans_format_reward": 1.0, + "step": 3227, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.875, + "epoch": 10.903878583473862, + "grad_norm": 19.24892607651357, + "kl": 0.5078125, + "learning_rate": 9.121621621621621e-08, + "loss": 0.0005, + "reward": 3.751962900161743, + "reward_std": 0.01671873265877366, + "rewards/final_reward": 1.7371263028198993, + "rewards/mask_iou_reward": 0.8685631514099497, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.751962959766388, + "rewards/thk_ans_format_reward": 1.0, + "step": 3228, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.55209350585938, + "epoch": 10.90725126475548, + "grad_norm": 9.79467029988201, + "kl": 0.486328125, + "learning_rate": 9.093468468468468e-08, + "loss": 0.0005, + "reward": 3.695330023765564, + "reward_std": 0.07344387657940388, + "rewards/final_reward": 1.7112524455751048, + "rewards/mask_iou_reward": 0.8556262227875524, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.695330023765564, + "rewards/thk_ans_format_reward": 1.0, + "step": 3229, + "think_completion_length": 7.666666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.6145935058594, + "epoch": 10.910623946037099, + "grad_norm": 13.314238932481985, + "kl": 0.4599609375, + "learning_rate": 9.065315315315315e-08, + "loss": 0.0005, + "reward": 3.8808305263519287, + "reward_std": 0.013991189189255238, + "rewards/final_reward": 1.9198786636199257, + "rewards/mask_iou_reward": 0.9599393318099628, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8808305263519287, + "rewards/thk_ans_format_reward": 1.0, + "step": 3230, + "think_completion_length": 8.416666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.09376525878906, + "epoch": 10.913996627318719, + "grad_norm": 7.402782423189984, + "kl": 0.462890625, + "learning_rate": 9.037162162162162e-08, + "loss": 0.0005, + "reward": 2.9830663204193115, + "reward_std": 0.13797349110245705, + "rewards/final_reward": 0.8877441586400034, + "rewards/mask_iou_reward": 0.4438720793200017, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 0.98306605219841, + "rewards/thk_ans_format_reward": 1.0, + "step": 3231, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.4166717529297, + "epoch": 10.917369308600337, + "grad_norm": 8.39626969225335, + "kl": 0.49609375, + "learning_rate": 9.009009009009008e-08, + "loss": 0.0005, + "reward": 3.6167478561401367, + "reward_std": 0.026893689297139645, + "rewards/final_reward": 1.6967432154165967, + "rewards/mask_iou_reward": 0.8483716077082983, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6167477369308472, + "rewards/thk_ans_format_reward": 1.0, + "step": 3232, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.6666717529297, + "epoch": 10.920741989881956, + "grad_norm": 11.167540183524496, + "kl": 0.443359375, + "learning_rate": 8.980855855855856e-08, + "loss": 0.0004, + "reward": 3.337292790412903, + "reward_std": 0.04990806803107262, + "rewards/final_reward": 1.5319708460289454, + "rewards/mask_iou_reward": 0.7659854230144727, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3372925519943237, + "rewards/thk_ans_format_reward": 1.0, + "step": 3233, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 163.3229217529297, + "epoch": 10.924114671163576, + "grad_norm": 9.234633088741644, + "kl": 0.4169921875, + "learning_rate": 8.952702702702702e-08, + "loss": 0.0004, + "reward": 3.564087986946106, + "reward_std": 0.05904023256152868, + "rewards/final_reward": 1.6774216256025858, + "rewards/mask_iou_reward": 0.8387108128012929, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5640875697135925, + "rewards/thk_ans_format_reward": 1.0, + "step": 3234, + "think_completion_length": 7.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.93750762939453, + "epoch": 10.927487352445194, + "grad_norm": 22.966644031125828, + "kl": 0.4521484375, + "learning_rate": 8.924549549549549e-08, + "loss": 0.0005, + "reward": 3.5395623445510864, + "reward_std": 0.012699170969426632, + "rewards/final_reward": 1.8294896969313919, + "rewards/mask_iou_reward": 0.9147448484656959, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5395622253417969, + "rewards/thk_ans_format_reward": 1.0, + "step": 3235, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.5729217529297, + "epoch": 10.930860033726812, + "grad_norm": 10.42284799278542, + "kl": 0.369140625, + "learning_rate": 8.896396396396395e-08, + "loss": 0.0004, + "reward": 3.5979576110839844, + "reward_std": 0.008746222592890263, + "rewards/final_reward": 1.8873318631194849, + "rewards/mask_iou_reward": 0.9436659315597424, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.59795743227005, + "rewards/thk_ans_format_reward": 1.0, + "step": 3236, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.9479217529297, + "epoch": 10.93423271500843, + "grad_norm": 8.054077747626835, + "kl": 0.4912109375, + "learning_rate": 8.868243243243243e-08, + "loss": 0.0005, + "reward": 3.634762167930603, + "reward_std": 0.05479666404426098, + "rewards/final_reward": 1.427444284547413, + "rewards/mask_iou_reward": 0.7137221422737065, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6347622275352478, + "rewards/thk_ans_format_reward": 1.0, + "step": 3237, + "think_completion_length": 7.791666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.46875762939453, + "epoch": 10.937605396290051, + "grad_norm": 11.689568398021407, + "kl": 0.4052734375, + "learning_rate": 8.84009009009009e-08, + "loss": 0.0004, + "reward": 3.4657106399536133, + "reward_std": 0.06903432868421078, + "rewards/final_reward": 1.5770540530338935, + "rewards/mask_iou_reward": 0.7885270265169467, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4657106399536133, + "rewards/thk_ans_format_reward": 1.0, + "step": 3238, + "think_completion_length": 6.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.21875762939453, + "epoch": 10.94097807757167, + "grad_norm": 7.9066782689942805, + "kl": 0.3916015625, + "learning_rate": 8.811936936936936e-08, + "loss": 0.0004, + "reward": 3.693625569343567, + "reward_std": 0.02968922909349203, + "rewards/final_reward": 1.8096445212847971, + "rewards/mask_iou_reward": 0.9048222606423986, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6936253905296326, + "rewards/thk_ans_format_reward": 1.0, + "step": 3239, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.3229217529297, + "epoch": 10.944350758853288, + "grad_norm": 5.704007915875586, + "kl": 0.4609375, + "learning_rate": 8.783783783783784e-08, + "loss": 0.0005, + "reward": 3.5016400814056396, + "reward_std": 0.01643510302528739, + "rewards/final_reward": 1.6616097302980894, + "rewards/mask_iou_reward": 0.8308048651490447, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5016400814056396, + "rewards/thk_ans_format_reward": 1.0, + "step": 3240, + "think_completion_length": 8.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 171.6354217529297, + "epoch": 10.947723440134908, + "grad_norm": 10.243122206364419, + "kl": 0.484375, + "learning_rate": 8.75563063063063e-08, + "loss": 0.0005, + "reward": 3.5590054988861084, + "reward_std": 0.15231713093817234, + "rewards/final_reward": 1.5732570087525417, + "rewards/mask_iou_reward": 0.7866285043762709, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5590054392814636, + "rewards/thk_ans_format_reward": 1.0, + "step": 3241, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 162.8229217529297, + "epoch": 10.951096121416526, + "grad_norm": 19.269768147583175, + "kl": 0.4482421875, + "learning_rate": 8.727477477477477e-08, + "loss": 0.0005, + "reward": 3.5184816122055054, + "reward_std": 0.03059364575892687, + "rewards/final_reward": 1.3453870798808751, + "rewards/mask_iou_reward": 0.6726935399404376, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.518481731414795, + "rewards/thk_ans_format_reward": 1.0, + "step": 3242, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.37500762939453, + "epoch": 10.954468802698145, + "grad_norm": 10.50987455275937, + "kl": 0.4052734375, + "learning_rate": 8.699324324324324e-08, + "loss": 0.0004, + "reward": 3.3991010189056396, + "reward_std": 0.04655653005465865, + "rewards/final_reward": 1.1448598582338887, + "rewards/mask_iou_reward": 0.5724299291169443, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3991011381149292, + "rewards/thk_ans_format_reward": 1.0, + "step": 3243, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.83333587646484, + "epoch": 10.957841483979763, + "grad_norm": 45.08296764599884, + "kl": 0.3984375, + "learning_rate": 8.67117117117117e-08, + "loss": 0.0005, + "reward": 3.770294427871704, + "reward_std": 0.01194569538347423, + "rewards/final_reward": 1.8231030989413408, + "rewards/mask_iou_reward": 0.9115515494706704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7702943086624146, + "rewards/thk_ans_format_reward": 1.0, + "step": 3244, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.125, + "epoch": 10.961214165261383, + "grad_norm": 8.5253164523504, + "kl": 0.376953125, + "learning_rate": 8.643018018018018e-08, + "loss": 0.0004, + "reward": 3.5307878255844116, + "reward_std": 0.09277350455522537, + "rewards/final_reward": 1.573530974288559, + "rewards/mask_iou_reward": 0.7867654871442795, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5307878851890564, + "rewards/thk_ans_format_reward": 1.0, + "step": 3245, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.82292938232422, + "epoch": 10.964586846543002, + "grad_norm": 11.467629355388848, + "kl": 0.453125, + "learning_rate": 8.614864864864864e-08, + "loss": 0.0005, + "reward": 3.6759188175201416, + "reward_std": 0.02997300773859024, + "rewards/final_reward": 1.6248022016955932, + "rewards/mask_iou_reward": 0.8124011008477966, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6759187579154968, + "rewards/thk_ans_format_reward": 1.0, + "step": 3246, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.6666717529297, + "epoch": 10.96795952782462, + "grad_norm": 13.907193326709237, + "kl": 1.087890625, + "learning_rate": 8.586711711711711e-08, + "loss": 0.0011, + "reward": 3.539788007736206, + "reward_std": 0.21456366777420044, + "rewards/final_reward": 1.7829225695129152, + "rewards/mask_iou_reward": 0.8914612847564576, + "rewards/sam_format_reward": 0.9791666865348816, + "rewards/sam_reward_func_ultra": 1.560621440410614, + "rewards/thk_ans_format_reward": 1.0, + "step": 3247, + "think_completion_length": 9.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.06250762939453, + "epoch": 10.97133220910624, + "grad_norm": 16.606124292610147, + "kl": 0.3974609375, + "learning_rate": 8.558558558558559e-08, + "loss": 0.0004, + "reward": 3.6476298570632935, + "reward_std": 0.05834665335714817, + "rewards/final_reward": 1.5603483165007168, + "rewards/mask_iou_reward": 0.7801741582503584, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6476297974586487, + "rewards/thk_ans_format_reward": 1.0, + "step": 3248, + "think_completion_length": 7.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.35417938232422, + "epoch": 10.974704890387859, + "grad_norm": 11.609692382916647, + "kl": 0.40625, + "learning_rate": 8.530405405405405e-08, + "loss": 0.0004, + "reward": 3.69338595867157, + "reward_std": 0.027596603147685528, + "rewards/final_reward": 1.9251977772616073, + "rewards/mask_iou_reward": 0.9625988886308037, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6933859586715698, + "rewards/thk_ans_format_reward": 1.0, + "step": 3249, + "think_completion_length": 7.916666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.3645935058594, + "epoch": 10.978077571669477, + "grad_norm": 21.16551017125066, + "kl": 0.3671875, + "learning_rate": 8.502252252252252e-08, + "loss": 0.0004, + "reward": 3.619927167892456, + "reward_std": 0.03946511447429657, + "rewards/final_reward": 1.2778173729229194, + "rewards/mask_iou_reward": 0.6389086864614597, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6199271082878113, + "rewards/thk_ans_format_reward": 1.0, + "step": 3250, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.28125, + "epoch": 10.981450252951095, + "grad_norm": 6.630093534026958, + "kl": 0.498046875, + "learning_rate": 8.474099099099098e-08, + "loss": 0.0005, + "reward": 3.6429604291915894, + "reward_std": 0.06296277791261673, + "rewards/final_reward": 1.7986877345795302, + "rewards/mask_iou_reward": 0.8993438672897651, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.642960548400879, + "rewards/thk_ans_format_reward": 1.0, + "step": 3251, + "think_completion_length": 8.458333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.8333435058594, + "epoch": 10.984822934232715, + "grad_norm": 11.65935010806574, + "kl": 0.66015625, + "learning_rate": 8.445945945945946e-08, + "loss": 0.0007, + "reward": 3.341140866279602, + "reward_std": 0.11203420907258987, + "rewards/final_reward": 0.977063955266588, + "rewards/mask_iou_reward": 0.488531977633294, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3411406874656677, + "rewards/thk_ans_format_reward": 1.0, + "step": 3252, + "think_completion_length": 7.208333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.45834350585938, + "epoch": 10.988195615514334, + "grad_norm": 10.670823671803728, + "kl": 0.4453125, + "learning_rate": 8.417792792792793e-08, + "loss": 0.0004, + "reward": 3.72513210773468, + "reward_std": 0.07078396715223789, + "rewards/final_reward": 1.7892148373927257, + "rewards/mask_iou_reward": 0.8946074186963628, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.725132167339325, + "rewards/thk_ans_format_reward": 1.0, + "step": 3253, + "think_completion_length": 8.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.52083587646484, + "epoch": 10.991568296795952, + "grad_norm": 6.109386082964309, + "kl": 0.46875, + "learning_rate": 8.389639639639639e-08, + "loss": 0.0005, + "reward": 3.374544382095337, + "reward_std": 0.054962567053735256, + "rewards/final_reward": 1.930871790359452, + "rewards/mask_iou_reward": 0.965435895179726, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3745442032814026, + "rewards/thk_ans_format_reward": 1.0, + "step": 3254, + "think_completion_length": 9.416666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.81250762939453, + "epoch": 10.994940978077572, + "grad_norm": 10.063378755116721, + "kl": 0.5302734375, + "learning_rate": 8.361486486486486e-08, + "loss": 0.0005, + "reward": 3.614203453063965, + "reward_std": 0.06491642817854881, + "rewards/final_reward": 1.5505027246245475, + "rewards/mask_iou_reward": 0.7752513623122738, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6142034530639648, + "rewards/thk_ans_format_reward": 1.0, + "step": 3255, + "think_completion_length": 9.333333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.28947257995605, + "epoch": 10.99831365935919, + "grad_norm": 8.135456862235158, + "kl": 0.546875, + "learning_rate": 8.333333333333333e-08, + "loss": 0.0006, + "reward": 3.6233062744140625, + "reward_std": 0.04351498291362077, + "rewards/final_reward": 1.9861655112622487, + "rewards/mask_iou_reward": 0.9930827556311244, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6233062744140625, + "rewards/thk_ans_format_reward": 1.0, + "step": 3256, + "think_completion_length": 9.916666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.67709350585938, + "epoch": 11.003372681281618, + "grad_norm": 11.557014185942567, + "kl": 0.560546875, + "learning_rate": 8.30518018018018e-08, + "loss": 0.0006, + "reward": 3.7738869190216064, + "reward_std": 0.05592325050383806, + "rewards/final_reward": 1.7459996031354077, + "rewards/mask_iou_reward": 0.8729998015677038, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7738869786262512, + "rewards/thk_ans_format_reward": 1.0, + "step": 3257, + "think_completion_length": 7.291666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.7083435058594, + "epoch": 11.006745362563239, + "grad_norm": 7.8662512856552, + "kl": 0.3935546875, + "learning_rate": 8.277027027027027e-08, + "loss": 0.0004, + "reward": 3.5249884128570557, + "reward_std": 0.03571249917149544, + "rewards/final_reward": 1.5830277205064258, + "rewards/mask_iou_reward": 0.7915138602532129, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5249884724617004, + "rewards/thk_ans_format_reward": 1.0, + "step": 3258, + "think_completion_length": 7.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.02083587646484, + "epoch": 11.010118043844857, + "grad_norm": 18.362126856228976, + "kl": 0.71484375, + "learning_rate": 8.248873873873873e-08, + "loss": 0.0007, + "reward": 3.383101224899292, + "reward_std": 0.0536690279841423, + "rewards/final_reward": 1.5824458142464324, + "rewards/mask_iou_reward": 0.7912229071232162, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3831011056900024, + "rewards/thk_ans_format_reward": 1.0, + "step": 3259, + "think_completion_length": 8.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.6979217529297, + "epoch": 11.013490725126475, + "grad_norm": 7.679861782392985, + "kl": 0.3681640625, + "learning_rate": 8.220720720720721e-08, + "loss": 0.0004, + "reward": 3.794712543487549, + "reward_std": 0.045380293391644955, + "rewards/final_reward": 1.7493822493573408, + "rewards/mask_iou_reward": 0.8746911246786704, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7947124242782593, + "rewards/thk_ans_format_reward": 1.0, + "step": 3260, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.08333587646484, + "epoch": 11.016863406408094, + "grad_norm": 5.712072509389033, + "kl": 0.615234375, + "learning_rate": 8.192567567567567e-08, + "loss": 0.0006, + "reward": 3.6091835498809814, + "reward_std": 0.12349607236683369, + "rewards/final_reward": 1.7861388123293191, + "rewards/mask_iou_reward": 0.8930694061646596, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6196001768112183, + "rewards/thk_ans_format_reward": 1.0, + "step": 3261, + "think_completion_length": 8.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.67708587646484, + "epoch": 11.020236087689714, + "grad_norm": 9.664881769651082, + "kl": 0.5009765625, + "learning_rate": 8.164414414414414e-08, + "loss": 0.0005, + "reward": 3.8026552200317383, + "reward_std": 0.012866603909060359, + "rewards/final_reward": 1.479182740000682, + "rewards/mask_iou_reward": 0.739591370000341, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.8026551604270935, + "rewards/thk_ans_format_reward": 1.0, + "step": 3262, + "think_completion_length": 10.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.2395935058594, + "epoch": 11.023608768971332, + "grad_norm": 14.431790656745983, + "kl": 0.38671875, + "learning_rate": 8.136261261261262e-08, + "loss": 0.0004, + "reward": 3.31676983833313, + "reward_std": 0.3980730175971985, + "rewards/final_reward": 1.1245725328677882, + "rewards/mask_iou_reward": 0.5622862664338941, + "rewards/sam_format_reward": 0.9270833432674408, + "rewards/sam_reward_func_ultra": 1.4626030325889587, + "rewards/thk_ans_format_reward": 0.9270833432674408, + "step": 3263, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.92708587646484, + "epoch": 11.02698145025295, + "grad_norm": 7.97145047005376, + "kl": 0.45703125, + "learning_rate": 8.108108108108108e-08, + "loss": 0.0005, + "reward": 3.676851272583008, + "reward_std": 0.02242056792601943, + "rewards/final_reward": 1.827183155940764, + "rewards/mask_iou_reward": 0.913591577970382, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6768511533737183, + "rewards/thk_ans_format_reward": 1.0, + "step": 3264, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.1041717529297, + "epoch": 11.03035413153457, + "grad_norm": 15.361060325537107, + "kl": 0.546875, + "learning_rate": 8.079954954954954e-08, + "loss": 0.0005, + "reward": 3.680310845375061, + "reward_std": 0.046101706102490425, + "rewards/final_reward": 1.5770611709529159, + "rewards/mask_iou_reward": 0.7885305854764579, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.680310845375061, + "rewards/thk_ans_format_reward": 1.0, + "step": 3265, + "think_completion_length": 10.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.39583587646484, + "epoch": 11.03372681281619, + "grad_norm": 7.5963750674538515, + "kl": 0.4228515625, + "learning_rate": 8.051801801801801e-08, + "loss": 0.0004, + "reward": 3.7898802757263184, + "reward_std": 0.031077871099114418, + "rewards/final_reward": 1.8079273683003763, + "rewards/mask_iou_reward": 0.9039636841501881, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.789880096912384, + "rewards/thk_ans_format_reward": 1.0, + "step": 3266, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.71875, + "epoch": 11.037099494097808, + "grad_norm": 50.02483132654234, + "kl": 0.505859375, + "learning_rate": 8.023648648648649e-08, + "loss": 0.0005, + "reward": 3.673597812652588, + "reward_std": 0.03365712705999613, + "rewards/final_reward": 1.5130002342504598, + "rewards/mask_iou_reward": 0.7565001171252299, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6735975742340088, + "rewards/thk_ans_format_reward": 1.0, + "step": 3267, + "think_completion_length": 7.791666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.8333435058594, + "epoch": 11.040472175379426, + "grad_norm": 7.948491490121137, + "kl": 0.3974609375, + "learning_rate": 7.995495495495496e-08, + "loss": 0.0004, + "reward": 3.448967218399048, + "reward_std": 0.11523477360606194, + "rewards/final_reward": 1.6464742945714734, + "rewards/mask_iou_reward": 0.8232371472857367, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4489670991897583, + "rewards/thk_ans_format_reward": 1.0, + "step": 3268, + "think_completion_length": 9.208333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.12500762939453, + "epoch": 11.043844856661046, + "grad_norm": 30.844776065741474, + "kl": 0.4765625, + "learning_rate": 7.967342342342342e-08, + "loss": 0.0005, + "reward": 3.5432692766189575, + "reward_std": 0.07160472683608532, + "rewards/final_reward": 1.6734057054448193, + "rewards/mask_iou_reward": 0.8367028527224096, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5432690382003784, + "rewards/thk_ans_format_reward": 1.0, + "step": 3269, + "think_completion_length": 8.458333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.05208587646484, + "epoch": 11.047217537942664, + "grad_norm": 13.632646077084331, + "kl": 0.7734375, + "learning_rate": 7.939189189189188e-08, + "loss": 0.0008, + "reward": 3.6073343753814697, + "reward_std": 0.07992910593748093, + "rewards/final_reward": 1.7809643798983177, + "rewards/mask_iou_reward": 0.8904821899491588, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.607334315776825, + "rewards/thk_ans_format_reward": 1.0, + "step": 3270, + "think_completion_length": 8.125 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.91666793823242, + "epoch": 11.050590219224283, + "grad_norm": 12.258278428753142, + "kl": 0.796875, + "learning_rate": 7.911036036036035e-08, + "loss": 0.0008, + "reward": 3.5184515714645386, + "reward_std": 0.047073401510715485, + "rewards/final_reward": 1.6241069163450614, + "rewards/mask_iou_reward": 0.8120534581725307, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5184515714645386, + "rewards/thk_ans_format_reward": 1.0, + "step": 3271, + "think_completion_length": 8.5 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.8645935058594, + "epoch": 11.053962900505903, + "grad_norm": 8.700745630019178, + "kl": 0.380859375, + "learning_rate": 7.882882882882883e-08, + "loss": 0.0004, + "reward": 3.5366759300231934, + "reward_std": 0.08967574685811996, + "rewards/final_reward": 0.9761992976696359, + "rewards/mask_iou_reward": 0.48809964883481793, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.536676049232483, + "rewards/thk_ans_format_reward": 1.0, + "step": 3272, + "think_completion_length": 9.166666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.2604293823242, + "epoch": 11.057335581787521, + "grad_norm": 7.473400543514334, + "kl": 0.51953125, + "learning_rate": 7.85472972972973e-08, + "loss": 0.0005, + "reward": 3.5971490144729614, + "reward_std": 0.069289181381464, + "rewards/final_reward": 1.7362259460742102, + "rewards/mask_iou_reward": 0.8681129730371051, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5971489548683167, + "rewards/thk_ans_format_reward": 1.0, + "step": 3273, + "think_completion_length": 7.708333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.84375762939453, + "epoch": 11.06070826306914, + "grad_norm": 6.5430731406634886, + "kl": 0.4736328125, + "learning_rate": 7.826576576576576e-08, + "loss": 0.0005, + "reward": 3.5351314544677734, + "reward_std": 0.05387053173035383, + "rewards/final_reward": 1.3752052736546478, + "rewards/mask_iou_reward": 0.6876026368273239, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.535131573677063, + "rewards/thk_ans_format_reward": 1.0, + "step": 3274, + "think_completion_length": 9.166666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.93751525878906, + "epoch": 11.064080944350758, + "grad_norm": 59.65870522669481, + "kl": 0.517578125, + "learning_rate": 7.798423423423422e-08, + "loss": 0.0005, + "reward": 3.680536389350891, + "reward_std": 0.06550450623035431, + "rewards/final_reward": 1.6033873661332025, + "rewards/mask_iou_reward": 0.8016936830666013, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6805359721183777, + "rewards/thk_ans_format_reward": 1.0, + "step": 3275, + "think_completion_length": 9.041666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.95834350585938, + "epoch": 11.067453625632378, + "grad_norm": 10.050849213572555, + "kl": 0.4326171875, + "learning_rate": 7.77027027027027e-08, + "loss": 0.0004, + "reward": 3.075374126434326, + "reward_std": 0.06078692898154259, + "rewards/final_reward": 1.414640110385101, + "rewards/mask_iou_reward": 0.7073200551925505, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.0753739774227142, + "rewards/thk_ans_format_reward": 1.0, + "step": 3276, + "think_completion_length": 8.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.78125762939453, + "epoch": 11.070826306913997, + "grad_norm": 11.056552105553564, + "kl": 0.392578125, + "learning_rate": 7.742117117117117e-08, + "loss": 0.0004, + "reward": 3.5583964586257935, + "reward_std": 0.06431005522608757, + "rewards/final_reward": 1.4149002911814856, + "rewards/mask_iou_reward": 0.7074501455907428, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5583963990211487, + "rewards/thk_ans_format_reward": 1.0, + "step": 3277, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.53125762939453, + "epoch": 11.074198988195615, + "grad_norm": 8.529640320104521, + "kl": 0.4326171875, + "learning_rate": 7.713963963963965e-08, + "loss": 0.0004, + "reward": 3.541915774345398, + "reward_std": 0.037605963414534926, + "rewards/final_reward": 1.51284828093512, + "rewards/mask_iou_reward": 0.75642414046756, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5419156551361084, + "rewards/thk_ans_format_reward": 1.0, + "step": 3278, + "think_completion_length": 8.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.33333587646484, + "epoch": 11.077571669477235, + "grad_norm": 27.685500769109286, + "kl": 0.505859375, + "learning_rate": 7.68581081081081e-08, + "loss": 0.0005, + "reward": 3.642976999282837, + "reward_std": 0.029503632336854935, + "rewards/final_reward": 1.4340308792184384, + "rewards/mask_iou_reward": 0.7170154396092192, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6429771184921265, + "rewards/thk_ans_format_reward": 1.0, + "step": 3279, + "think_completion_length": 8.791666666666668 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.34375762939453, + "epoch": 11.080944350758854, + "grad_norm": 6.816602663960299, + "kl": 0.3935546875, + "learning_rate": 7.657657657657657e-08, + "loss": 0.0004, + "reward": 3.582181930541992, + "reward_std": 0.05259714089334011, + "rewards/final_reward": 1.7713748625896824, + "rewards/mask_iou_reward": 0.8856874312948412, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5821818113327026, + "rewards/thk_ans_format_reward": 1.0, + "step": 3280, + "think_completion_length": 7.833333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.15626525878906, + "epoch": 11.084317032040472, + "grad_norm": 8.972118747155667, + "kl": 0.6015625, + "learning_rate": 7.629504504504504e-08, + "loss": 0.0006, + "reward": 3.2960952520370483, + "reward_std": 0.03791832132264972, + "rewards/final_reward": 1.9528624409568343, + "rewards/mask_iou_reward": 0.9764312204784171, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.2960952520370483, + "rewards/thk_ans_format_reward": 1.0, + "step": 3281, + "think_completion_length": 7.416666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.06251525878906, + "epoch": 11.08768971332209, + "grad_norm": 9.61726929093108, + "kl": 0.55078125, + "learning_rate": 7.601351351351351e-08, + "loss": 0.0006, + "reward": 3.6938302516937256, + "reward_std": 0.028953220695257187, + "rewards/final_reward": 1.4376917790190762, + "rewards/mask_iou_reward": 0.7188458895095381, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6938302516937256, + "rewards/thk_ans_format_reward": 1.0, + "step": 3282, + "think_completion_length": 7.666666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.84375762939453, + "epoch": 11.09106239460371, + "grad_norm": 6.908669230016509, + "kl": 0.392578125, + "learning_rate": 7.573198198198199e-08, + "loss": 0.0004, + "reward": 3.4922462701797485, + "reward_std": 0.07157362625002861, + "rewards/final_reward": 1.9423061070144874, + "rewards/mask_iou_reward": 0.9711530535072437, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4922462105751038, + "rewards/thk_ans_format_reward": 1.0, + "step": 3283, + "think_completion_length": 7.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.03125762939453, + "epoch": 11.094435075885329, + "grad_norm": 69.26979033185252, + "kl": 0.474609375, + "learning_rate": 7.545045045045045e-08, + "loss": 0.0005, + "reward": 3.3932985067367554, + "reward_std": 0.014708156697452068, + "rewards/final_reward": 1.5756231422508065, + "rewards/mask_iou_reward": 0.7878115711254032, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.3932985067367554, + "rewards/thk_ans_format_reward": 1.0, + "step": 3284, + "think_completion_length": 8.333333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.30208587646484, + "epoch": 11.097807757166947, + "grad_norm": 15.734960613548125, + "kl": 0.4365234375, + "learning_rate": 7.516891891891891e-08, + "loss": 0.0005, + "reward": 3.662890672683716, + "reward_std": 0.012950449250638485, + "rewards/final_reward": 1.5119907518770903, + "rewards/mask_iou_reward": 0.7559953759385452, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6628906726837158, + "rewards/thk_ans_format_reward": 1.0, + "step": 3285, + "think_completion_length": 7.625 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.5416717529297, + "epoch": 11.101180438448566, + "grad_norm": 9.099008238943991, + "kl": 0.3857421875, + "learning_rate": 7.488738738738738e-08, + "loss": 0.0004, + "reward": 3.594428062438965, + "reward_std": 0.06632101535797119, + "rewards/final_reward": 1.7235100982074392, + "rewards/mask_iou_reward": 0.8617550491037196, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5944279432296753, + "rewards/thk_ans_format_reward": 1.0, + "step": 3286, + "think_completion_length": 9.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.48958587646484, + "epoch": 11.104553119730186, + "grad_norm": 7.084162835272017, + "kl": 0.59375, + "learning_rate": 7.460585585585586e-08, + "loss": 0.0006, + "reward": 3.198632597923279, + "reward_std": 0.3177741765975952, + "rewards/final_reward": 1.4057838586232614, + "rewards/mask_iou_reward": 0.7028919293116307, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.2194659411907196, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3287, + "think_completion_length": 8.75 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.7604217529297, + "epoch": 11.107925801011804, + "grad_norm": 17.248231136349567, + "kl": 0.4658203125, + "learning_rate": 7.432432432432433e-08, + "loss": 0.0005, + "reward": 3.525164246559143, + "reward_std": 0.020461719017475843, + "rewards/final_reward": 1.2088079626138248, + "rewards/mask_iou_reward": 0.6044039813069124, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5251641869544983, + "rewards/thk_ans_format_reward": 1.0, + "step": 3288, + "think_completion_length": 8.041666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.1666717529297, + "epoch": 11.111298482293423, + "grad_norm": 7.706524760091841, + "kl": 0.408203125, + "learning_rate": 7.404279279279278e-08, + "loss": 0.0004, + "reward": 3.6560596227645874, + "reward_std": 0.06422694679349661, + "rewards/final_reward": 1.8228721948733961, + "rewards/mask_iou_reward": 0.9114360974366981, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6560596227645874, + "rewards/thk_ans_format_reward": 1.0, + "step": 3289, + "think_completion_length": 8.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.94792938232422, + "epoch": 11.114671163575043, + "grad_norm": 5.03298899344414, + "kl": 0.3583984375, + "learning_rate": 7.376126126126125e-08, + "loss": 0.0003, + "reward": 3.7929335832595825, + "reward_std": 0.031620634719729424, + "rewards/final_reward": 1.9703793094418138, + "rewards/mask_iou_reward": 0.9851896547209069, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.7929337620735168, + "rewards/thk_ans_format_reward": 1.0, + "step": 3290, + "think_completion_length": 8.291666666666666 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.90625762939453, + "epoch": 11.118043844856661, + "grad_norm": 16.76234833828291, + "kl": 0.779296875, + "learning_rate": 7.347972972972973e-08, + "loss": 0.0008, + "reward": 3.5754377841949463, + "reward_std": 0.22860531508922577, + "rewards/final_reward": 1.5805011760570107, + "rewards/mask_iou_reward": 0.7902505880285053, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.5962709784507751, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3291, + "think_completion_length": 7.916666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.40625762939453, + "epoch": 11.12141652613828, + "grad_norm": 8.834695585041631, + "kl": 0.4853515625, + "learning_rate": 7.31981981981982e-08, + "loss": 0.0005, + "reward": 3.5796273946762085, + "reward_std": 0.08974376507103443, + "rewards/final_reward": 1.8597737950833935, + "rewards/mask_iou_reward": 0.9298868975416967, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5796274542808533, + "rewards/thk_ans_format_reward": 1.0, + "step": 3292, + "think_completion_length": 9.083333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 158.33333587646484, + "epoch": 11.124789207419898, + "grad_norm": 17.70647786369016, + "kl": 0.6015625, + "learning_rate": 7.291666666666667e-08, + "loss": 0.0006, + "reward": 3.784427046775818, + "reward_std": 0.040433993737678975, + "rewards/final_reward": 1.9819054048092806, + "rewards/mask_iou_reward": 0.9909527024046403, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.784426987171173, + "rewards/thk_ans_format_reward": 1.0, + "step": 3293, + "think_completion_length": 9.833333333333332 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.9479293823242, + "epoch": 11.128161888701518, + "grad_norm": 7.106594817415373, + "kl": 0.4462890625, + "learning_rate": 7.263513513513512e-08, + "loss": 0.0004, + "reward": 3.288295030593872, + "reward_std": 0.24038275331258774, + "rewards/final_reward": 1.5676557711252865, + "rewards/mask_iou_reward": 0.7838278855626433, + "rewards/sam_format_reward": 0.9479166865348816, + "rewards/sam_reward_func_ultra": 1.3924616575241089, + "rewards/thk_ans_format_reward": 0.9479166865348816, + "step": 3294, + "think_completion_length": 7.875 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.77084350585938, + "epoch": 11.131534569983137, + "grad_norm": 10.746438770092233, + "kl": 0.427734375, + "learning_rate": 7.23536036036036e-08, + "loss": 0.0005, + "reward": 3.652415633201599, + "reward_std": 0.2226637750864029, + "rewards/final_reward": 1.8436673039719813, + "rewards/mask_iou_reward": 0.9218336519859907, + "rewards/sam_format_reward": 0.9895833432674408, + "rewards/sam_reward_func_ultra": 1.6732491850852966, + "rewards/thk_ans_format_reward": 0.9895833432674408, + "step": 3295, + "think_completion_length": 7.541666666666667 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.75, + "epoch": 11.134907251264755, + "grad_norm": 15.824855680054148, + "kl": 0.3994140625, + "learning_rate": 7.207207207207207e-08, + "loss": 0.0004, + "reward": 3.43550705909729, + "reward_std": 0.08496489748358727, + "rewards/final_reward": 1.5848493301955808, + "rewards/mask_iou_reward": 0.7924246650977904, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.4355069994926453, + "rewards/thk_ans_format_reward": 1.0, + "step": 3296, + "think_completion_length": 8.25 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.6145935058594, + "epoch": 11.138279932546375, + "grad_norm": 7.5556161914264806, + "kl": 0.384765625, + "learning_rate": 7.179054054054054e-08, + "loss": 0.0003, + "reward": 3.502914309501648, + "reward_std": 0.07663140445947647, + "rewards/final_reward": 1.8889766537903219, + "rewards/mask_iou_reward": 0.9444883268951609, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.5029143691062927, + "rewards/thk_ans_format_reward": 1.0, + "step": 3297, + "think_completion_length": 7.208333333333334 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.23959350585938, + "epoch": 11.141652613827993, + "grad_norm": 7.059335996561547, + "kl": 0.4306640625, + "learning_rate": 7.150900900900902e-08, + "loss": 0.0004, + "reward": 3.641920804977417, + "reward_std": 0.04107053391635418, + "rewards/final_reward": 1.7453527965394877, + "rewards/mask_iou_reward": 0.8726763982697439, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6419206857681274, + "rewards/thk_ans_format_reward": 1.0, + "step": 3298, + "think_completion_length": 7.958333333333333 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.83333587646484, + "epoch": 11.145025295109612, + "grad_norm": 12.348962116367375, + "kl": 0.40625, + "learning_rate": 7.122747747747746e-08, + "loss": 0.0004, + "reward": 3.6770007610321045, + "reward_std": 0.02028821548447013, + "rewards/final_reward": 1.6679152904367895, + "rewards/mask_iou_reward": 0.8339576452183948, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6770007610321045, + "rewards/thk_ans_format_reward": 1.0, + "step": 3299, + "think_completion_length": 7.375 + }, + { + "clip_ratio": 0.0, + "completion_length": 177.86459350585938, + "epoch": 11.14839797639123, + "grad_norm": 6.210183344320372, + "kl": 0.4912109375, + "learning_rate": 7.094594594594594e-08, + "loss": 0.0005, + "reward": 3.6773531436920166, + "reward_std": 0.029986443929374218, + "rewards/final_reward": 1.5234544784468844, + "rewards/mask_iou_reward": 0.7617272392234422, + "rewards/sam_format_reward": 1.0, + "rewards/sam_reward_func_ultra": 1.6773530840873718, + "rewards/thk_ans_format_reward": 1.0, + "step": 3300, + "think_completion_length": 7.958333333333334 + } + ], + "logging_steps": 1.0, + "max_steps": 3552, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 12, + "trial_name": null, + "trial_params": null +}