{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008933941322485758, "eval_steps": 500, "global_step": 817, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 659.982177734375, "completions/mean_terminated_length": 647.4774780273438, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.093505669826898e-05, "grad_norm": 0.14601701498031616, "learning_rate": 0.0, "loss": -0.0084, "num_tokens": 91530.0, "reward": 0.06428571790456772, "reward_std": 0.12413345277309418, "rewards/accuracy_reward/mean": 0.0357142873108387, "rewards/accuracy_reward/std": 0.18641093373298645, "rewards/format_reward/mean": 0.2857142984867096, "rewards/format_reward/std": 0.453784316778183, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 2.187011339653796e-05, "grad_norm": 0.1459706425666809, "learning_rate": 1.2195121951219512e-08, "loss": -0.0084, "step": 2 }, { "clip_ratio/high_max": 0.00046226743143051863, "clip_ratio/high_mean": 0.00022063626965973526, "clip_ratio/low_mean": 0.00019331149815116078, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041394776781089604, "epoch": 3.280517009480694e-05, "grad_norm": 0.14210286736488342, "learning_rate": 2.4390243902439023e-08, "loss": -0.0084, "step": 3 }, { "clip_ratio/high_max": 0.0005377307534217834, "clip_ratio/high_mean": 0.00015713994798716158, "clip_ratio/low_mean": 0.00025477990857325494, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004119198420085013, "epoch": 4.374022679307592e-05, "grad_norm": 0.14298461377620697, "learning_rate": 3.658536585365853e-08, "loss": -0.0084, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 630.0178833007812, "completions/mean_terminated_length": 604.236328125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 5.46752834913449e-05, "grad_norm": 0.027561135590076447, "learning_rate": 4.878048780487805e-08, "loss": -0.0003, "num_tokens": 178248.0, "reward": 0.02321428805589676, "reward_std": 0.041517265141010284, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.2321428507566452, "rewards/format_reward/std": 0.4240972101688385, "step": 5 }, { "clip_ratio/high_max": 0.000349406007444486, "clip_ratio/high_mean": 0.00016255068476311862, "clip_ratio/low_mean": 0.00029700156301259995, "clip_ratio/low_min": 0.0001171920812339522, "clip_ratio/region_mean": 0.0004595522186718881, "epoch": 6.561034018961388e-05, "grad_norm": 0.027283119037747383, "learning_rate": 6.097560975609756e-08, "loss": -0.0003, "step": 6 }, { "clip_ratio/high_max": 0.00035157622187398374, "clip_ratio/high_mean": 0.00012556638102978468, "clip_ratio/low_mean": 0.00015416464884765446, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002797310589812696, "epoch": 7.654539688788286e-05, "grad_norm": 0.027099918574094772, "learning_rate": 7.317073170731706e-08, "loss": -0.0003, "step": 7 }, { "clip_ratio/high_max": 0.0002343841624679044, "clip_ratio/high_mean": 0.00010000879410654306, "clip_ratio/low_mean": 0.00015365133003797382, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025366010959260166, "epoch": 8.748045358615184e-05, "grad_norm": 0.02751326560974121, "learning_rate": 8.536585365853659e-08, "loss": -0.0003, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 752.9017944335938, "completions/mean_terminated_length": 741.2342529296875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 9.841551028442082e-05, "grad_norm": 0.024599654600024223, "learning_rate": 9.75609756097561e-08, "loss": 0.0033, "num_tokens": 283685.0, "reward": 0.02142857387661934, "reward_std": 0.032084476202726364, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.2142857164144516, "rewards/format_reward/std": 0.41217005252838135, "step": 9 }, { "clip_ratio/high_max": 0.0004192872147541493, "clip_ratio/high_mean": 0.00014125580491963774, "clip_ratio/low_mean": 0.00030112647800706327, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004423822683747858, "epoch": 0.0001093505669826898, "grad_norm": 0.024231158196926117, "learning_rate": 1.097560975609756e-07, "loss": 0.0033, "step": 10 }, { "clip_ratio/high_max": 0.0002795248001348227, "clip_ratio/high_mean": 7.599347736686468e-05, "clip_ratio/low_mean": 0.0002099502453347668, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028594370814971626, "epoch": 0.00012028562368095878, "grad_norm": 0.02471867948770523, "learning_rate": 1.219512195121951e-07, "loss": 0.0033, "step": 11 }, { "clip_ratio/high_max": 0.0002795248001348227, "clip_ratio/high_mean": 0.00011941684351768345, "clip_ratio/low_mean": 0.00030397603404708207, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00042339289211668074, "epoch": 0.00013122068037922776, "grad_norm": 0.024062857031822205, "learning_rate": 1.3414634146341465e-07, "loss": 0.0033, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1954.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 640.5892944335938, "completions/mean_terminated_length": 640.5892944335938, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.00014215573707749676, "grad_norm": 0.11785831302404404, "learning_rate": 1.4634146341463413e-07, "loss": 0.0038, "num_tokens": 370047.0, "reward": 0.10625001043081284, "reward_std": 0.23345845937728882, "rewards/accuracy_reward/mean": 0.0803571417927742, "rewards/accuracy_reward/std": 0.27306708693504333, "rewards/format_reward/mean": 0.2589285671710968, "rewards/format_reward/std": 0.44001504778862, "step": 13 }, { "clip_ratio/high_max": 0.00015511090168729424, "clip_ratio/high_mean": 4.426019950187765e-05, "clip_ratio/low_mean": 0.0002925886947195977, "clip_ratio/low_min": 8.0762394645717e-05, "clip_ratio/region_mean": 0.00033684889785945415, "epoch": 0.00015309079377576573, "grad_norm": 0.11697400361299515, "learning_rate": 1.5853658536585366e-07, "loss": 0.0038, "step": 14 }, { "clip_ratio/high_max": 0.00022184426779858768, "clip_ratio/high_mean": 6.538822344737127e-05, "clip_ratio/low_mean": 0.0002739164629019797, "clip_ratio/low_min": 0.0001478961785323918, "clip_ratio/region_mean": 0.0003393047081772238, "epoch": 0.00016402585047403472, "grad_norm": 0.11562111228704453, "learning_rate": 1.7073170731707317e-07, "loss": 0.0038, "step": 15 }, { "clip_ratio/high_max": 0.000161524789291434, "clip_ratio/high_mean": 5.4185329645406455e-05, "clip_ratio/low_mean": 0.00034718820825219154, "clip_ratio/low_min": 6.987631786614656e-05, "clip_ratio/region_mean": 0.0004013734869658947, "epoch": 0.00017496090717230369, "grad_norm": 0.11474598944187164, "learning_rate": 1.8292682926829268e-07, "loss": 0.0038, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 963.0982666015625, "completions/mean_terminated_length": 943.3726806640625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.00018589596387057268, "grad_norm": 0.018137287348508835, "learning_rate": 1.951219512195122e-07, "loss": -0.002, "num_tokens": 498774.0, "reward": 0.02410714700818062, "reward_std": 0.041704438626766205, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.2410714328289032, "rewards/format_reward/std": 0.42965590953826904, "step": 17 }, { "clip_ratio/high_max": 0.0005076786619611084, "clip_ratio/high_mean": 0.00017283321358263493, "clip_ratio/low_mean": 0.00034910827525891364, "clip_ratio/low_min": 0.00020699648302979767, "clip_ratio/region_mean": 0.000521941517945379, "epoch": 0.00019683102056884165, "grad_norm": 0.01788054220378399, "learning_rate": 2.073170731707317e-07, "loss": -0.002, "step": 18 }, { "clip_ratio/high_max": 0.00031507405219599605, "clip_ratio/high_mean": 0.00018010234634857625, "clip_ratio/low_mean": 0.00027040819986723363, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004505105607677251, "epoch": 0.00020776607726711064, "grad_norm": 0.01767919957637787, "learning_rate": 2.195121951219512e-07, "loss": -0.002, "step": 19 }, { "clip_ratio/high_max": 0.0005076786619611084, "clip_ratio/high_mean": 0.0001818694727262482, "clip_ratio/low_mean": 0.00033357544452883303, "clip_ratio/low_min": 9.0021152573172e-05, "clip_ratio/region_mean": 0.000515444902703166, "epoch": 0.0002187011339653796, "grad_norm": 0.017548447474837303, "learning_rate": 2.3170731707317074e-07, "loss": -0.002, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1215.8660888671875, "completions/mean_terminated_length": 1160.3905029296875, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.0002296361906636486, "grad_norm": 0.03614993020892143, "learning_rate": 2.439024390243902e-07, "loss": -0.0022, "num_tokens": 654971.0, "reward": 0.0312500074505806, "reward_std": 0.07786982506513596, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.2232142835855484, "rewards/format_reward/std": 0.41827234625816345, "step": 21 }, { "clip_ratio/high_max": 0.0002501000417396426, "clip_ratio/high_mean": 0.00016270557534880936, "clip_ratio/low_mean": 0.00031285686418414116, "clip_ratio/low_min": 0.00017319015751127154, "clip_ratio/region_mean": 0.0004755624395329505, "epoch": 0.00024057124736191757, "grad_norm": 0.03528996556997299, "learning_rate": 2.5609756097560976e-07, "loss": -0.0022, "step": 22 }, { "clip_ratio/high_max": 0.0002943947329185903, "clip_ratio/high_mean": 0.00018352743063587695, "clip_ratio/low_mean": 0.00030860278639011085, "clip_ratio/low_min": 0.00010004001524066553, "clip_ratio/region_mean": 0.000492130231577903, "epoch": 0.00025150630406018656, "grad_norm": 0.0354945994913578, "learning_rate": 2.682926829268293e-07, "loss": -0.0022, "step": 23 }, { "clip_ratio/high_max": 0.00030970710213296115, "clip_ratio/high_mean": 0.00014271253894548863, "clip_ratio/low_mean": 0.00030832603806629777, "clip_ratio/low_min": 7.849909889046103e-05, "clip_ratio/region_mean": 0.00045103859156370163, "epoch": 0.00026244136075845553, "grad_norm": 0.03578762337565422, "learning_rate": 2.8048780487804877e-07, "loss": -0.0022, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 644.2232666015625, "completions/mean_terminated_length": 644.2232666015625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.0002733764174567245, "grad_norm": 0.12037378549575806, "learning_rate": 2.9268292682926825e-07, "loss": -0.0024, "num_tokens": 743392.0, "reward": 0.11696429550647736, "reward_std": 0.20798708498477936, "rewards/accuracy_reward/mean": 0.0803571417927742, "rewards/accuracy_reward/std": 0.27306708693504333, "rewards/format_reward/mean": 0.3660714328289032, "rewards/format_reward/std": 0.483894407749176, "step": 25 }, { "clip_ratio/high_max": 0.0008694572024978697, "clip_ratio/high_mean": 0.0003273177135270089, "clip_ratio/low_mean": 0.00013087269326206297, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00045819039223715663, "epoch": 0.0002843114741549935, "grad_norm": 0.11732704192399979, "learning_rate": 3.048780487804878e-07, "loss": -0.0024, "step": 26 }, { "clip_ratio/high_max": 0.000766577257309109, "clip_ratio/high_mean": 0.00025628754519857466, "clip_ratio/low_mean": 0.00017492448387201875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00043121198541484773, "epoch": 0.0002952465308532625, "grad_norm": 0.11551497876644135, "learning_rate": 3.170731707317073e-07, "loss": -0.0025, "step": 27 }, { "clip_ratio/high_max": 0.0007452490390278399, "clip_ratio/high_mean": 0.0002468357270117849, "clip_ratio/low_mean": 0.0003685637202579528, "clip_ratio/low_min": 0.00022426553186960518, "clip_ratio/region_mean": 0.0006153994472697377, "epoch": 0.00030618158755153145, "grad_norm": 0.11520084738731384, "learning_rate": 3.292682926829268e-07, "loss": -0.0025, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 378.6071472167969, "completions/mean_terminated_length": 378.6071472167969, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.0003171166442498004, "grad_norm": 0.0914326086640358, "learning_rate": 3.4146341463414634e-07, "loss": -0.0036, "num_tokens": 802848.0, "reward": 0.02946428954601288, "reward_std": 0.0805157795548439, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.2053571492433548, "rewards/format_reward/std": 0.4057779312133789, "step": 29 }, { "clip_ratio/high_max": 0.0005429864395409822, "clip_ratio/high_mean": 0.00019312214863020927, "clip_ratio/low_mean": 0.00023086834698915482, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004239904519636184, "epoch": 0.00032805170094806944, "grad_norm": 0.08846415579319, "learning_rate": 3.536585365853658e-07, "loss": -0.0036, "step": 30 }, { "clip_ratio/high_max": 0.0004729623324237764, "clip_ratio/high_mean": 0.00014943396672606468, "clip_ratio/low_mean": 0.00023221100855153054, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038164496072568, "epoch": 0.0003389867576463384, "grad_norm": 0.089188352227211, "learning_rate": 3.6585365853658536e-07, "loss": -0.0037, "step": 31 }, { "clip_ratio/high_max": 0.000723981880582869, "clip_ratio/high_mean": 0.00030318801873363554, "clip_ratio/low_mean": 0.00035330280661582947, "clip_ratio/low_min": 0.00018099547014571726, "clip_ratio/region_mean": 0.0006564908544532955, "epoch": 0.00034992181434460737, "grad_norm": 0.08975915610790253, "learning_rate": 3.7804878048780484e-07, "loss": -0.0037, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 513.0803833007812, "completions/mean_terminated_length": 513.0803833007812, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.00036085687104287634, "grad_norm": 0.0554211363196373, "learning_rate": 3.902439024390244e-07, "loss": 0.0028, "num_tokens": 883049.0, "reward": 0.04107143357396126, "reward_std": 0.08232957124710083, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.3214285671710968, "rewards/format_reward/std": 0.46912387013435364, "step": 33 }, { "clip_ratio/high_max": 0.000348918343661353, "clip_ratio/high_mean": 0.0001688492193352431, "clip_ratio/low_mean": 0.000307459820760414, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004763090400956571, "epoch": 0.00037179192774114536, "grad_norm": 0.053999803960323334, "learning_rate": 4.024390243902439e-07, "loss": 0.0028, "step": 34 }, { "clip_ratio/high_max": 0.0003273322363384068, "clip_ratio/high_mean": 0.00016411014075856656, "clip_ratio/low_mean": 0.00034245516872033477, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005065652658231556, "epoch": 0.0003827269844394143, "grad_norm": 0.054283756762742996, "learning_rate": 4.146341463414634e-07, "loss": 0.0028, "step": 35 }, { "clip_ratio/high_max": 0.0007637752569280565, "clip_ratio/high_mean": 0.00035004745586775243, "clip_ratio/low_mean": 0.0003699049411807209, "clip_ratio/low_min": 0.0001091107478714548, "clip_ratio/region_mean": 0.0007199523970484734, "epoch": 0.0003936620411376833, "grad_norm": 0.051339082419872284, "learning_rate": 4.268292682926829e-07, "loss": 0.0027, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1000.90185546875, "completions/mean_terminated_length": 909.4077758789062, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.00040459709783595226, "grad_norm": 0.022105030715465546, "learning_rate": 4.390243902439024e-07, "loss": -0.001, "num_tokens": 1022030.0, "reward": 0.02589286118745804, "reward_std": 0.04297788068652153, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.2589285671710968, "rewards/format_reward/std": 0.44001504778862, "step": 37 }, { "clip_ratio/high_max": 0.00019742362201213837, "clip_ratio/high_mean": 0.00012670627620536834, "clip_ratio/low_mean": 0.00026663325843401253, "clip_ratio/low_min": 4.935590550303459e-05, "clip_ratio/region_mean": 0.0003933394909836352, "epoch": 0.0004155321545342213, "grad_norm": 0.02214411087334156, "learning_rate": 4.5121951219512194e-07, "loss": -0.001, "step": 38 }, { "clip_ratio/high_max": 0.00034549133852124214, "clip_ratio/high_mean": 0.00022293270740192384, "clip_ratio/low_mean": 0.0003277330833952874, "clip_ratio/low_min": 0.00020605810277629644, "clip_ratio/region_mean": 0.000550665776245296, "epoch": 0.00042646721123249025, "grad_norm": 0.02236776426434517, "learning_rate": 4.634146341463415e-07, "loss": -0.001, "step": 39 }, { "clip_ratio/high_max": 0.0005168444477021694, "clip_ratio/high_mean": 0.0001956779306055978, "clip_ratio/low_mean": 0.00032807938987389207, "clip_ratio/low_min": 0.00012055454863002524, "clip_ratio/region_mean": 0.0005237573059275746, "epoch": 0.0004374022679307592, "grad_norm": 0.0221861582249403, "learning_rate": 4.756097560975609e-07, "loss": -0.001, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 718.482177734375, "completions/mean_terminated_length": 602.3106689453125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.0004483373246290282, "grad_norm": 0.029572656378149986, "learning_rate": 4.878048780487804e-07, "loss": 0.0018, "num_tokens": 1118852.0, "reward": 0.02500000223517418, "reward_std": 0.0356564037501812, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4349588453769684, "step": 41 }, { "clip_ratio/high_max": 0.000404432590585202, "clip_ratio/high_mean": 0.000183849100722, "clip_ratio/low_mean": 0.0002603862958494574, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00044423542567528784, "epoch": 0.0004592723813272972, "grad_norm": 0.029234349727630615, "learning_rate": 5e-07, "loss": 0.0018, "step": 42 }, { "clip_ratio/high_max": 0.0008897516527213156, "clip_ratio/high_mean": 0.0002369636349612847, "clip_ratio/low_mean": 0.00015576167788822204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003927252837456763, "epoch": 0.00047020743802556617, "grad_norm": 0.02913379669189453, "learning_rate": 5.121951219512195e-07, "loss": 0.0018, "step": 43 }, { "clip_ratio/high_max": 0.0007279786514118314, "clip_ratio/high_mean": 0.0002515536907594651, "clip_ratio/low_mean": 0.0003790940681938082, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006306477589532733, "epoch": 0.00048114249472383514, "grad_norm": 0.02893688529729843, "learning_rate": 5.24390243902439e-07, "loss": 0.0018, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 603.4910888671875, "completions/mean_terminated_length": 590.4774780273438, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.0004920775514221041, "grad_norm": 0.1906323879957199, "learning_rate": 5.365853658536586e-07, "loss": 0.0069, "num_tokens": 1201283.0, "reward": 0.10446429997682571, "reward_std": 0.18228116631507874, "rewards/accuracy_reward/mean": 0.0535714291036129, "rewards/accuracy_reward/std": 0.2261820137500763, "rewards/format_reward/mean": 0.5089285969734192, "rewards/format_reward/std": 0.5021671056747437, "step": 45 }, { "clip_ratio/high_max": 0.0009161104680970311, "clip_ratio/high_mean": 0.00043651257874444127, "clip_ratio/low_mean": 0.00041805635555647314, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008545689634047449, "epoch": 0.0005030126081203731, "grad_norm": 0.17394037544727325, "learning_rate": 5.487804878048781e-07, "loss": 0.0068, "step": 46 }, { "clip_ratio/high_max": 0.0012410026974976063, "clip_ratio/high_mean": 0.0006795044173486531, "clip_ratio/low_mean": 0.0008594868704676628, "clip_ratio/low_min": 0.0001329256920143962, "clip_ratio/region_mean": 0.0015389913460239768, "epoch": 0.000513947664818642, "grad_norm": 0.15573686361312866, "learning_rate": 5.609756097560975e-07, "loss": 0.0066, "step": 47 }, { "clip_ratio/high_max": 0.003971208818256855, "clip_ratio/high_mean": 0.0012710903538390994, "clip_ratio/low_mean": 0.0015962637262418866, "clip_ratio/low_min": 0.00043744532740674913, "clip_ratio/region_mean": 0.0028673538472503424, "epoch": 0.0005248827215169111, "grad_norm": 0.12995466589927673, "learning_rate": 5.73170731707317e-07, "loss": 0.0063, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 1090.25, "completions/mean_terminated_length": 996.3529663085938, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.0005358177782151801, "grad_norm": 0.02657626010477543, "learning_rate": 5.853658536585365e-07, "loss": 0.0009, "num_tokens": 1342375.0, "reward": 0.03392857313156128, "reward_std": 0.04750056937336922, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.3392857015132904, "rewards/format_reward/std": 0.4755948483943939, "step": 49 }, { "clip_ratio/high_max": 0.00047873612493276596, "clip_ratio/high_mean": 0.00025426450883969665, "clip_ratio/low_mean": 0.00024083582684397697, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004951003356836736, "epoch": 0.000546752834913449, "grad_norm": 0.02579951286315918, "learning_rate": 5.97560975609756e-07, "loss": 0.0009, "step": 50 }, { "clip_ratio/high_max": 0.000684462720528245, "clip_ratio/high_mean": 0.00032317452132701874, "clip_ratio/low_mean": 0.0005159030552022159, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008390775765292346, "epoch": 0.000557687891611718, "grad_norm": 0.02537481300532818, "learning_rate": 6.097560975609756e-07, "loss": 0.0009, "step": 51 }, { "clip_ratio/high_max": 0.0015970796812325716, "clip_ratio/high_mean": 0.0005515736993402243, "clip_ratio/low_mean": 0.0006376489764079452, "clip_ratio/low_min": 0.00011234061821596697, "clip_ratio/region_mean": 0.0011892226757481694, "epoch": 0.000568622948309987, "grad_norm": 0.024309715256094933, "learning_rate": 6.219512195121951e-07, "loss": 0.0009, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 595.0, "completions/mean_terminated_length": 581.909912109375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.0005795580050082559, "grad_norm": 0.08723586052656174, "learning_rate": 6.341463414634146e-07, "loss": 0.002, "num_tokens": 1426655.0, "reward": 0.08839286118745804, "reward_std": 0.16274110972881317, "rewards/accuracy_reward/mean": 0.0446428582072258, "rewards/accuracy_reward/std": 0.2074466198682785, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.4983079433441162, "step": 53 }, { "clip_ratio/high_max": 0.002442333847284317, "clip_ratio/high_mean": 0.000537730404175818, "clip_ratio/low_mean": 0.0002887838345486671, "clip_ratio/low_min": 0.00014705881767440587, "clip_ratio/region_mean": 0.0008265142678283155, "epoch": 0.000590493061706525, "grad_norm": 0.08400396257638931, "learning_rate": 6.463414634146342e-07, "loss": 0.0019, "step": 54 }, { "clip_ratio/high_max": 0.0016642891569063067, "clip_ratio/high_mean": 0.0006832025246694684, "clip_ratio/low_mean": 0.000578437524382025, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012616401072591543, "epoch": 0.0006014281184047939, "grad_norm": 0.07554998248815536, "learning_rate": 6.585365853658536e-07, "loss": 0.0019, "step": 55 }, { "clip_ratio/high_max": 0.0027137042488902807, "clip_ratio/high_mean": 0.0010817094007506967, "clip_ratio/low_mean": 0.0013298114063218236, "clip_ratio/low_min": 0.00018796992662828416, "clip_ratio/region_mean": 0.0024115208070725203, "epoch": 0.0006123631751030629, "grad_norm": 0.06974621117115021, "learning_rate": 6.707317073170731e-07, "loss": 0.0018, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 527.8214721679688, "completions/mean_terminated_length": 527.8214721679688, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.0006232982318013319, "grad_norm": 0.12193766236305237, "learning_rate": 6.829268292682927e-07, "loss": 0.0069, "num_tokens": 1501563.0, "reward": 0.12767858803272247, "reward_std": 0.1476120501756668, "rewards/accuracy_reward/mean": 0.0714285746216774, "rewards/accuracy_reward/std": 0.25869685411453247, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4983079433441162, "step": 57 }, { "clip_ratio/high_max": 0.0016117215855047107, "clip_ratio/high_mean": 0.0006512849358841777, "clip_ratio/low_mean": 0.0002499984693713486, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009012834052555263, "epoch": 0.0006342332884996008, "grad_norm": 0.1120833307504654, "learning_rate": 6.951219512195121e-07, "loss": 0.0068, "step": 58 }, { "clip_ratio/high_max": 0.0019047618843615055, "clip_ratio/high_mean": 0.0009913879912346601, "clip_ratio/low_mean": 0.0007051273132674396, "clip_ratio/low_min": 0.0001945146796060726, "clip_ratio/region_mean": 0.0016965152462944388, "epoch": 0.0006451683451978699, "grad_norm": 0.10706006735563278, "learning_rate": 7.073170731707316e-07, "loss": 0.0067, "step": 59 }, { "clip_ratio/high_max": 0.003076923079788685, "clip_ratio/high_mean": 0.0018556894501671195, "clip_ratio/low_mean": 0.0015209207776933908, "clip_ratio/low_min": 0.0003890293592121452, "clip_ratio/region_mean": 0.003376610577106476, "epoch": 0.0006561034018961389, "grad_norm": 0.09404683113098145, "learning_rate": 7.195121951219512e-07, "loss": 0.0065, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 643.2589721679688, "completions/mean_terminated_length": 535.2019653320312, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.0006670384585944078, "grad_norm": 0.3058190643787384, "learning_rate": 7.317073170731707e-07, "loss": -0.0037, "num_tokens": 1588812.0, "reward": 0.2321428656578064, "reward_std": 0.19061145186424255, "rewards/accuracy_reward/mean": 0.1785714328289032, "rewards/accuracy_reward/std": 0.38471436500549316, "rewards/format_reward/mean": 0.5357142686843872, "rewards/format_reward/std": 0.5009642839431763, "step": 61 }, { "clip_ratio/high_max": 0.003916448913514614, "clip_ratio/high_mean": 0.001071483944542706, "clip_ratio/low_mean": 0.0009142861817963421, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019857699517160654, "epoch": 0.0006779735152926768, "grad_norm": 0.24885834753513336, "learning_rate": 7.439024390243903e-07, "loss": -0.0041, "step": 62 }, { "clip_ratio/high_max": 0.016100957989692688, "clip_ratio/high_mean": 0.003327990649268031, "clip_ratio/low_mean": 0.0027384681161493063, "clip_ratio/low_min": 0.00017032874166034162, "clip_ratio/region_mean": 0.006066458765417337, "epoch": 0.0006889085719909457, "grad_norm": 0.19699124991893768, "learning_rate": 7.560975609756097e-07, "loss": -0.0046, "step": 63 }, { "clip_ratio/high_max": 0.0352480411529541, "clip_ratio/high_mean": 0.00686678197234869, "clip_ratio/low_mean": 0.004863728769123554, "clip_ratio/low_min": 0.0002146076294593513, "clip_ratio/region_mean": 0.011730511672794819, "epoch": 0.0006998436286892147, "grad_norm": 0.17983171343803406, "learning_rate": 7.682926829268292e-07, "loss": -0.0049, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 826.6250610351562, "completions/mean_terminated_length": 804.4181518554688, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.0007107786853874838, "grad_norm": 0.09521128982305527, "learning_rate": 7.804878048780488e-07, "loss": -0.0011, "num_tokens": 1697102.0, "reward": 0.08928573876619339, "reward_std": 0.1450389176607132, "rewards/accuracy_reward/mean": 0.0357142873108387, "rewards/accuracy_reward/std": 0.18641093373298645, "rewards/format_reward/mean": 0.5357142686843872, "rewards/format_reward/std": 0.5009642839431763, "step": 65 }, { "clip_ratio/high_max": 0.0009934871923178434, "clip_ratio/high_mean": 0.00037774149677716196, "clip_ratio/low_mean": 0.0003757358936127275, "clip_ratio/low_min": 7.512019510613754e-05, "clip_ratio/region_mean": 0.0007534773903898895, "epoch": 0.0007217137420857527, "grad_norm": 0.08346717059612274, "learning_rate": 7.926829268292683e-07, "loss": -0.0011, "step": 66 }, { "clip_ratio/high_max": 0.0015454244567081332, "clip_ratio/high_mean": 0.0005376662011258304, "clip_ratio/low_mean": 0.0007496183970943093, "clip_ratio/low_min": 0.0001502403902122751, "clip_ratio/region_mean": 0.0012872845400124788, "epoch": 0.0007326487987840217, "grad_norm": 0.07449610531330109, "learning_rate": 8.048780487804878e-07, "loss": -0.0012, "step": 67 }, { "clip_ratio/high_max": 0.0032012362498790026, "clip_ratio/high_mean": 0.0010080985957756639, "clip_ratio/low_mean": 0.0012095103738829494, "clip_ratio/low_min": 0.0003856834373436868, "clip_ratio/region_mean": 0.002217608969658613, "epoch": 0.0007435838554822907, "grad_norm": 0.06452438980340958, "learning_rate": 8.170731707317072e-07, "loss": -0.0014, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 413.1071472167969, "completions/mean_terminated_length": 398.3783874511719, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0007545189121805596, "grad_norm": 0.06578363478183746, "learning_rate": 8.292682926829268e-07, "loss": -0.0012, "num_tokens": 1757762.0, "reward": 0.07857143878936768, "reward_std": 0.07827309519052505, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.6964285969734192, "rewards/format_reward/std": 0.4618662893772125, "step": 69 }, { "clip_ratio/high_max": 0.0005354035529308021, "clip_ratio/high_mean": 0.0002603907778393477, "clip_ratio/low_mean": 0.00021863861184101552, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00047902940423227847, "epoch": 0.0007654539688788287, "grad_norm": 0.06916521489620209, "learning_rate": 8.414634146341463e-07, "loss": -0.0012, "step": 70 }, { "clip_ratio/high_max": 0.0008459214586764574, "clip_ratio/high_mean": 0.0005174838588573039, "clip_ratio/low_mean": 0.0005742484354414046, "clip_ratio/low_min": 0.00015607928798999637, "clip_ratio/region_mean": 0.0010917324107140303, "epoch": 0.0007763890255770976, "grad_norm": 0.06596191972494125, "learning_rate": 8.536585365853657e-07, "loss": -0.0013, "step": 71 }, { "clip_ratio/high_max": 0.0025339096318930387, "clip_ratio/high_mean": 0.0013838078593835235, "clip_ratio/low_mean": 0.0010228607570752501, "clip_ratio/low_min": 0.00017310022667516023, "clip_ratio/region_mean": 0.0024066686164587736, "epoch": 0.0007873240822753666, "grad_norm": 0.0469934307038784, "learning_rate": 8.658536585365853e-07, "loss": -0.0014, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 908.3750610351562, "completions/mean_terminated_length": 898.108154296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.0007982591389736356, "grad_norm": 0.0217976663261652, "learning_rate": 8.780487804878048e-07, "loss": -0.002, "num_tokens": 1876972.0, "reward": 0.05892857909202576, "reward_std": 0.0462380088865757, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.5892857313156128, "rewards/format_reward/std": 0.4941745698451996, "step": 73 }, { "clip_ratio/high_max": 0.0006133088027127087, "clip_ratio/high_mean": 0.0003540238249115646, "clip_ratio/low_mean": 0.00021767507132608443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005716988816857338, "epoch": 0.0008091941956719045, "grad_norm": 0.021611612290143967, "learning_rate": 8.902439024390244e-07, "loss": -0.002, "step": 74 }, { "clip_ratio/high_max": 0.0013024224899709225, "clip_ratio/high_mean": 0.0006695449119433761, "clip_ratio/low_mean": 0.00038694983231835067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010564947733655572, "epoch": 0.0008201292523701735, "grad_norm": 0.02105650119483471, "learning_rate": 9.024390243902439e-07, "loss": -0.002, "step": 75 }, { "clip_ratio/high_max": 0.002604844979941845, "clip_ratio/high_mean": 0.0010139356600120664, "clip_ratio/low_mean": 0.0004636357189156115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014775712043046951, "epoch": 0.0008310643090684426, "grad_norm": 0.02021688222885132, "learning_rate": 9.146341463414634e-07, "loss": -0.002, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 516.5714721679688, "completions/mean_terminated_length": 488.7272644042969, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0008419993657667115, "grad_norm": 0.2136283814907074, "learning_rate": 9.26829268292683e-07, "loss": 0.0079, "num_tokens": 1950396.0, "reward": 0.2589285969734192, "reward_std": 0.2298194319009781, "rewards/accuracy_reward/mean": 0.1964285671710968, "rewards/accuracy_reward/std": 0.3990819752216339, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.4862987697124481, "step": 77 }, { "clip_ratio/high_max": 0.0023084995336830616, "clip_ratio/high_mean": 0.0010965176625177264, "clip_ratio/low_mean": 0.0008275257423520088, "clip_ratio/low_min": 7.656381785636768e-05, "clip_ratio/region_mean": 0.0019240434048697352, "epoch": 0.0008529344224649805, "grad_norm": 0.17822469770908356, "learning_rate": 9.390243902439024e-07, "loss": 0.0075, "step": 78 }, { "clip_ratio/high_max": 0.01154249720275402, "clip_ratio/high_mean": 0.0040679750964045525, "clip_ratio/low_mean": 0.0036565426271408796, "clip_ratio/low_min": 0.00032478076172992587, "clip_ratio/region_mean": 0.007724517025053501, "epoch": 0.0008638694791632494, "grad_norm": 0.14549744129180908, "learning_rate": 9.512195121951218e-07, "loss": 0.0069, "step": 79 }, { "clip_ratio/high_max": 0.01720881462097168, "clip_ratio/high_mean": 0.006936359219253063, "clip_ratio/low_mean": 0.007120309863239527, "clip_ratio/low_min": 0.0003789108886849135, "clip_ratio/region_mean": 0.014056667685508728, "epoch": 0.0008748045358615184, "grad_norm": 0.13901934027671814, "learning_rate": 9.634146341463414e-07, "loss": 0.0065, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 637.5803833007812, "completions/mean_terminated_length": 557.7453002929688, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.0008857395925597874, "grad_norm": 0.2482970654964447, "learning_rate": 9.756097560975609e-07, "loss": 0.0017, "num_tokens": 2039417.0, "reward": 0.25267860293388367, "reward_std": 0.2906312346458435, "rewards/accuracy_reward/mean": 0.1964285671710968, "rewards/accuracy_reward/std": 0.3990819752216339, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4983079433441162, "step": 81 }, { "clip_ratio/high_max": 0.0019218450179323554, "clip_ratio/high_mean": 0.0008665407076478004, "clip_ratio/low_mean": 0.0014180873986333609, "clip_ratio/low_min": 0.0002848461735993624, "clip_ratio/region_mean": 0.0022846278734505177, "epoch": 0.0008966746492580564, "grad_norm": 0.18263693153858185, "learning_rate": 9.878048780487804e-07, "loss": 0.0011, "step": 82 }, { "clip_ratio/high_max": 0.011531069874763489, "clip_ratio/high_mean": 0.0035708413925021887, "clip_ratio/low_mean": 0.0035906818229705095, "clip_ratio/low_min": 0.0006909547955729067, "clip_ratio/region_mean": 0.007161523215472698, "epoch": 0.0009076097059563254, "grad_norm": 0.15612059831619263, "learning_rate": 1e-06, "loss": 0.0006, "step": 83 }, { "clip_ratio/high_max": 0.017616912722587585, "clip_ratio/high_mean": 0.005587514955550432, "clip_ratio/low_mean": 0.006551002152264118, "clip_ratio/low_min": 0.0011934672947973013, "clip_ratio/region_mean": 0.012138517573475838, "epoch": 0.0009185447626545944, "grad_norm": 0.13437975943088531, "learning_rate": 1e-06, "loss": 0.0002, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 633.7678833007812, "completions/mean_terminated_length": 567.6822509765625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.0009294798193528633, "grad_norm": 0.133569598197937, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 2126051.0, "reward": 0.31250008940696716, "reward_std": 0.07976776361465454, "rewards/accuracy_reward/mean": 0.2410714328289032, "rewards/accuracy_reward/std": 0.42965593934059143, "rewards/format_reward/mean": 0.7142857313156128, "rewards/format_reward/std": 0.4537842869758606, "step": 85 }, { "clip_ratio/high_max": 0.0008865247946232557, "clip_ratio/high_mean": 0.00044517754577100277, "clip_ratio/low_mean": 0.0007196103106252849, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011647877981886268, "epoch": 0.0009404148760511323, "grad_norm": 0.12368395179510117, "learning_rate": 1e-06, "loss": 0.0123, "step": 86 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.0015926456544548273, "clip_ratio/low_mean": 0.0016206876607611775, "clip_ratio/low_min": 0.00013460761692840606, "clip_ratio/region_mean": 0.0032133334316313267, "epoch": 0.0009513499327494014, "grad_norm": 0.07820602506399155, "learning_rate": 1e-06, "loss": 0.0121, "step": 87 }, { "clip_ratio/high_max": 0.012411347590386868, "clip_ratio/high_mean": 0.003177961567416787, "clip_ratio/low_mean": 0.0030520872678607702, "clip_ratio/low_min": 0.0002692152338568121, "clip_ratio/region_mean": 0.00623004836961627, "epoch": 0.0009622849894476703, "grad_norm": 0.06198735162615776, "learning_rate": 1e-06, "loss": 0.0119, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 639.9732666015625, "completions/mean_terminated_length": 639.9732666015625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.0009732200461459393, "grad_norm": 0.13909491896629333, "learning_rate": 1e-06, "loss": -0.0265, "num_tokens": 2216796.0, "reward": 0.12142857909202576, "reward_std": 0.1898127645254135, "rewards/accuracy_reward/mean": 0.0535714291036129, "rewards/accuracy_reward/std": 0.2261820137500763, "rewards/format_reward/mean": 0.6785714030265808, "rewards/format_reward/std": 0.46912387013435364, "step": 89 }, { "clip_ratio/high_max": 0.0012893243692815304, "clip_ratio/high_mean": 0.0006003719754517078, "clip_ratio/low_mean": 0.0004279267159290612, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001028298633173108, "epoch": 0.0009841551028442082, "grad_norm": 0.11283573508262634, "learning_rate": 1e-06, "loss": -0.0267, "step": 90 }, { "clip_ratio/high_max": 0.0033522434532642365, "clip_ratio/high_mean": 0.0014228541404008865, "clip_ratio/low_mean": 0.0016091503202915192, "clip_ratio/low_min": 0.0005516227101907134, "clip_ratio/region_mean": 0.0030320044606924057, "epoch": 0.0009950901595424772, "grad_norm": 0.09381996840238571, "learning_rate": 1e-06, "loss": -0.027, "step": 91 }, { "clip_ratio/high_max": 0.006317689549177885, "clip_ratio/high_mean": 0.002791736973449588, "clip_ratio/low_mean": 0.0032940569799393415, "clip_ratio/low_min": 0.0010113082826137543, "clip_ratio/region_mean": 0.006085793953388929, "epoch": 0.0010060252162407462, "grad_norm": 0.0848713293671608, "learning_rate": 1e-06, "loss": -0.0273, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 603.4553833007812, "completions/mean_terminated_length": 603.4553833007812, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.0010169602729390153, "grad_norm": 0.11820001900196075, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 2305775.0, "reward": 0.20892859995365143, "reward_std": 0.15180592238903046, "rewards/accuracy_reward/mean": 0.1428571492433548, "rewards/accuracy_reward/std": 0.3514998257160187, "rewards/format_reward/mean": 0.6607142686843872, "rewards/format_reward/std": 0.4755948781967163, "step": 93 }, { "clip_ratio/high_max": 0.0008391608134843409, "clip_ratio/high_mean": 0.00060434261104092, "clip_ratio/low_mean": 0.0003632372245192528, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009675797773525119, "epoch": 0.001027895329637284, "grad_norm": 0.0917087122797966, "learning_rate": 1e-06, "loss": 0.0035, "step": 94 }, { "clip_ratio/high_max": 0.0029719448648393154, "clip_ratio/high_mean": 0.0014942000852897763, "clip_ratio/low_mean": 0.0010373879922553897, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002531588077545166, "epoch": 0.001038830386335553, "grad_norm": 0.0799674242734909, "learning_rate": 1e-06, "loss": 0.0033, "step": 95 }, { "clip_ratio/high_max": 0.005495064426213503, "clip_ratio/high_mean": 0.0022591736633330584, "clip_ratio/low_mean": 0.001941935857757926, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004201109055429697, "epoch": 0.0010497654430338221, "grad_norm": 0.07114733755588531, "learning_rate": 1e-06, "loss": 0.0031, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1255.8125, "completions/mean_terminated_length": 1142.642822265625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.0010607004997320911, "grad_norm": 0.10637569427490234, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 2467818.0, "reward": 0.24732144176959991, "reward_std": 0.134047269821167, "rewards/accuracy_reward/mean": 0.2053571492433548, "rewards/accuracy_reward/std": 0.4057779312133789, "rewards/format_reward/mean": 0.4196428656578064, "rewards/format_reward/std": 0.49571847915649414, "step": 97 }, { "clip_ratio/high_max": 0.001143619534559548, "clip_ratio/high_mean": 0.00044968040310777724, "clip_ratio/low_mean": 0.0005705053918063641, "clip_ratio/low_min": 0.00020730758842546493, "clip_ratio/region_mean": 0.0010201857658103108, "epoch": 0.0010716355564303602, "grad_norm": 0.07828596234321594, "learning_rate": 1e-06, "loss": 0.0053, "step": 98 }, { "clip_ratio/high_max": 0.003621462034061551, "clip_ratio/high_mean": 0.0011392736341804266, "clip_ratio/low_mean": 0.0010652748169377446, "clip_ratio/low_min": 0.0002791402512229979, "clip_ratio/region_mean": 0.0022045483347028494, "epoch": 0.0010825706131286292, "grad_norm": 0.06696613132953644, "learning_rate": 1e-06, "loss": 0.0052, "step": 99 }, { "clip_ratio/high_max": 0.007242924068123102, "clip_ratio/high_mean": 0.001881648669950664, "clip_ratio/low_mean": 0.0016347207129001617, "clip_ratio/low_min": 0.0002392630703980103, "clip_ratio/region_mean": 0.003516369266435504, "epoch": 0.001093505669826898, "grad_norm": 0.054511819034814835, "learning_rate": 1e-06, "loss": 0.005, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 855.7232666015625, "completions/mean_terminated_length": 844.9819946289062, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.001104440726525167, "grad_norm": 0.023753512650728226, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 2580543.0, "reward": 0.07500001043081284, "reward_std": 0.041345614939928055, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4349588453769684, "step": 101 }, { "clip_ratio/high_max": 0.0006910252850502729, "clip_ratio/high_mean": 0.0004304506292100996, "clip_ratio/low_mean": 0.00018152353004552424, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006119741010479629, "epoch": 0.001115375783223436, "grad_norm": 0.023200614377856255, "learning_rate": 1e-06, "loss": 0.0015, "step": 102 }, { "clip_ratio/high_max": 0.0013935340102761984, "clip_ratio/high_mean": 0.0008892741752788424, "clip_ratio/low_mean": 0.00039294740417972207, "clip_ratio/low_min": 0.0001394991995766759, "clip_ratio/region_mean": 0.0012822215212509036, "epoch": 0.001126310839921705, "grad_norm": 0.021049363538622856, "learning_rate": 1e-06, "loss": 0.0015, "step": 103 }, { "clip_ratio/high_max": 0.001950947567820549, "clip_ratio/high_mean": 0.0012700182851403952, "clip_ratio/low_mean": 0.0007963463431224227, "clip_ratio/low_min": 0.00034551264252513647, "clip_ratio/region_mean": 0.0020663647446781397, "epoch": 0.001137245896619974, "grad_norm": 0.019882960245013237, "learning_rate": 1e-06, "loss": 0.0015, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1004.3839721679688, "completions/mean_terminated_length": 890.7227783203125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.0011481809533182429, "grad_norm": 0.053473908454179764, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 2713082.0, "reward": 0.08124999701976776, "reward_std": 0.09330786019563675, "rewards/accuracy_reward/mean": 0.01785714365541935, "rewards/accuracy_reward/std": 0.1330273300409317, "rewards/format_reward/mean": 0.6339285969734192, "rewards/format_reward/std": 0.483894407749176, "step": 105 }, { "clip_ratio/high_max": 0.0009361834963783622, "clip_ratio/high_mean": 0.0003966704534832388, "clip_ratio/low_mean": 0.00037176930345594883, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007684397278353572, "epoch": 0.0011591160100165119, "grad_norm": 0.04168293997645378, "learning_rate": 1e-06, "loss": 0.0052, "step": 106 }, { "clip_ratio/high_max": 0.0016188714653253555, "clip_ratio/high_mean": 0.000767249264754355, "clip_ratio/low_mean": 0.000624868378508836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001392117701470852, "epoch": 0.001170051066714781, "grad_norm": 0.03727998584508896, "learning_rate": 1e-06, "loss": 0.0052, "step": 107 }, { "clip_ratio/high_max": 0.0024283071979880333, "clip_ratio/high_mean": 0.0011697775917127728, "clip_ratio/low_mean": 0.001000600284896791, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002170377876609564, "epoch": 0.00118098612341305, "grad_norm": 0.0324622318148613, "learning_rate": 1e-06, "loss": 0.0051, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 458.982177734375, "completions/mean_terminated_length": 458.982177734375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.001191921180111319, "grad_norm": 0.027233324944972992, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 2786300.0, "reward": 0.08303572982549667, "reward_std": 0.030857985839247704, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.8303571343421936, "rewards/format_reward/std": 0.37700554728507996, "step": 109 }, { "clip_ratio/high_max": 0.0011216350831091404, "clip_ratio/high_mean": 0.0006159297772683203, "clip_ratio/low_mean": 0.00013789345393888652, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007538231438957155, "epoch": 0.0012028562368095878, "grad_norm": 0.027108456939458847, "learning_rate": 1e-06, "loss": 0.0009, "step": 110 }, { "clip_ratio/high_max": 0.0024665980599820614, "clip_ratio/high_mean": 0.0007811164832673967, "clip_ratio/low_mean": 0.00045345458784140646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012345711002126336, "epoch": 0.0012137912935078568, "grad_norm": 0.024394135922193527, "learning_rate": 1e-06, "loss": 0.0008, "step": 111 }, { "clip_ratio/high_max": 0.004110997077077627, "clip_ratio/high_mean": 0.0015342350816354156, "clip_ratio/low_mean": 0.0009296265197917819, "clip_ratio/low_min": 0.0004689111956395209, "clip_ratio/region_mean": 0.002463861834257841, "epoch": 0.0012247263502061258, "grad_norm": 0.023147467523813248, "learning_rate": 1e-06, "loss": 0.0008, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 291.33038330078125, "completions/mean_terminated_length": 275.5045166015625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0012356614069043948, "grad_norm": 0.06344415992498398, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 2835449.0, "reward": 0.10625001788139343, "reward_std": 0.05276830121874809, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 113 }, { "clip_ratio/high_max": 0.0007813517586328089, "clip_ratio/high_mean": 0.000190377511898987, "clip_ratio/low_mean": 0.0002739811025094241, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004643586289603263, "epoch": 0.0012465964636026638, "grad_norm": 0.04217861220240593, "learning_rate": 1e-06, "loss": 0.008, "step": 114 }, { "clip_ratio/high_max": 0.0016929287230595946, "clip_ratio/high_mean": 0.0004529635771177709, "clip_ratio/low_mean": 0.0011529650073498487, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016059285262599587, "epoch": 0.0012575315203009329, "grad_norm": 0.03459494560956955, "learning_rate": 1e-06, "loss": 0.0079, "step": 115 }, { "clip_ratio/high_max": 0.0024742805399000645, "clip_ratio/high_mean": 0.000618190155364573, "clip_ratio/low_mean": 0.0024859921541064978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031041824258863926, "epoch": 0.0012684665769992017, "grad_norm": 0.028240878134965897, "learning_rate": 1e-06, "loss": 0.0078, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 473.7232360839844, "completions/mean_terminated_length": 473.7232360839844, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.0012794016336974707, "grad_norm": 0.23190933465957642, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 2904214.0, "reward": 0.32678574323654175, "reward_std": 0.18174830079078674, "rewards/accuracy_reward/mean": 0.2321428507566452, "rewards/accuracy_reward/std": 0.4240972101688385, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 117 }, { "clip_ratio/high_max": 0.012622720561921597, "clip_ratio/high_mean": 0.002667257096618414, "clip_ratio/low_mean": 0.0016445304499939084, "clip_ratio/low_min": 0.0002468526363372803, "clip_ratio/region_mean": 0.0043117874301970005, "epoch": 0.0012903366903957397, "grad_norm": 0.16302308440208435, "learning_rate": 1e-06, "loss": -0.0048, "step": 118 }, { "clip_ratio/high_max": 0.03786816447973251, "clip_ratio/high_mean": 0.00882615614682436, "clip_ratio/low_mean": 0.0045376853086054325, "clip_ratio/low_min": 0.0006171315908432007, "clip_ratio/region_mean": 0.01336384005844593, "epoch": 0.0013012717470940087, "grad_norm": 0.12206914275884628, "learning_rate": 1e-06, "loss": -0.0054, "step": 119 }, { "clip_ratio/high_max": 0.05797101557254791, "clip_ratio/high_mean": 0.013341264799237251, "clip_ratio/low_mean": 0.006899774074554443, "clip_ratio/low_min": 0.000987410545349121, "clip_ratio/region_mean": 0.020241038873791695, "epoch": 0.0013122068037922778, "grad_norm": 0.09893647581338882, "learning_rate": 1e-06, "loss": -0.0058, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1182.7232666015625, "completions/mean_terminated_length": 1027.88427734375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.0013231418604905466, "grad_norm": 0.02082119882106781, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 3060591.0, "reward": 0.06339286267757416, "reward_std": 0.043387822806835175, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.6339285969734192, "rewards/format_reward/std": 0.483894407749176, "step": 121 }, { "clip_ratio/high_max": 0.001145475427620113, "clip_ratio/high_mean": 0.0005733342259190977, "clip_ratio/low_mean": 0.00017528988246340305, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007486240938305855, "epoch": 0.0013340769171888156, "grad_norm": 0.020690133795142174, "learning_rate": 1e-06, "loss": 0.0029, "step": 122 }, { "clip_ratio/high_max": 0.002004581969231367, "clip_ratio/high_mean": 0.0008856711792759597, "clip_ratio/low_mean": 0.0003247045970056206, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012103758053854108, "epoch": 0.0013450119738870846, "grad_norm": 0.019811440259218216, "learning_rate": 1e-06, "loss": 0.0029, "step": 123 }, { "clip_ratio/high_max": 0.004009163938462734, "clip_ratio/high_mean": 0.0013574919430539012, "clip_ratio/low_mean": 0.000379743316443637, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001737235113978386, "epoch": 0.0013559470305853536, "grad_norm": 0.019280998036265373, "learning_rate": 1e-06, "loss": 0.0029, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1476.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 477.7232360839844, "completions/mean_terminated_length": 477.7232360839844, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.0013668820872836226, "grad_norm": 0.1581992655992508, "learning_rate": 1e-06, "loss": -0.0149, "num_tokens": 3132912.0, "reward": 0.47232145071029663, "reward_std": 0.25426214933395386, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48852667212486267, "rewards/format_reward/mean": 0.8839285969734192, "rewards/format_reward/std": 0.3217501640319824, "step": 125 }, { "clip_ratio/high_max": 0.002819548826664686, "clip_ratio/high_mean": 0.0010755109833553433, "clip_ratio/low_mean": 0.0006568928947672248, "clip_ratio/low_min": 0.00011963153519900516, "clip_ratio/region_mean": 0.00173240399453789, "epoch": 0.0013778171439818914, "grad_norm": 0.1280120313167572, "learning_rate": 1e-06, "loss": -0.0153, "step": 126 }, { "clip_ratio/high_max": 0.007205513771623373, "clip_ratio/high_mean": 0.0034155980683863163, "clip_ratio/low_mean": 0.0023422951344400644, "clip_ratio/low_min": 0.00011963153519900516, "clip_ratio/region_mean": 0.005757893435657024, "epoch": 0.0013887522006801605, "grad_norm": 0.09794305264949799, "learning_rate": 1e-06, "loss": -0.0158, "step": 127 }, { "clip_ratio/high_max": 0.012844611890614033, "clip_ratio/high_mean": 0.006252243183553219, "clip_ratio/low_mean": 0.004035287536680698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010287531651556492, "epoch": 0.0013996872573784295, "grad_norm": 0.07884787768125534, "learning_rate": 1e-06, "loss": -0.0162, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 904.08935546875, "completions/mean_terminated_length": 850.635498046875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.0014106223140766985, "grad_norm": 0.06924200803041458, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 3255170.0, "reward": 0.09821430593729019, "reward_std": 0.0910198763012886, "rewards/accuracy_reward/mean": 0.01785714365541935, "rewards/accuracy_reward/std": 0.1330273300409317, "rewards/format_reward/mean": 0.8035714030265808, "rewards/format_reward/std": 0.3990819454193115, "step": 129 }, { "clip_ratio/high_max": 0.0018024513265118003, "clip_ratio/high_mean": 0.0008109541377052665, "clip_ratio/low_mean": 0.00038576911902055144, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011967233149334788, "epoch": 0.0014215573707749675, "grad_norm": 0.04875153675675392, "learning_rate": 1e-06, "loss": -0.0014, "step": 130 }, { "clip_ratio/high_max": 0.0013208294985815883, "clip_ratio/high_mean": 0.0011240259045735002, "clip_ratio/low_mean": 0.0011755600571632385, "clip_ratio/low_min": 0.00018024512974079698, "clip_ratio/region_mean": 0.0022995860781520605, "epoch": 0.0014324924274732365, "grad_norm": 0.040166694670915604, "learning_rate": 1e-06, "loss": -0.0015, "step": 131 }, { "clip_ratio/high_max": 0.002278480911627412, "clip_ratio/high_mean": 0.0014757791068404913, "clip_ratio/low_mean": 0.002575651975348592, "clip_ratio/low_min": 0.00018024512974079698, "clip_ratio/region_mean": 0.004051431082189083, "epoch": 0.0014434274841715054, "grad_norm": 0.0343865267932415, "learning_rate": 1e-06, "loss": -0.0015, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1163.232177734375, "completions/mean_terminated_length": 1047.050537109375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.0014543625408697744, "grad_norm": 0.142621710896492, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 3406228.0, "reward": 0.27321431040763855, "reward_std": 0.3195189833641052, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3920665979385376, "rewards/format_reward/mean": 0.8571428656578064, "rewards/format_reward/std": 0.3514997959136963, "step": 133 }, { "clip_ratio/high_max": 0.000951798225287348, "clip_ratio/high_mean": 0.0005443213740363717, "clip_ratio/low_mean": 0.0007858923054300249, "clip_ratio/low_min": 0.00011430313315941021, "clip_ratio/region_mean": 0.0013302137376740575, "epoch": 0.0014652975975680434, "grad_norm": 0.1258755624294281, "learning_rate": 1e-06, "loss": 0.0077, "step": 134 }, { "clip_ratio/high_max": 0.004487048834562302, "clip_ratio/high_mean": 0.0018837058451026678, "clip_ratio/low_mean": 0.0024830519687384367, "clip_ratio/low_min": 0.00045721253263764083, "clip_ratio/region_mean": 0.0043667578138411045, "epoch": 0.0014762326542663124, "grad_norm": 0.10603295266628265, "learning_rate": 1e-06, "loss": 0.0072, "step": 135 }, { "clip_ratio/high_max": 0.008022299036383629, "clip_ratio/high_mean": 0.0037462038453668356, "clip_ratio/low_mean": 0.004885740578174591, "clip_ratio/low_min": 0.0006411169888451695, "clip_ratio/region_mean": 0.008631943725049496, "epoch": 0.0014871677109645814, "grad_norm": 0.09647484123706818, "learning_rate": 1e-06, "loss": 0.0067, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 745.4553833007812, "completions/mean_terminated_length": 709.60546875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0014981027676628502, "grad_norm": 0.05984002724289894, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 3513183.0, "reward": 0.08928573876619339, "reward_std": 0.07310117781162262, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.8035714030265808, "rewards/format_reward/std": 0.3990819454193115, "step": 137 }, { "clip_ratio/high_max": 0.0012608220567926764, "clip_ratio/high_mean": 0.0006319649401120842, "clip_ratio/low_mean": 0.00045674372813664377, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010887086391448975, "epoch": 0.0015090378243611193, "grad_norm": 0.04797409474849701, "learning_rate": 1e-06, "loss": 0.0021, "step": 138 }, { "clip_ratio/high_max": 0.0024375892244279385, "clip_ratio/high_mean": 0.0011787747498601675, "clip_ratio/low_mean": 0.0007476304890587926, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00192640523891896, "epoch": 0.0015199728810593883, "grad_norm": 0.03870755434036255, "learning_rate": 1e-06, "loss": 0.002, "step": 139 }, { "clip_ratio/high_max": 0.003950575832277536, "clip_ratio/high_mean": 0.001632058760151267, "clip_ratio/low_mean": 0.0015648575499653816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031969163101166487, "epoch": 0.0015309079377576573, "grad_norm": 0.03060651198029518, "learning_rate": 1e-06, "loss": 0.0019, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 1041.08935546875, "completions/mean_terminated_length": 1022.7817993164062, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.0015418429944559263, "grad_norm": 0.05656857043504715, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 3654649.0, "reward": 0.09017858654260635, "reward_std": 0.07182445377111435, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3920665979385376, "step": 141 }, { "clip_ratio/high_max": 0.0009250693838112056, "clip_ratio/high_mean": 0.0006185629172250628, "clip_ratio/low_mean": 0.0002760044299066067, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008945673471316695, "epoch": 0.0015527780511541951, "grad_norm": 0.0437757670879364, "learning_rate": 1e-06, "loss": -0.0029, "step": 142 }, { "clip_ratio/high_max": 0.0013029315741732717, "clip_ratio/high_mean": 0.0007653120555914938, "clip_ratio/low_mean": 0.0004719887219835073, "clip_ratio/low_min": 5.116659667692147e-05, "clip_ratio/region_mean": 0.0012373009230941534, "epoch": 0.0015637131078524642, "grad_norm": 0.04216944798827171, "learning_rate": 1e-06, "loss": -0.003, "step": 143 }, { "clip_ratio/high_max": 0.0028664495330303907, "clip_ratio/high_mean": 0.0011535061057657003, "clip_ratio/low_mean": 0.0006789136677980423, "clip_ratio/low_min": 0.00015349980094470084, "clip_ratio/region_mean": 0.0018324197735637426, "epoch": 0.0015746481645507332, "grad_norm": 0.0333990640938282, "learning_rate": 1e-06, "loss": -0.003, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 460.20538330078125, "completions/mean_terminated_length": 460.20538330078125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.0015855832212490022, "grad_norm": 0.0918477326631546, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 3726156.0, "reward": 0.107142873108387, "reward_std": 0.05380219221115112, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 145 }, { "clip_ratio/high_max": 0.002310030395165086, "clip_ratio/high_mean": 0.0005134900566190481, "clip_ratio/low_mean": 0.0005553321097977459, "clip_ratio/low_min": 9.45894789765589e-05, "clip_ratio/region_mean": 0.0010688221082091331, "epoch": 0.0015965182779472712, "grad_norm": 0.06242881342768669, "learning_rate": 1e-06, "loss": -0.0019, "step": 146 }, { "clip_ratio/high_max": 0.0053495438769459724, "clip_ratio/high_mean": 0.0009654107270762324, "clip_ratio/low_mean": 0.001455288496799767, "clip_ratio/low_min": 0.0003437607374507934, "clip_ratio/region_mean": 0.0024206992238759995, "epoch": 0.0016074533346455402, "grad_norm": 0.048312462866306305, "learning_rate": 1e-06, "loss": -0.002, "step": 147 }, { "clip_ratio/high_max": 0.008510638028383255, "clip_ratio/high_mean": 0.0017265069764107466, "clip_ratio/low_mean": 0.0031380162108689547, "clip_ratio/low_min": 0.0005156411207281053, "clip_ratio/region_mean": 0.004864522721618414, "epoch": 0.001618388391343809, "grad_norm": 0.03508325293660164, "learning_rate": 1e-06, "loss": -0.0021, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1233.125, "completions/mean_terminated_length": 1210.6971435546875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.001629323448042078, "grad_norm": 0.025515329092741013, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 3882830.0, "reward": 0.08750001341104507, "reward_std": 0.0769662857055664, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.7857142686843872, "rewards/format_reward/std": 0.41217002272605896, "step": 149 }, { "clip_ratio/high_max": 0.0009478264837525785, "clip_ratio/high_mean": 0.0004188166349194944, "clip_ratio/low_mean": 0.00020255900744814426, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006213756278157234, "epoch": 0.001640258504740347, "grad_norm": 0.02533772960305214, "learning_rate": 1e-06, "loss": 0.0028, "step": 150 }, { "clip_ratio/high_max": 0.0007754943799227476, "clip_ratio/high_mean": 0.0004965187981724739, "clip_ratio/low_mean": 0.0003570202097762376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000853539037052542, "epoch": 0.001651193561438616, "grad_norm": 0.023106753826141357, "learning_rate": 1e-06, "loss": 0.0027, "step": 151 }, { "clip_ratio/high_max": 0.0012308955192565918, "clip_ratio/high_mean": 0.0008705434156581759, "clip_ratio/low_mean": 0.0007697716355323792, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001640315051190555, "epoch": 0.0016621286181368851, "grad_norm": 0.02115386165678501, "learning_rate": 1e-06, "loss": 0.0027, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 611.6785888671875, "completions/mean_terminated_length": 611.6785888671875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.001673063674835154, "grad_norm": 0.10400090366601944, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 3975418.0, "reward": 0.12589286267757416, "reward_std": 0.13206826150417328, "rewards/accuracy_reward/mean": 0.0357142873108387, "rewards/accuracy_reward/std": 0.18641093373298645, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 153 }, { "clip_ratio/high_max": 0.0010519269853830338, "clip_ratio/high_mean": 0.0004195675428491086, "clip_ratio/low_mean": 0.0005048479652032256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009244154789485037, "epoch": 0.001683998731533423, "grad_norm": 0.09566161036491394, "learning_rate": 1e-06, "loss": -0.0088, "step": 154 }, { "clip_ratio/high_max": 0.005259634926915169, "clip_ratio/high_mean": 0.0013248688774183393, "clip_ratio/low_mean": 0.0018848018953576684, "clip_ratio/low_min": 0.00032102729892358184, "clip_ratio/region_mean": 0.003209670539945364, "epoch": 0.001694933788231692, "grad_norm": 0.07607891410589218, "learning_rate": 1e-06, "loss": -0.0092, "step": 155 }, { "clip_ratio/high_max": 0.010232380591332912, "clip_ratio/high_mean": 0.002212588908150792, "clip_ratio/low_mean": 0.004140789620578289, "clip_ratio/low_min": 0.0008828250574879348, "clip_ratio/region_mean": 0.0063533782958984375, "epoch": 0.001705868844929961, "grad_norm": 0.0682518258690834, "learning_rate": 1e-06, "loss": -0.0094, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 450.83929443359375, "completions/mean_terminated_length": 450.83929443359375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.00171680390162823, "grad_norm": 0.027389002963900566, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 4041648.0, "reward": 0.09285715222358704, "reward_std": 0.02571207843720913, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.25869685411453247, "step": 157 }, { "clip_ratio/high_max": 0.0008704735664650798, "clip_ratio/high_mean": 0.0005300022894516587, "clip_ratio/low_mean": 0.0001220887352246791, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006520910537801683, "epoch": 0.0017277389583264988, "grad_norm": 0.02549685165286064, "learning_rate": 1e-06, "loss": 0.0017, "step": 158 }, { "clip_ratio/high_max": 0.0017337700119242072, "clip_ratio/high_mean": 0.0011629096698015928, "clip_ratio/low_mean": 0.0003547764499671757, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015176861779764295, "epoch": 0.0017386740150247678, "grad_norm": 0.024154644459486008, "learning_rate": 1e-06, "loss": 0.0017, "step": 159 }, { "clip_ratio/high_max": 0.00604142714291811, "clip_ratio/high_mean": 0.0022426473442465067, "clip_ratio/low_mean": 0.001221979153342545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00346462638117373, "epoch": 0.0017496090717230369, "grad_norm": 0.021228190511465073, "learning_rate": 1e-06, "loss": 0.0016, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 732.7053833007812, "completions/mean_terminated_length": 708.7908935546875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.0017605441284213059, "grad_norm": 0.09178578108549118, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 4144375.0, "reward": 0.12232144176959991, "reward_std": 0.08751466125249863, "rewards/accuracy_reward/mean": 0.02678571455180645, "rewards/accuracy_reward/std": 0.1621822714805603, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 161 }, { "clip_ratio/high_max": 0.0013963694218546152, "clip_ratio/high_mean": 0.0005027515580877662, "clip_ratio/low_mean": 0.000593507313169539, "clip_ratio/low_min": 4.97561959491577e-05, "clip_ratio/region_mean": 0.0010962588712573051, "epoch": 0.001771479185119575, "grad_norm": 0.052314016968011856, "learning_rate": 1e-06, "loss": 0.0032, "step": 162 }, { "clip_ratio/high_max": 0.002593257464468479, "clip_ratio/high_mean": 0.0008977393736131489, "clip_ratio/low_mean": 0.0016939230263233185, "clip_ratio/low_min": 0.0002985371684189886, "clip_ratio/region_mean": 0.0025916623417288065, "epoch": 0.001782414241817844, "grad_norm": 0.039197176694869995, "learning_rate": 1e-06, "loss": 0.0031, "step": 163 }, { "clip_ratio/high_max": 0.003329536411911249, "clip_ratio/high_mean": 0.001077432418242097, "clip_ratio/low_mean": 0.003707718802616, "clip_ratio/low_min": 0.0006965867360122502, "clip_ratio/region_mean": 0.004785151686519384, "epoch": 0.0017933492985161127, "grad_norm": 0.0326027050614357, "learning_rate": 1e-06, "loss": 0.003, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 478.419677734375, "completions/mean_terminated_length": 478.419677734375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.0018042843552143817, "grad_norm": 0.38635629415512085, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 4220974.0, "reward": 0.30625003576278687, "reward_std": 0.2627086639404297, "rewards/accuracy_reward/mean": 0.2142857164144516, "rewards/accuracy_reward/std": 0.41217005252838135, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 165 }, { "clip_ratio/high_max": 0.011960855685174465, "clip_ratio/high_mean": 0.0032820510677993298, "clip_ratio/low_mean": 0.004140849690884352, "clip_ratio/low_min": 0.00011150136560900137, "clip_ratio/region_mean": 0.007422901224344969, "epoch": 0.0018152194119126508, "grad_norm": 0.2953922748565674, "learning_rate": 1e-06, "loss": 0.0114, "step": 166 }, { "clip_ratio/high_max": 0.02392171137034893, "clip_ratio/high_mean": 0.008143913932144642, "clip_ratio/low_mean": 0.010672117583453655, "clip_ratio/low_min": 0.000390254775993526, "clip_ratio/region_mean": 0.018816033378243446, "epoch": 0.0018261544686109198, "grad_norm": 0.2262006253004074, "learning_rate": 1e-06, "loss": 0.0099, "step": 167 }, { "clip_ratio/high_max": 0.02790866233408451, "clip_ratio/high_mean": 0.011872656643390656, "clip_ratio/low_mean": 0.01964360475540161, "clip_ratio/low_min": 0.000557506806217134, "clip_ratio/region_mean": 0.03151626139879227, "epoch": 0.0018370895253091888, "grad_norm": 0.2433360069990158, "learning_rate": 1e-06, "loss": 0.0089, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 548.3928833007812, "completions/mean_terminated_length": 548.3928833007812, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.0018480245820074576, "grad_norm": 0.015911461785435677, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 4299950.0, "reward": 0.3410714864730835, "reward_std": 0.023383798077702522, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4349588453769684, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.28643733263015747, "step": 169 }, { "clip_ratio/high_max": 0.001899335184134543, "clip_ratio/high_mean": 0.0007947622216306627, "clip_ratio/low_mean": 0.00023074833734426647, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010255106026306748, "epoch": 0.0018589596387057266, "grad_norm": 0.019726956263184547, "learning_rate": 1e-06, "loss": 0.0005, "step": 170 }, { "clip_ratio/high_max": 0.00255804811604321, "clip_ratio/high_mean": 0.0013285139575600624, "clip_ratio/low_mean": 0.0007224426954053342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002050956478342414, "epoch": 0.0018698946954039957, "grad_norm": 0.014790475368499756, "learning_rate": 1e-06, "loss": 0.0005, "step": 171 }, { "clip_ratio/high_max": 0.0035419126506894827, "clip_ratio/high_mean": 0.0018953380640596151, "clip_ratio/low_mean": 0.0013183584669604897, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032136966474354267, "epoch": 0.0018808297521022647, "grad_norm": 0.01381615363061428, "learning_rate": 1e-06, "loss": 0.0005, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 742.9285888671875, "completions/mean_terminated_length": 719.199951171875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.0018917648088005337, "grad_norm": 0.2072668969631195, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 4397606.0, "reward": 0.267857164144516, "reward_std": 0.3763384222984314, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3920665979385376, "rewards/format_reward/mean": 0.8035714030265808, "rewards/format_reward/std": 0.3990819454193115, "step": 173 }, { "clip_ratio/high_max": 0.005411673802882433, "clip_ratio/high_mean": 0.0015740820672363043, "clip_ratio/low_mean": 0.002410009503364563, "clip_ratio/low_min": 0.0012469692155718803, "clip_ratio/region_mean": 0.003984091337770224, "epoch": 0.0019026998654988027, "grad_norm": 0.15432241559028625, "learning_rate": 1e-06, "loss": -0.0125, "step": 174 }, { "clip_ratio/high_max": 0.01249838899821043, "clip_ratio/high_mean": 0.004020059015601873, "clip_ratio/low_mean": 0.006312613841146231, "clip_ratio/low_min": 0.0032374365255236626, "clip_ratio/region_mean": 0.010332672856748104, "epoch": 0.0019136349221970715, "grad_norm": 0.13368429243564606, "learning_rate": 1e-06, "loss": -0.0133, "step": 175 }, { "clip_ratio/high_max": 0.019327405840158463, "clip_ratio/high_mean": 0.006715203635394573, "clip_ratio/low_mean": 0.010306647047400475, "clip_ratio/low_min": 0.004635420627892017, "clip_ratio/region_mean": 0.017021849751472473, "epoch": 0.0019245699788953405, "grad_norm": 0.11235758662223816, "learning_rate": 1e-06, "loss": -0.014, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 744.8928833007812, "completions/mean_terminated_length": 696.629638671875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.0019355050355936096, "grad_norm": 0.1375121772289276, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 4497274.0, "reward": 0.38660719990730286, "reward_std": 0.2237224280834198, "rewards/accuracy_reward/mean": 0.2946428656578064, "rewards/accuracy_reward/std": 0.45793095231056213, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 177 }, { "clip_ratio/high_max": 0.0012787723680958152, "clip_ratio/high_mean": 0.0006159843760542572, "clip_ratio/low_mean": 0.0009435554966330528, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001559539814479649, "epoch": 0.0019464400922918786, "grad_norm": 0.08930055797100067, "learning_rate": 1e-06, "loss": 0.0018, "step": 178 }, { "clip_ratio/high_max": 0.0027832104824483395, "clip_ratio/high_mean": 0.0014293711865320802, "clip_ratio/low_mean": 0.003551855683326721, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004981227219104767, "epoch": 0.0019573751489901474, "grad_norm": 0.07015280425548553, "learning_rate": 1e-06, "loss": 0.0015, "step": 179 }, { "clip_ratio/high_max": 0.004438092466443777, "clip_ratio/high_mean": 0.002208216581493616, "clip_ratio/low_mean": 0.006515674293041229, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008723891340196133, "epoch": 0.0019683102056884164, "grad_norm": 0.060339443385601044, "learning_rate": 1e-06, "loss": 0.0013, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2035.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 510.5982360839844, "completions/mean_terminated_length": 510.5982360839844, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.0019792452623866854, "grad_norm": 0.008939902298152447, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 4573249.0, "reward": 0.0937500149011612, "reward_std": 0.01102396473288536, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 181 }, { "clip_ratio/high_max": 0.0003437016566749662, "clip_ratio/high_mean": 9.644049714552239e-05, "clip_ratio/low_mean": 0.0006105398060753942, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007069804123602808, "epoch": 0.0019901803190849545, "grad_norm": 0.00886242464184761, "learning_rate": 1e-06, "loss": 0.0002, "step": 182 }, { "clip_ratio/high_max": 0.0008019705419428647, "clip_ratio/high_mean": 0.0001776875724317506, "clip_ratio/low_mean": 0.0009504329063929617, "clip_ratio/low_min": 0.0001718508283374831, "clip_ratio/region_mean": 0.001128120464272797, "epoch": 0.0020011153757832235, "grad_norm": 0.00843091867864132, "learning_rate": 1e-06, "loss": 0.0001, "step": 183 }, { "clip_ratio/high_max": 0.0012602393981069326, "clip_ratio/high_mean": 0.000306274916511029, "clip_ratio/low_mean": 0.0015055412659421563, "clip_ratio/low_min": 0.00040098527097143233, "clip_ratio/region_mean": 0.0018118161242455244, "epoch": 0.0020120504324814925, "grad_norm": 0.008018731139600277, "learning_rate": 1e-06, "loss": 0.0001, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1420.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 494.232177734375, "completions/mean_terminated_length": 494.232177734375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0020229854891797615, "grad_norm": 0.10899297147989273, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 4643807.0, "reward": 0.2767857313156128, "reward_std": 0.11925646662712097, "rewards/accuracy_reward/mean": 0.1785714328289032, "rewards/accuracy_reward/std": 0.38471436500549316, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 185 }, { "clip_ratio/high_max": 0.00202977005392313, "clip_ratio/high_mean": 0.0007515425677411258, "clip_ratio/low_mean": 0.0008265268406830728, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015780694084241986, "epoch": 0.0020339205458780305, "grad_norm": 0.08338364958763123, "learning_rate": 1e-06, "loss": -0.0003, "step": 186 }, { "clip_ratio/high_max": 0.006596752442419529, "clip_ratio/high_mean": 0.0019520351197570562, "clip_ratio/low_mean": 0.002018360188230872, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003970395307987928, "epoch": 0.0020448556025762996, "grad_norm": 0.0605650320649147, "learning_rate": 1e-06, "loss": -0.0006, "step": 187 }, { "clip_ratio/high_max": 0.010233423672616482, "clip_ratio/high_mean": 0.002994147827848792, "clip_ratio/low_mean": 0.003203790867701173, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006197938229888678, "epoch": 0.002055790659274568, "grad_norm": 0.053484976291656494, "learning_rate": 1e-06, "loss": -0.0007, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 677.8660888671875, "completions/mean_terminated_length": 665.5225219726562, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.002066725715972837, "grad_norm": 0.07966841012239456, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 4741596.0, "reward": 0.13214287161827087, "reward_std": 0.12027876079082489, "rewards/accuracy_reward/mean": 0.0446428582072258, "rewards/accuracy_reward/std": 0.2074466347694397, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33220529556274414, "step": 189 }, { "clip_ratio/high_max": 0.001497006043791771, "clip_ratio/high_mean": 0.0009280600934289396, "clip_ratio/low_mean": 0.0006133720162324607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015414322260767221, "epoch": 0.002077660772671106, "grad_norm": 0.06297197192907333, "learning_rate": 1e-06, "loss": 0.0069, "step": 190 }, { "clip_ratio/high_max": 0.0021004725713282824, "clip_ratio/high_mean": 0.0015911329537630081, "clip_ratio/low_mean": 0.0014220633311197162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030131961684674025, "epoch": 0.002088595829369375, "grad_norm": 0.051875241100788116, "learning_rate": 1e-06, "loss": 0.0068, "step": 191 }, { "clip_ratio/high_max": 0.004063302185386419, "clip_ratio/high_mean": 0.0029825004749000072, "clip_ratio/low_mean": 0.003112561535090208, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006095061544328928, "epoch": 0.0020995308860676442, "grad_norm": 0.04187750071287155, "learning_rate": 1e-06, "loss": 0.0066, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 823.857177734375, "completions/mean_terminated_length": 812.828857421875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.0021104659427659133, "grad_norm": 0.15021169185638428, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 4852012.0, "reward": 0.15625, "reward_std": 0.17802336812019348, "rewards/accuracy_reward/mean": 0.0714285746216774, "rewards/accuracy_reward/std": 0.25869685411453247, "rewards/format_reward/mean": 0.8482142686843872, "rewards/format_reward/std": 0.3604257106781006, "step": 193 }, { "clip_ratio/high_max": 0.0026001541409641504, "clip_ratio/high_mean": 0.0013006635708734393, "clip_ratio/low_mean": 0.0011017571669071913, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024024206213653088, "epoch": 0.0021214009994641823, "grad_norm": 0.10779526829719543, "learning_rate": 1e-06, "loss": 0.0135, "step": 194 }, { "clip_ratio/high_max": 0.006333869881927967, "clip_ratio/high_mean": 0.0025799479335546494, "clip_ratio/low_mean": 0.0029141181148588657, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005494066514074802, "epoch": 0.0021323360561624513, "grad_norm": 0.08414215594530106, "learning_rate": 1e-06, "loss": 0.013, "step": 195 }, { "clip_ratio/high_max": 0.008373591117560863, "clip_ratio/high_mean": 0.004029650706797838, "clip_ratio/low_mean": 0.005614197812974453, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009643848054111004, "epoch": 0.0021432711128607203, "grad_norm": 0.06841538846492767, "learning_rate": 1e-06, "loss": 0.0127, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 577.0535888671875, "completions/mean_terminated_length": 550.30908203125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.0021542061695589893, "grad_norm": 0.16247113049030304, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 4930838.0, "reward": 0.5571429133415222, "reward_std": 0.18242615461349487, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.5015259981155396, "rewards/format_reward/mean": 0.8392857313156128, "rewards/format_reward/std": 0.368917852640152, "step": 197 }, { "clip_ratio/high_max": 0.003708914155140519, "clip_ratio/high_mean": 0.0018843497382476926, "clip_ratio/low_mean": 0.0011520127300173044, "clip_ratio/low_min": 0.00015225335664581507, "clip_ratio/region_mean": 0.0030363628175109625, "epoch": 0.0021651412262572584, "grad_norm": 0.10816186666488647, "learning_rate": 1e-06, "loss": -0.0045, "step": 198 }, { "clip_ratio/high_max": 0.011412829160690308, "clip_ratio/high_mean": 0.005151406396180391, "clip_ratio/low_mean": 0.0027883669827133417, "clip_ratio/low_min": 0.0010657734237611294, "clip_ratio/region_mean": 0.007939773611724377, "epoch": 0.002176076282955527, "grad_norm": 0.08106408268213272, "learning_rate": 1e-06, "loss": -0.005, "step": 199 }, { "clip_ratio/high_max": 0.022825658321380615, "clip_ratio/high_mean": 0.009336665272712708, "clip_ratio/low_mean": 0.005405419040471315, "clip_ratio/low_min": 0.00182704022154212, "clip_ratio/region_mean": 0.014742083847522736, "epoch": 0.002187011339653796, "grad_norm": 0.060441650450229645, "learning_rate": 1e-06, "loss": -0.0052, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 800.1785888671875, "completions/mean_terminated_length": 788.9369506835938, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.002197946396352065, "grad_norm": 0.17398223280906677, "learning_rate": 1e-06, "loss": -0.0176, "num_tokens": 5036782.0, "reward": 0.46964290738105774, "reward_std": 0.2943170964717865, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4862987697124481, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 201 }, { "clip_ratio/high_max": 0.0023767082020640373, "clip_ratio/high_mean": 0.0012728049186989665, "clip_ratio/low_mean": 0.0015603854553774, "clip_ratio/low_min": 0.0005765350069850683, "clip_ratio/region_mean": 0.0028331901412457228, "epoch": 0.002208881453050334, "grad_norm": 0.1174439787864685, "learning_rate": 1e-06, "loss": -0.0181, "step": 202 }, { "clip_ratio/high_max": 0.004753416404128075, "clip_ratio/high_mean": 0.0029471220914274454, "clip_ratio/low_mean": 0.0037910714745521545, "clip_ratio/low_min": 0.001293493784032762, "clip_ratio/region_mean": 0.006738193333148956, "epoch": 0.002219816509748603, "grad_norm": 0.08902607858181, "learning_rate": 1e-06, "loss": -0.0185, "step": 203 }, { "clip_ratio/high_max": 0.008044634945690632, "clip_ratio/high_mean": 0.004854958970099688, "clip_ratio/low_mean": 0.006592132616788149, "clip_ratio/low_min": 0.0024022292345762253, "clip_ratio/region_mean": 0.011447091586887836, "epoch": 0.002230751566446872, "grad_norm": 0.08299358934164047, "learning_rate": 1e-06, "loss": -0.0189, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 844.8392944335938, "completions/mean_terminated_length": 822.963623046875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.002241686623145141, "grad_norm": 0.09725531935691833, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 5152992.0, "reward": 0.16339287161827087, "reward_std": 0.15440881252288818, "rewards/accuracy_reward/mean": 0.0714285746216774, "rewards/accuracy_reward/std": 0.25869685411453247, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 205 }, { "clip_ratio/high_max": 0.00098754174541682, "clip_ratio/high_mean": 0.0003187194524798542, "clip_ratio/low_mean": 0.0011871767928823829, "clip_ratio/low_min": 0.0008327165269292891, "clip_ratio/region_mean": 0.0015058963326737285, "epoch": 0.00225262167984341, "grad_norm": 0.07711759209632874, "learning_rate": 1e-06, "loss": -0.0072, "step": 206 }, { "clip_ratio/high_max": 0.0027347311843186617, "clip_ratio/high_mean": 0.0009674456669017673, "clip_ratio/low_mean": 0.0019748785998672247, "clip_ratio/low_min": 0.0005551443318836391, "clip_ratio/region_mean": 0.00294232415035367, "epoch": 0.002263556736541679, "grad_norm": 0.05991889536380768, "learning_rate": 1e-06, "loss": -0.0075, "step": 207 }, { "clip_ratio/high_max": 0.004026131704449654, "clip_ratio/high_mean": 0.0016004100907593966, "clip_ratio/low_mean": 0.004056769888848066, "clip_ratio/low_min": 0.0012953367549926043, "clip_ratio/region_mean": 0.0056571802124381065, "epoch": 0.002274491793239948, "grad_norm": 0.05242515727877617, "learning_rate": 1e-06, "loss": -0.0077, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 461.794677734375, "completions/mean_terminated_length": 461.794677734375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0022854268499382167, "grad_norm": 0.12984277307987213, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 5219301.0, "reward": 0.12410715967416763, "reward_std": 0.11402992904186249, "rewards/accuracy_reward/mean": 0.02678571455180645, "rewards/accuracy_reward/std": 0.1621822714805603, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 209 }, { "clip_ratio/high_max": 0.0035587188322097063, "clip_ratio/high_mean": 0.0007737679407000542, "clip_ratio/low_mean": 0.0009953055996447802, "clip_ratio/low_min": 0.00022644927958026528, "clip_ratio/region_mean": 0.0017690735403448343, "epoch": 0.0022963619066364857, "grad_norm": 0.08917607367038727, "learning_rate": 1e-06, "loss": -0.0036, "step": 210 }, { "clip_ratio/high_max": 0.007339857518672943, "clip_ratio/high_mean": 0.0016290688654407859, "clip_ratio/low_mean": 0.004214680287986994, "clip_ratio/low_min": 0.00041034058085642755, "clip_ratio/region_mean": 0.005843748338520527, "epoch": 0.0023072969633347548, "grad_norm": 0.06134293973445892, "learning_rate": 1e-06, "loss": -0.0038, "step": 211 }, { "clip_ratio/high_max": 0.010676156729459763, "clip_ratio/high_mean": 0.002503752475604415, "clip_ratio/low_mean": 0.008600995875895023, "clip_ratio/low_min": 0.0020380434580147266, "clip_ratio/region_mean": 0.011104747653007507, "epoch": 0.0023182320200330238, "grad_norm": 0.047313421964645386, "learning_rate": 1e-06, "loss": -0.004, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 803.3482666015625, "completions/mean_terminated_length": 732.896240234375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.002329167076731293, "grad_norm": 0.046943821012973785, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 5326888.0, "reward": 0.09910715371370316, "reward_std": 0.06426939368247986, "rewards/accuracy_reward/mean": 0.008928571827709675, "rewards/accuracy_reward/std": 0.09449111670255661, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 213 }, { "clip_ratio/high_max": 0.0010810811072587967, "clip_ratio/high_mean": 0.0004824989882763475, "clip_ratio/low_mean": 0.0005189571529626846, "clip_ratio/low_min": 0.0001487707777414471, "clip_ratio/region_mean": 0.0010014561703428626, "epoch": 0.002340102133429562, "grad_norm": 0.03301604837179184, "learning_rate": 1e-06, "loss": 0.0111, "step": 214 }, { "clip_ratio/high_max": 0.0021621622145175934, "clip_ratio/high_mean": 0.0008779577328823507, "clip_ratio/low_mean": 0.0015161455376073718, "clip_ratio/low_min": 0.0005578904529102147, "clip_ratio/region_mean": 0.0023941032122820616, "epoch": 0.002351037190127831, "grad_norm": 0.025425324216485023, "learning_rate": 1e-06, "loss": 0.011, "step": 215 }, { "clip_ratio/high_max": 0.0021621622145175934, "clip_ratio/high_mean": 0.0010367566719651222, "clip_ratio/low_mean": 0.00294610601849854, "clip_ratio/low_min": 0.00104139547329396, "clip_ratio/region_mean": 0.0039828624576330185, "epoch": 0.0023619722468261, "grad_norm": 0.022809579968452454, "learning_rate": 1e-06, "loss": 0.011, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 723.607177734375, "completions/mean_terminated_length": 687.1559448242188, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.002372907303524369, "grad_norm": 0.013799415901303291, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 5426272.0, "reward": 0.09464286267757416, "reward_std": 0.015465338714420795, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 217 }, { "clip_ratio/high_max": 0.0010771992383524776, "clip_ratio/high_mean": 0.00040522852214053273, "clip_ratio/low_mean": 0.0005624617333523929, "clip_ratio/low_min": 0.00011968881153734401, "clip_ratio/region_mean": 0.0009676902554929256, "epoch": 0.002383842360222638, "grad_norm": 0.012956572696566582, "learning_rate": 1e-06, "loss": 0.0024, "step": 218 }, { "clip_ratio/high_max": 0.001196888042613864, "clip_ratio/high_mean": 0.0006995805306360126, "clip_ratio/low_mean": 0.000676199677400291, "clip_ratio/low_min": 0.0002967359032481909, "clip_ratio/region_mean": 0.0013757802080363035, "epoch": 0.002394777416920907, "grad_norm": 0.011950033716857433, "learning_rate": 1e-06, "loss": 0.0024, "step": 219 }, { "clip_ratio/high_max": 0.0014883721014484763, "clip_ratio/high_mean": 0.0007653041393496096, "clip_ratio/low_mean": 0.001073259161785245, "clip_ratio/low_min": 0.00041891084401868284, "clip_ratio/region_mean": 0.0018385632429271936, "epoch": 0.0024057124736191755, "grad_norm": 0.01114115584641695, "learning_rate": 1e-06, "loss": 0.0024, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 886.6785888671875, "completions/mean_terminated_length": 843.6666870117188, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.0024166475303174445, "grad_norm": 0.15129195153713226, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 5540112.0, "reward": 0.35803574323654175, "reward_std": 0.278379887342453, "rewards/accuracy_reward/mean": 0.2678571343421936, "rewards/accuracy_reward/std": 0.44483304023742676, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 221 }, { "clip_ratio/high_max": 0.003874813672155142, "clip_ratio/high_mean": 0.0012036676052957773, "clip_ratio/low_mean": 0.0010082644876092672, "clip_ratio/low_min": 0.00030075188260525465, "clip_ratio/region_mean": 0.002211931860074401, "epoch": 0.0024275825870157136, "grad_norm": 0.1080126166343689, "learning_rate": 1e-06, "loss": -0.0118, "step": 222 }, { "clip_ratio/high_max": 0.007948336191475391, "clip_ratio/high_mean": 0.0034911988768726587, "clip_ratio/low_mean": 0.0025931389536708593, "clip_ratio/low_min": 0.0006015037652105093, "clip_ratio/region_mean": 0.006084337830543518, "epoch": 0.0024385176437139826, "grad_norm": 0.08217406272888184, "learning_rate": 1e-06, "loss": -0.0123, "step": 223 }, { "clip_ratio/high_max": 0.011227024719119072, "clip_ratio/high_mean": 0.005833187140524387, "clip_ratio/low_mean": 0.005094918422400951, "clip_ratio/low_min": 0.0012030075304210186, "clip_ratio/region_mean": 0.010928104631602764, "epoch": 0.0024494527004122516, "grad_norm": 0.06790053099393845, "learning_rate": 1e-06, "loss": -0.0127, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1125.27685546875, "completions/mean_terminated_length": 1116.9639892578125, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.0024603877571105206, "grad_norm": 0.16540271043777466, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 5681851.0, "reward": 0.19732145965099335, "reward_std": 0.26466554403305054, "rewards/accuracy_reward/mean": 0.1071428582072258, "rewards/accuracy_reward/std": 0.31068485975265503, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 225 }, { "clip_ratio/high_max": 0.0013059376506134868, "clip_ratio/high_mean": 0.0008409543661400676, "clip_ratio/low_mean": 0.0010955993784591556, "clip_ratio/low_min": 0.0006681400118395686, "clip_ratio/region_mean": 0.001936553861014545, "epoch": 0.0024713228138087896, "grad_norm": 0.10858166962862015, "learning_rate": 1e-06, "loss": -0.0042, "step": 226 }, { "clip_ratio/high_max": 0.003134250408038497, "clip_ratio/high_mean": 0.0021434512455016375, "clip_ratio/low_mean": 0.0026210490614175797, "clip_ratio/low_min": 0.00048717117169871926, "clip_ratio/region_mean": 0.004764500539749861, "epoch": 0.0024822578705070587, "grad_norm": 0.08861404657363892, "learning_rate": 1e-06, "loss": -0.0048, "step": 227 }, { "clip_ratio/high_max": 0.006013260222971439, "clip_ratio/high_mean": 0.0036565400660037994, "clip_ratio/low_mean": 0.0059335194528102875, "clip_ratio/low_min": 0.002354660537093878, "clip_ratio/region_mean": 0.009590059518814087, "epoch": 0.0024931929272053277, "grad_norm": 0.07383939623832703, "learning_rate": 1e-06, "loss": -0.0052, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 715.0178833007812, "completions/mean_terminated_length": 715.0178833007812, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0025041279839035967, "grad_norm": 0.13921575248241425, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 5783521.0, "reward": 0.1294642984867096, "reward_std": 0.14730219542980194, "rewards/accuracy_reward/mean": 0.0357142873108387, "rewards/accuracy_reward/std": 0.18641093373298645, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 229 }, { "clip_ratio/high_max": 0.0027406271547079086, "clip_ratio/high_mean": 0.0012007539626210928, "clip_ratio/low_mean": 0.0011092581553384662, "clip_ratio/low_min": 0.00010887316602747887, "clip_ratio/region_mean": 0.002310012001544237, "epoch": 0.0025150630406018657, "grad_norm": 0.07463350147008896, "learning_rate": 1e-06, "loss": 0.0022, "step": 230 }, { "clip_ratio/high_max": 0.0050427536480128765, "clip_ratio/high_mean": 0.00206890725530684, "clip_ratio/low_mean": 0.00421347888186574, "clip_ratio/low_min": 0.0001633097417652607, "clip_ratio/region_mean": 0.006282386370003223, "epoch": 0.0025259980973001343, "grad_norm": 0.0519527904689312, "learning_rate": 1e-06, "loss": 0.002, "step": 231 }, { "clip_ratio/high_max": 0.007783380802720785, "clip_ratio/high_mean": 0.0027791333850473166, "clip_ratio/low_mean": 0.007234591990709305, "clip_ratio/low_min": 0.0004899292252957821, "clip_ratio/region_mean": 0.01001372467726469, "epoch": 0.0025369331539984033, "grad_norm": 0.04056819900870323, "learning_rate": 1e-06, "loss": 0.0019, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 566.1339721679688, "completions/mean_terminated_length": 552.7838134765625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.0025478682106966724, "grad_norm": 0.2973848879337311, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 5861936.0, "reward": 0.5803572535514832, "reward_std": 0.2527322769165039, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5021671056747437, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3106848895549774, "step": 233 }, { "clip_ratio/high_max": 0.007598480209708214, "clip_ratio/high_mean": 0.0026649790816009045, "clip_ratio/low_mean": 0.004012782126665115, "clip_ratio/low_min": 0.0003877471899613738, "clip_ratio/region_mean": 0.006677761673927307, "epoch": 0.0025588032673949414, "grad_norm": 0.21019479632377625, "learning_rate": 1e-06, "loss": 0.022, "step": 234 }, { "clip_ratio/high_max": 0.01759648136794567, "clip_ratio/high_mean": 0.006639711558818817, "clip_ratio/low_mean": 0.011608807370066643, "clip_ratio/low_min": 0.0006647094851359725, "clip_ratio/region_mean": 0.01824851892888546, "epoch": 0.0025697383240932104, "grad_norm": 0.1672622263431549, "learning_rate": 1e-06, "loss": 0.0209, "step": 235 }, { "clip_ratio/high_max": 0.0259948018938303, "clip_ratio/high_mean": 0.010433985851705074, "clip_ratio/low_mean": 0.017942562699317932, "clip_ratio/low_min": 0.001329418970271945, "clip_ratio/region_mean": 0.028376547619700432, "epoch": 0.0025806733807914794, "grad_norm": 0.14666414260864258, "learning_rate": 1e-06, "loss": 0.02, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 566.9375, "completions/mean_terminated_length": 566.9375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.0025916084374897484, "grad_norm": 0.4771106541156769, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 5939993.0, "reward": 0.3526786267757416, "reward_std": 0.33622342348098755, "rewards/accuracy_reward/mean": 0.2589285671710968, "rewards/accuracy_reward/std": 0.44001504778862, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 237 }, { "clip_ratio/high_max": 0.02380952425301075, "clip_ratio/high_mean": 0.005012080539017916, "clip_ratio/low_mean": 0.005525905638933182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010537986643612385, "epoch": 0.0026025434941880175, "grad_norm": 0.27038612961769104, "learning_rate": 1e-06, "loss": -0.0042, "step": 238 }, { "clip_ratio/high_max": 0.030538301914930344, "clip_ratio/high_mean": 0.008291947655379772, "clip_ratio/low_mean": 0.011678864248096943, "clip_ratio/low_min": 0.00037957867607474327, "clip_ratio/region_mean": 0.019970808178186417, "epoch": 0.0026134785508862865, "grad_norm": 0.2085813730955124, "learning_rate": 1e-06, "loss": -0.0051, "step": 239 }, { "clip_ratio/high_max": 0.03933747485280037, "clip_ratio/high_mean": 0.011688821949064732, "clip_ratio/low_mean": 0.01933138072490692, "clip_ratio/low_min": 0.0008540520211681724, "clip_ratio/region_mean": 0.031020203605294228, "epoch": 0.0026244136075845555, "grad_norm": 0.16731950640678406, "learning_rate": 1e-06, "loss": -0.0059, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 906.6339721679688, "completions/mean_terminated_length": 875.2201538085938, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.002635348664282824, "grad_norm": 0.1134834885597229, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 6058784.0, "reward": 0.12142858654260635, "reward_std": 0.09321768581867218, "rewards/accuracy_reward/mean": 0.02678571455180645, "rewards/accuracy_reward/std": 0.1621822714805603, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 241 }, { "clip_ratio/high_max": 0.0030142099130898714, "clip_ratio/high_mean": 0.0014625288313254714, "clip_ratio/low_mean": 0.0022077676840126514, "clip_ratio/low_min": 4.298302155802958e-05, "clip_ratio/region_mean": 0.003670296398922801, "epoch": 0.002646283720981093, "grad_norm": 0.07632028311491013, "learning_rate": 1e-06, "loss": 0.0082, "step": 242 }, { "clip_ratio/high_max": 0.004413387272506952, "clip_ratio/high_mean": 0.0022119893692433834, "clip_ratio/low_mean": 0.0057065836153924465, "clip_ratio/low_min": 0.0002149151114281267, "clip_ratio/region_mean": 0.00791857298463583, "epoch": 0.002657218777679362, "grad_norm": 0.054457541555166245, "learning_rate": 1e-06, "loss": 0.0081, "step": 243 }, { "clip_ratio/high_max": 0.006620081141591072, "clip_ratio/high_mean": 0.0031734767835587263, "clip_ratio/low_mean": 0.009233498014509678, "clip_ratio/low_min": 0.0005587792838923633, "clip_ratio/region_mean": 0.012406975962221622, "epoch": 0.002668153834377631, "grad_norm": 0.0486837774515152, "learning_rate": 1e-06, "loss": 0.0079, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 529.8928833007812, "completions/mean_terminated_length": 529.8928833007812, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.0026790888910759, "grad_norm": 0.2513697147369385, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 6134428.0, "reward": 0.3678571581840515, "reward_std": 0.3068114221096039, "rewards/accuracy_reward/mean": 0.2857142984867096, "rewards/accuracy_reward/std": 0.453784316778183, "rewards/format_reward/mean": 0.8214285969734192, "rewards/format_reward/std": 0.38471439480781555, "step": 245 }, { "clip_ratio/high_max": 0.005941286217421293, "clip_ratio/high_mean": 0.002256399253383279, "clip_ratio/low_mean": 0.0022819708101451397, "clip_ratio/low_min": 0.00012017786502838135, "clip_ratio/region_mean": 0.004538369830697775, "epoch": 0.002690023947774169, "grad_norm": 0.16129940748214722, "learning_rate": 1e-06, "loss": 0.0026, "step": 246 }, { "clip_ratio/high_max": 0.01607642136514187, "clip_ratio/high_mean": 0.005683543626219034, "clip_ratio/low_mean": 0.006805977784097195, "clip_ratio/low_min": 0.00012017786502838135, "clip_ratio/region_mean": 0.012489521875977516, "epoch": 0.0027009590044724382, "grad_norm": 0.12208259105682373, "learning_rate": 1e-06, "loss": 0.002, "step": 247 }, { "clip_ratio/high_max": 0.02283317781984806, "clip_ratio/high_mean": 0.00805062148720026, "clip_ratio/low_mean": 0.011485996656119823, "clip_ratio/low_min": 0.0004807114601135254, "clip_ratio/region_mean": 0.019536618143320084, "epoch": 0.0027118940611707072, "grad_norm": 0.09665606170892715, "learning_rate": 1e-06, "loss": 0.0015, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 588.0089721679688, "completions/mean_terminated_length": 588.0089721679688, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.0027228291178689763, "grad_norm": 0.2245621681213379, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 6213585.0, "reward": 0.5687500834465027, "reward_std": 0.33204683661460876, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.5015259981155396, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 249 }, { "clip_ratio/high_max": 0.0087819155305624, "clip_ratio/high_mean": 0.0026462089736014605, "clip_ratio/low_mean": 0.0030686098616570234, "clip_ratio/low_min": 0.0005094243679195642, "clip_ratio/region_mean": 0.005714819300919771, "epoch": 0.0027337641745672453, "grad_norm": 0.1514621376991272, "learning_rate": 1e-06, "loss": -0.0005, "step": 250 }, { "clip_ratio/high_max": 0.019027484580874443, "clip_ratio/high_mean": 0.006051361095160246, "clip_ratio/low_mean": 0.00880864355713129, "clip_ratio/low_min": 0.002016128972172737, "clip_ratio/region_mean": 0.014860004186630249, "epoch": 0.0027446992312655143, "grad_norm": 0.10354339331388474, "learning_rate": 1e-06, "loss": -0.0011, "step": 251 }, { "clip_ratio/high_max": 0.025857863947749138, "clip_ratio/high_mean": 0.008918759413063526, "clip_ratio/low_mean": 0.014840983785688877, "clip_ratio/low_min": 0.0029569892212748528, "clip_ratio/region_mean": 0.023759743198752403, "epoch": 0.002755634287963783, "grad_norm": 0.08486434072256088, "learning_rate": 1e-06, "loss": -0.0015, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 921.7232666015625, "completions/mean_terminated_length": 869.0934448242188, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.002766569344662052, "grad_norm": 0.09538955241441727, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 6334878.0, "reward": 0.34196433424949646, "reward_std": 0.11236974596977234, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4349588453769684, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 253 }, { "clip_ratio/high_max": 0.002749919192865491, "clip_ratio/high_mean": 0.0015176227316260338, "clip_ratio/low_mean": 0.0006134035065770149, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021310262382030487, "epoch": 0.002777504401360321, "grad_norm": 0.0635828971862793, "learning_rate": 1e-06, "loss": -0.0043, "step": 254 }, { "clip_ratio/high_max": 0.004898715764284134, "clip_ratio/high_mean": 0.0025005489587783813, "clip_ratio/low_mean": 0.0020453983452171087, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004545947071164846, "epoch": 0.00278843945805859, "grad_norm": 0.03769033029675484, "learning_rate": 1e-06, "loss": -0.0045, "step": 255 }, { "clip_ratio/high_max": 0.007926237769424915, "clip_ratio/high_mean": 0.0035813129507005215, "clip_ratio/low_mean": 0.004260806832462549, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007842118851840496, "epoch": 0.002799374514756859, "grad_norm": 0.026403669267892838, "learning_rate": 1e-06, "loss": -0.0045, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1453.0, "completions/max_terminated_length": 1453.0, "completions/mean_length": 572.2232666015625, "completions/mean_terminated_length": 572.2232666015625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.002810309571455128, "grad_norm": 0.24165667593479156, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 6416803.0, "reward": 0.4303572177886963, "reward_std": 0.2141229510307312, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.47854551672935486, "rewards/format_reward/mean": 0.8214285969734192, "rewards/format_reward/std": 0.38471439480781555, "step": 257 }, { "clip_ratio/high_max": 0.007240795064717531, "clip_ratio/high_mean": 0.00308774271979928, "clip_ratio/low_mean": 0.0029762533959001303, "clip_ratio/low_min": 0.0003225806576665491, "clip_ratio/region_mean": 0.0060639954172074795, "epoch": 0.002821244628153397, "grad_norm": 0.14373843371868134, "learning_rate": 1e-06, "loss": 0.0074, "step": 258 }, { "clip_ratio/high_max": 0.01848713681101799, "clip_ratio/high_mean": 0.007508347276598215, "clip_ratio/low_mean": 0.0069971545599401, "clip_ratio/low_min": 0.000516129017341882, "clip_ratio/region_mean": 0.01450550090521574, "epoch": 0.002832179684851666, "grad_norm": 0.12072297930717468, "learning_rate": 1e-06, "loss": 0.0067, "step": 259 }, { "clip_ratio/high_max": 0.02480357326567173, "clip_ratio/high_mean": 0.010040606372058392, "clip_ratio/low_mean": 0.00994468666613102, "clip_ratio/low_min": 0.001677419408224523, "clip_ratio/region_mean": 0.019985292106866837, "epoch": 0.002843114741549935, "grad_norm": 0.09051058441400528, "learning_rate": 1e-06, "loss": 0.0062, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 491.2857360839844, "completions/mean_terminated_length": 462.9818115234375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.002854049798248204, "grad_norm": 0.38378509879112244, "learning_rate": 1e-06, "loss": 0.0932, "num_tokens": 6491875.0, "reward": 0.7776786684989929, "reward_std": 0.37279990315437317, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4655956029891968, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 261 }, { "clip_ratio/high_max": 0.009225091896951199, "clip_ratio/high_mean": 0.0038565807044506073, "clip_ratio/low_mean": 0.0049975779838860035, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008854158222675323, "epoch": 0.002864984854946473, "grad_norm": 0.26227521896362305, "learning_rate": 1e-06, "loss": 0.0917, "step": 262 }, { "clip_ratio/high_max": 0.021133847534656525, "clip_ratio/high_mean": 0.008968709036707878, "clip_ratio/low_mean": 0.014030814170837402, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02299952320754528, "epoch": 0.0028759199116447417, "grad_norm": 0.20786207914352417, "learning_rate": 1e-06, "loss": 0.0902, "step": 263 }, { "clip_ratio/high_max": 0.027507547289133072, "clip_ratio/high_mean": 0.014015513472259045, "clip_ratio/low_mean": 0.022627850994467735, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.036643367260694504, "epoch": 0.0028868549683430107, "grad_norm": 0.18969295918941498, "learning_rate": 1e-06, "loss": 0.089, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 988.0982666015625, "completions/mean_terminated_length": 968.8272705078125, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.0028977900250412797, "grad_norm": 0.15900887548923492, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 6625390.0, "reward": 0.2026785910129547, "reward_std": 0.24290505051612854, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33220529556274414, "rewards/format_reward/mean": 0.7767857313156128, "rewards/format_reward/std": 0.41827234625816345, "step": 265 }, { "clip_ratio/high_max": 0.002889129566028714, "clip_ratio/high_mean": 0.0013029256369918585, "clip_ratio/low_mean": 0.0012380414409562945, "clip_ratio/low_min": 0.00012649421114474535, "clip_ratio/region_mean": 0.002540966961532831, "epoch": 0.0029087250817395487, "grad_norm": 0.12886948883533478, "learning_rate": 1e-06, "loss": -0.0108, "step": 266 }, { "clip_ratio/high_max": 0.0092692906036973, "clip_ratio/high_mean": 0.003346310695633292, "clip_ratio/low_mean": 0.0036016462836414576, "clip_ratio/low_min": 0.0004427297390066087, "clip_ratio/region_mean": 0.00694795697927475, "epoch": 0.0029196601384378178, "grad_norm": 0.09821335971355438, "learning_rate": 1e-06, "loss": -0.0114, "step": 267 }, { "clip_ratio/high_max": 0.015288311056792736, "clip_ratio/high_mean": 0.005462431348860264, "clip_ratio/low_mean": 0.0073490082286298275, "clip_ratio/low_min": 0.0013914363225921988, "clip_ratio/region_mean": 0.012811440043151379, "epoch": 0.002930595195136087, "grad_norm": 0.07706645876169205, "learning_rate": 1e-06, "loss": -0.0118, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 744.0714721679688, "completions/mean_terminated_length": 732.3243408203125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.002941530251834356, "grad_norm": 0.11816884577274323, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 6726674.0, "reward": 0.4062500596046448, "reward_std": 0.17408105731010437, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4655956029891968, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 269 }, { "clip_ratio/high_max": 0.002008723560720682, "clip_ratio/high_mean": 0.0011128870537504554, "clip_ratio/low_mean": 0.0013425281504169106, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024554149713367224, "epoch": 0.002952465308532625, "grad_norm": 0.08621466904878616, "learning_rate": 1e-06, "loss": 0.0107, "step": 270 }, { "clip_ratio/high_max": 0.004476584028452635, "clip_ratio/high_mean": 0.002227130113169551, "clip_ratio/low_mean": 0.00378287467174232, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006010004784911871, "epoch": 0.002963400365230894, "grad_norm": 0.06582856923341751, "learning_rate": 1e-06, "loss": 0.0104, "step": 271 }, { "clip_ratio/high_max": 0.006829660385847092, "clip_ratio/high_mean": 0.00372076197527349, "clip_ratio/low_mean": 0.007433601655066013, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011154362931847572, "epoch": 0.002974335421929163, "grad_norm": 0.05151357501745224, "learning_rate": 1e-06, "loss": 0.0102, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 897.2500610351562, "completions/mean_terminated_length": 796.6990356445312, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.002985270478627432, "grad_norm": 0.3304896056652069, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 6844890.0, "reward": 0.44732150435447693, "reward_std": 0.2696543335914612, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.48131096363067627, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 273 }, { "clip_ratio/high_max": 0.004130679648369551, "clip_ratio/high_mean": 0.0013325463514775038, "clip_ratio/low_mean": 0.0018896734109148383, "clip_ratio/low_min": 0.0006094257696531713, "clip_ratio/region_mean": 0.003222219878807664, "epoch": 0.0029962055353257005, "grad_norm": 0.14581957459449768, "learning_rate": 1e-06, "loss": 0.0164, "step": 274 }, { "clip_ratio/high_max": 0.010889973491430283, "clip_ratio/high_mean": 0.004137958865612745, "clip_ratio/low_mean": 0.005612813867628574, "clip_ratio/low_min": 0.0020079200621694326, "clip_ratio/region_mean": 0.009750773198902607, "epoch": 0.0030071405920239695, "grad_norm": 0.11051084101200104, "learning_rate": 1e-06, "loss": 0.0158, "step": 275 }, { "clip_ratio/high_max": 0.015396169386804104, "clip_ratio/high_mean": 0.006238116882741451, "clip_ratio/low_mean": 0.009393145330250263, "clip_ratio/low_min": 0.0037023324985057116, "clip_ratio/region_mean": 0.015631262212991714, "epoch": 0.0030180756487222385, "grad_norm": 0.08370156586170197, "learning_rate": 1e-06, "loss": 0.0154, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 610.5625, "completions/mean_terminated_length": 610.5625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.0030290107054205075, "grad_norm": 0.13622477650642395, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 6934749.0, "reward": 0.3633929193019867, "reward_std": 0.16640949249267578, "rewards/accuracy_reward/mean": 0.2678571343421936, "rewards/accuracy_reward/std": 0.44483304023742676, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466198682785, "step": 277 }, { "clip_ratio/high_max": 0.0045982045121490955, "clip_ratio/high_mean": 0.0018848020117729902, "clip_ratio/low_mean": 0.001907173776999116, "clip_ratio/low_min": 0.0002625705674290657, "clip_ratio/region_mean": 0.0037919755559414625, "epoch": 0.0030399457621187766, "grad_norm": 0.07810451835393906, "learning_rate": 1e-06, "loss": 0.0145, "step": 278 }, { "clip_ratio/high_max": 0.011823954060673714, "clip_ratio/high_mean": 0.004027615301311016, "clip_ratio/low_mean": 0.003961509093642235, "clip_ratio/low_min": 0.0005251411348581314, "clip_ratio/region_mean": 0.007989124394953251, "epoch": 0.0030508808188170456, "grad_norm": 0.05489056557416916, "learning_rate": 1e-06, "loss": 0.0143, "step": 279 }, { "clip_ratio/high_max": 0.016422158107161522, "clip_ratio/high_mean": 0.005998618435114622, "clip_ratio/low_mean": 0.006866735406219959, "clip_ratio/low_min": 0.0007877117022871971, "clip_ratio/region_mean": 0.012865354306995869, "epoch": 0.0030618158755153146, "grad_norm": 0.04439927637577057, "learning_rate": 1e-06, "loss": 0.0141, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 806.2678833007812, "completions/mean_terminated_length": 783.6908569335938, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.0030727509322135836, "grad_norm": 0.23866648972034454, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 7041123.0, "reward": 0.2678571939468384, "reward_std": 0.29807010293006897, "rewards/accuracy_reward/mean": 0.1785714328289032, "rewards/accuracy_reward/std": 0.38471439480781555, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3106848895549774, "step": 281 }, { "clip_ratio/high_max": 0.0042639970779418945, "clip_ratio/high_mean": 0.0020146185997873545, "clip_ratio/low_mean": 0.0031496421433985233, "clip_ratio/low_min": 0.0001492871524533257, "clip_ratio/region_mean": 0.0051642609760165215, "epoch": 0.0030836859889118527, "grad_norm": 0.1313341110944748, "learning_rate": 1e-06, "loss": 0.0196, "step": 282 }, { "clip_ratio/high_max": 0.007839100435376167, "clip_ratio/high_mean": 0.0036670886911451817, "clip_ratio/low_mean": 0.007363352458924055, "clip_ratio/low_min": 0.00022393073595594615, "clip_ratio/region_mean": 0.011030441150069237, "epoch": 0.0030946210456101217, "grad_norm": 0.10096918791532516, "learning_rate": 1e-06, "loss": 0.0192, "step": 283 }, { "clip_ratio/high_max": 0.011560693383216858, "clip_ratio/high_mean": 0.0052343918941915035, "clip_ratio/low_mean": 0.011715439148247242, "clip_ratio/low_min": 0.0004478614719118923, "clip_ratio/region_mean": 0.016949830576777458, "epoch": 0.0031055561023083903, "grad_norm": 0.09769563376903534, "learning_rate": 1e-06, "loss": 0.0189, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 501.77679443359375, "completions/mean_terminated_length": 459.22015380859375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.0031164911590066593, "grad_norm": 0.43638771772384644, "learning_rate": 1e-06, "loss": -0.0171, "num_tokens": 7113058.0, "reward": 0.31964290142059326, "reward_std": 0.26764950156211853, "rewards/accuracy_reward/mean": 0.2232142835855484, "rewards/accuracy_reward/std": 0.41827234625816345, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641091883182526, "step": 285 }, { "clip_ratio/high_max": 0.017648817971348763, "clip_ratio/high_mean": 0.005698407534509897, "clip_ratio/low_mean": 0.003849025582894683, "clip_ratio/low_min": 0.0003296848153695464, "clip_ratio/region_mean": 0.009547432884573936, "epoch": 0.0031274262157049283, "grad_norm": 0.23280829191207886, "learning_rate": 1e-06, "loss": -0.0183, "step": 286 }, { "clip_ratio/high_max": 0.02752019092440605, "clip_ratio/high_mean": 0.00994950532913208, "clip_ratio/low_mean": 0.010854714550077915, "clip_ratio/low_min": 0.0011868653818964958, "clip_ratio/region_mean": 0.02080422081053257, "epoch": 0.0031383612724031973, "grad_norm": 0.15659715235233307, "learning_rate": 1e-06, "loss": -0.0192, "step": 287 }, { "clip_ratio/high_max": 0.030212383717298508, "clip_ratio/high_mean": 0.012610738165676594, "clip_ratio/low_mean": 0.019246384501457214, "clip_ratio/low_min": 0.001714361016638577, "clip_ratio/region_mean": 0.031857121735811234, "epoch": 0.0031492963291014663, "grad_norm": 0.11864504963159561, "learning_rate": 1e-06, "loss": -0.0197, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 436.26788330078125, "completions/mean_terminated_length": 436.26788330078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.0031602313857997354, "grad_norm": 0.3862595856189728, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 7180680.0, "reward": 0.266964316368103, "reward_std": 0.2832203507423401, "rewards/accuracy_reward/mean": 0.1696428507566452, "rewards/accuracy_reward/std": 0.37700554728507996, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 289 }, { "clip_ratio/high_max": 0.01319305319339037, "clip_ratio/high_mean": 0.004581835120916367, "clip_ratio/low_mean": 0.006062577944248915, "clip_ratio/low_min": 0.0003207869885955006, "clip_ratio/region_mean": 0.010644412599503994, "epoch": 0.0031711664424980044, "grad_norm": 0.2569689154624939, "learning_rate": 1e-06, "loss": 0.0077, "step": 290 }, { "clip_ratio/high_max": 0.029893118888139725, "clip_ratio/high_mean": 0.00881954375654459, "clip_ratio/low_mean": 0.014989323914051056, "clip_ratio/low_min": 0.0005346449906937778, "clip_ratio/region_mean": 0.02380886860191822, "epoch": 0.0031821014991962734, "grad_norm": 0.19886180758476257, "learning_rate": 1e-06, "loss": 0.0065, "step": 291 }, { "clip_ratio/high_max": 0.04358717426657677, "clip_ratio/high_mean": 0.012233642861247063, "clip_ratio/low_mean": 0.02316555753350258, "clip_ratio/low_min": 0.0013900769408792257, "clip_ratio/region_mean": 0.03539920225739479, "epoch": 0.0031930365558945424, "grad_norm": 0.1712842583656311, "learning_rate": 1e-06, "loss": 0.0056, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1045.544677734375, "completions/mean_terminated_length": 1045.544677734375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.0032039716125928114, "grad_norm": 0.20903965830802917, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 7316485.0, "reward": 0.11607144773006439, "reward_std": 0.1717204451560974, "rewards/accuracy_reward/mean": 0.0446428582072258, "rewards/accuracy_reward/std": 0.2074466347694397, "rewards/format_reward/mean": 0.7142857313156128, "rewards/format_reward/std": 0.4537842869758606, "step": 293 }, { "clip_ratio/high_max": 0.0018976052524521947, "clip_ratio/high_mean": 0.0014065094292163849, "clip_ratio/low_mean": 0.0026197272818535566, "clip_ratio/low_min": 0.00041747314389795065, "clip_ratio/region_mean": 0.004026236943900585, "epoch": 0.0032149066692910805, "grad_norm": 0.08214298635721207, "learning_rate": 1e-06, "loss": 0.0076, "step": 294 }, { "clip_ratio/high_max": 0.0034948522225022316, "clip_ratio/high_mean": 0.0023484504781663418, "clip_ratio/low_mean": 0.00449950760230422, "clip_ratio/low_min": 0.0006072336691431701, "clip_ratio/region_mean": 0.006847958546131849, "epoch": 0.003225841725989349, "grad_norm": 0.062435880303382874, "learning_rate": 1e-06, "loss": 0.0074, "step": 295 }, { "clip_ratio/high_max": 0.0060897283256053925, "clip_ratio/high_mean": 0.0038045577239245176, "clip_ratio/low_mean": 0.008064300753176212, "clip_ratio/low_min": 0.0009108505328185856, "clip_ratio/region_mean": 0.011868857778608799, "epoch": 0.003236776782687618, "grad_norm": 0.047082431614398956, "learning_rate": 1e-06, "loss": 0.0072, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 971.1785888671875, "completions/mean_terminated_length": 865.6078491210938, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.003247711839385887, "grad_norm": 0.2060617059469223, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 7442449.0, "reward": 0.27321431040763855, "reward_std": 0.18400833010673523, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3920665979385376, "rewards/format_reward/mean": 0.8571428656578064, "rewards/format_reward/std": 0.3514997959136963, "step": 297 }, { "clip_ratio/high_max": 0.015676181763410568, "clip_ratio/high_mean": 0.0033958647400140762, "clip_ratio/low_mean": 0.0027403549756854773, "clip_ratio/low_min": 0.0002188375365221873, "clip_ratio/region_mean": 0.006136219948530197, "epoch": 0.003258646896084156, "grad_norm": 0.1077965721487999, "learning_rate": 1e-06, "loss": -0.0023, "step": 298 }, { "clip_ratio/high_max": 0.023397285491228104, "clip_ratio/high_mean": 0.00535667734220624, "clip_ratio/low_mean": 0.004519244190305471, "clip_ratio/low_min": 0.00025852039107121527, "clip_ratio/region_mean": 0.009875921532511711, "epoch": 0.003269581952782425, "grad_norm": 0.08649815618991852, "learning_rate": 1e-06, "loss": -0.0024, "step": 299 }, { "clip_ratio/high_max": 0.028310716152191162, "clip_ratio/high_mean": 0.00676260981708765, "clip_ratio/low_mean": 0.006821298506110907, "clip_ratio/low_min": 0.0005689776153303683, "clip_ratio/region_mean": 0.013583908788859844, "epoch": 0.003280517009480694, "grad_norm": 0.07212438434362411, "learning_rate": 1e-06, "loss": -0.0026, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 803.3482666015625, "completions/mean_terminated_length": 732.896240234375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.003291452066178963, "grad_norm": 0.22173619270324707, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 7549784.0, "reward": 0.5366072058677673, "reward_std": 0.22768345475196838, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49935612082481384, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417016506195, "step": 301 }, { "clip_ratio/high_max": 0.010009442456066608, "clip_ratio/high_mean": 0.00449835741892457, "clip_ratio/low_mean": 0.002231012564152479, "clip_ratio/low_min": 0.00040463986806571484, "clip_ratio/region_mean": 0.006729369517415762, "epoch": 0.003302387122877232, "grad_norm": 0.10460333526134491, "learning_rate": 1e-06, "loss": -0.0007, "step": 302 }, { "clip_ratio/high_max": 0.022096317261457443, "clip_ratio/high_mean": 0.008688386529684067, "clip_ratio/low_mean": 0.0042308298870921135, "clip_ratio/low_min": 0.0005844798288308084, "clip_ratio/region_mean": 0.012919217348098755, "epoch": 0.0033133221795755012, "grad_norm": 0.08764772862195969, "learning_rate": 1e-06, "loss": -0.0009, "step": 303 }, { "clip_ratio/high_max": 0.024740321561694145, "clip_ratio/high_mean": 0.010482413694262505, "clip_ratio/low_mean": 0.005692082457244396, "clip_ratio/low_min": 0.0011239995947107673, "clip_ratio/region_mean": 0.016174497082829475, "epoch": 0.0033242572362737702, "grad_norm": 0.07056335359811783, "learning_rate": 1e-06, "loss": -0.001, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 753.419677734375, "completions/mean_terminated_length": 741.7567749023438, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.0033351922929720393, "grad_norm": 0.15208350121974945, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 7650519.0, "reward": 0.15535716712474823, "reward_std": 0.2024206519126892, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24314938485622406, "rewards/format_reward/mean": 0.9285714030265808, "rewards/format_reward/std": 0.25869685411453247, "step": 305 }, { "clip_ratio/high_max": 0.002639862010255456, "clip_ratio/high_mean": 0.00137334072496742, "clip_ratio/low_mean": 0.00230352277867496, "clip_ratio/low_min": 0.00039506173925474286, "clip_ratio/region_mean": 0.003676863620057702, "epoch": 0.003346127349670308, "grad_norm": 0.09281422197818756, "learning_rate": 1e-06, "loss": 0.0151, "step": 306 }, { "clip_ratio/high_max": 0.004365925677120686, "clip_ratio/high_mean": 0.0024857826065272093, "clip_ratio/low_mean": 0.006271806079894304, "clip_ratio/low_min": 0.001086419797502458, "clip_ratio/region_mean": 0.008757589384913445, "epoch": 0.003357062406368577, "grad_norm": 0.06448769569396973, "learning_rate": 1e-06, "loss": 0.0149, "step": 307 }, { "clip_ratio/high_max": 0.005888922605663538, "clip_ratio/high_mean": 0.003234049305319786, "clip_ratio/low_mean": 0.010413828305900097, "clip_ratio/low_min": 0.0027654320001602173, "clip_ratio/region_mean": 0.013647877611219883, "epoch": 0.003367997463066846, "grad_norm": 0.05444297194480896, "learning_rate": 1e-06, "loss": 0.0146, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1099.4375, "completions/mean_terminated_length": 1026.47119140625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.003378932519765115, "grad_norm": 0.24573777616024017, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 7789728.0, "reward": 0.266964316368103, "reward_std": 0.2906479239463806, "rewards/accuracy_reward/mean": 0.1785714328289032, "rewards/accuracy_reward/std": 0.38471436500549316, "rewards/format_reward/mean": 0.8839285969734192, "rewards/format_reward/std": 0.3217501640319824, "step": 309 }, { "clip_ratio/high_max": 0.014035088010132313, "clip_ratio/high_mean": 0.002690879162400961, "clip_ratio/low_mean": 0.0029840078204870224, "clip_ratio/low_min": 0.0011169948847964406, "clip_ratio/region_mean": 0.005674886982887983, "epoch": 0.003389867576463384, "grad_norm": 0.1541258543729782, "learning_rate": 1e-06, "loss": -0.0082, "step": 310 }, { "clip_ratio/high_max": 0.031578946858644485, "clip_ratio/high_mean": 0.0060642859898507595, "clip_ratio/low_mean": 0.006956311408430338, "clip_ratio/low_min": 0.0023939243983477354, "clip_ratio/region_mean": 0.013020597398281097, "epoch": 0.003400802633161653, "grad_norm": 0.11576902121305466, "learning_rate": 1e-06, "loss": -0.0089, "step": 311 }, { "clip_ratio/high_max": 0.04477192834019661, "clip_ratio/high_mean": 0.008793550543487072, "clip_ratio/low_mean": 0.011074639856815338, "clip_ratio/low_min": 0.003754634642973542, "clip_ratio/region_mean": 0.019868191331624985, "epoch": 0.003411737689859922, "grad_norm": 0.08700046688318253, "learning_rate": 1e-06, "loss": -0.0094, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 654.7678833007812, "completions/mean_terminated_length": 642.2162475585938, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.003422672746558191, "grad_norm": 0.22923102974891663, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 7883726.0, "reward": 0.47321438789367676, "reward_std": 0.27362167835235596, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4862987697124481, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 313 }, { "clip_ratio/high_max": 0.00467904657125473, "clip_ratio/high_mean": 0.0019069899572059512, "clip_ratio/low_mean": 0.0042481934651732445, "clip_ratio/low_min": 0.0013746431795880198, "clip_ratio/region_mean": 0.00615518307313323, "epoch": 0.00343360780325646, "grad_norm": 0.1461719274520874, "learning_rate": 1e-06, "loss": 0.0218, "step": 314 }, { "clip_ratio/high_max": 0.008491807617247105, "clip_ratio/high_mean": 0.0034959283657372, "clip_ratio/low_mean": 0.011023125611245632, "clip_ratio/low_min": 0.003712164470925927, "clip_ratio/region_mean": 0.014519053511321545, "epoch": 0.003444542859954729, "grad_norm": 0.10856746882200241, "learning_rate": 1e-06, "loss": 0.0212, "step": 315 }, { "clip_ratio/high_max": 0.011960292235016823, "clip_ratio/high_mean": 0.004535282496362925, "clip_ratio/low_mean": 0.016677159816026688, "clip_ratio/low_min": 0.003426613286137581, "clip_ratio/region_mean": 0.021212443709373474, "epoch": 0.0034554779166529976, "grad_norm": 0.12202321738004684, "learning_rate": 1e-06, "loss": 0.0208, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 391.8660888671875, "completions/mean_terminated_length": 391.8660888671875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.0034664129733512667, "grad_norm": 0.35915660858154297, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 7944107.0, "reward": 0.8142858743667603, "reward_std": 0.22645136713981628, "rewards/accuracy_reward/mean": 0.7142857313156128, "rewards/accuracy_reward/std": 0.4537842869758606, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.015066724270582199, "clip_ratio/high_mean": 0.003953267820179462, "clip_ratio/low_mean": 0.006657526828348637, "clip_ratio/low_min": 0.0007833921117708087, "clip_ratio/region_mean": 0.010610794648528099, "epoch": 0.0034773480300495357, "grad_norm": 0.2277407944202423, "learning_rate": 1e-06, "loss": 0.0037, "step": 318 }, { "clip_ratio/high_max": 0.028698522597551346, "clip_ratio/high_mean": 0.007551479618996382, "clip_ratio/low_mean": 0.013987978920340538, "clip_ratio/low_min": 0.002546024275943637, "clip_ratio/region_mean": 0.021539459004998207, "epoch": 0.0034882830867478047, "grad_norm": 0.16547347605228424, "learning_rate": 1e-06, "loss": 0.003, "step": 319 }, { "clip_ratio/high_max": 0.033864255994558334, "clip_ratio/high_mean": 0.009223735891282558, "clip_ratio/low_mean": 0.01883832924067974, "clip_ratio/low_min": 0.002935420721769333, "clip_ratio/region_mean": 0.028062064200639725, "epoch": 0.0034992181434460737, "grad_norm": 0.18520542979240417, "learning_rate": 1e-06, "loss": 0.0026, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 695.0357666015625, "completions/mean_terminated_length": 682.8468627929688, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.0035101532001443427, "grad_norm": 0.4054780602455139, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 8039703.0, "reward": 0.5633929371833801, "reward_std": 0.36581680178642273, "rewards/accuracy_reward/mean": 0.4642857015132904, "rewards/accuracy_reward/std": 0.5009642839431763, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 321 }, { "clip_ratio/high_max": 0.010324179194867611, "clip_ratio/high_mean": 0.0037383954040706158, "clip_ratio/low_mean": 0.005423116963356733, "clip_ratio/low_min": 0.0017259978922083974, "clip_ratio/region_mean": 0.009161512367427349, "epoch": 0.0035210882568426118, "grad_norm": 0.2759929299354553, "learning_rate": 1e-06, "loss": 0.0252, "step": 322 }, { "clip_ratio/high_max": 0.022919677197933197, "clip_ratio/high_mean": 0.00846562348306179, "clip_ratio/low_mean": 0.012884951196610928, "clip_ratio/low_min": 0.002358743455260992, "clip_ratio/region_mean": 0.021350575610995293, "epoch": 0.0035320233135408808, "grad_norm": 0.23281559348106384, "learning_rate": 1e-06, "loss": 0.024, "step": 323 }, { "clip_ratio/high_max": 0.029527151957154274, "clip_ratio/high_mean": 0.012027055025100708, "clip_ratio/low_mean": 0.018872082233428955, "clip_ratio/low_min": 0.002787605859339237, "clip_ratio/region_mean": 0.030899137258529663, "epoch": 0.00354295837023915, "grad_norm": 0.17648355662822723, "learning_rate": 1e-06, "loss": 0.023, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 608.1964721679688, "completions/mean_terminated_length": 608.1964721679688, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.003553893426937419, "grad_norm": 0.4258395731449127, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 8123809.0, "reward": 0.7312501072883606, "reward_std": 0.2230919897556305, "rewards/accuracy_reward/mean": 0.6339285969734192, "rewards/accuracy_reward/std": 0.483894407749176, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 325 }, { "clip_ratio/high_max": 0.00993142556399107, "clip_ratio/high_mean": 0.003542464692145586, "clip_ratio/low_mean": 0.0033671760465949774, "clip_ratio/low_min": 5.937889727647416e-05, "clip_ratio/region_mean": 0.006909640971571207, "epoch": 0.003564828483635688, "grad_norm": 0.16457095742225647, "learning_rate": 1e-06, "loss": 0.009, "step": 326 }, { "clip_ratio/high_max": 0.014897138811647892, "clip_ratio/high_mean": 0.006037508603185415, "clip_ratio/low_mean": 0.006827770732343197, "clip_ratio/low_min": 0.0004156522627454251, "clip_ratio/region_mean": 0.012865280732512474, "epoch": 0.0035757635403339564, "grad_norm": 0.12378884851932526, "learning_rate": 1e-06, "loss": 0.0084, "step": 327 }, { "clip_ratio/high_max": 0.02317332662642002, "clip_ratio/high_mean": 0.00939054973423481, "clip_ratio/low_mean": 0.00998037401586771, "clip_ratio/low_min": 0.0007125467527657747, "clip_ratio/region_mean": 0.019370922818779945, "epoch": 0.0035866985970322255, "grad_norm": 0.11571387201547623, "learning_rate": 1e-06, "loss": 0.0081, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 698.794677734375, "completions/mean_terminated_length": 698.794677734375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.0035976336537304945, "grad_norm": 0.1957181692123413, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 8217810.0, "reward": 0.6330358386039734, "reward_std": 0.21837164461612701, "rewards/accuracy_reward/mean": 0.5357142686843872, "rewards/accuracy_reward/std": 0.5009642839431763, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 329 }, { "clip_ratio/high_max": 0.006606691051274538, "clip_ratio/high_mean": 0.0022590849548578262, "clip_ratio/low_mean": 0.0026920048985630274, "clip_ratio/low_min": 0.0006764592253603041, "clip_ratio/region_mean": 0.00495108962059021, "epoch": 0.0036085687104287635, "grad_norm": 0.10477279126644135, "learning_rate": 1e-06, "loss": 0.0004, "step": 330 }, { "clip_ratio/high_max": 0.010120888240635395, "clip_ratio/high_mean": 0.003814263502135873, "clip_ratio/low_mean": 0.006123620551079512, "clip_ratio/low_min": 0.0012562813935801387, "clip_ratio/region_mean": 0.009937883354723454, "epoch": 0.0036195037671270325, "grad_norm": 0.07381904125213623, "learning_rate": 1e-06, "loss": 0.0002, "step": 331 }, { "clip_ratio/high_max": 0.011245431378483772, "clip_ratio/high_mean": 0.004804347176104784, "clip_ratio/low_mean": 0.009619680233299732, "clip_ratio/low_min": 0.0020293777342885733, "clip_ratio/region_mean": 0.014424026943743229, "epoch": 0.0036304388238253015, "grad_norm": 0.06240319088101387, "learning_rate": 1e-06, "loss": -0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 503.0625305175781, "completions/mean_terminated_length": 503.0625305175781, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.0036413738805235706, "grad_norm": 0.3151375651359558, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 8287089.0, "reward": 0.5848214626312256, "reward_std": 0.360604852437973, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5021671056747437, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 333 }, { "clip_ratio/high_max": 0.01691143773496151, "clip_ratio/high_mean": 0.00525979558005929, "clip_ratio/low_mean": 0.0041300226002931595, "clip_ratio/low_min": 0.001518314704298973, "clip_ratio/region_mean": 0.009389817714691162, "epoch": 0.0036523089372218396, "grad_norm": 0.2064860612154007, "learning_rate": 1e-06, "loss": 0.0098, "step": 334 }, { "clip_ratio/high_max": 0.025589674711227417, "clip_ratio/high_mean": 0.008748997934162617, "clip_ratio/low_mean": 0.008261704817414284, "clip_ratio/low_min": 0.0027985074557363987, "clip_ratio/region_mean": 0.017010701820254326, "epoch": 0.0036632439939201086, "grad_norm": 0.1607491821050644, "learning_rate": 1e-06, "loss": 0.0092, "step": 335 }, { "clip_ratio/high_max": 0.031152648851275444, "clip_ratio/high_mean": 0.010734586976468563, "clip_ratio/low_mean": 0.010630601085722446, "clip_ratio/low_min": 0.0031094527803361416, "clip_ratio/region_mean": 0.02136518992483616, "epoch": 0.0036741790506183776, "grad_norm": 0.15955552458763123, "learning_rate": 1e-06, "loss": 0.0089, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 530.8035888671875, "completions/mean_terminated_length": 530.8035888671875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.0036851141073166466, "grad_norm": 0.4057229161262512, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 8362807.0, "reward": 0.3053571879863739, "reward_std": 0.33450374007225037, "rewards/accuracy_reward/mean": 0.2053571492433548, "rewards/accuracy_reward/std": 0.4057779312133789, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.009907997213304043, "clip_ratio/high_mean": 0.002758780959993601, "clip_ratio/low_mean": 0.00509972358122468, "clip_ratio/low_min": 0.0019062142819166183, "clip_ratio/region_mean": 0.00785850454121828, "epoch": 0.0036960491640149152, "grad_norm": 0.24556492269039154, "learning_rate": 1e-06, "loss": 0.0173, "step": 338 }, { "clip_ratio/high_max": 0.023826373741030693, "clip_ratio/high_mean": 0.006594088859856129, "clip_ratio/low_mean": 0.012152090668678284, "clip_ratio/low_min": 0.003335874993354082, "clip_ratio/region_mean": 0.018746180459856987, "epoch": 0.0037069842207131842, "grad_norm": 0.17308728396892548, "learning_rate": 1e-06, "loss": 0.0164, "step": 339 }, { "clip_ratio/high_max": 0.03231894224882126, "clip_ratio/high_mean": 0.008899944834411144, "clip_ratio/low_mean": 0.019190888851881027, "clip_ratio/low_min": 0.0035264964681118727, "clip_ratio/region_mean": 0.028090834617614746, "epoch": 0.0037179192774114533, "grad_norm": 0.14503507316112518, "learning_rate": 1e-06, "loss": 0.0157, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 578.3928833007812, "completions/mean_terminated_length": 578.3928833007812, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.0037288543341097223, "grad_norm": 0.37361910939216614, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 8443995.0, "reward": 0.40446433424949646, "reward_std": 0.3930583894252777, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4655956029891968, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 341 }, { "clip_ratio/high_max": 0.006897386163473129, "clip_ratio/high_mean": 0.002977658063173294, "clip_ratio/low_mean": 0.003577468916773796, "clip_ratio/low_min": 0.0011305396910756826, "clip_ratio/region_mean": 0.006555126514285803, "epoch": 0.0037397893908079913, "grad_norm": 0.23642009496688843, "learning_rate": 1e-06, "loss": 0.0165, "step": 342 }, { "clip_ratio/high_max": 0.014641819521784782, "clip_ratio/high_mean": 0.0063910456374287605, "clip_ratio/low_mean": 0.011971807107329369, "clip_ratio/low_min": 0.004672897048294544, "clip_ratio/region_mean": 0.018362851813435555, "epoch": 0.0037507244475062603, "grad_norm": 0.15636758506298065, "learning_rate": 1e-06, "loss": 0.0153, "step": 343 }, { "clip_ratio/high_max": 0.019724104553461075, "clip_ratio/high_mean": 0.008977481164038181, "clip_ratio/low_mean": 0.01999574340879917, "clip_ratio/low_min": 0.007989146746695042, "clip_ratio/region_mean": 0.02897322177886963, "epoch": 0.0037616595042045294, "grad_norm": 0.1360606998205185, "learning_rate": 1e-06, "loss": 0.0145, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 798.9464721679688, "completions/mean_terminated_length": 740.5794067382812, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.0037725945609027984, "grad_norm": 0.1632775068283081, "learning_rate": 1e-06, "loss": 0.0574, "num_tokens": 8547645.0, "reward": 0.30446434020996094, "reward_std": 0.20445600152015686, "rewards/accuracy_reward/mean": 0.2142857164144516, "rewards/accuracy_reward/std": 0.41217005252838135, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 345 }, { "clip_ratio/high_max": 0.0033106657210737467, "clip_ratio/high_mean": 0.0013432787964120507, "clip_ratio/low_mean": 0.0016613679472357035, "clip_ratio/low_min": 0.00020801553910132498, "clip_ratio/region_mean": 0.0030046466272324324, "epoch": 0.0037835296176010674, "grad_norm": 0.09988246113061905, "learning_rate": 1e-06, "loss": 0.057, "step": 346 }, { "clip_ratio/high_max": 0.007203226909041405, "clip_ratio/high_mean": 0.0028512105345726013, "clip_ratio/low_mean": 0.004299093037843704, "clip_ratio/low_min": 0.00020801553910132498, "clip_ratio/region_mean": 0.007150303106755018, "epoch": 0.0037944646742993364, "grad_norm": 0.0736912339925766, "learning_rate": 1e-06, "loss": 0.0567, "step": 347 }, { "clip_ratio/high_max": 0.009892432019114494, "clip_ratio/high_mean": 0.004035804886370897, "clip_ratio/low_mean": 0.007103733718395233, "clip_ratio/low_min": 0.0008320621564052999, "clip_ratio/region_mean": 0.011139539070427418, "epoch": 0.0038053997309976054, "grad_norm": 0.06604953855276108, "learning_rate": 1e-06, "loss": 0.0565, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 640.6339721679688, "completions/mean_terminated_length": 640.6339721679688, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.003816334787695874, "grad_norm": 0.1440352499485016, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 8637876.0, "reward": 0.39732152223587036, "reward_std": 0.1524580717086792, "rewards/accuracy_reward/mean": 0.3035714328289032, "rewards/accuracy_reward/std": 0.46186625957489014, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 349 }, { "clip_ratio/high_max": 0.0021663494408130646, "clip_ratio/high_mean": 0.0012485334882512689, "clip_ratio/low_mean": 0.003454709192737937, "clip_ratio/low_min": 0.0002741791831795126, "clip_ratio/region_mean": 0.004703241866081953, "epoch": 0.003827269844394143, "grad_norm": 0.07604376971721649, "learning_rate": 1e-06, "loss": 0.0048, "step": 350 }, { "clip_ratio/high_max": 0.003938816953450441, "clip_ratio/high_mean": 0.002022993052378297, "clip_ratio/low_mean": 0.007657899521291256, "clip_ratio/low_min": 0.00034272397169843316, "clip_ratio/region_mean": 0.00968089234083891, "epoch": 0.003838204901092412, "grad_norm": 0.057267896831035614, "learning_rate": 1e-06, "loss": 0.0046, "step": 351 }, { "clip_ratio/high_max": 0.00564563786610961, "clip_ratio/high_mean": 0.002665670355781913, "clip_ratio/low_mean": 0.011474096216261387, "clip_ratio/low_min": 0.0006854479433968663, "clip_ratio/region_mean": 0.014139766804873943, "epoch": 0.003849139957790681, "grad_norm": 0.04401132091879845, "learning_rate": 1e-06, "loss": 0.0045, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 756.0267944335938, "completions/mean_terminated_length": 744.3873901367188, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.00386007501448895, "grad_norm": 0.30809324979782104, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 8739267.0, "reward": 0.3035714626312256, "reward_std": 0.1517094224691391, "rewards/accuracy_reward/mean": 0.2053571492433548, "rewards/accuracy_reward/std": 0.4057779312133789, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 353 }, { "clip_ratio/high_max": 0.017117958515882492, "clip_ratio/high_mean": 0.0036036537494510412, "clip_ratio/low_mean": 0.002718703355640173, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0063223568722605705, "epoch": 0.003871010071187219, "grad_norm": 0.1349509060382843, "learning_rate": 1e-06, "loss": -0.0002, "step": 354 }, { "clip_ratio/high_max": 0.02707749791443348, "clip_ratio/high_mean": 0.005973368883132935, "clip_ratio/low_mean": 0.005391333717852831, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011364703997969627, "epoch": 0.003881945127885488, "grad_norm": 0.09402363002300262, "learning_rate": 1e-06, "loss": -0.0004, "step": 355 }, { "clip_ratio/high_max": 0.0320572666823864, "clip_ratio/high_mean": 0.0076055703684687614, "clip_ratio/low_mean": 0.007130791898816824, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014736362732946873, "epoch": 0.003892880184583757, "grad_norm": 0.09047137945890427, "learning_rate": 1e-06, "loss": -0.0006, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 996.0714721679688, "completions/mean_terminated_length": 915.1538696289062, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.003903815241282026, "grad_norm": 0.20571313798427582, "learning_rate": 1e-06, "loss": -0.0054, "num_tokens": 8867459.0, "reward": 0.33750003576278687, "reward_std": 0.22338135540485382, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4349588453769684, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33220529556274414, "step": 357 }, { "clip_ratio/high_max": 0.003827910404652357, "clip_ratio/high_mean": 0.0019085257081314921, "clip_ratio/low_mean": 0.001914918189868331, "clip_ratio/low_min": 0.0003644905227702111, "clip_ratio/region_mean": 0.003823444014415145, "epoch": 0.003914750297980295, "grad_norm": 0.12756165862083435, "learning_rate": 1e-06, "loss": -0.006, "step": 358 }, { "clip_ratio/high_max": 0.00987325794994831, "clip_ratio/high_mean": 0.004274412523955107, "clip_ratio/low_mean": 0.004638587590306997, "clip_ratio/low_min": 0.0008909768075682223, "clip_ratio/region_mean": 0.008913001045584679, "epoch": 0.003925685354678564, "grad_norm": 0.09420273452997208, "learning_rate": 1e-06, "loss": -0.0065, "step": 359 }, { "clip_ratio/high_max": 0.015545129776000977, "clip_ratio/high_mean": 0.006446208339184523, "clip_ratio/low_mean": 0.007435681764036417, "clip_ratio/low_min": 0.0010934715392068028, "clip_ratio/region_mean": 0.01388189010322094, "epoch": 0.003936620411376833, "grad_norm": 0.07976268976926804, "learning_rate": 1e-06, "loss": -0.0069, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 745.1964721679688, "completions/mean_terminated_length": 745.1964721679688, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.003947555468075102, "grad_norm": 0.22945274412631989, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 8970633.0, "reward": 0.5008928775787354, "reward_std": 0.34413689374923706, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.4924624562263489, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 361 }, { "clip_ratio/high_max": 0.004260556772351265, "clip_ratio/high_mean": 0.0023939411621540785, "clip_ratio/low_mean": 0.003118568565696478, "clip_ratio/low_min": 0.001026288839057088, "clip_ratio/region_mean": 0.0055125099606812, "epoch": 0.003958490524773371, "grad_norm": 0.14420168101787567, "learning_rate": 1e-06, "loss": 0.017, "step": 362 }, { "clip_ratio/high_max": 0.00914537999778986, "clip_ratio/high_mean": 0.004657930228859186, "clip_ratio/low_mean": 0.00662363413721323, "clip_ratio/low_min": 0.0014210152439773083, "clip_ratio/region_mean": 0.011281564831733704, "epoch": 0.00396942558147164, "grad_norm": 0.1072487160563469, "learning_rate": 1e-06, "loss": 0.0165, "step": 363 }, { "clip_ratio/high_max": 0.012614317238330841, "clip_ratio/high_mean": 0.006063615437597036, "clip_ratio/low_mean": 0.010079247877001762, "clip_ratio/low_min": 0.002605194691568613, "clip_ratio/region_mean": 0.016142861917614937, "epoch": 0.003980360638169909, "grad_norm": 0.0925506055355072, "learning_rate": 1e-06, "loss": 0.0162, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1997.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 561.9732666015625, "completions/mean_terminated_length": 561.9732666015625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.0039912956948681775, "grad_norm": 0.23219901323318481, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 9052810.0, "reward": 0.4303572475910187, "reward_std": 0.18659235537052155, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.010797342285513878, "clip_ratio/high_mean": 0.002183460397645831, "clip_ratio/low_mean": 0.004311223980039358, "clip_ratio/low_min": 0.001752848387695849, "clip_ratio/region_mean": 0.00649468507617712, "epoch": 0.004002230751566447, "grad_norm": 0.13747113943099976, "learning_rate": 1e-06, "loss": 0.0016, "step": 366 }, { "clip_ratio/high_max": 0.01934029348194599, "clip_ratio/high_mean": 0.004131282214075327, "clip_ratio/low_mean": 0.011956683360040188, "clip_ratio/low_min": 0.004256917629390955, "clip_ratio/region_mean": 0.016087966039776802, "epoch": 0.0040131658082647155, "grad_norm": 0.10388217121362686, "learning_rate": 1e-06, "loss": 0.0011, "step": 367 }, { "clip_ratio/high_max": 0.023493118584156036, "clip_ratio/high_mean": 0.005141339264810085, "clip_ratio/low_mean": 0.019209247082471848, "clip_ratio/low_min": 0.006886190269142389, "clip_ratio/region_mean": 0.024350585415959358, "epoch": 0.004024100864962985, "grad_norm": 0.07855092734098434, "learning_rate": 1e-06, "loss": 0.0007, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 648.7857666015625, "completions/mean_terminated_length": 648.7857666015625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.004035035921661254, "grad_norm": 0.3764286935329437, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 9143338.0, "reward": 0.48214295506477356, "reward_std": 0.33976519107818604, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48852667212486267, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 369 }, { "clip_ratio/high_max": 0.011190913617610931, "clip_ratio/high_mean": 0.004899663385003805, "clip_ratio/low_mean": 0.004440761636942625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00934042502194643, "epoch": 0.004045970978359523, "grad_norm": 0.20788508653640747, "learning_rate": 1e-06, "loss": 0.0099, "step": 370 }, { "clip_ratio/high_max": 0.020544512197375298, "clip_ratio/high_mean": 0.008569741621613503, "clip_ratio/low_mean": 0.009177908301353455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017747649922966957, "epoch": 0.004056906035057792, "grad_norm": 0.16150394082069397, "learning_rate": 1e-06, "loss": 0.0091, "step": 371 }, { "clip_ratio/high_max": 0.024553198367357254, "clip_ratio/high_mean": 0.011466169729828835, "clip_ratio/low_mean": 0.013304016552865505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024770187214016914, "epoch": 0.004067841091756061, "grad_norm": 0.14075152575969696, "learning_rate": 1e-06, "loss": 0.0086, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1749.0, "completions/max_terminated_length": 1749.0, "completions/mean_length": 792.8303833007812, "completions/mean_terminated_length": 792.8303833007812, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.00407877614845433, "grad_norm": 0.32313576340675354, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 9249075.0, "reward": 0.4125000536441803, "reward_std": 0.29105621576309204, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4655956029891968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.00818464532494545, "clip_ratio/high_mean": 0.0037482099141925573, "clip_ratio/low_mean": 0.004159366246312857, "clip_ratio/low_min": 0.0009690369479358196, "clip_ratio/region_mean": 0.00790757592767477, "epoch": 0.004089711205152599, "grad_norm": 0.19429123401641846, "learning_rate": 1e-06, "loss": 0.0016, "step": 374 }, { "clip_ratio/high_max": 0.022582827135920525, "clip_ratio/high_mean": 0.008606873452663422, "clip_ratio/low_mean": 0.010430485010147095, "clip_ratio/low_min": 0.0013843384804204106, "clip_ratio/region_mean": 0.019037356600165367, "epoch": 0.004100646261850868, "grad_norm": 0.1462700515985489, "learning_rate": 1e-06, "loss": 0.0007, "step": 375 }, { "clip_ratio/high_max": 0.031913455575704575, "clip_ratio/high_mean": 0.012066367082297802, "clip_ratio/low_mean": 0.016883164644241333, "clip_ratio/low_min": 0.0017996401293203235, "clip_ratio/region_mean": 0.02894953265786171, "epoch": 0.004111581318549136, "grad_norm": 0.10822377353906631, "learning_rate": 1e-06, "loss": 0.0001, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 816.8392944335938, "completions/mean_terminated_length": 805.7477416992188, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.004122516375247406, "grad_norm": 0.28729668259620667, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 9356997.0, "reward": 0.3812500536441803, "reward_std": 0.2464848756790161, "rewards/accuracy_reward/mean": 0.2857142984867096, "rewards/accuracy_reward/std": 0.453784316778183, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 377 }, { "clip_ratio/high_max": 0.008606777526438236, "clip_ratio/high_mean": 0.003377232700586319, "clip_ratio/low_mean": 0.003555637551471591, "clip_ratio/low_min": 0.000395078444853425, "clip_ratio/region_mean": 0.006932870484888554, "epoch": 0.004133451431945674, "grad_norm": 0.176166370511055, "learning_rate": 1e-06, "loss": 0.0019, "step": 378 }, { "clip_ratio/high_max": 0.015277030877768993, "clip_ratio/high_mean": 0.006159103009849787, "clip_ratio/low_mean": 0.007386974990367889, "clip_ratio/low_min": 0.0007337171118706465, "clip_ratio/region_mean": 0.013546077534556389, "epoch": 0.004144386488643944, "grad_norm": 0.13360565900802612, "learning_rate": 1e-06, "loss": 0.0012, "step": 379 }, { "clip_ratio/high_max": 0.023023130372166634, "clip_ratio/high_mean": 0.008464730344712734, "clip_ratio/low_mean": 0.011043580248951912, "clip_ratio/low_min": 0.001467434223741293, "clip_ratio/region_mean": 0.01950831152498722, "epoch": 0.004155321545342212, "grad_norm": 0.13168999552726746, "learning_rate": 1e-06, "loss": 0.0008, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1185.2054443359375, "completions/mean_terminated_length": 1144.8878173828125, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.004166256602040482, "grad_norm": 0.16884374618530273, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 9514912.0, "reward": 0.3214286267757416, "reward_std": 0.1638985574245453, "rewards/accuracy_reward/mean": 0.2232142835855484, "rewards/accuracy_reward/std": 0.41827231645584106, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 381 }, { "clip_ratio/high_max": 0.004694835748523474, "clip_ratio/high_mean": 0.00125222303904593, "clip_ratio/low_mean": 0.0018456929828971624, "clip_ratio/low_min": 0.00134036666713655, "clip_ratio/region_mean": 0.003097916254773736, "epoch": 0.00417719165873875, "grad_norm": 0.08357404172420502, "learning_rate": 1e-06, "loss": -0.0056, "step": 382 }, { "clip_ratio/high_max": 0.009117507375776768, "clip_ratio/high_mean": 0.002300156047567725, "clip_ratio/low_mean": 0.0036428479943424463, "clip_ratio/low_min": 0.0015146051300689578, "clip_ratio/region_mean": 0.0059430040419101715, "epoch": 0.00418812671543702, "grad_norm": 0.061668042093515396, "learning_rate": 1e-06, "loss": -0.0058, "step": 383 }, { "clip_ratio/high_max": 0.011975233443081379, "clip_ratio/high_mean": 0.003158664796501398, "clip_ratio/low_mean": 0.005762381013482809, "clip_ratio/low_min": 0.002632527844980359, "clip_ratio/region_mean": 0.008921045809984207, "epoch": 0.0041990617721352885, "grad_norm": 0.050305940210819244, "learning_rate": 1e-06, "loss": -0.006, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 854.5803833007812, "completions/mean_terminated_length": 843.828857421875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.004209996828833558, "grad_norm": 0.19435937702655792, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 9625857.0, "reward": 0.142857164144516, "reward_std": 0.10406188666820526, "rewards/accuracy_reward/mean": 0.0446428582072258, "rewards/accuracy_reward/std": 0.2074466347694397, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 385 }, { "clip_ratio/high_max": 0.008551483042538166, "clip_ratio/high_mean": 0.0016871389234438539, "clip_ratio/low_mean": 0.0033148841466754675, "clip_ratio/low_min": 0.000329142261762172, "clip_ratio/region_mean": 0.005002022720873356, "epoch": 0.0042209318855318265, "grad_norm": 0.10696747899055481, "learning_rate": 1e-06, "loss": 0.0039, "step": 386 }, { "clip_ratio/high_max": 0.012216405011713505, "clip_ratio/high_mean": 0.002561283065006137, "clip_ratio/low_mean": 0.0062170266173779964, "clip_ratio/low_min": 0.000658284523524344, "clip_ratio/region_mean": 0.008778310380876064, "epoch": 0.004231866942230095, "grad_norm": 0.06800665706396103, "learning_rate": 1e-06, "loss": 0.0037, "step": 387 }, { "clip_ratio/high_max": 0.013787085190415382, "clip_ratio/high_mean": 0.00289428373798728, "clip_ratio/low_mean": 0.008356082253158092, "clip_ratio/low_min": 0.0007899414049461484, "clip_ratio/region_mean": 0.011250368319451809, "epoch": 0.0042428019989283645, "grad_norm": 0.0625847578048706, "learning_rate": 1e-06, "loss": 0.0037, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 557.0892944335938, "completions/mean_terminated_length": 557.0892944335938, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.004253737055626633, "grad_norm": 0.3644300699234009, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 9704323.0, "reward": 0.9205358028411865, "reward_std": 0.3205476403236389, "rewards/accuracy_reward/mean": 0.8214285969734192, "rewards/accuracy_reward/std": 0.38471439480781555, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 389 }, { "clip_ratio/high_max": 0.009526518173515797, "clip_ratio/high_mean": 0.004253768362104893, "clip_ratio/low_mean": 0.003988626878708601, "clip_ratio/low_min": 0.0003299241070635617, "clip_ratio/region_mean": 0.008242395706474781, "epoch": 0.004264672112324903, "grad_norm": 0.2058568149805069, "learning_rate": 1e-06, "loss": 0.0217, "step": 390 }, { "clip_ratio/high_max": 0.019161134958267212, "clip_ratio/high_mean": 0.009004028514027596, "clip_ratio/low_mean": 0.008361774496734142, "clip_ratio/low_min": 0.0008293249411508441, "clip_ratio/region_mean": 0.017365803942084312, "epoch": 0.004275607169023171, "grad_norm": 0.14397075772285461, "learning_rate": 1e-06, "loss": 0.0208, "step": 391 }, { "clip_ratio/high_max": 0.030627639964222908, "clip_ratio/high_mean": 0.014489359222352505, "clip_ratio/low_mean": 0.011646748520433903, "clip_ratio/low_min": 0.0020895195193588734, "clip_ratio/region_mean": 0.026136109605431557, "epoch": 0.004286542225721441, "grad_norm": 0.11740674823522568, "learning_rate": 1e-06, "loss": 0.0203, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 880.794677734375, "completions/mean_terminated_length": 859.5726928710938, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.004297477282419709, "grad_norm": 0.3406054377555847, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 9822012.0, "reward": 0.35357150435447693, "reward_std": 0.3527674078941345, "rewards/accuracy_reward/mean": 0.2589285671710968, "rewards/accuracy_reward/std": 0.44001504778862, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261819988489151, "step": 393 }, { "clip_ratio/high_max": 0.0100626852363348, "clip_ratio/high_mean": 0.004006435163319111, "clip_ratio/low_mean": 0.003916099201887846, "clip_ratio/low_min": 4.639295002561994e-05, "clip_ratio/region_mean": 0.007922534830868244, "epoch": 0.004308412339117979, "grad_norm": 0.22178521752357483, "learning_rate": 1e-06, "loss": 0.0147, "step": 394 }, { "clip_ratio/high_max": 0.018970636650919914, "clip_ratio/high_mean": 0.008150049485266209, "clip_ratio/low_mean": 0.010428179986774921, "clip_ratio/low_min": 0.0002319647464901209, "clip_ratio/region_mean": 0.01857822947204113, "epoch": 0.004319347395816247, "grad_norm": 0.16091831028461456, "learning_rate": 1e-06, "loss": 0.0138, "step": 395 }, { "clip_ratio/high_max": 0.024249423295259476, "clip_ratio/high_mean": 0.010466156527400017, "clip_ratio/low_mean": 0.016035685315728188, "clip_ratio/low_min": 0.0003711436002049595, "clip_ratio/region_mean": 0.026501838117837906, "epoch": 0.004330282452514517, "grad_norm": 0.14533838629722595, "learning_rate": 1e-06, "loss": 0.0132, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 970.5982666015625, "completions/mean_terminated_length": 930.6944580078125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.004341217509212785, "grad_norm": 0.21559830009937286, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 9947659.0, "reward": 0.4705357849597931, "reward_std": 0.3305283784866333, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.4862987697124481, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 397 }, { "clip_ratio/high_max": 0.0038722169119864702, "clip_ratio/high_mean": 0.0018116988940164447, "clip_ratio/low_mean": 0.0032754200510680676, "clip_ratio/low_min": 0.0006867457996122539, "clip_ratio/region_mean": 0.005087118595838547, "epoch": 0.004352152565911054, "grad_norm": 0.12829799950122833, "learning_rate": 1e-06, "loss": 0.0098, "step": 398 }, { "clip_ratio/high_max": 0.0074217491783201694, "clip_ratio/high_mean": 0.0038047300186008215, "clip_ratio/low_mean": 0.0077881277538836, "clip_ratio/low_min": 0.0016678111860528588, "clip_ratio/region_mean": 0.011592858470976353, "epoch": 0.004363087622609323, "grad_norm": 0.09898648411035538, "learning_rate": 1e-06, "loss": 0.0092, "step": 399 }, { "clip_ratio/high_max": 0.010648596100509167, "clip_ratio/high_mean": 0.005447011440992355, "clip_ratio/low_mean": 0.01244239043444395, "clip_ratio/low_min": 0.0023545571602880955, "clip_ratio/region_mean": 0.01788940094411373, "epoch": 0.004374022679307592, "grad_norm": 0.08050595223903656, "learning_rate": 1e-06, "loss": 0.0088, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 811.169677734375, "completions/mean_terminated_length": 800.0270385742188, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.004384957736005861, "grad_norm": 0.3616224229335785, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 10057578.0, "reward": 0.508928656578064, "reward_std": 0.3820215165615082, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4941745698451996, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 401 }, { "clip_ratio/high_max": 0.007955857552587986, "clip_ratio/high_mean": 0.004069387447088957, "clip_ratio/low_mean": 0.004056707955896854, "clip_ratio/low_min": 0.002264946000650525, "clip_ratio/region_mean": 0.008126095868647099, "epoch": 0.00439589279270413, "grad_norm": 0.2187660187482834, "learning_rate": 1e-06, "loss": -0.0033, "step": 402 }, { "clip_ratio/high_max": 0.01674761436879635, "clip_ratio/high_mean": 0.009014414623379707, "clip_ratio/low_mean": 0.009614810347557068, "clip_ratio/low_min": 0.004582565277814865, "clip_ratio/region_mean": 0.018629224970936775, "epoch": 0.004406827849402399, "grad_norm": 0.15716494619846344, "learning_rate": 1e-06, "loss": -0.0043, "step": 403 }, { "clip_ratio/high_max": 0.023097651079297066, "clip_ratio/high_mean": 0.011695362627506256, "clip_ratio/low_mean": 0.015055136755108833, "clip_ratio/low_min": 0.007690281607210636, "clip_ratio/region_mean": 0.02675050124526024, "epoch": 0.004417762906100668, "grad_norm": 0.1297202706336975, "learning_rate": 1e-06, "loss": -0.0049, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 842.6875610351562, "completions/mean_terminated_length": 798.0463256835938, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.0044286979627989375, "grad_norm": 0.2668319046497345, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 10172651.0, "reward": 0.43392857909202576, "reward_std": 0.24753466248512268, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.4755948781967163, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 405 }, { "clip_ratio/high_max": 0.01661819778382778, "clip_ratio/high_mean": 0.0049111065454781055, "clip_ratio/low_mean": 0.003447099821642041, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008358205668628216, "epoch": 0.004439633019497206, "grad_norm": 0.1483461558818817, "learning_rate": 1e-06, "loss": 0.0063, "step": 406 }, { "clip_ratio/high_max": 0.027835479006171227, "clip_ratio/high_mean": 0.00882735289633274, "clip_ratio/low_mean": 0.008025792427361012, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016853144392371178, "epoch": 0.004450568076195475, "grad_norm": 0.10468899458646774, "learning_rate": 1e-06, "loss": 0.0057, "step": 407 }, { "clip_ratio/high_max": 0.03323639556765556, "clip_ratio/high_mean": 0.010982009582221508, "clip_ratio/low_mean": 0.012611044570803642, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023593055084347725, "epoch": 0.004461503132893744, "grad_norm": 0.08629827946424484, "learning_rate": 1e-06, "loss": 0.0053, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 842.982177734375, "completions/mean_terminated_length": 809.8164672851562, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.004472438189592013, "grad_norm": 0.34888604283332825, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 10294197.0, "reward": 0.5419643521308899, "reward_std": 0.25207969546318054, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49935612082481384, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 409 }, { "clip_ratio/high_max": 0.012383177876472473, "clip_ratio/high_mean": 0.002808554330840707, "clip_ratio/low_mean": 0.0055130585096776485, "clip_ratio/low_min": 0.0016065363306552172, "clip_ratio/region_mean": 0.008321613073348999, "epoch": 0.004483373246290282, "grad_norm": 0.19308611750602722, "learning_rate": 1e-06, "loss": 0.0033, "step": 410 }, { "clip_ratio/high_max": 0.020093457773327827, "clip_ratio/high_mean": 0.005305344704538584, "clip_ratio/low_mean": 0.009981426410377026, "clip_ratio/low_min": 0.002788951387628913, "clip_ratio/region_mean": 0.015286771580576897, "epoch": 0.004494308302988551, "grad_norm": 0.16588351130485535, "learning_rate": 1e-06, "loss": 0.0026, "step": 411 }, { "clip_ratio/high_max": 0.025934578850865364, "clip_ratio/high_mean": 0.00731480959802866, "clip_ratio/low_mean": 0.013854661956429482, "clip_ratio/low_min": 0.00450522918254137, "clip_ratio/region_mean": 0.021169470623135567, "epoch": 0.00450524335968682, "grad_norm": 0.12492897361516953, "learning_rate": 1e-06, "loss": 0.002, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 439.01788330078125, "completions/mean_terminated_length": 439.01788330078125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.004516178416385089, "grad_norm": 0.3079312741756439, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 10359551.0, "reward": 0.8678572773933411, "reward_std": 0.29201769828796387, "rewards/accuracy_reward/mean": 0.7678571343421936, "rewards/accuracy_reward/std": 0.4240972101688385, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.005604483652859926, "clip_ratio/high_mean": 0.0035014364402741194, "clip_ratio/low_mean": 0.0036602825857698917, "clip_ratio/low_min": 0.0011660447344183922, "clip_ratio/region_mean": 0.007161719258874655, "epoch": 0.004527113473083358, "grad_norm": 0.18132828176021576, "learning_rate": 1e-06, "loss": 0.0327, "step": 414 }, { "clip_ratio/high_max": 0.01281024795025587, "clip_ratio/high_mean": 0.008008226752281189, "clip_ratio/low_mean": 0.008326650597155094, "clip_ratio/low_min": 0.0018656715983524919, "clip_ratio/region_mean": 0.01633487641811371, "epoch": 0.004538048529781627, "grad_norm": 0.12174264341592789, "learning_rate": 1e-06, "loss": 0.0322, "step": 415 }, { "clip_ratio/high_max": 0.01921537145972252, "clip_ratio/high_mean": 0.010853893123567104, "clip_ratio/low_mean": 0.01171119324862957, "clip_ratio/low_min": 0.0023320894688367844, "clip_ratio/region_mean": 0.02256508357822895, "epoch": 0.004548983586479896, "grad_norm": 0.10033378005027771, "learning_rate": 1e-06, "loss": 0.0317, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 702.4553833007812, "completions/mean_terminated_length": 652.620361328125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.004559918643178165, "grad_norm": 0.299993634223938, "learning_rate": 1e-06, "loss": 0.0509, "num_tokens": 10454774.0, "reward": 0.7026787400245667, "reward_std": 0.37448176741600037, "rewards/accuracy_reward/mean": 0.6071428656578064, "rewards/accuracy_reward/std": 0.4905805289745331, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 417 }, { "clip_ratio/high_max": 0.007763586472719908, "clip_ratio/high_mean": 0.003772138385102153, "clip_ratio/low_mean": 0.003428997239097953, "clip_ratio/low_min": 0.001804692205041647, "clip_ratio/region_mean": 0.007201135624200106, "epoch": 0.0045708536998764334, "grad_norm": 0.1786826103925705, "learning_rate": 1e-06, "loss": 0.0499, "step": 418 }, { "clip_ratio/high_max": 0.013447565026581287, "clip_ratio/high_mean": 0.008034558035433292, "clip_ratio/low_mean": 0.008043305017054081, "clip_ratio/low_min": 0.004663268569856882, "clip_ratio/region_mean": 0.016077861189842224, "epoch": 0.004581788756574703, "grad_norm": 0.14025753736495972, "learning_rate": 1e-06, "loss": 0.0492, "step": 419 }, { "clip_ratio/high_max": 0.021364277228713036, "clip_ratio/high_mean": 0.011645530350506306, "clip_ratio/low_mean": 0.011829162947833538, "clip_ratio/low_min": 0.006073093973100185, "clip_ratio/region_mean": 0.023474693298339844, "epoch": 0.0045927238132729715, "grad_norm": 0.10820312798023224, "learning_rate": 1e-06, "loss": 0.0487, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 517.8660888671875, "completions/mean_terminated_length": 517.8660888671875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.004603658869971241, "grad_norm": 0.2779967486858368, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 10530191.0, "reward": 0.9026786684989929, "reward_std": 0.2519865036010742, "rewards/accuracy_reward/mean": 0.8035714030265808, "rewards/accuracy_reward/std": 0.3990819454193115, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 421 }, { "clip_ratio/high_max": 0.010168584063649178, "clip_ratio/high_mean": 0.004468988161534071, "clip_ratio/low_mean": 0.0030440576374530792, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00751304579898715, "epoch": 0.0046145939266695095, "grad_norm": 0.1259162873029709, "learning_rate": 1e-06, "loss": 0.0161, "step": 422 }, { "clip_ratio/high_max": 0.01685844175517559, "clip_ratio/high_mean": 0.00771013367921114, "clip_ratio/low_mean": 0.00529073178768158, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01300086546689272, "epoch": 0.004625528983367779, "grad_norm": 0.0929860919713974, "learning_rate": 1e-06, "loss": 0.0158, "step": 423 }, { "clip_ratio/high_max": 0.019610276445746422, "clip_ratio/high_mean": 0.009916586801409721, "clip_ratio/low_mean": 0.007154133636504412, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017070719972252846, "epoch": 0.0046364640400660476, "grad_norm": 0.09623173624277115, "learning_rate": 1e-06, "loss": 0.0156, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 649.7053833007812, "completions/mean_terminated_length": 649.7053833007812, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.004647399096764317, "grad_norm": 0.18498457968235016, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 10620962.0, "reward": 0.5151786208152771, "reward_std": 0.24639706313610077, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.49571847915649414, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 425 }, { "clip_ratio/high_max": 0.0065146577544510365, "clip_ratio/high_mean": 0.0029351522680372, "clip_ratio/low_mean": 0.0019465460209175944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004881698172539473, "epoch": 0.004658334153462586, "grad_norm": 0.10022404044866562, "learning_rate": 1e-06, "loss": 0.0054, "step": 426 }, { "clip_ratio/high_max": 0.010553745552897453, "clip_ratio/high_mean": 0.0051738121546804905, "clip_ratio/low_mean": 0.004075776319950819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009249587543308735, "epoch": 0.004669269210160855, "grad_norm": 0.08102551102638245, "learning_rate": 1e-06, "loss": 0.0052, "step": 427 }, { "clip_ratio/high_max": 0.014619418419897556, "clip_ratio/high_mean": 0.006738331168889999, "clip_ratio/low_mean": 0.006340779364109039, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013079111464321613, "epoch": 0.004680204266859124, "grad_norm": 0.06723509728908539, "learning_rate": 1e-06, "loss": 0.005, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 480.8125305175781, "completions/mean_terminated_length": 480.8125305175781, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.004691139323557392, "grad_norm": 0.44922947883605957, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 10690549.0, "reward": 0.4919643700122833, "reward_std": 0.35284894704818726, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4905804991722107, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 429 }, { "clip_ratio/high_max": 0.009379968047142029, "clip_ratio/high_mean": 0.004745785612612963, "clip_ratio/low_mean": 0.005614419002085924, "clip_ratio/low_min": 0.0014989860355854034, "clip_ratio/region_mean": 0.010360204614698887, "epoch": 0.004702074380255662, "grad_norm": 0.2693074345588684, "learning_rate": 1e-06, "loss": 0.0084, "step": 430 }, { "clip_ratio/high_max": 0.014944355934858322, "clip_ratio/high_mean": 0.008310304023325443, "clip_ratio/low_mean": 0.011558307334780693, "clip_ratio/low_min": 0.0026305406354367733, "clip_ratio/region_mean": 0.01986861042678356, "epoch": 0.00471300943695393, "grad_norm": 0.19789746403694153, "learning_rate": 1e-06, "loss": 0.0075, "step": 431 }, { "clip_ratio/high_max": 0.02384737692773342, "clip_ratio/high_mean": 0.011769689619541168, "clip_ratio/low_mean": 0.018936067819595337, "clip_ratio/low_min": 0.005655662156641483, "clip_ratio/region_mean": 0.030705761164426804, "epoch": 0.0047239444936522, "grad_norm": 0.18328450620174408, "learning_rate": 1e-06, "loss": 0.0068, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 682.0, "completions/mean_terminated_length": 669.6937255859375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.004734879550350468, "grad_norm": 0.21344046294689178, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 10788269.0, "reward": 0.5535714626312256, "reward_std": 0.23540560901165009, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.500241219997406, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 433 }, { "clip_ratio/high_max": 0.004372747149318457, "clip_ratio/high_mean": 0.0022531363647431135, "clip_ratio/low_mean": 0.0023609879426658154, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0046141245402395725, "epoch": 0.004745814607048738, "grad_norm": 0.10622672736644745, "learning_rate": 1e-06, "loss": 0.0133, "step": 434 }, { "clip_ratio/high_max": 0.009868226945400238, "clip_ratio/high_mean": 0.005067841149866581, "clip_ratio/low_mean": 0.0055234478786587715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010591289028525352, "epoch": 0.004756749663747006, "grad_norm": 0.07889661937952042, "learning_rate": 1e-06, "loss": 0.0129, "step": 435 }, { "clip_ratio/high_max": 0.01595461741089821, "clip_ratio/high_mean": 0.00700655672699213, "clip_ratio/low_mean": 0.007970079779624939, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014976635575294495, "epoch": 0.004767684720445276, "grad_norm": 0.06783241778612137, "learning_rate": 1e-06, "loss": 0.0126, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 562.3839721679688, "completions/mean_terminated_length": 562.3839721679688, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.004778619777143544, "grad_norm": 0.38017594814300537, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 10868840.0, "reward": 0.5285715460777283, "reward_std": 0.3182254731655121, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.4970957934856415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.008166748099029064, "clip_ratio/high_mean": 0.003372583771124482, "clip_ratio/low_mean": 0.004712698515504599, "clip_ratio/low_min": 0.0022890777327120304, "clip_ratio/region_mean": 0.008085282519459724, "epoch": 0.004789554833841814, "grad_norm": 0.2275373935699463, "learning_rate": 1e-06, "loss": -0.0047, "step": 438 }, { "clip_ratio/high_max": 0.014627011492848396, "clip_ratio/high_mean": 0.00672884052619338, "clip_ratio/low_mean": 0.013095042668282986, "clip_ratio/low_min": 0.0035126234870404005, "clip_ratio/region_mean": 0.01982388272881508, "epoch": 0.0048004898905400824, "grad_norm": 0.16164880990982056, "learning_rate": 1e-06, "loss": -0.0057, "step": 439 }, { "clip_ratio/high_max": 0.016455387696623802, "clip_ratio/high_mean": 0.008718007244169712, "clip_ratio/low_mean": 0.020343318581581116, "clip_ratio/low_min": 0.005488474387675524, "clip_ratio/region_mean": 0.029061326757073402, "epoch": 0.004811424947238351, "grad_norm": 0.12413939088582993, "learning_rate": 1e-06, "loss": -0.0064, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 900.4464721679688, "completions/mean_terminated_length": 890.108154296875, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.0048223600039366205, "grad_norm": 0.20689430832862854, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 10992538.0, "reward": 0.24642857909202576, "reward_std": 0.2566687762737274, "rewards/accuracy_reward/mean": 0.1607142835855484, "rewards/accuracy_reward/std": 0.368917852640152, "rewards/format_reward/mean": 0.8571428656578064, "rewards/format_reward/std": 0.3514997959136963, "step": 441 }, { "clip_ratio/high_max": 0.0038137822411954403, "clip_ratio/high_mean": 0.002254741033539176, "clip_ratio/low_mean": 0.002430385211482644, "clip_ratio/low_min": 0.0003713330952450633, "clip_ratio/region_mean": 0.00468512624502182, "epoch": 0.004833295060634889, "grad_norm": 0.12637531757354736, "learning_rate": 1e-06, "loss": 0.0311, "step": 442 }, { "clip_ratio/high_max": 0.006312467157840729, "clip_ratio/high_mean": 0.003964635077863932, "clip_ratio/low_mean": 0.0059314328245818615, "clip_ratio/low_min": 0.0014117996906861663, "clip_ratio/region_mean": 0.009896067902445793, "epoch": 0.0048442301173331585, "grad_norm": 0.09446990489959717, "learning_rate": 1e-06, "loss": 0.0306, "step": 443 }, { "clip_ratio/high_max": 0.008745428174734116, "clip_ratio/high_mean": 0.005657802801579237, "clip_ratio/low_mean": 0.009776296094059944, "clip_ratio/low_min": 0.0027492940425872803, "clip_ratio/region_mean": 0.015434099361300468, "epoch": 0.004855165174031427, "grad_norm": 0.08189226686954498, "learning_rate": 1e-06, "loss": 0.0302, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 635.4017944335938, "completions/mean_terminated_length": 635.4017944335938, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.004866100230729697, "grad_norm": 0.3797457814216614, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 11079299.0, "reward": 0.31071433424949646, "reward_std": 0.32246968150138855, "rewards/accuracy_reward/mean": 0.2142857164144516, "rewards/accuracy_reward/std": 0.41217005252838135, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641091883182526, "step": 445 }, { "clip_ratio/high_max": 0.008618036285042763, "clip_ratio/high_mean": 0.0045418995432555676, "clip_ratio/low_mean": 0.005407730583101511, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009949631057679653, "epoch": 0.004877035287427965, "grad_norm": 0.2447299063205719, "learning_rate": 1e-06, "loss": -0.0052, "step": 446 }, { "clip_ratio/high_max": 0.01631271094083786, "clip_ratio/high_mean": 0.008213420398533344, "clip_ratio/low_mean": 0.014461757615208626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022675177082419395, "epoch": 0.004887970344126235, "grad_norm": 0.17408163845539093, "learning_rate": 1e-06, "loss": -0.0063, "step": 447 }, { "clip_ratio/high_max": 0.022058069705963135, "clip_ratio/high_mean": 0.010592096485197544, "clip_ratio/low_mean": 0.022521020844578743, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03311311826109886, "epoch": 0.004898905400824503, "grad_norm": 0.14891204237937927, "learning_rate": 1e-06, "loss": -0.0071, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 498.919677734375, "completions/mean_terminated_length": 498.919677734375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.004909840457522773, "grad_norm": 0.5287459492683411, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 11151950.0, "reward": 0.5267857909202576, "reward_std": 0.4235399067401886, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.4970957934856415, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 449 }, { "clip_ratio/high_max": 0.007528641726821661, "clip_ratio/high_mean": 0.003308182582259178, "clip_ratio/low_mean": 0.004833113867789507, "clip_ratio/low_min": 0.0024529844522476196, "clip_ratio/region_mean": 0.008141296915709972, "epoch": 0.004920775514221041, "grad_norm": 0.27205294370651245, "learning_rate": 1e-06, "loss": 0.028, "step": 450 }, { "clip_ratio/high_max": 0.01407528668642044, "clip_ratio/high_mean": 0.006627238821238279, "clip_ratio/low_mean": 0.011180011555552483, "clip_ratio/low_min": 0.00621931254863739, "clip_ratio/region_mean": 0.0178072489798069, "epoch": 0.00493171057091931, "grad_norm": 0.2544794976711273, "learning_rate": 1e-06, "loss": 0.0274, "step": 451 }, { "clip_ratio/high_max": 0.019312601536512375, "clip_ratio/high_mean": 0.008649109862744808, "clip_ratio/low_mean": 0.01699928566813469, "clip_ratio/low_min": 0.008183306083083153, "clip_ratio/region_mean": 0.025648394599556923, "epoch": 0.004942645627617579, "grad_norm": 0.2188624143600464, "learning_rate": 1e-06, "loss": 0.0269, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 869.0535888671875, "completions/mean_terminated_length": 858.4324340820312, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.004953580684315848, "grad_norm": 0.2866225242614746, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 11268296.0, "reward": 0.2794643044471741, "reward_std": 0.20204564929008484, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.3920665979385376, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 453 }, { "clip_ratio/high_max": 0.009387267753481865, "clip_ratio/high_mean": 0.004020216874778271, "clip_ratio/low_mean": 0.0039015260990709066, "clip_ratio/low_min": 0.00024492794182151556, "clip_ratio/region_mean": 0.007921743206679821, "epoch": 0.004964515741014117, "grad_norm": 0.15116499364376068, "learning_rate": 1e-06, "loss": 0.0043, "step": 454 }, { "clip_ratio/high_max": 0.016555726528167725, "clip_ratio/high_mean": 0.006276329513639212, "clip_ratio/low_mean": 0.007582320366054773, "clip_ratio/low_min": 0.00024492794182151556, "clip_ratio/region_mean": 0.01385865081101656, "epoch": 0.004975450797712386, "grad_norm": 0.12391917407512665, "learning_rate": 1e-06, "loss": 0.004, "step": 455 }, { "clip_ratio/high_max": 0.01757979206740856, "clip_ratio/high_mean": 0.007447163108736277, "clip_ratio/low_mean": 0.010946075432002544, "clip_ratio/low_min": 0.00036739191273227334, "clip_ratio/region_mean": 0.01839323900640011, "epoch": 0.004986385854410655, "grad_norm": 0.13370417058467865, "learning_rate": 1e-06, "loss": 0.0038, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 933.3125610351562, "completions/mean_terminated_length": 892.0277709960938, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.004997320911108924, "grad_norm": 0.2651110887527466, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 11389375.0, "reward": 0.35535719990730286, "reward_std": 0.30802884697914124, "rewards/accuracy_reward/mean": 0.2589285671710968, "rewards/accuracy_reward/std": 0.4400150775909424, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641091883182526, "step": 457 }, { "clip_ratio/high_max": 0.005980124697089195, "clip_ratio/high_mean": 0.002423542784526944, "clip_ratio/low_mean": 0.0030224791262298822, "clip_ratio/low_min": 0.00015001499559730291, "clip_ratio/region_mean": 0.005446021910756826, "epoch": 0.005008255967807193, "grad_norm": 0.15757331252098083, "learning_rate": 1e-06, "loss": 0.0213, "step": 458 }, { "clip_ratio/high_max": 0.011608477681875229, "clip_ratio/high_mean": 0.005034833215177059, "clip_ratio/low_mean": 0.007703525014221668, "clip_ratio/low_min": 0.000550055003259331, "clip_ratio/region_mean": 0.012738357298076153, "epoch": 0.005019191024505462, "grad_norm": 0.11790647357702255, "learning_rate": 1e-06, "loss": 0.0206, "step": 459 }, { "clip_ratio/high_max": 0.015302084386348724, "clip_ratio/high_mean": 0.006805465091019869, "clip_ratio/low_mean": 0.012381305918097496, "clip_ratio/low_min": 0.0007000699988566339, "clip_ratio/region_mean": 0.019186770543456078, "epoch": 0.0050301260812037315, "grad_norm": 0.10479208081960678, "learning_rate": 1e-06, "loss": 0.0201, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 453.1607360839844, "completions/mean_terminated_length": 453.1607360839844, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.005041061137902, "grad_norm": 0.5265418887138367, "learning_rate": 1e-06, "loss": 0.0453, "num_tokens": 11454437.0, "reward": 0.5017857551574707, "reward_std": 0.4265156686306, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.49246248602867126, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.008228912949562073, "clip_ratio/high_mean": 0.0048773884773254395, "clip_ratio/low_mean": 0.007236923556774855, "clip_ratio/low_min": 0.0036097359843552113, "clip_ratio/region_mean": 0.012114311568439007, "epoch": 0.005051996194600269, "grad_norm": 0.33364516496658325, "learning_rate": 1e-06, "loss": 0.0437, "step": 462 }, { "clip_ratio/high_max": 0.017579952254891396, "clip_ratio/high_mean": 0.010284372605383396, "clip_ratio/low_mean": 0.015483961440622807, "clip_ratio/low_min": 0.008579201065003872, "clip_ratio/region_mean": 0.025768335908651352, "epoch": 0.005062931251298538, "grad_norm": 0.23317837715148926, "learning_rate": 1e-06, "loss": 0.0423, "step": 463 }, { "clip_ratio/high_max": 0.02487376146018505, "clip_ratio/high_mean": 0.014340688474476337, "clip_ratio/low_mean": 0.02487536333501339, "clip_ratio/low_min": 0.013391923159360886, "clip_ratio/region_mean": 0.039216049015522, "epoch": 0.005073866307996807, "grad_norm": 0.21676428616046906, "learning_rate": 1e-06, "loss": 0.0414, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 559.0803833007812, "completions/mean_terminated_length": 559.0803833007812, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.005084801364695076, "grad_norm": 0.4848935902118683, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 11533686.0, "reward": 0.7419643998146057, "reward_std": 0.36934894323349, "rewards/accuracy_reward/mean": 0.6428571343421936, "rewards/accuracy_reward/std": 0.4813109338283539, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 465 }, { "clip_ratio/high_max": 0.011989795602858067, "clip_ratio/high_mean": 0.00677094841375947, "clip_ratio/low_mean": 0.005067888181656599, "clip_ratio/low_min": 0.0018111735116690397, "clip_ratio/region_mean": 0.011838837526738644, "epoch": 0.005095736421393345, "grad_norm": 0.24487805366516113, "learning_rate": 1e-06, "loss": 0.0051, "step": 466 }, { "clip_ratio/high_max": 0.0258485060185194, "clip_ratio/high_mean": 0.011755562387406826, "clip_ratio/low_mean": 0.00937273446470499, "clip_ratio/low_min": 0.003900989191606641, "clip_ratio/region_mean": 0.021128296852111816, "epoch": 0.005106671478091614, "grad_norm": 0.17825645208358765, "learning_rate": 1e-06, "loss": 0.0042, "step": 467 }, { "clip_ratio/high_max": 0.03641267865896225, "clip_ratio/high_mean": 0.016322042793035507, "clip_ratio/low_mean": 0.013654717244207859, "clip_ratio/low_min": 0.005331911612302065, "clip_ratio/region_mean": 0.029976757243275642, "epoch": 0.005117606534789883, "grad_norm": 0.16653281450271606, "learning_rate": 1e-06, "loss": 0.0037, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 431.9910888671875, "completions/mean_terminated_length": 431.9910888671875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.005128541591488152, "grad_norm": 0.3928585350513458, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 11598757.0, "reward": 0.4392857849597931, "reward_std": 0.3755877912044525, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.4755948781967163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.010237420909106731, "clip_ratio/high_mean": 0.0040892623364925385, "clip_ratio/low_mean": 0.005038138013333082, "clip_ratio/low_min": 0.0027797920629382133, "clip_ratio/region_mean": 0.009127399884164333, "epoch": 0.005139476648186421, "grad_norm": 0.2147270292043686, "learning_rate": 1e-06, "loss": -0.0003, "step": 470 }, { "clip_ratio/high_max": 0.022653017193078995, "clip_ratio/high_mean": 0.008746727369725704, "clip_ratio/low_mean": 0.011892237700521946, "clip_ratio/low_min": 0.008003722876310349, "clip_ratio/region_mean": 0.0206389632076025, "epoch": 0.00515041170488469, "grad_norm": 0.22797589004039764, "learning_rate": 1e-06, "loss": -0.001, "step": 471 }, { "clip_ratio/high_max": 0.0304944459348917, "clip_ratio/high_mean": 0.012384490109980106, "clip_ratio/low_mean": 0.017853735014796257, "clip_ratio/low_min": 0.013536378741264343, "clip_ratio/region_mean": 0.03023822419345379, "epoch": 0.005161346761582959, "grad_norm": 0.11909593641757965, "learning_rate": 1e-06, "loss": -0.0015, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 690.5625610351562, "completions/mean_terminated_length": 678.3333740234375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.005172281818281227, "grad_norm": 0.14080852270126343, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 11695924.0, "reward": 0.5821430087089539, "reward_std": 0.06556632369756699, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5019267797470093, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.008620689623057842, "clip_ratio/high_mean": 0.0019575986079871655, "clip_ratio/low_mean": 0.002082347171381116, "clip_ratio/low_min": 0.0013354701222851872, "clip_ratio/region_mean": 0.004039945546537638, "epoch": 0.005183216874979497, "grad_norm": 0.03831104561686516, "learning_rate": 1e-06, "loss": -0.0018, "step": 474 }, { "clip_ratio/high_max": 0.014268727973103523, "clip_ratio/high_mean": 0.003154095960780978, "clip_ratio/low_mean": 0.0029781816992908716, "clip_ratio/low_min": 0.0020000869408249855, "clip_ratio/region_mean": 0.00613227766007185, "epoch": 0.0051941519316777655, "grad_norm": 0.03890424594283104, "learning_rate": 1e-06, "loss": -0.0018, "step": 475 }, { "clip_ratio/high_max": 0.014268727973103523, "clip_ratio/high_mean": 0.003306721104308963, "clip_ratio/low_mean": 0.004091798793524504, "clip_ratio/low_min": 0.0027392494957894087, "clip_ratio/region_mean": 0.007398519665002823, "epoch": 0.005205086988376035, "grad_norm": 0.03684166818857193, "learning_rate": 1e-06, "loss": -0.0018, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 884.15185546875, "completions/mean_terminated_length": 884.15185546875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.0052160220450743035, "grad_norm": 0.16878579556941986, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 11812869.0, "reward": 0.1696428805589676, "reward_std": 0.181509330868721, "rewards/accuracy_reward/mean": 0.0714285746216774, "rewards/accuracy_reward/std": 0.25869685411453247, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 477 }, { "clip_ratio/high_max": 0.005633802618831396, "clip_ratio/high_mean": 0.002050726907327771, "clip_ratio/low_mean": 0.00281369686126709, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004864424001425505, "epoch": 0.005226957101772573, "grad_norm": 0.09884060174226761, "learning_rate": 1e-06, "loss": -0.0024, "step": 478 }, { "clip_ratio/high_max": 0.00809859111905098, "clip_ratio/high_mean": 0.00287391128949821, "clip_ratio/low_mean": 0.0064293197356164455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009303232654929161, "epoch": 0.0052378921584708416, "grad_norm": 0.07106197625398636, "learning_rate": 1e-06, "loss": -0.0027, "step": 479 }, { "clip_ratio/high_max": 0.009771126322448254, "clip_ratio/high_mean": 0.003433321136981249, "clip_ratio/low_mean": 0.010854027234017849, "clip_ratio/low_min": 0.0002581977751106024, "clip_ratio/region_mean": 0.01428734790533781, "epoch": 0.005248827215169111, "grad_norm": 0.057069312781095505, "learning_rate": 1e-06, "loss": -0.0029, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 596.1160888671875, "completions/mean_terminated_length": 596.1160888671875, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.00525976227186738, "grad_norm": 0.2975551187992096, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 11898226.0, "reward": 0.5285714864730835, "reward_std": 0.33549025654792786, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.4970957934856415, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.006796363741159439, "clip_ratio/high_mean": 0.0030402890406548977, "clip_ratio/low_mean": 0.004070701543241739, "clip_ratio/low_min": 0.0007110352744348347, "clip_ratio/region_mean": 0.007110990583896637, "epoch": 0.005270697328565648, "grad_norm": 0.18412119150161743, "learning_rate": 1e-06, "loss": -0.005, "step": 482 }, { "clip_ratio/high_max": 0.015716591849923134, "clip_ratio/high_mean": 0.006512940861284733, "clip_ratio/low_mean": 0.0097059216350317, "clip_ratio/low_min": 0.0027019339613616467, "clip_ratio/region_mean": 0.01621886156499386, "epoch": 0.005281632385263918, "grad_norm": 0.13145232200622559, "learning_rate": 1e-06, "loss": -0.0057, "step": 483 }, { "clip_ratio/high_max": 0.02123863808810711, "clip_ratio/high_mean": 0.008885049261152744, "clip_ratio/low_mean": 0.015043639577925205, "clip_ratio/low_min": 0.003542821854352951, "clip_ratio/region_mean": 0.02392868883907795, "epoch": 0.005292567441962186, "grad_norm": 0.10863419622182846, "learning_rate": 1e-06, "loss": -0.0062, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 950.52685546875, "completions/mean_terminated_length": 950.52685546875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.005303502498660456, "grad_norm": 0.2776433527469635, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 12026273.0, "reward": 0.24642862379550934, "reward_std": 0.2498413324356079, "rewards/accuracy_reward/mean": 0.1517857164144516, "rewards/accuracy_reward/std": 0.3604257106781006, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 485 }, { "clip_ratio/high_max": 0.008128980174660683, "clip_ratio/high_mean": 0.0028587805572897196, "clip_ratio/low_mean": 0.0030891639180481434, "clip_ratio/low_min": 0.00015549681847915053, "clip_ratio/region_mean": 0.005947944242507219, "epoch": 0.005314437555358724, "grad_norm": 0.2521331310272217, "learning_rate": 1e-06, "loss": 0.0065, "step": 486 }, { "clip_ratio/high_max": 0.012057987041771412, "clip_ratio/high_mean": 0.00437897490337491, "clip_ratio/low_mean": 0.0059349676594138145, "clip_ratio/low_min": 0.0007774840341880918, "clip_ratio/region_mean": 0.010313943028450012, "epoch": 0.005325372612056994, "grad_norm": 0.1277829259634018, "learning_rate": 1e-06, "loss": 0.0062, "step": 487 }, { "clip_ratio/high_max": 0.015716027468442917, "clip_ratio/high_mean": 0.005662315990775824, "clip_ratio/low_mean": 0.009233078919351101, "clip_ratio/low_min": 0.0015031358925625682, "clip_ratio/region_mean": 0.014895395375788212, "epoch": 0.005336307668755262, "grad_norm": 0.1150631234049797, "learning_rate": 1e-06, "loss": 0.0059, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 544.0803833007812, "completions/mean_terminated_length": 544.0803833007812, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.005347242725453532, "grad_norm": 0.22184687852859497, "learning_rate": 1e-06, "loss": 0.0305, "num_tokens": 12100958.0, "reward": 0.21250003576278687, "reward_std": 0.1768874228000641, "rewards/accuracy_reward/mean": 0.1160714253783226, "rewards/accuracy_reward/std": 0.3217501640319824, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641091883182526, "step": 489 }, { "clip_ratio/high_max": 0.004530427511781454, "clip_ratio/high_mean": 0.0018791136099025607, "clip_ratio/low_mean": 0.003571560839191079, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005450674332678318, "epoch": 0.0053581777821518, "grad_norm": 0.11291354149580002, "learning_rate": 1e-06, "loss": 0.03, "step": 490 }, { "clip_ratio/high_max": 0.00636708689853549, "clip_ratio/high_mean": 0.002685852814465761, "clip_ratio/low_mean": 0.007701757829636335, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010387609712779522, "epoch": 0.00536911283885007, "grad_norm": 0.08533273637294769, "learning_rate": 1e-06, "loss": 0.0296, "step": 491 }, { "clip_ratio/high_max": 0.006244643125683069, "clip_ratio/high_mean": 0.002944458043202758, "clip_ratio/low_mean": 0.01185530424118042, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014799763448536396, "epoch": 0.005380047895548338, "grad_norm": 0.07246936857700348, "learning_rate": 1e-06, "loss": 0.0294, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 835.0267944335938, "completions/mean_terminated_length": 824.09912109375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.005390982952246607, "grad_norm": 0.25818705558776855, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 12217077.0, "reward": 0.3651786148548126, "reward_std": 0.30938220024108887, "rewards/accuracy_reward/mean": 0.2678571343421936, "rewards/accuracy_reward/std": 0.44483304023742676, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 493 }, { "clip_ratio/high_max": 0.007165755145251751, "clip_ratio/high_mean": 0.0034693903289735317, "clip_ratio/low_mean": 0.0035194477532058954, "clip_ratio/low_min": 0.00010422638297313824, "clip_ratio/region_mean": 0.006988838315010071, "epoch": 0.0054019180089448764, "grad_norm": 0.17806975543498993, "learning_rate": 1e-06, "loss": 0.0158, "step": 494 }, { "clip_ratio/high_max": 0.012634358368813992, "clip_ratio/high_mean": 0.005728443618863821, "clip_ratio/low_mean": 0.007739504333585501, "clip_ratio/low_min": 0.00015633956354577094, "clip_ratio/region_mean": 0.013467947952449322, "epoch": 0.005412853065643145, "grad_norm": 0.1852974146604538, "learning_rate": 1e-06, "loss": 0.0153, "step": 495 }, { "clip_ratio/high_max": 0.016405807808041573, "clip_ratio/high_mean": 0.007890501990914345, "clip_ratio/low_mean": 0.011560708284378052, "clip_ratio/low_min": 0.00046901870518922806, "clip_ratio/region_mean": 0.019451210275292397, "epoch": 0.0054237881223414145, "grad_norm": 0.12401580810546875, "learning_rate": 1e-06, "loss": 0.015, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 656.875, "completions/mean_terminated_length": 505.3663330078125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.005434723179039683, "grad_norm": 0.2583461105823517, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 12308847.0, "reward": 0.45982152223587036, "reward_std": 0.14213958382606506, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.483894407749176, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 497 }, { "clip_ratio/high_max": 0.009834453463554382, "clip_ratio/high_mean": 0.0031334541272372007, "clip_ratio/low_mean": 0.004276713356375694, "clip_ratio/low_min": 4.232983337715268e-05, "clip_ratio/region_mean": 0.007410167716443539, "epoch": 0.0054456582357379525, "grad_norm": 0.12310406565666199, "learning_rate": 1e-06, "loss": 0.0134, "step": 498 }, { "clip_ratio/high_max": 0.01311260461807251, "clip_ratio/high_mean": 0.00462943222373724, "clip_ratio/low_mean": 0.007556230761110783, "clip_ratio/low_min": 8.465966675430536e-05, "clip_ratio/region_mean": 0.012185662984848022, "epoch": 0.005456593292436221, "grad_norm": 0.10313910990953445, "learning_rate": 1e-06, "loss": 0.0131, "step": 499 }, { "clip_ratio/high_max": 0.016062941402196884, "clip_ratio/high_mean": 0.00588183430954814, "clip_ratio/low_mean": 0.009703909046947956, "clip_ratio/low_min": 0.00033863866701722145, "clip_ratio/region_mean": 0.015585741959512234, "epoch": 0.0054675283491344906, "grad_norm": 0.11317723244428635, "learning_rate": 1e-06, "loss": 0.0129, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 1025.419677734375, "completions/mean_terminated_length": 1025.419677734375, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.005478463405832759, "grad_norm": 0.31516894698143005, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 12445982.0, "reward": 0.35714295506477356, "reward_std": 0.40086135268211365, "rewards/accuracy_reward/mean": 0.2589285671710968, "rewards/accuracy_reward/std": 0.44001504778862, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 501 }, { "clip_ratio/high_max": 0.009021513164043427, "clip_ratio/high_mean": 0.002953395713120699, "clip_ratio/low_mean": 0.003959414083510637, "clip_ratio/low_min": 0.0013866547960788012, "clip_ratio/region_mean": 0.0069128102622926235, "epoch": 0.005489398462531029, "grad_norm": 0.20548972487449646, "learning_rate": 1e-06, "loss": 0.0061, "step": 502 }, { "clip_ratio/high_max": 0.01584547758102417, "clip_ratio/high_mean": 0.005720508750528097, "clip_ratio/low_mean": 0.010420888662338257, "clip_ratio/low_min": 0.002924655331298709, "clip_ratio/region_mean": 0.01614139787852764, "epoch": 0.005500333519229297, "grad_norm": 0.15880629420280457, "learning_rate": 1e-06, "loss": 0.0051, "step": 503 }, { "clip_ratio/high_max": 0.020356234163045883, "clip_ratio/high_mean": 0.007931982167065144, "clip_ratio/low_mean": 0.016180356964468956, "clip_ratio/low_min": 0.005106540862470865, "clip_ratio/region_mean": 0.024112340062856674, "epoch": 0.005511268575927566, "grad_norm": 0.12541724741458893, "learning_rate": 1e-06, "loss": 0.0043, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 586.0178833007812, "completions/mean_terminated_length": 586.0178833007812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.005522203632625835, "grad_norm": 0.3796822428703308, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 12529732.0, "reward": 0.5821430087089539, "reward_std": 0.300202339887619, "rewards/accuracy_reward/mean": 0.4821428656578064, "rewards/accuracy_reward/std": 0.5019267797470093, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0054083289578557014, "clip_ratio/high_mean": 0.002507934346795082, "clip_ratio/low_mean": 0.004047846887260675, "clip_ratio/low_min": 0.001327140023931861, "clip_ratio/region_mean": 0.00655578076839447, "epoch": 0.005533138689324104, "grad_norm": 0.19733446836471558, "learning_rate": 1e-06, "loss": -0.0073, "step": 506 }, { "clip_ratio/high_max": 0.013737155124545097, "clip_ratio/high_mean": 0.005071605555713177, "clip_ratio/low_mean": 0.00956796295940876, "clip_ratio/low_min": 0.004644989967346191, "clip_ratio/region_mean": 0.014639569446444511, "epoch": 0.005544073746022373, "grad_norm": 0.14907674491405487, "learning_rate": 1e-06, "loss": -0.0079, "step": 507 }, { "clip_ratio/high_max": 0.019686317071318626, "clip_ratio/high_mean": 0.006605467293411493, "clip_ratio/low_mean": 0.013346799649298191, "clip_ratio/low_min": 0.007286353502422571, "clip_ratio/region_mean": 0.019952265545725822, "epoch": 0.005555008802720642, "grad_norm": 0.13930639624595642, "learning_rate": 1e-06, "loss": -0.0083, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 653.0357666015625, "completions/mean_terminated_length": 653.0357666015625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.005565943859418911, "grad_norm": 0.2554740905761719, "learning_rate": 1e-06, "loss": -0.0113, "num_tokens": 12620484.0, "reward": 0.1446428745985031, "reward_std": 0.1443089246749878, "rewards/accuracy_reward/mean": 0.0446428582072258, "rewards/accuracy_reward/std": 0.2074466347694397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.006388318724930286, "clip_ratio/high_mean": 0.001609299099072814, "clip_ratio/low_mean": 0.005494693759828806, "clip_ratio/low_min": 0.0015450522769242525, "clip_ratio/region_mean": 0.007103992160409689, "epoch": 0.00557687891611718, "grad_norm": 0.08056098222732544, "learning_rate": 1e-06, "loss": -0.0117, "step": 510 }, { "clip_ratio/high_max": 0.007300935219973326, "clip_ratio/high_mean": 0.00211539794690907, "clip_ratio/low_mean": 0.009172008372843266, "clip_ratio/low_min": 0.003581712255254388, "clip_ratio/region_mean": 0.011287406086921692, "epoch": 0.005587813972815449, "grad_norm": 0.056747499853372574, "learning_rate": 1e-06, "loss": -0.0118, "step": 511 }, { "clip_ratio/high_max": 0.007985398173332214, "clip_ratio/high_mean": 0.002374386414885521, "clip_ratio/low_mean": 0.011971071362495422, "clip_ratio/low_min": 0.003581712255254388, "clip_ratio/region_mean": 0.014345458708703518, "epoch": 0.005598749029513718, "grad_norm": 0.04204167425632477, "learning_rate": 1e-06, "loss": -0.0118, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 638.1428833007812, "completions/mean_terminated_length": 612.5090942382812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.005609684086211987, "grad_norm": 0.3764965534210205, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 12711556.0, "reward": 0.35803574323654175, "reward_std": 0.21942807734012604, "rewards/accuracy_reward/mean": 0.2589285671710968, "rewards/accuracy_reward/std": 0.44001504778862, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 513 }, { "clip_ratio/high_max": 0.014567035250365734, "clip_ratio/high_mean": 0.005270153284072876, "clip_ratio/low_mean": 0.0041887653060257435, "clip_ratio/low_min": 8.747758693061769e-05, "clip_ratio/region_mean": 0.009458919055759907, "epoch": 0.005620619142910256, "grad_norm": 0.21607331931591034, "learning_rate": 1e-06, "loss": -0.0006, "step": 514 }, { "clip_ratio/high_max": 0.026436472311615944, "clip_ratio/high_mean": 0.008598108775913715, "clip_ratio/low_mean": 0.008480672724545002, "clip_ratio/low_min": 0.0003061715397052467, "clip_ratio/region_mean": 0.017078781500458717, "epoch": 0.005631554199608525, "grad_norm": 0.1634996384382248, "learning_rate": 1e-06, "loss": -0.0014, "step": 515 }, { "clip_ratio/high_max": 0.03398974984884262, "clip_ratio/high_mean": 0.011294850148260593, "clip_ratio/low_mean": 0.013714409433305264, "clip_ratio/low_min": 0.00048112671356648207, "clip_ratio/region_mean": 0.025009257718920708, "epoch": 0.005642489256306794, "grad_norm": 0.1283883899450302, "learning_rate": 1e-06, "loss": -0.0018, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 565.4910888671875, "completions/mean_terminated_length": 565.4910888671875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.005653424313005063, "grad_norm": 0.5500590205192566, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 12793987.0, "reward": 0.3669643700122833, "reward_std": 0.3848356008529663, "rewards/accuracy_reward/mean": 0.2678571343421936, "rewards/accuracy_reward/std": 0.44483304023742676, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 517 }, { "clip_ratio/high_max": 0.014703425578773022, "clip_ratio/high_mean": 0.005474448204040527, "clip_ratio/low_mean": 0.006305765826255083, "clip_ratio/low_min": 0.0022727272007614374, "clip_ratio/region_mean": 0.011780215427279472, "epoch": 0.005664359369703332, "grad_norm": 0.3416762948036194, "learning_rate": 1e-06, "loss": -0.0047, "step": 518 }, { "clip_ratio/high_max": 0.026798561215400696, "clip_ratio/high_mean": 0.010520289652049541, "clip_ratio/low_mean": 0.01437010895460844, "clip_ratio/low_min": 0.005085015203803778, "clip_ratio/region_mean": 0.024890396744012833, "epoch": 0.005675294426401601, "grad_norm": 0.23459996283054352, "learning_rate": 1e-06, "loss": -0.006, "step": 519 }, { "clip_ratio/high_max": 0.03866906464099884, "clip_ratio/high_mean": 0.014111851342022419, "clip_ratio/low_mean": 0.021950827911496162, "clip_ratio/low_min": 0.008257575333118439, "clip_ratio/region_mean": 0.036062683910131454, "epoch": 0.00568622948309987, "grad_norm": 0.18985004723072052, "learning_rate": 1e-06, "loss": -0.0071, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 636.8214721679688, "completions/mean_terminated_length": 636.8214721679688, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.005697164539798139, "grad_norm": 0.38513264060020447, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 12880571.0, "reward": 0.43839290738105774, "reward_std": 0.38152584433555603, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.4755948781967163, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 521 }, { "clip_ratio/high_max": 0.006962435320019722, "clip_ratio/high_mean": 0.0032361086923629045, "clip_ratio/low_mean": 0.005177970975637436, "clip_ratio/low_min": 0.0025906735099852085, "clip_ratio/region_mean": 0.008414079435169697, "epoch": 0.005708099596496408, "grad_norm": 0.2151951789855957, "learning_rate": 1e-06, "loss": 0.004, "step": 522 }, { "clip_ratio/high_max": 0.015058290213346481, "clip_ratio/high_mean": 0.007323872298002243, "clip_ratio/low_mean": 0.012696812860667706, "clip_ratio/low_min": 0.005181347019970417, "clip_ratio/region_mean": 0.020020684227347374, "epoch": 0.005719034653194677, "grad_norm": 0.15684561431407928, "learning_rate": 1e-06, "loss": 0.0032, "step": 523 }, { "clip_ratio/high_max": 0.02056347206234932, "clip_ratio/high_mean": 0.009959644638001919, "clip_ratio/low_mean": 0.018327560275793076, "clip_ratio/low_min": 0.0077720205299556255, "clip_ratio/region_mean": 0.02828720211982727, "epoch": 0.005729969709892946, "grad_norm": 0.13466887176036835, "learning_rate": 1e-06, "loss": 0.0027, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 651.9375, "completions/mean_terminated_length": 639.3603515625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.005740904766591215, "grad_norm": 0.39543724060058594, "learning_rate": 1e-06, "loss": -0.013, "num_tokens": 12969100.0, "reward": 0.4830358028411865, "reward_std": 0.23922109603881836, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48852667212486267, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 525 }, { "clip_ratio/high_max": 0.016605166718363762, "clip_ratio/high_mean": 0.0038470395375043154, "clip_ratio/low_mean": 0.006046221125870943, "clip_ratio/low_min": 0.001238416531123221, "clip_ratio/region_mean": 0.009893259964883327, "epoch": 0.005751839823289483, "grad_norm": 0.22944913804531097, "learning_rate": 1e-06, "loss": -0.0136, "step": 526 }, { "clip_ratio/high_max": 0.023575235158205032, "clip_ratio/high_mean": 0.005811909679323435, "clip_ratio/low_mean": 0.010885953903198242, "clip_ratio/low_min": 0.0023709451779723167, "clip_ratio/region_mean": 0.01669786497950554, "epoch": 0.005762774879987753, "grad_norm": 0.2336900532245636, "learning_rate": 1e-06, "loss": -0.0141, "step": 527 }, { "clip_ratio/high_max": 0.025420254096388817, "clip_ratio/high_mean": 0.006819969974458218, "clip_ratio/low_mean": 0.014796075411140919, "clip_ratio/low_min": 0.004410143475979567, "clip_ratio/region_mean": 0.021616045385599136, "epoch": 0.005773709936686021, "grad_norm": 0.1361958384513855, "learning_rate": 1e-06, "loss": -0.0144, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 620.5089721679688, "completions/mean_terminated_length": 620.5089721679688, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.005784644993384291, "grad_norm": 0.36565887928009033, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 13056433.0, "reward": 0.4366072118282318, "reward_std": 0.17199577391147614, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.4755948781967163, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 529 }, { "clip_ratio/high_max": 0.009706689044833183, "clip_ratio/high_mean": 0.0026620293501764536, "clip_ratio/low_mean": 0.006622757762670517, "clip_ratio/low_min": 0.00026448030257597566, "clip_ratio/region_mean": 0.009284786880016327, "epoch": 0.0057955800500825595, "grad_norm": 0.22273671627044678, "learning_rate": 1e-06, "loss": 0.0079, "step": 530 }, { "clip_ratio/high_max": 0.011394809000194073, "clip_ratio/high_mean": 0.004233020823448896, "clip_ratio/low_mean": 0.011737152934074402, "clip_ratio/low_min": 0.000793440907727927, "clip_ratio/region_mean": 0.015970174223184586, "epoch": 0.005806515106780829, "grad_norm": 0.12211643159389496, "learning_rate": 1e-06, "loss": 0.0077, "step": 531 }, { "clip_ratio/high_max": 0.014138003811240196, "clip_ratio/high_mean": 0.005125329829752445, "clip_ratio/low_mean": 0.016670716926455498, "clip_ratio/low_min": 0.0005289606051519513, "clip_ratio/region_mean": 0.02179604582488537, "epoch": 0.0058174501634790975, "grad_norm": 0.11796583235263824, "learning_rate": 1e-06, "loss": 0.0075, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 517.5625, "completions/mean_terminated_length": 517.5625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.005828385220177367, "grad_norm": 0.37172913551330566, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 13128848.0, "reward": 0.27767860889434814, "reward_std": 0.119735486805439, "rewards/accuracy_reward/mean": 0.1785714328289032, "rewards/accuracy_reward/std": 0.38471439480781555, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 533 }, { "clip_ratio/high_max": 0.014101695269346237, "clip_ratio/high_mean": 0.004439380951225758, "clip_ratio/low_mean": 0.004408924840390682, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008848304860293865, "epoch": 0.0058393202768756355, "grad_norm": 0.18692101538181305, "learning_rate": 1e-06, "loss": 0.0047, "step": 534 }, { "clip_ratio/high_max": 0.02833898365497589, "clip_ratio/high_mean": 0.007986525073647499, "clip_ratio/low_mean": 0.007983866147696972, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015970392152667046, "epoch": 0.005850255333573905, "grad_norm": 0.13162776827812195, "learning_rate": 1e-06, "loss": 0.0041, "step": 535 }, { "clip_ratio/high_max": 0.036745764315128326, "clip_ratio/high_mean": 0.010496946983039379, "clip_ratio/low_mean": 0.01015507709234953, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020652025938034058, "epoch": 0.005861190390272174, "grad_norm": 0.10337212681770325, "learning_rate": 1e-06, "loss": 0.0037, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 830.5178833007812, "completions/mean_terminated_length": 761.603759765625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.005872125446970442, "grad_norm": 0.3383430242538452, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 13242222.0, "reward": 0.22321432828903198, "reward_std": 0.26055994629859924, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33220529556274414, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 537 }, { "clip_ratio/high_max": 0.00832736399024725, "clip_ratio/high_mean": 0.002975282957777381, "clip_ratio/low_mean": 0.004707090090960264, "clip_ratio/low_min": 0.0014439354417845607, "clip_ratio/region_mean": 0.007682372350245714, "epoch": 0.005883060503668712, "grad_norm": 0.2142666131258011, "learning_rate": 1e-06, "loss": -0.0057, "step": 538 }, { "clip_ratio/high_max": 0.017729226499795914, "clip_ratio/high_mean": 0.005975401494652033, "clip_ratio/low_mean": 0.012942035682499409, "clip_ratio/low_min": 0.002987452782690525, "clip_ratio/region_mean": 0.01891743764281273, "epoch": 0.00589399556036698, "grad_norm": 0.14318078756332397, "learning_rate": 1e-06, "loss": -0.0067, "step": 539 }, { "clip_ratio/high_max": 0.026952005922794342, "clip_ratio/high_mean": 0.008110282942652702, "clip_ratio/low_mean": 0.021230267360806465, "clip_ratio/low_min": 0.005028878804296255, "clip_ratio/region_mean": 0.029340548440814018, "epoch": 0.00590493061706525, "grad_norm": 0.1282573640346527, "learning_rate": 1e-06, "loss": -0.0073, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1676.0, "completions/max_terminated_length": 1676.0, "completions/mean_length": 614.3214721679688, "completions/mean_terminated_length": 614.3214721679688, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.005915865673763518, "grad_norm": 0.4413740038871765, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 13328134.0, "reward": 0.2678571939468384, "reward_std": 0.302952378988266, "rewards/accuracy_reward/mean": 0.1785714328289032, "rewards/accuracy_reward/std": 0.38471439480781555, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3106848895549774, "step": 541 }, { "clip_ratio/high_max": 0.009156310930848122, "clip_ratio/high_mean": 0.0043806042522192, "clip_ratio/low_mean": 0.00821332074701786, "clip_ratio/low_min": 0.00042496356763876975, "clip_ratio/region_mean": 0.012593925930559635, "epoch": 0.005926800730461788, "grad_norm": 0.3317408561706543, "learning_rate": 1e-06, "loss": -0.0011, "step": 542 }, { "clip_ratio/high_max": 0.012426422908902168, "clip_ratio/high_mean": 0.006080091930925846, "clip_ratio/low_mean": 0.013754297979176044, "clip_ratio/low_min": 0.00030354541377164423, "clip_ratio/region_mean": 0.01983438991010189, "epoch": 0.005937735787160056, "grad_norm": 0.19401122629642487, "learning_rate": 1e-06, "loss": -0.0016, "step": 543 }, { "clip_ratio/high_max": 0.016132548451423645, "clip_ratio/high_mean": 0.00806559156626463, "clip_ratio/low_mean": 0.017884671688079834, "clip_ratio/low_min": 0.0011534725781530142, "clip_ratio/region_mean": 0.025950266048312187, "epoch": 0.005948670843858326, "grad_norm": 0.19979247450828552, "learning_rate": 1e-06, "loss": -0.0021, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 717.794677734375, "completions/mean_terminated_length": 717.794677734375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.005959605900556594, "grad_norm": 0.2603328824043274, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 13425859.0, "reward": 0.35089290142059326, "reward_std": 0.20604102313518524, "rewards/accuracy_reward/mean": 0.2589285671710968, "rewards/accuracy_reward/std": 0.4400150775909424, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 545 }, { "clip_ratio/high_max": 0.007340823765844107, "clip_ratio/high_mean": 0.003485785098746419, "clip_ratio/low_mean": 0.0035077917855232954, "clip_ratio/low_min": 0.00011319259647279978, "clip_ratio/region_mean": 0.006993576884269714, "epoch": 0.005970540957254864, "grad_norm": 0.12479767203330994, "learning_rate": 1e-06, "loss": -0.0005, "step": 546 }, { "clip_ratio/high_max": 0.01393258385360241, "clip_ratio/high_mean": 0.005820327438414097, "clip_ratio/low_mean": 0.006731193512678146, "clip_ratio/low_min": 0.00033957778941839933, "clip_ratio/region_mean": 0.012551521882414818, "epoch": 0.005981476013953132, "grad_norm": 0.07999002188444138, "learning_rate": 1e-06, "loss": -0.0008, "step": 547 }, { "clip_ratio/high_max": 0.020074905827641487, "clip_ratio/high_mean": 0.007880987599492073, "clip_ratio/low_mean": 0.010332835838198662, "clip_ratio/low_min": 0.0006225592806003988, "clip_ratio/region_mean": 0.018213823437690735, "epoch": 0.005992411070651401, "grad_norm": 0.07563266903162003, "learning_rate": 1e-06, "loss": -0.001, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 644.5267944335938, "completions/mean_terminated_length": 644.5267944335938, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.00600334612734967, "grad_norm": 0.33417174220085144, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 13515210.0, "reward": 0.42500004172325134, "reward_std": 0.32547369599342346, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.4755948483943939, "rewards/format_reward/mean": 0.8571428656578064, "rewards/format_reward/std": 0.3514997959136963, "step": 549 }, { "clip_ratio/high_max": 0.006873428355902433, "clip_ratio/high_mean": 0.002750448416918516, "clip_ratio/low_mean": 0.0051689473912119865, "clip_ratio/low_min": 0.002300690161064267, "clip_ratio/region_mean": 0.007919395342469215, "epoch": 0.006014281184047939, "grad_norm": 0.17088699340820312, "learning_rate": 1e-06, "loss": 0.0329, "step": 550 }, { "clip_ratio/high_max": 0.015423302538692951, "clip_ratio/high_mean": 0.005265102256089449, "clip_ratio/low_mean": 0.011462089605629444, "clip_ratio/low_min": 0.004692832939326763, "clip_ratio/region_mean": 0.01672719046473503, "epoch": 0.0060252162407462085, "grad_norm": 0.11298159509897232, "learning_rate": 1e-06, "loss": 0.0323, "step": 551 }, { "clip_ratio/high_max": 0.02145850844681263, "clip_ratio/high_mean": 0.006891316268593073, "clip_ratio/low_mean": 0.01794988475739956, "clip_ratio/low_min": 0.010452218353748322, "clip_ratio/region_mean": 0.024841202422976494, "epoch": 0.006036151297444477, "grad_norm": 0.0922405868768692, "learning_rate": 1e-06, "loss": 0.0319, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 792.9375610351562, "completions/mean_terminated_length": 669.8921508789062, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.0060470863541427465, "grad_norm": 0.3363291919231415, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 13621771.0, "reward": 0.6133929491043091, "reward_std": 0.23530003428459167, "rewards/accuracy_reward/mean": 0.5267857313156128, "rewards/accuracy_reward/std": 0.5015259385108948, "rewards/format_reward/mean": 0.8660714030265808, "rewards/format_reward/std": 0.3421062231063843, "step": 553 }, { "clip_ratio/high_max": 0.006901840679347515, "clip_ratio/high_mean": 0.003219356993213296, "clip_ratio/low_mean": 0.00345010869204998, "clip_ratio/low_min": 0.0016586737474426627, "clip_ratio/region_mean": 0.00666946591809392, "epoch": 0.006058021410841015, "grad_norm": 0.16413035988807678, "learning_rate": 1e-06, "loss": -0.0096, "step": 554 }, { "clip_ratio/high_max": 0.012781186029314995, "clip_ratio/high_mean": 0.006273639388382435, "clip_ratio/low_mean": 0.007442645728588104, "clip_ratio/low_min": 0.002488010562956333, "clip_ratio/region_mean": 0.013716285116970539, "epoch": 0.0060689564675392845, "grad_norm": 0.12281201034784317, "learning_rate": 1e-06, "loss": -0.01, "step": 555 }, { "clip_ratio/high_max": 0.01648773066699505, "clip_ratio/high_mean": 0.008119535632431507, "clip_ratio/low_mean": 0.010359399951994419, "clip_ratio/low_min": 0.0028846501372754574, "clip_ratio/region_mean": 0.018478933721780777, "epoch": 0.006079891524237553, "grad_norm": 0.10092619806528091, "learning_rate": 1e-06, "loss": -0.0102, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 530.857177734375, "completions/mean_terminated_length": 530.857177734375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.006090826580935822, "grad_norm": 0.39483824372291565, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 13695059.0, "reward": 0.4294643700122833, "reward_std": 0.17432238161563873, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 557 }, { "clip_ratio/high_max": 0.00801559817045927, "clip_ratio/high_mean": 0.002640588441863656, "clip_ratio/low_mean": 0.007608667481690645, "clip_ratio/low_min": 0.0008345503592863679, "clip_ratio/region_mean": 0.010249256156384945, "epoch": 0.006101761637634091, "grad_norm": 0.172408789396286, "learning_rate": 1e-06, "loss": 0.0057, "step": 558 }, { "clip_ratio/high_max": 0.009965337812900543, "clip_ratio/high_mean": 0.00407788110896945, "clip_ratio/low_mean": 0.013714678585529327, "clip_ratio/low_min": 0.001507483539171517, "clip_ratio/region_mean": 0.017792562022805214, "epoch": 0.00611269669433236, "grad_norm": 0.10948218405246735, "learning_rate": 1e-06, "loss": 0.0052, "step": 559 }, { "clip_ratio/high_max": 0.01161688007414341, "clip_ratio/high_mean": 0.004637538455426693, "clip_ratio/low_mean": 0.01841801404953003, "clip_ratio/low_min": 0.0028534510638564825, "clip_ratio/region_mean": 0.023055549710989, "epoch": 0.006123631751030629, "grad_norm": 0.09467586874961853, "learning_rate": 1e-06, "loss": 0.005, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 761.4107666015625, "completions/mean_terminated_length": 761.4107666015625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.006134566807728898, "grad_norm": 0.303372859954834, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 13797081.0, "reward": 0.40178579092025757, "reward_std": 0.2661088705062866, "rewards/accuracy_reward/mean": 0.3035714328289032, "rewards/accuracy_reward/std": 0.46186625957489014, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 561 }, { "clip_ratio/high_max": 0.007885842584073544, "clip_ratio/high_mean": 0.00276687229052186, "clip_ratio/low_mean": 0.004856078419834375, "clip_ratio/low_min": 0.001528532593511045, "clip_ratio/region_mean": 0.007622950244694948, "epoch": 0.006145501864427167, "grad_norm": 0.17705944180488586, "learning_rate": 1e-06, "loss": 0.038, "step": 562 }, { "clip_ratio/high_max": 0.015521341934800148, "clip_ratio/high_mean": 0.0050161839462816715, "clip_ratio/low_mean": 0.010469188913702965, "clip_ratio/low_min": 0.002758328104391694, "clip_ratio/region_mean": 0.015485371463000774, "epoch": 0.006156436921125436, "grad_norm": 0.1367868036031723, "learning_rate": 1e-06, "loss": 0.0373, "step": 563 }, { "clip_ratio/high_max": 0.01915133371949196, "clip_ratio/high_mean": 0.006316826678812504, "clip_ratio/low_mean": 0.015454218722879887, "clip_ratio/low_min": 0.0031419836450368166, "clip_ratio/region_mean": 0.02177104353904724, "epoch": 0.006167371977823705, "grad_norm": 0.10915742069482803, "learning_rate": 1e-06, "loss": 0.0368, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.0, "completions/max_terminated_length": 1345.0, "completions/mean_length": 608.4732666015625, "completions/mean_terminated_length": 608.4732666015625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.006178307034521974, "grad_norm": 0.3089020550251007, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 13881750.0, "reward": 0.2964286208152771, "reward_std": 0.1574852168560028, "rewards/accuracy_reward/mean": 0.1964285671710968, "rewards/accuracy_reward/std": 0.3990819752216339, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.008297378197312355, "clip_ratio/high_mean": 0.0022017331793904305, "clip_ratio/low_mean": 0.005317286588251591, "clip_ratio/low_min": 0.0011157317785546184, "clip_ratio/region_mean": 0.007519019301980734, "epoch": 0.006189242091220243, "grad_norm": 0.13535264134407043, "learning_rate": 1e-06, "loss": 0.0083, "step": 566 }, { "clip_ratio/high_max": 0.01576501876115799, "clip_ratio/high_mean": 0.004217927809804678, "clip_ratio/low_mean": 0.009583241306245327, "clip_ratio/low_min": 0.0013185921125113964, "clip_ratio/region_mean": 0.013801167719066143, "epoch": 0.006200177147918512, "grad_norm": 0.09356275200843811, "learning_rate": 1e-06, "loss": 0.008, "step": 567 }, { "clip_ratio/high_max": 0.022071026265621185, "clip_ratio/high_mean": 0.005682411603629589, "clip_ratio/low_mean": 0.012921721674501896, "clip_ratio/low_min": 0.0014200223376974463, "clip_ratio/region_mean": 0.018604131415486336, "epoch": 0.0062111122046167805, "grad_norm": 0.12365998327732086, "learning_rate": 1e-06, "loss": 0.0078, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 1028.732177734375, "completions/mean_terminated_length": 981.102783203125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.00622204726131505, "grad_norm": 0.24703601002693176, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 14013572.0, "reward": 0.23750002682209015, "reward_std": 0.2814064919948578, "rewards/accuracy_reward/mean": 0.1428571492433548, "rewards/accuracy_reward/std": 0.3514998257160187, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 569 }, { "clip_ratio/high_max": 0.007426873780786991, "clip_ratio/high_mean": 0.0027375665958970785, "clip_ratio/low_mean": 0.004238673951476812, "clip_ratio/low_min": 0.00011429224832681939, "clip_ratio/region_mean": 0.006976240314543247, "epoch": 0.0062329823180133186, "grad_norm": 0.146487757563591, "learning_rate": 1e-06, "loss": 0.0056, "step": 570 }, { "clip_ratio/high_max": 0.011540219187736511, "clip_ratio/high_mean": 0.003926779609173536, "clip_ratio/low_mean": 0.009626020677387714, "clip_ratio/low_min": 0.00017143836885225028, "clip_ratio/region_mean": 0.013552799820899963, "epoch": 0.006243917374711588, "grad_norm": 0.11464876681566238, "learning_rate": 1e-06, "loss": 0.005, "step": 571 }, { "clip_ratio/high_max": 0.013368372805416584, "clip_ratio/high_mean": 0.004599213600158691, "clip_ratio/low_mean": 0.014891296625137329, "clip_ratio/low_min": 0.0005143150920048356, "clip_ratio/region_mean": 0.01949051022529602, "epoch": 0.006254852431409857, "grad_norm": 0.09995519369840622, "learning_rate": 1e-06, "loss": 0.0047, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 902.5714721679688, "completions/mean_terminated_length": 871.0458374023438, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.006265787488108126, "grad_norm": 0.017342621460556984, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 14134120.0, "reward": 0.34196433424949646, "reward_std": 0.016225649043917656, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4349588453769684, "rewards/format_reward/mean": 0.9196428656578064, "rewards/format_reward/std": 0.27306708693504333, "step": 573 }, { "clip_ratio/high_max": 0.003071081591770053, "clip_ratio/high_mean": 0.001394810271449387, "clip_ratio/low_mean": 0.0009470749064348638, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00234188511967659, "epoch": 0.006276722544806395, "grad_norm": 0.015070304274559021, "learning_rate": 1e-06, "loss": -0.0014, "step": 574 }, { "clip_ratio/high_max": 0.0036852979101240635, "clip_ratio/high_mean": 0.0017461201641708612, "clip_ratio/low_mean": 0.0015159818576648831, "clip_ratio/low_min": 0.00022335138055495918, "clip_ratio/region_mean": 0.003262102138251066, "epoch": 0.006287657601504664, "grad_norm": 0.012885691598057747, "learning_rate": 1e-06, "loss": -0.0014, "step": 575 }, { "clip_ratio/high_max": 0.004355351906269789, "clip_ratio/high_mean": 0.002148828934878111, "clip_ratio/low_mean": 0.002094811527058482, "clip_ratio/low_min": 0.00022335138055495918, "clip_ratio/region_mean": 0.004243640694767237, "epoch": 0.006298592658202933, "grad_norm": 0.011236459948122501, "learning_rate": 1e-06, "loss": -0.0014, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 843.0357666015625, "completions/mean_terminated_length": 832.18017578125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.006309527714901202, "grad_norm": 0.3779085874557495, "learning_rate": 1e-06, "loss": -0.0119, "num_tokens": 14246852.0, "reward": 0.329464316368103, "reward_std": 0.29899537563323975, "rewards/accuracy_reward/mean": 0.2321428507566452, "rewards/accuracy_reward/std": 0.4240971803665161, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 577 }, { "clip_ratio/high_max": 0.021071644499897957, "clip_ratio/high_mean": 0.005266234744340181, "clip_ratio/low_mean": 0.004421980120241642, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00968821533024311, "epoch": 0.006320462771599471, "grad_norm": 0.2259678691625595, "learning_rate": 1e-06, "loss": -0.013, "step": 578 }, { "clip_ratio/high_max": 0.033313266932964325, "clip_ratio/high_mean": 0.009074115194380283, "clip_ratio/low_mean": 0.00956934504210949, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018643459305167198, "epoch": 0.006331397828297739, "grad_norm": 0.1589677482843399, "learning_rate": 1e-06, "loss": -0.0137, "step": 579 }, { "clip_ratio/high_max": 0.0397351011633873, "clip_ratio/high_mean": 0.011253027245402336, "clip_ratio/low_mean": 0.014576733112335205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02582976035773754, "epoch": 0.006342332884996009, "grad_norm": 0.13397908210754395, "learning_rate": 1e-06, "loss": -0.0141, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 490.71429443359375, "completions/mean_terminated_length": 490.71429443359375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.006353267941694277, "grad_norm": 0.5235233306884766, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 14320012.0, "reward": 0.4830358028411865, "reward_std": 0.33753934502601624, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48852667212486267, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 581 }, { "clip_ratio/high_max": 0.00842481479048729, "clip_ratio/high_mean": 0.005162827670574188, "clip_ratio/low_mean": 0.007260608021169901, "clip_ratio/low_min": 0.0015058729331940413, "clip_ratio/region_mean": 0.012423434294760227, "epoch": 0.006364202998392547, "grad_norm": 0.33995750546455383, "learning_rate": 1e-06, "loss": 0.0147, "step": 582 }, { "clip_ratio/high_max": 0.018434962257742882, "clip_ratio/high_mean": 0.010722504928708076, "clip_ratio/low_mean": 0.014621355570852757, "clip_ratio/low_min": 0.0026101795956492424, "clip_ratio/region_mean": 0.025343861430883408, "epoch": 0.006375138055090815, "grad_norm": 0.24851417541503906, "learning_rate": 1e-06, "loss": 0.0137, "step": 583 }, { "clip_ratio/high_max": 0.02230243571102619, "clip_ratio/high_mean": 0.01349236723035574, "clip_ratio/low_mean": 0.020065443590283394, "clip_ratio/low_min": 0.002509788144379854, "clip_ratio/region_mean": 0.03355780988931656, "epoch": 0.006386073111789085, "grad_norm": 0.26387274265289307, "learning_rate": 1e-06, "loss": 0.0133, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 650.8839721679688, "completions/mean_terminated_length": 650.8839721679688, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.0063970081684873534, "grad_norm": 0.17803536355495453, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 14408563.0, "reward": 0.3660714328289032, "reward_std": 0.1354372799396515, "rewards/accuracy_reward/mean": 0.2678571343421936, "rewards/accuracy_reward/std": 0.44483304023742676, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 585 }, { "clip_ratio/high_max": 0.004818902350962162, "clip_ratio/high_mean": 0.003301844000816345, "clip_ratio/low_mean": 0.0014063202543184161, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004708164371550083, "epoch": 0.006407943225185623, "grad_norm": 0.05491362884640694, "learning_rate": 1e-06, "loss": -0.007, "step": 586 }, { "clip_ratio/high_max": 0.006217938847839832, "clip_ratio/high_mean": 0.004148630890995264, "clip_ratio/low_mean": 0.0027588156517595053, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006907446775585413, "epoch": 0.0064188782818838915, "grad_norm": 0.04065268859267235, "learning_rate": 1e-06, "loss": -0.007, "step": 587 }, { "clip_ratio/high_max": 0.008705114014446735, "clip_ratio/high_mean": 0.005647982005029917, "clip_ratio/low_mean": 0.004520478192716837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010168458335101604, "epoch": 0.006429813338582161, "grad_norm": 0.03386669233441353, "learning_rate": 1e-06, "loss": -0.0071, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1691.0, "completions/mean_length": 640.607177734375, "completions/mean_terminated_length": 615.0181884765625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0064407483952804295, "grad_norm": 0.2788974344730377, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 14493051.0, "reward": 0.4901786744594574, "reward_std": 0.19407814741134644, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4905804991722107, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 589 }, { "clip_ratio/high_max": 0.007749713025987148, "clip_ratio/high_mean": 0.001954990904778242, "clip_ratio/low_mean": 0.003955877386033535, "clip_ratio/low_min": 0.0009090909152291715, "clip_ratio/region_mean": 0.005910868290811777, "epoch": 0.006451683451978698, "grad_norm": 0.14411614835262299, "learning_rate": 1e-06, "loss": 0.0099, "step": 590 }, { "clip_ratio/high_max": 0.01205510925501585, "clip_ratio/high_mean": 0.00304572656750679, "clip_ratio/low_mean": 0.007112984079867601, "clip_ratio/low_min": 0.001818181830458343, "clip_ratio/region_mean": 0.010158711113035679, "epoch": 0.006462618508676968, "grad_norm": 0.15235473215579987, "learning_rate": 1e-06, "loss": 0.0097, "step": 591 }, { "clip_ratio/high_max": 0.013203214854001999, "clip_ratio/high_mean": 0.0036587752401828766, "clip_ratio/low_mean": 0.00997158419340849, "clip_ratio/low_min": 0.0009090909152291715, "clip_ratio/region_mean": 0.01363036036491394, "epoch": 0.006473553565375236, "grad_norm": 0.11725945770740509, "learning_rate": 1e-06, "loss": 0.0096, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 765.3482666015625, "completions/mean_terminated_length": 753.7927856445312, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.006484488622073506, "grad_norm": 0.20867061614990234, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 14597082.0, "reward": 0.13482144474983215, "reward_std": 0.12639759480953217, "rewards/accuracy_reward/mean": 0.0357142873108387, "rewards/accuracy_reward/std": 0.18641093373298645, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 593 }, { "clip_ratio/high_max": 0.0030090271029621363, "clip_ratio/high_mean": 0.0009950101375579834, "clip_ratio/low_mean": 0.0061875744722783566, "clip_ratio/low_min": 0.0007552870083600283, "clip_ratio/region_mean": 0.00718258460983634, "epoch": 0.006495423678771774, "grad_norm": 0.07878584414720535, "learning_rate": 1e-06, "loss": 0.0081, "step": 594 }, { "clip_ratio/high_max": 0.004262788221240044, "clip_ratio/high_mean": 0.0014683343470096588, "clip_ratio/low_mean": 0.010896573774516582, "clip_ratio/low_min": 0.001085725030861795, "clip_ratio/region_mean": 0.012364907190203667, "epoch": 0.006506358735470044, "grad_norm": 0.061055950820446014, "learning_rate": 1e-06, "loss": 0.0079, "step": 595 }, { "clip_ratio/high_max": 0.004513540770858526, "clip_ratio/high_mean": 0.00166746333707124, "clip_ratio/low_mean": 0.01559179276227951, "clip_ratio/low_min": 0.0017466011922806501, "clip_ratio/region_mean": 0.017259256914258003, "epoch": 0.006517293792168312, "grad_norm": 0.0464322492480278, "learning_rate": 1e-06, "loss": 0.0078, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 827.1607666015625, "completions/mean_terminated_length": 804.963623046875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.006528228848866582, "grad_norm": 0.43270716071128845, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 14709632.0, "reward": 0.34017863869667053, "reward_std": 0.29568350315093994, "rewards/accuracy_reward/mean": 0.2410714328289032, "rewards/accuracy_reward/std": 0.42965590953826904, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 597 }, { "clip_ratio/high_max": 0.010503441095352173, "clip_ratio/high_mean": 0.003566670697182417, "clip_ratio/low_mean": 0.006743399892002344, "clip_ratio/low_min": 0.0018527095671743155, "clip_ratio/region_mean": 0.010310070589184761, "epoch": 0.00653916390556485, "grad_norm": 0.2306758314371109, "learning_rate": 1e-06, "loss": -0.0056, "step": 598 }, { "clip_ratio/high_max": 0.014668598771095276, "clip_ratio/high_mean": 0.006013710517436266, "clip_ratio/low_mean": 0.011791030876338482, "clip_ratio/low_min": 0.0021071643568575382, "clip_ratio/region_mean": 0.017804741859436035, "epoch": 0.00655009896226312, "grad_norm": 0.19701622426509857, "learning_rate": 1e-06, "loss": -0.006, "step": 599 }, { "clip_ratio/high_max": 0.018471568822860718, "clip_ratio/high_mean": 0.007293575443327427, "clip_ratio/low_mean": 0.016952181234955788, "clip_ratio/low_min": 0.0026590407360345125, "clip_ratio/region_mean": 0.02424575574696064, "epoch": 0.006561034018961388, "grad_norm": 0.14441682398319244, "learning_rate": 1e-06, "loss": -0.0064, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 606.357177734375, "completions/mean_terminated_length": 606.357177734375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.006571969075659657, "grad_norm": 0.35325759649276733, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 14793476.0, "reward": 0.3321429193019867, "reward_std": 0.26966434717178345, "rewards/accuracy_reward/mean": 0.2321428507566452, "rewards/accuracy_reward/std": 0.4240971803665161, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.005964955780655146, "clip_ratio/high_mean": 0.002677193144336343, "clip_ratio/low_mean": 0.005003637634217739, "clip_ratio/low_min": 0.0020564848091453314, "clip_ratio/region_mean": 0.007680830545723438, "epoch": 0.006582904132357926, "grad_norm": 0.1568213850259781, "learning_rate": 1e-06, "loss": -0.008, "step": 602 }, { "clip_ratio/high_max": 0.00842956081032753, "clip_ratio/high_mean": 0.004468908999115229, "clip_ratio/low_mean": 0.012330198660492897, "clip_ratio/low_min": 0.00402729632332921, "clip_ratio/region_mean": 0.016799109056591988, "epoch": 0.006593839189056195, "grad_norm": 0.1325366050004959, "learning_rate": 1e-06, "loss": -0.0084, "step": 603 }, { "clip_ratio/high_max": 0.009468821808695793, "clip_ratio/high_mean": 0.0056105912663042545, "clip_ratio/low_mean": 0.018204864114522934, "clip_ratio/low_min": 0.005415409803390503, "clip_ratio/region_mean": 0.0238154549151659, "epoch": 0.006604774245754464, "grad_norm": 0.0996580719947815, "learning_rate": 1e-06, "loss": -0.0087, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1378.0, "completions/max_terminated_length": 1378.0, "completions/mean_length": 688.0089721679688, "completions/mean_terminated_length": 688.0089721679688, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.006615709302452733, "grad_norm": 0.3802414536476135, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 14889909.0, "reward": 0.5017857551574707, "reward_std": 0.23479939997196198, "rewards/accuracy_reward/mean": 0.4017857015132904, "rewards/accuracy_reward/std": 0.4924624562263489, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.00988852884620428, "clip_ratio/high_mean": 0.002590281656011939, "clip_ratio/low_mean": 0.005536864977329969, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008127146400511265, "epoch": 0.0066266443591510025, "grad_norm": 0.189643993973732, "learning_rate": 1e-06, "loss": 0.0163, "step": 606 }, { "clip_ratio/high_max": 0.019417475908994675, "clip_ratio/high_mean": 0.005578275304287672, "clip_ratio/low_mean": 0.009761680848896503, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015339955687522888, "epoch": 0.006637579415849271, "grad_norm": 0.15638317167758942, "learning_rate": 1e-06, "loss": 0.0158, "step": 607 }, { "clip_ratio/high_max": 0.01977705769240856, "clip_ratio/high_mean": 0.006028846371918917, "clip_ratio/low_mean": 0.013621577061712742, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019650425761938095, "epoch": 0.0066485144725475405, "grad_norm": 0.11698593944311142, "learning_rate": 1e-06, "loss": 0.0156, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 906.7589721679688, "completions/mean_terminated_length": 818.97119140625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.006659449529245809, "grad_norm": 0.35085660219192505, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 15009890.0, "reward": 0.21517860889434814, "reward_std": 0.2504523992538452, "rewards/accuracy_reward/mean": 0.1160714253783226, "rewards/accuracy_reward/std": 0.3217501640319824, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 609 }, { "clip_ratio/high_max": 0.00794507097452879, "clip_ratio/high_mean": 0.0023065502755343914, "clip_ratio/low_mean": 0.005897578317672014, "clip_ratio/low_min": 0.0012100957101210952, "clip_ratio/region_mean": 0.008204127661883831, "epoch": 0.0066703845859440785, "grad_norm": 0.16935260593891144, "learning_rate": 1e-06, "loss": 0.0008, "step": 610 }, { "clip_ratio/high_max": 0.01373222190886736, "clip_ratio/high_mean": 0.003903226461261511, "clip_ratio/low_mean": 0.012742312625050545, "clip_ratio/low_min": 0.0025279633700847626, "clip_ratio/region_mean": 0.016645537689328194, "epoch": 0.006681319642642347, "grad_norm": 0.118543341755867, "learning_rate": 1e-06, "loss": 0.0003, "step": 611 }, { "clip_ratio/high_max": 0.01520353090018034, "clip_ratio/high_mean": 0.004498370923101902, "clip_ratio/low_mean": 0.018690183758735657, "clip_ratio/low_min": 0.0027659328188747168, "clip_ratio/region_mean": 0.023188555613160133, "epoch": 0.006692254699340616, "grad_norm": 0.11313570290803909, "learning_rate": 1e-06, "loss": 0.0001, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 693.5982666015625, "completions/mean_terminated_length": 681.3964233398438, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.006703189756038885, "grad_norm": 0.48322218656539917, "learning_rate": 1e-06, "loss": -0.0038, "num_tokens": 15112745.0, "reward": 0.5464286208152771, "reward_std": 0.34783026576042175, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49935612082481384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.010066346265375614, "clip_ratio/high_mean": 0.004771905951201916, "clip_ratio/low_mean": 0.006059177685528994, "clip_ratio/low_min": 0.002014504512771964, "clip_ratio/region_mean": 0.010831083171069622, "epoch": 0.006714124812737154, "grad_norm": 0.3095279633998871, "learning_rate": 1e-06, "loss": -0.0054, "step": 614 }, { "clip_ratio/high_max": 0.026424158364534378, "clip_ratio/high_mean": 0.01104914303869009, "clip_ratio/low_mean": 0.013838933780789375, "clip_ratio/low_min": 0.003130217082798481, "clip_ratio/region_mean": 0.02488807775080204, "epoch": 0.006725059869435423, "grad_norm": 0.2320675551891327, "learning_rate": 1e-06, "loss": -0.0068, "step": 615 }, { "clip_ratio/high_max": 0.0404941663146019, "clip_ratio/high_mean": 0.016614457592368126, "clip_ratio/low_mean": 0.020552443340420723, "clip_ratio/low_min": 0.003912771120667458, "clip_ratio/region_mean": 0.03716690093278885, "epoch": 0.006735994926133692, "grad_norm": 0.19065313041210175, "learning_rate": 1e-06, "loss": -0.0078, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 645.7767944335938, "completions/mean_terminated_length": 645.7767944335938, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.006746929982831961, "grad_norm": 0.45413094758987427, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 15203832.0, "reward": 0.6169643402099609, "reward_std": 0.316789835691452, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5019267797470093, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 617 }, { "clip_ratio/high_max": 0.016561342403292656, "clip_ratio/high_mean": 0.0062814438715577126, "clip_ratio/low_mean": 0.006737909745424986, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013019353151321411, "epoch": 0.00675786503953023, "grad_norm": 0.3171757161617279, "learning_rate": 1e-06, "loss": -0.0036, "step": 618 }, { "clip_ratio/high_max": 0.02200915291905403, "clip_ratio/high_mean": 0.008671342395246029, "clip_ratio/low_mean": 0.009673578664660454, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018344921991229057, "epoch": 0.006768800096228499, "grad_norm": 0.2181480973958969, "learning_rate": 1e-06, "loss": -0.0043, "step": 619 }, { "clip_ratio/high_max": 0.025277838110923767, "clip_ratio/high_mean": 0.011124247685074806, "clip_ratio/low_mean": 0.013002638705074787, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024126887321472168, "epoch": 0.006779735152926768, "grad_norm": 0.3802279829978943, "learning_rate": 1e-06, "loss": -0.0041, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1144.821533203125, "completions/mean_terminated_length": 1128.4000244140625, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.006790670209625037, "grad_norm": 0.21554993093013763, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 15352940.0, "reward": 0.1803571581840515, "reward_std": 0.21727575361728668, "rewards/accuracy_reward/mean": 0.0803571417927742, "rewards/accuracy_reward/std": 0.27306708693504333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 621 }, { "clip_ratio/high_max": 0.008554826490581036, "clip_ratio/high_mean": 0.0018268569838255644, "clip_ratio/low_mean": 0.0045053670182824135, "clip_ratio/low_min": 0.0023680177982896566, "clip_ratio/region_mean": 0.006332223769277334, "epoch": 0.006801605266323306, "grad_norm": 0.1284254491329193, "learning_rate": 1e-06, "loss": 0.0084, "step": 622 }, { "clip_ratio/high_max": 0.016248788684606552, "clip_ratio/high_mean": 0.0032845104578882456, "clip_ratio/low_mean": 0.009153245016932487, "clip_ratio/low_min": 0.0028555509634315968, "clip_ratio/region_mean": 0.012437754310667515, "epoch": 0.0068125403230215745, "grad_norm": 0.09056536108255386, "learning_rate": 1e-06, "loss": 0.0079, "step": 623 }, { "clip_ratio/high_max": 0.02066071145236492, "clip_ratio/high_mean": 0.004146409686654806, "clip_ratio/low_mean": 0.013749786652624607, "clip_ratio/low_min": 0.0029251985251903534, "clip_ratio/region_mean": 0.017896195873618126, "epoch": 0.006823475379719844, "grad_norm": 0.06958520412445068, "learning_rate": 1e-06, "loss": 0.0076, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 499.0535888671875, "completions/mean_terminated_length": 499.0535888671875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.0068344104364181125, "grad_norm": 0.6036021113395691, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 15428854.0, "reward": 0.3669643700122833, "reward_std": 0.3704379200935364, "rewards/accuracy_reward/mean": 0.2678571343421936, "rewards/accuracy_reward/std": 0.44483304023742676, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 625 }, { "clip_ratio/high_max": 0.011527377180755138, "clip_ratio/high_mean": 0.0050359866581857204, "clip_ratio/low_mean": 0.008024957962334156, "clip_ratio/low_min": 0.0033335802145302296, "clip_ratio/region_mean": 0.01306094415485859, "epoch": 0.006845345493116382, "grad_norm": 0.33414342999458313, "learning_rate": 1e-06, "loss": -0.0015, "step": 626 }, { "clip_ratio/high_max": 0.020172910764813423, "clip_ratio/high_mean": 0.009848830290138721, "clip_ratio/low_mean": 0.016085688024759293, "clip_ratio/low_min": 0.005895647220313549, "clip_ratio/region_mean": 0.02593451738357544, "epoch": 0.006856280549814651, "grad_norm": 0.2562123239040375, "learning_rate": 1e-06, "loss": -0.0029, "step": 627 }, { "clip_ratio/high_max": 0.02593659982085228, "clip_ratio/high_mean": 0.013086305931210518, "clip_ratio/low_mean": 0.025582818314433098, "clip_ratio/low_min": 0.009007678367197514, "clip_ratio/region_mean": 0.03866912052035332, "epoch": 0.00686721560651292, "grad_norm": 0.21375328302383423, "learning_rate": 1e-06, "loss": -0.0034, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 747.8035888671875, "completions/mean_terminated_length": 747.8035888671875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.006878150663211189, "grad_norm": 0.341010183095932, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 15535960.0, "reward": 0.19553573429584503, "reward_std": 0.2501002252101898, "rewards/accuracy_reward/mean": 0.0982142835855484, "rewards/accuracy_reward/std": 0.2989417314529419, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 629 }, { "clip_ratio/high_max": 0.0074490029364824295, "clip_ratio/high_mean": 0.002404567552730441, "clip_ratio/low_mean": 0.0045464662835001945, "clip_ratio/low_min": 0.0019588638097047806, "clip_ratio/region_mean": 0.006951034069061279, "epoch": 0.006889085719909458, "grad_norm": 0.17796331644058228, "learning_rate": 1e-06, "loss": 0.0021, "step": 630 }, { "clip_ratio/high_max": 0.01237680483609438, "clip_ratio/high_mean": 0.004156279377639294, "clip_ratio/low_mean": 0.010558193549513817, "clip_ratio/low_min": 0.004306219983845949, "clip_ratio/region_mean": 0.014714471995830536, "epoch": 0.006900020776607727, "grad_norm": 0.1083877831697464, "learning_rate": 1e-06, "loss": 0.0015, "step": 631 }, { "clip_ratio/high_max": 0.014898005872964859, "clip_ratio/high_mean": 0.005232270807027817, "clip_ratio/low_mean": 0.016149401664733887, "clip_ratio/low_min": 0.005809979513287544, "clip_ratio/region_mean": 0.021381672471761703, "epoch": 0.006910955833305995, "grad_norm": 0.10034214705228806, "learning_rate": 1e-06, "loss": 0.0012, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 785.8750610351562, "completions/mean_terminated_length": 785.8750610351562, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.006921890890004265, "grad_norm": 0.31201064586639404, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 15640722.0, "reward": 0.49196434020996094, "reward_std": 0.1974678933620453, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.4905804991722107, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 633 }, { "clip_ratio/high_max": 0.010064300149679184, "clip_ratio/high_mean": 0.0040674335323274136, "clip_ratio/low_mean": 0.0041650026105344296, "clip_ratio/low_min": 0.002320050960406661, "clip_ratio/region_mean": 0.008232436142861843, "epoch": 0.006932825946702533, "grad_norm": 0.15830041468143463, "learning_rate": 1e-06, "loss": 0.0038, "step": 634 }, { "clip_ratio/high_max": 0.017193177714943886, "clip_ratio/high_mean": 0.007439407054334879, "clip_ratio/low_mean": 0.00730860885232687, "clip_ratio/low_min": 0.0031900701578706503, "clip_ratio/region_mean": 0.014748014509677887, "epoch": 0.006943761003400803, "grad_norm": 0.10300728678703308, "learning_rate": 1e-06, "loss": 0.0033, "step": 635 }, { "clip_ratio/high_max": 0.02245623804628849, "clip_ratio/high_mean": 0.009209086187183857, "clip_ratio/low_mean": 0.009224029257893562, "clip_ratio/low_min": 0.003886085469275713, "clip_ratio/region_mean": 0.018433114513754845, "epoch": 0.006954696060099071, "grad_norm": 0.08730489015579224, "learning_rate": 1e-06, "loss": 0.003, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 608.0357666015625, "completions/mean_terminated_length": 595.0630493164062, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.006965631116797341, "grad_norm": 0.5654830932617188, "learning_rate": 1e-06, "loss": -0.009, "num_tokens": 15726714.0, "reward": 0.428571492433548, "reward_std": 0.3599766492843628, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 637 }, { "clip_ratio/high_max": 0.019947199150919914, "clip_ratio/high_mean": 0.006198248825967312, "clip_ratio/low_mean": 0.005920854862779379, "clip_ratio/low_min": 0.0001431434357073158, "clip_ratio/region_mean": 0.012119104154407978, "epoch": 0.006976566173495609, "grad_norm": 0.3013886511325836, "learning_rate": 1e-06, "loss": -0.0103, "step": 638 }, { "clip_ratio/high_max": 0.03989439830183983, "clip_ratio/high_mean": 0.011739159002900124, "clip_ratio/low_mean": 0.012982592917978764, "clip_ratio/low_min": 0.0002385723782936111, "clip_ratio/region_mean": 0.024721750989556313, "epoch": 0.006987501230193879, "grad_norm": 0.23689238727092743, "learning_rate": 1e-06, "loss": -0.0113, "step": 639 }, { "clip_ratio/high_max": 0.04928131401538849, "clip_ratio/high_mean": 0.014134238474071026, "clip_ratio/low_mean": 0.017647650092840195, "clip_ratio/low_min": 0.00042943027801811695, "clip_ratio/region_mean": 0.031781889498233795, "epoch": 0.006998436286892147, "grad_norm": 0.18401430547237396, "learning_rate": 1e-06, "loss": -0.0119, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 613.2053833007812, "completions/mean_terminated_length": 613.2053833007812, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.007009371343590417, "grad_norm": 0.45419442653656006, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 15810289.0, "reward": 0.2937500476837158, "reward_std": 0.34055057168006897, "rewards/accuracy_reward/mean": 0.1964285671710968, "rewards/accuracy_reward/std": 0.3990819454193115, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 641 }, { "clip_ratio/high_max": 0.012638874351978302, "clip_ratio/high_mean": 0.004679047502577305, "clip_ratio/low_mean": 0.005324759986251593, "clip_ratio/low_min": 9.060432785190642e-05, "clip_ratio/region_mean": 0.01000380702316761, "epoch": 0.0070203064002886855, "grad_norm": 0.24774736166000366, "learning_rate": 1e-06, "loss": 0.0069, "step": 642 }, { "clip_ratio/high_max": 0.02171032503247261, "clip_ratio/high_mean": 0.008004782721400261, "clip_ratio/low_mean": 0.012392906472086906, "clip_ratio/low_min": 0.0009060433367267251, "clip_ratio/region_mean": 0.020397689193487167, "epoch": 0.007031241456986954, "grad_norm": 0.16058577597141266, "learning_rate": 1e-06, "loss": 0.006, "step": 643 }, { "clip_ratio/high_max": 0.027010498568415642, "clip_ratio/high_mean": 0.010136467404663563, "clip_ratio/low_mean": 0.019523104652762413, "clip_ratio/low_min": 0.0014496692456305027, "clip_ratio/region_mean": 0.02965957298874855, "epoch": 0.0070421765136852235, "grad_norm": 0.17767739295959473, "learning_rate": 1e-06, "loss": 0.0055, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 795.3750610351562, "completions/mean_terminated_length": 795.3750610351562, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.007053111570383492, "grad_norm": 0.5906127691268921, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 15916927.0, "reward": 0.4303572177886963, "reward_std": 0.37761881947517395, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.007519843988120556, "clip_ratio/high_mean": 0.0037140233907848597, "clip_ratio/low_mean": 0.0054629710502922535, "clip_ratio/low_min": 0.002668699948117137, "clip_ratio/region_mean": 0.009176994673907757, "epoch": 0.0070640466270817616, "grad_norm": 0.3091946840286255, "learning_rate": 1e-06, "loss": 0.0243, "step": 646 }, { "clip_ratio/high_max": 0.016710763797163963, "clip_ratio/high_mean": 0.007404944859445095, "clip_ratio/low_mean": 0.01163096260279417, "clip_ratio/low_min": 0.003594575449824333, "clip_ratio/region_mean": 0.019035909324884415, "epoch": 0.00707498168378003, "grad_norm": 0.23058761656284332, "learning_rate": 1e-06, "loss": 0.0231, "step": 647 }, { "clip_ratio/high_max": 0.021306224167346954, "clip_ratio/high_mean": 0.010595515370368958, "clip_ratio/low_mean": 0.0190921388566494, "clip_ratio/low_min": 0.0038668918423354626, "clip_ratio/region_mean": 0.029687656089663506, "epoch": 0.0070859167404783, "grad_norm": 0.20200426876544952, "learning_rate": 1e-06, "loss": 0.0222, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 365.45538330078125, "completions/mean_terminated_length": 365.45538330078125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.007096851797176568, "grad_norm": 0.47127753496170044, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 15978130.0, "reward": 0.8857144117355347, "reward_std": 0.20409803092479706, "rewards/accuracy_reward/mean": 0.7857142686843872, "rewards/accuracy_reward/std": 0.41217005252838135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.017301905900239944, "clip_ratio/high_mean": 0.00575000885874033, "clip_ratio/low_mean": 0.006054234225302935, "clip_ratio/low_min": 0.0011327593820169568, "clip_ratio/region_mean": 0.011804242618381977, "epoch": 0.007107786853874838, "grad_norm": 0.2628447115421295, "learning_rate": 1e-06, "loss": 0.0043, "step": 650 }, { "clip_ratio/high_max": 0.027958877384662628, "clip_ratio/high_mean": 0.010220013558864594, "clip_ratio/low_mean": 0.01195527333766222, "clip_ratio/low_min": 0.0011327593820169568, "clip_ratio/region_mean": 0.02217528596520424, "epoch": 0.007118721910573106, "grad_norm": 0.1707860827445984, "learning_rate": 1e-06, "loss": 0.0035, "step": 651 }, { "clip_ratio/high_max": 0.034729186445474625, "clip_ratio/high_mean": 0.013197250664234161, "clip_ratio/low_mean": 0.018201099708676338, "clip_ratio/low_min": 0.0021767523139715195, "clip_ratio/region_mean": 0.03139834851026535, "epoch": 0.007129656967271376, "grad_norm": 0.11975650489330292, "learning_rate": 1e-06, "loss": 0.0029, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1695.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 655.1875, "completions/mean_terminated_length": 655.1875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.007140592023969644, "grad_norm": 0.40098950266838074, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 16071727.0, "reward": 0.30446434020996094, "reward_std": 0.28995972871780396, "rewards/accuracy_reward/mean": 0.2053571492433548, "rewards/accuracy_reward/std": 0.4057779312133789, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 653 }, { "clip_ratio/high_max": 0.009977826848626137, "clip_ratio/high_mean": 0.003538876073434949, "clip_ratio/low_mean": 0.006183979567140341, "clip_ratio/low_min": 0.003252326278015971, "clip_ratio/region_mean": 0.009722856804728508, "epoch": 0.007151527080667913, "grad_norm": 0.22535920143127441, "learning_rate": 1e-06, "loss": 0.0113, "step": 654 }, { "clip_ratio/high_max": 0.016013797372579575, "clip_ratio/high_mean": 0.00601024879142642, "clip_ratio/low_mean": 0.012947693467140198, "clip_ratio/low_min": 0.0044377525337040424, "clip_ratio/region_mean": 0.018957942724227905, "epoch": 0.007162462137366182, "grad_norm": 0.1917138397693634, "learning_rate": 1e-06, "loss": 0.0106, "step": 655 }, { "clip_ratio/high_max": 0.019899990409612656, "clip_ratio/high_mean": 0.007337216753512621, "clip_ratio/low_mean": 0.02025649882853031, "clip_ratio/low_min": 0.0065773832611739635, "clip_ratio/region_mean": 0.027593715116381645, "epoch": 0.007173397194064451, "grad_norm": 0.13930760324001312, "learning_rate": 1e-06, "loss": 0.0101, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1026.6607666015625, "completions/mean_terminated_length": 978.9345703125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.00718433225076272, "grad_norm": 0.26161545515060425, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 16205109.0, "reward": 0.46250003576278687, "reward_std": 0.1876526176929474, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.483894407749176, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641091883182526, "step": 657 }, { "clip_ratio/high_max": 0.006421543192118406, "clip_ratio/high_mean": 0.0036614276468753815, "clip_ratio/low_mean": 0.003279526950791478, "clip_ratio/low_min": 0.00028657400980591774, "clip_ratio/region_mean": 0.006940954364836216, "epoch": 0.007195267307460989, "grad_norm": 0.14405670762062073, "learning_rate": 1e-06, "loss": 0.0199, "step": 658 }, { "clip_ratio/high_max": 0.012014499865472317, "clip_ratio/high_mean": 0.0057818167842924595, "clip_ratio/low_mean": 0.007028718013316393, "clip_ratio/low_min": 0.00038209868944250047, "clip_ratio/region_mean": 0.012810533866286278, "epoch": 0.007206202364159258, "grad_norm": 0.11610378324985504, "learning_rate": 1e-06, "loss": 0.0194, "step": 659 }, { "clip_ratio/high_max": 0.01450025849044323, "clip_ratio/high_mean": 0.007528324145823717, "clip_ratio/low_mean": 0.01043582521378994, "clip_ratio/low_min": 0.0007069783168844879, "clip_ratio/region_mean": 0.01796414889395237, "epoch": 0.007217137420857527, "grad_norm": 0.09415360540151596, "learning_rate": 1e-06, "loss": 0.019, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 882.5982666015625, "completions/mean_terminated_length": 850.5228881835938, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0072280724775557964, "grad_norm": 0.45990392565727234, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 16319080.0, "reward": 0.4366072118282318, "reward_std": 0.3145366609096527, "rewards/accuracy_reward/mean": 0.3392857015132904, "rewards/accuracy_reward/std": 0.4755948483943939, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 661 }, { "clip_ratio/high_max": 0.01595214381814003, "clip_ratio/high_mean": 0.006011414807289839, "clip_ratio/low_mean": 0.00541565241292119, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011427066288888454, "epoch": 0.007239007534254065, "grad_norm": 0.2712430953979492, "learning_rate": 1e-06, "loss": 0.015, "step": 662 }, { "clip_ratio/high_max": 0.03472914546728134, "clip_ratio/high_mean": 0.0113167529925704, "clip_ratio/low_mean": 0.013294677250087261, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02461143024265766, "epoch": 0.0072499425909523345, "grad_norm": 0.17952311038970947, "learning_rate": 1e-06, "loss": 0.0138, "step": 663 }, { "clip_ratio/high_max": 0.05284147709608078, "clip_ratio/high_mean": 0.016102809458971024, "clip_ratio/low_mean": 0.02239823155105114, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03850104287266731, "epoch": 0.007260877647650603, "grad_norm": 0.142509326338768, "learning_rate": 1e-06, "loss": 0.0129, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1043.044677734375, "completions/mean_terminated_length": 976.0476684570312, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.007271812704348872, "grad_norm": 0.33306053280830383, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 16460737.0, "reward": 0.18839289247989655, "reward_std": 0.14371222257614136, "rewards/accuracy_reward/mean": 0.0982142835855484, "rewards/accuracy_reward/std": 0.2989417314529419, "rewards/format_reward/mean": 0.9017857313156128, "rewards/format_reward/std": 0.2989417314529419, "step": 665 }, { "clip_ratio/high_max": 0.012140199542045593, "clip_ratio/high_mean": 0.003531795460730791, "clip_ratio/low_mean": 0.006084228400141001, "clip_ratio/low_min": 0.0003428347408771515, "clip_ratio/region_mean": 0.009616022929549217, "epoch": 0.007282747761047141, "grad_norm": 0.21901169419288635, "learning_rate": 1e-06, "loss": -0.001, "step": 666 }, { "clip_ratio/high_max": 0.01566477306187153, "clip_ratio/high_mean": 0.004693079274147749, "clip_ratio/low_mean": 0.00811065174639225, "clip_ratio/low_min": 0.0006726080318912864, "clip_ratio/region_mean": 0.012803731486201286, "epoch": 0.00729368281774541, "grad_norm": 0.11431267857551575, "learning_rate": 1e-06, "loss": -0.0012, "step": 667 }, { "clip_ratio/high_max": 0.016448011621832848, "clip_ratio/high_mean": 0.005512411240488291, "clip_ratio/low_mean": 0.010119305923581123, "clip_ratio/low_min": 0.0012244097888469696, "clip_ratio/region_mean": 0.015631716698408127, "epoch": 0.007304617874443679, "grad_norm": 0.14458125829696655, "learning_rate": 1e-06, "loss": -0.0013, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 744.5535888671875, "completions/mean_terminated_length": 708.6788940429688, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.007315552931141948, "grad_norm": 0.45728799700737, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 16562103.0, "reward": 0.428571492433548, "reward_std": 0.23902472853660583, "rewards/accuracy_reward/mean": 0.3303571343421936, "rewards/accuracy_reward/std": 0.47245559096336365, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 669 }, { "clip_ratio/high_max": 0.011227545328438282, "clip_ratio/high_mean": 0.0045152283273637295, "clip_ratio/low_mean": 0.00621606782078743, "clip_ratio/low_min": 0.0019045805092900991, "clip_ratio/region_mean": 0.010731296613812447, "epoch": 0.007326487987840217, "grad_norm": 0.2760436236858368, "learning_rate": 1e-06, "loss": 0.0184, "step": 670 }, { "clip_ratio/high_max": 0.020408162847161293, "clip_ratio/high_mean": 0.008419261313974857, "clip_ratio/low_mean": 0.012203298509120941, "clip_ratio/low_min": 0.0019521950744092464, "clip_ratio/region_mean": 0.020622562617063522, "epoch": 0.007337423044538486, "grad_norm": 0.179888054728508, "learning_rate": 1e-06, "loss": 0.0174, "step": 671 }, { "clip_ratio/high_max": 0.027578599750995636, "clip_ratio/high_mean": 0.011375784873962402, "clip_ratio/low_mean": 0.016330478712916374, "clip_ratio/low_min": 0.002761641750112176, "clip_ratio/region_mean": 0.027706265449523926, "epoch": 0.007348358101236755, "grad_norm": 0.14181581139564514, "learning_rate": 1e-06, "loss": 0.0168, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1178.3482666015625, "completions/mean_terminated_length": 1111.451904296875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.007359293157935024, "grad_norm": 0.3006715178489685, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 16713874.0, "reward": 0.30178579688072205, "reward_std": 0.25091397762298584, "rewards/accuracy_reward/mean": 0.2053571492433548, "rewards/accuracy_reward/std": 0.4057779014110565, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641093373298645, "step": 673 }, { "clip_ratio/high_max": 0.00754766957834363, "clip_ratio/high_mean": 0.003153149038553238, "clip_ratio/low_mean": 0.00558734405785799, "clip_ratio/low_min": 0.00039282909710891545, "clip_ratio/region_mean": 0.008740493096411228, "epoch": 0.007370228214633293, "grad_norm": 0.20564334094524384, "learning_rate": 1e-06, "loss": 0.0033, "step": 674 }, { "clip_ratio/high_max": 0.013207264244556427, "clip_ratio/high_mean": 0.005147075746208429, "clip_ratio/low_mean": 0.010048894211649895, "clip_ratio/low_min": 0.00039282909710891545, "clip_ratio/region_mean": 0.015195970423519611, "epoch": 0.007381163271331562, "grad_norm": 0.1821756809949875, "learning_rate": 1e-06, "loss": 0.0026, "step": 675 }, { "clip_ratio/high_max": 0.021211666986346245, "clip_ratio/high_mean": 0.007193165365606546, "clip_ratio/low_mean": 0.014462673105299473, "clip_ratio/low_min": 0.001071352045983076, "clip_ratio/region_mean": 0.021655838936567307, "epoch": 0.0073920983280298305, "grad_norm": 0.15294645726680756, "learning_rate": 1e-06, "loss": 0.0021, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 784.1785888671875, "completions/mean_terminated_length": 749.3944702148438, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.0074030333847281, "grad_norm": 0.23769938945770264, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 16817830.0, "reward": 0.525892972946167, "reward_std": 0.17401854693889618, "rewards/accuracy_reward/mean": 0.4285714328289032, "rewards/accuracy_reward/std": 0.4970957934856415, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 677 }, { "clip_ratio/high_max": 0.010887496173381805, "clip_ratio/high_mean": 0.0033487759064882994, "clip_ratio/low_mean": 0.0036795916967093945, "clip_ratio/low_min": 0.00028150889556854963, "clip_ratio/region_mean": 0.0070283678360283375, "epoch": 0.0074139684414263685, "grad_norm": 0.12476712465286255, "learning_rate": 1e-06, "loss": 0.0026, "step": 678 }, { "clip_ratio/high_max": 0.015616408549249172, "clip_ratio/high_mean": 0.004813049454241991, "clip_ratio/low_mean": 0.006417558528482914, "clip_ratio/low_min": 0.0005228022346273065, "clip_ratio/region_mean": 0.011230608448386192, "epoch": 0.007424903498124638, "grad_norm": 0.09099764376878738, "learning_rate": 1e-06, "loss": 0.0024, "step": 679 }, { "clip_ratio/high_max": 0.019135598093271255, "clip_ratio/high_mean": 0.005951531697064638, "clip_ratio/low_mean": 0.009720084257423878, "clip_ratio/low_min": 0.0007640955736860633, "clip_ratio/region_mean": 0.015671616420149803, "epoch": 0.0074358385548229065, "grad_norm": 0.0762719139456749, "learning_rate": 1e-06, "loss": 0.0023, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 823.357177734375, "completions/mean_terminated_length": 778.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.007446773611521176, "grad_norm": 0.254210889339447, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 16930290.0, "reward": 0.5544643402099609, "reward_std": 0.1513407677412033, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.500241219997406, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 681 }, { "clip_ratio/high_max": 0.0112462705001235, "clip_ratio/high_mean": 0.0027983547188341618, "clip_ratio/low_mean": 0.003788066329434514, "clip_ratio/low_min": 0.002520292066037655, "clip_ratio/region_mean": 0.006586420815438032, "epoch": 0.007457708668219445, "grad_norm": 0.1255311816930771, "learning_rate": 1e-06, "loss": -0.0111, "step": 682 }, { "clip_ratio/high_max": 0.026050034910440445, "clip_ratio/high_mean": 0.006073222495615482, "clip_ratio/low_mean": 0.006911559496074915, "clip_ratio/low_min": 0.0027797338552773, "clip_ratio/region_mean": 0.012984782457351685, "epoch": 0.007468643724917714, "grad_norm": 0.07644679397344589, "learning_rate": 1e-06, "loss": -0.0115, "step": 683 }, { "clip_ratio/high_max": 0.036952029913663864, "clip_ratio/high_mean": 0.008404362015426159, "clip_ratio/low_mean": 0.009939000010490417, "clip_ratio/low_min": 0.0037063118070364, "clip_ratio/region_mean": 0.01834336295723915, "epoch": 0.007479578781615983, "grad_norm": 0.05994987487792969, "learning_rate": 1e-06, "loss": -0.0117, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1528.4376220703125, "completions/mean_terminated_length": 1422.290283203125, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.007490513838314252, "grad_norm": 0.21511103212833405, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 17124379.0, "reward": 0.26607149839401245, "reward_std": 0.28377628326416016, "rewards/accuracy_reward/mean": 0.1696428507566452, "rewards/accuracy_reward/std": 0.37700554728507996, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641091883182526, "step": 685 }, { "clip_ratio/high_max": 0.00538763590157032, "clip_ratio/high_mean": 0.0019364503677934408, "clip_ratio/low_mean": 0.0035934937186539173, "clip_ratio/low_min": 0.0024763918481767178, "clip_ratio/region_mean": 0.005529944319278002, "epoch": 0.007501448895012521, "grad_norm": 0.13922932744026184, "learning_rate": 1e-06, "loss": 0.0031, "step": 686 }, { "clip_ratio/high_max": 0.011153924278914928, "clip_ratio/high_mean": 0.0037992503494024277, "clip_ratio/low_mean": 0.007318308111280203, "clip_ratio/low_min": 0.0038523836992681026, "clip_ratio/region_mean": 0.011117557995021343, "epoch": 0.007512383951710789, "grad_norm": 0.10817722231149673, "learning_rate": 1e-06, "loss": 0.0024, "step": 687 }, { "clip_ratio/high_max": 0.016325289383530617, "clip_ratio/high_mean": 0.0055883945897221565, "clip_ratio/low_mean": 0.01202795747667551, "clip_ratio/low_min": 0.0050781420432031155, "clip_ratio/region_mean": 0.017616352066397667, "epoch": 0.007523319008409059, "grad_norm": 0.09097017347812653, "learning_rate": 1e-06, "loss": 0.0019, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 409.1339416503906, "completions/mean_terminated_length": 409.1339416503906, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.007534254065107327, "grad_norm": 0.4790695607662201, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 17187730.0, "reward": 0.6294643878936768, "reward_std": 0.21156971156597137, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49935612082481384, "rewards/format_reward/mean": 0.7589285969734192, "rewards/format_reward/std": 0.4296559691429138, "step": 689 }, { "clip_ratio/high_max": 0.016032064333558083, "clip_ratio/high_mean": 0.005439884960651398, "clip_ratio/low_mean": 0.007007843814790249, "clip_ratio/low_min": 0.0014457831857725978, "clip_ratio/region_mean": 0.012447727844119072, "epoch": 0.007545189121805597, "grad_norm": 0.2710942327976227, "learning_rate": 1e-06, "loss": 0.0024, "step": 690 }, { "clip_ratio/high_max": 0.02633839100599289, "clip_ratio/high_mean": 0.009236923418939114, "clip_ratio/low_mean": 0.013879237696528435, "clip_ratio/low_min": 0.0024096386041492224, "clip_ratio/region_mean": 0.023116162046790123, "epoch": 0.007556124178503865, "grad_norm": 0.2873303294181824, "learning_rate": 1e-06, "loss": 0.0017, "step": 691 }, { "clip_ratio/high_max": 0.03091898001730442, "clip_ratio/high_mean": 0.010846777819097042, "clip_ratio/low_mean": 0.01849985122680664, "clip_ratio/low_min": 0.0028915663715451956, "clip_ratio/region_mean": 0.029346628114581108, "epoch": 0.007567059235202135, "grad_norm": 0.19043517112731934, "learning_rate": 1e-06, "loss": 0.0012, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 670.4107666015625, "completions/mean_terminated_length": 658.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.007577994291900403, "grad_norm": 0.3209439516067505, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 17280372.0, "reward": 0.2071429044008255, "reward_std": 0.20774491131305695, "rewards/accuracy_reward/mean": 0.1071428582072258, "rewards/accuracy_reward/std": 0.3106848895549774, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.005417118314653635, "clip_ratio/high_mean": 0.0021518722642213106, "clip_ratio/low_mean": 0.006791460793465376, "clip_ratio/low_min": 0.00248893816024065, "clip_ratio/region_mean": 0.008943332359194756, "epoch": 0.007588929348598673, "grad_norm": 0.21033544838428497, "learning_rate": 1e-06, "loss": 0.027, "step": 694 }, { "clip_ratio/high_max": 0.008883647620677948, "clip_ratio/high_mean": 0.003615178167819977, "clip_ratio/low_mean": 0.013820239342749119, "clip_ratio/low_min": 0.0038716814015060663, "clip_ratio/region_mean": 0.01743541657924652, "epoch": 0.007599864405296941, "grad_norm": 0.13244715332984924, "learning_rate": 1e-06, "loss": 0.0264, "step": 695 }, { "clip_ratio/high_max": 0.01139937061816454, "clip_ratio/high_mean": 0.004223725758492947, "clip_ratio/low_mean": 0.019765416160225868, "clip_ratio/low_min": 0.004563053138554096, "clip_ratio/region_mean": 0.02398914285004139, "epoch": 0.007610799461995211, "grad_norm": 0.10176566988229752, "learning_rate": 1e-06, "loss": 0.026, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1126.294677734375, "completions/mean_terminated_length": 1126.294677734375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.0076217345186934795, "grad_norm": 0.2405523657798767, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 17433677.0, "reward": 0.17767859995365143, "reward_std": 0.13017992675304413, "rewards/accuracy_reward/mean": 0.0803571417927742, "rewards/accuracy_reward/std": 0.27306708693504333, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 697 }, { "clip_ratio/high_max": 0.007393715437501669, "clip_ratio/high_mean": 0.0036715941969305277, "clip_ratio/low_mean": 0.003716070670634508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007387664634734392, "epoch": 0.007632669575391748, "grad_norm": 0.13653628528118134, "learning_rate": 1e-06, "loss": 0.0085, "step": 698 }, { "clip_ratio/high_max": 0.013294468633830547, "clip_ratio/high_mean": 0.00529127661138773, "clip_ratio/low_mean": 0.007544178981333971, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012835456058382988, "epoch": 0.0076436046320900175, "grad_norm": 0.09255571663379669, "learning_rate": 1e-06, "loss": 0.008, "step": 699 }, { "clip_ratio/high_max": 0.015356178395450115, "clip_ratio/high_mean": 0.006531347054988146, "clip_ratio/low_mean": 0.010816042311489582, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017347389832139015, "epoch": 0.007654539688788286, "grad_norm": 0.09046933799982071, "learning_rate": 1e-06, "loss": 0.0078, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 878.9285888671875, "completions/mean_terminated_length": 878.9285888671875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.0076654747454865555, "grad_norm": 0.2731790840625763, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 17547517.0, "reward": 0.34910717606544495, "reward_std": 0.2496241331100464, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4349588453769684, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 701 }, { "clip_ratio/high_max": 0.008034279569983482, "clip_ratio/high_mean": 0.0030752907041460276, "clip_ratio/low_mean": 0.004940636456012726, "clip_ratio/low_min": 0.002723939251154661, "clip_ratio/region_mean": 0.008015927858650684, "epoch": 0.007676409802184824, "grad_norm": 0.1473335176706314, "learning_rate": 1e-06, "loss": 0.0025, "step": 702 }, { "clip_ratio/high_max": 0.014937808737158775, "clip_ratio/high_mean": 0.005581985227763653, "clip_ratio/low_mean": 0.010603217408061028, "clip_ratio/low_min": 0.003823991632089019, "clip_ratio/region_mean": 0.016185203567147255, "epoch": 0.007687344858883094, "grad_norm": 0.11631951481103897, "learning_rate": 1e-06, "loss": 0.002, "step": 703 }, { "clip_ratio/high_max": 0.02219841629266739, "clip_ratio/high_mean": 0.007728253025561571, "clip_ratio/low_mean": 0.015899354591965675, "clip_ratio/low_min": 0.004243059083819389, "clip_ratio/region_mean": 0.02362760715186596, "epoch": 0.007698279915581362, "grad_norm": 0.08448152244091034, "learning_rate": 1e-06, "loss": 0.0016, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 931.8928833007812, "completions/mean_terminated_length": 911.5999755859375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.007709214972279632, "grad_norm": 0.26633623242378235, "learning_rate": 1e-06, "loss": -0.0035, "num_tokens": 17671853.0, "reward": 0.3455357849597931, "reward_std": 0.22954946756362915, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4349588453769684, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 705 }, { "clip_ratio/high_max": 0.007868058979511261, "clip_ratio/high_mean": 0.0034268100280314684, "clip_ratio/low_mean": 0.0038733496330678463, "clip_ratio/low_min": 0.000505603791680187, "clip_ratio/region_mean": 0.007300159893929958, "epoch": 0.0077201500289779, "grad_norm": 0.11560755223035812, "learning_rate": 1e-06, "loss": -0.0041, "step": 706 }, { "clip_ratio/high_max": 0.013769102282822132, "clip_ratio/high_mean": 0.005367871839553118, "clip_ratio/low_mean": 0.007871835492551327, "clip_ratio/low_min": 0.0009269402362406254, "clip_ratio/region_mean": 0.013239708729088306, "epoch": 0.007731085085676169, "grad_norm": 0.0821533203125, "learning_rate": 1e-06, "loss": -0.0043, "step": 707 }, { "clip_ratio/high_max": 0.01740051433444023, "clip_ratio/high_mean": 0.00658798310905695, "clip_ratio/low_mean": 0.012264412827789783, "clip_ratio/low_min": 0.001643212279304862, "clip_ratio/region_mean": 0.018852395936846733, "epoch": 0.007742020142374438, "grad_norm": 0.07248670607805252, "learning_rate": 1e-06, "loss": -0.0045, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 875.3482666015625, "completions/mean_terminated_length": 875.3482666015625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.007752955199072707, "grad_norm": 0.3775704801082611, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 17786188.0, "reward": 0.3812500536441803, "reward_std": 0.30551981925964355, "rewards/accuracy_reward/mean": 0.2857142984867096, "rewards/accuracy_reward/std": 0.453784316778183, "rewards/format_reward/mean": 0.9553571343421936, "rewards/format_reward/std": 0.2074466347694397, "step": 709 }, { "clip_ratio/high_max": 0.00993430521339178, "clip_ratio/high_mean": 0.00473993644118309, "clip_ratio/low_mean": 0.004829951096326113, "clip_ratio/low_min": 0.00010346611816203222, "clip_ratio/region_mean": 0.00956988800317049, "epoch": 0.007763890255770976, "grad_norm": 0.24697385728359222, "learning_rate": 1e-06, "loss": 0.0056, "step": 710 }, { "clip_ratio/high_max": 0.020429417490959167, "clip_ratio/high_mean": 0.009091876447200775, "clip_ratio/low_mean": 0.009957291185855865, "clip_ratio/low_min": 0.00041386447264812887, "clip_ratio/region_mean": 0.01904916763305664, "epoch": 0.007774825312469245, "grad_norm": 0.17994379997253418, "learning_rate": 1e-06, "loss": 0.0045, "step": 711 }, { "clip_ratio/high_max": 0.028521070256829262, "clip_ratio/high_mean": 0.012312120757997036, "clip_ratio/low_mean": 0.015342465601861477, "clip_ratio/low_min": 0.0006207966944202781, "clip_ratio/region_mean": 0.027654588222503662, "epoch": 0.007785760369167514, "grad_norm": 0.1429011970758438, "learning_rate": 1e-06, "loss": 0.0039, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 625.1785888671875, "completions/mean_terminated_length": 612.3603515625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.007796695425865783, "grad_norm": 0.47629040479660034, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 17871076.0, "reward": 0.6687500476837158, "reward_std": 0.41023558378219604, "rewards/accuracy_reward/mean": 0.5714285969734192, "rewards/accuracy_reward/std": 0.4970957934856415, "rewards/format_reward/mean": 0.9732142686843872, "rewards/format_reward/std": 0.1621822714805603, "step": 713 }, { "clip_ratio/high_max": 0.016631416976451874, "clip_ratio/high_mean": 0.005460062529891729, "clip_ratio/low_mean": 0.005839328747242689, "clip_ratio/low_min": 0.0024781101383268833, "clip_ratio/region_mean": 0.011299391277134418, "epoch": 0.007807630482564052, "grad_norm": 0.3089068531990051, "learning_rate": 1e-06, "loss": 0.0174, "step": 714 }, { "clip_ratio/high_max": 0.03182080388069153, "clip_ratio/high_mean": 0.010893331840634346, "clip_ratio/low_mean": 0.014377961866557598, "clip_ratio/low_min": 0.00429539056494832, "clip_ratio/region_mean": 0.025271296501159668, "epoch": 0.007818565539262322, "grad_norm": 0.2542169988155365, "learning_rate": 1e-06, "loss": 0.0161, "step": 715 }, { "clip_ratio/high_max": 0.04335704818367958, "clip_ratio/high_mean": 0.015248594805598259, "clip_ratio/low_mean": 0.022256966680288315, "clip_ratio/low_min": 0.0062778787687420845, "clip_ratio/region_mean": 0.037505559623241425, "epoch": 0.00782950059596059, "grad_norm": 0.17658422887325287, "learning_rate": 1e-06, "loss": 0.015, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 625.75, "completions/mean_terminated_length": 612.9369506835938, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.007840435652658859, "grad_norm": 0.4567226469516754, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 17959248.0, "reward": 0.45357149839401245, "reward_std": 0.2681729197502136, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.48131096363067627, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641093373298645, "step": 717 }, { "clip_ratio/high_max": 0.009791731834411621, "clip_ratio/high_mean": 0.005075004417449236, "clip_ratio/low_mean": 0.0063598742708563805, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011434878222644329, "epoch": 0.007851370709357128, "grad_norm": 0.24534499645233154, "learning_rate": 1e-06, "loss": 0.0117, "step": 718 }, { "clip_ratio/high_max": 0.01605287939310074, "clip_ratio/high_mean": 0.008158298209309578, "clip_ratio/low_mean": 0.014301717281341553, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02246001549065113, "epoch": 0.007862305766055396, "grad_norm": 0.1625976413488388, "learning_rate": 1e-06, "loss": 0.0108, "step": 719 }, { "clip_ratio/high_max": 0.01951526664197445, "clip_ratio/high_mean": 0.01014054287225008, "clip_ratio/low_mean": 0.021096762269735336, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031237300485372543, "epoch": 0.007873240822753666, "grad_norm": 0.14523530006408691, "learning_rate": 1e-06, "loss": 0.0103, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 876.8214721679688, "completions/mean_terminated_length": 833.4444580078125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.007884175879451935, "grad_norm": 0.3409714996814728, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 18078144.0, "reward": 0.26875004172325134, "reward_std": 0.2165507972240448, "rewards/accuracy_reward/mean": 0.1696428507566452, "rewards/accuracy_reward/std": 0.37700554728507996, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 721 }, { "clip_ratio/high_max": 0.01064352411776781, "clip_ratio/high_mean": 0.0037424596957862377, "clip_ratio/low_mean": 0.0051302844658494, "clip_ratio/low_min": 0.0019699225667864084, "clip_ratio/region_mean": 0.008872744627296925, "epoch": 0.007895110936150205, "grad_norm": 0.22506526112556458, "learning_rate": 1e-06, "loss": 0.0003, "step": 722 }, { "clip_ratio/high_max": 0.026035696268081665, "clip_ratio/high_mean": 0.008174514397978783, "clip_ratio/low_mean": 0.012225942686200142, "clip_ratio/low_min": 0.0029789074324071407, "clip_ratio/region_mean": 0.020400455221533775, "epoch": 0.007906045992848472, "grad_norm": 0.17472919821739197, "learning_rate": 1e-06, "loss": -0.0009, "step": 723 }, { "clip_ratio/high_max": 0.038644179701805115, "clip_ratio/high_mean": 0.012155146338045597, "clip_ratio/low_mean": 0.019552340731024742, "clip_ratio/low_min": 0.003795704571530223, "clip_ratio/region_mean": 0.031707488000392914, "epoch": 0.007916981049546742, "grad_norm": 0.13761773705482483, "learning_rate": 1e-06, "loss": -0.0018, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 409.5714416503906, "completions/mean_terminated_length": 409.5714416503906, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.007927916106245011, "grad_norm": 0.5139058828353882, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 18139192.0, "reward": 0.4660714864730835, "reward_std": 0.28892889618873596, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.48389437794685364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.011796246282756329, "clip_ratio/high_mean": 0.005065215285867453, "clip_ratio/low_mean": 0.007518264930695295, "clip_ratio/low_min": 0.0005875440547242761, "clip_ratio/region_mean": 0.012583480216562748, "epoch": 0.00793885116294328, "grad_norm": 0.29257017374038696, "learning_rate": 1e-06, "loss": 0.0145, "step": 726 }, { "clip_ratio/high_max": 0.020857814699411392, "clip_ratio/high_mean": 0.009263821877539158, "clip_ratio/low_mean": 0.01892680861055851, "clip_ratio/low_min": 0.0032322227489203215, "clip_ratio/region_mean": 0.028190629556775093, "epoch": 0.007949786219641548, "grad_norm": 0.2193273901939392, "learning_rate": 1e-06, "loss": 0.0136, "step": 727 }, { "clip_ratio/high_max": 0.023795533925294876, "clip_ratio/high_mean": 0.010664718225598335, "clip_ratio/low_mean": 0.02763991989195347, "clip_ratio/low_min": 0.0032322227489203215, "clip_ratio/region_mean": 0.038304634392261505, "epoch": 0.007960721276339818, "grad_norm": 0.18318182229995728, "learning_rate": 1e-06, "loss": 0.0128, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 879.794677734375, "completions/mean_terminated_length": 858.5545043945312, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.007971656333038087, "grad_norm": 0.348341166973114, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 18259961.0, "reward": 0.5517858266830444, "reward_std": 0.30303955078125, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.5002412796020508, "rewards/format_reward/mean": 0.9642857313156128, "rewards/format_reward/std": 0.18641091883182526, "step": 729 }, { "clip_ratio/high_max": 0.016057979315519333, "clip_ratio/high_mean": 0.004814724437892437, "clip_ratio/low_mean": 0.0037899501621723175, "clip_ratio/low_min": 0.0016060739289969206, "clip_ratio/region_mean": 0.008604674600064754, "epoch": 0.007982591389736355, "grad_norm": 0.19863247871398926, "learning_rate": 1e-06, "loss": 0.0047, "step": 730 }, { "clip_ratio/high_max": 0.029700156301259995, "clip_ratio/high_mean": 0.009179094806313515, "clip_ratio/low_mean": 0.007731935940682888, "clip_ratio/low_min": 0.0038527012802660465, "clip_ratio/region_mean": 0.016911031678318977, "epoch": 0.007993526446434624, "grad_norm": 0.14161843061447144, "learning_rate": 1e-06, "loss": 0.004, "step": 731 }, { "clip_ratio/high_max": 0.03922126069664955, "clip_ratio/high_mean": 0.011994389817118645, "clip_ratio/low_mean": 0.011087740771472454, "clip_ratio/low_min": 0.0055550578981637955, "clip_ratio/region_mean": 0.023082133382558823, "epoch": 0.008004461503132894, "grad_norm": 0.12485579401254654, "learning_rate": 1e-06, "loss": 0.0036, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1250.0625, "completions/mean_terminated_length": 1204.896240234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.008015396559831163, "grad_norm": 0.15707455575466156, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 18422872.0, "reward": 0.11785715818405151, "reward_std": 0.12488342821598053, "rewards/accuracy_reward/mean": 0.02678571455180645, "rewards/accuracy_reward/std": 0.1621822714805603, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.2864373028278351, "step": 733 }, { "clip_ratio/high_max": 0.0031760786660015583, "clip_ratio/high_mean": 0.0012067880015820265, "clip_ratio/low_mean": 0.002989143831655383, "clip_ratio/low_min": 0.0003467673668637872, "clip_ratio/region_mean": 0.00419593183323741, "epoch": 0.008026331616529431, "grad_norm": 0.06549470126628876, "learning_rate": 1e-06, "loss": 0.0018, "step": 734 }, { "clip_ratio/high_max": 0.005422573536634445, "clip_ratio/high_mean": 0.001827788189984858, "clip_ratio/low_mean": 0.00597138237208128, "clip_ratio/low_min": 0.0007705941097810864, "clip_ratio/region_mean": 0.007799169979989529, "epoch": 0.0080372666732277, "grad_norm": 0.053590647876262665, "learning_rate": 1e-06, "loss": 0.0016, "step": 735 }, { "clip_ratio/high_max": 0.006429622881114483, "clip_ratio/high_mean": 0.0021702146623283625, "clip_ratio/low_mean": 0.009377358481287956, "clip_ratio/low_min": 0.0013870694674551487, "clip_ratio/region_mean": 0.01154757384210825, "epoch": 0.00804820172992597, "grad_norm": 0.03523087874054909, "learning_rate": 1e-06, "loss": 0.0015, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 381.3482360839844, "completions/mean_terminated_length": 381.3482360839844, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.00805913678662424, "grad_norm": 0.443189799785614, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 18479079.0, "reward": 0.5901786684989929, "reward_std": 0.233891099691391, "rewards/accuracy_reward/mean": 0.4910714328289032, "rewards/accuracy_reward/std": 0.5021671056747437, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 737 }, { "clip_ratio/high_max": 0.008705114014446735, "clip_ratio/high_mean": 0.00577555363997817, "clip_ratio/low_mean": 0.005038383882492781, "clip_ratio/low_min": 0.0026931841857731342, "clip_ratio/region_mean": 0.010813937522470951, "epoch": 0.008070071843322507, "grad_norm": 0.1818084418773651, "learning_rate": 1e-06, "loss": 0.0032, "step": 738 }, { "clip_ratio/high_max": 0.015283379703760147, "clip_ratio/high_mean": 0.008698961697518826, "clip_ratio/low_mean": 0.009816798381507397, "clip_ratio/low_min": 0.003314688103273511, "clip_ratio/region_mean": 0.01851576194167137, "epoch": 0.008081006900020777, "grad_norm": 0.13034236431121826, "learning_rate": 1e-06, "loss": 0.0028, "step": 739 }, { "clip_ratio/high_max": 0.02101464569568634, "clip_ratio/high_mean": 0.011783486232161522, "clip_ratio/low_mean": 0.015999797731637955, "clip_ratio/low_min": 0.005179200321435928, "clip_ratio/region_mean": 0.027783283963799477, "epoch": 0.008091941956719046, "grad_norm": 0.11137273907661438, "learning_rate": 1e-06, "loss": 0.0024, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1047.0179443359375, "completions/mean_terminated_length": 959.5534057617188, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.008102877013417314, "grad_norm": 0.3460134267807007, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 18624541.0, "reward": 0.3133929371833801, "reward_std": 0.24658483266830444, "rewards/accuracy_reward/mean": 0.2142857164144516, "rewards/accuracy_reward/std": 0.41217002272605896, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 741 }, { "clip_ratio/high_max": 0.00831215176731348, "clip_ratio/high_mean": 0.0035947414580732584, "clip_ratio/low_mean": 0.005397192668169737, "clip_ratio/low_min": 0.0028956132009625435, "clip_ratio/region_mean": 0.008991934359073639, "epoch": 0.008113812070115583, "grad_norm": 0.24135182797908783, "learning_rate": 1e-06, "loss": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.02005907893180847, "clip_ratio/high_mean": 0.00842107180505991, "clip_ratio/low_mean": 0.012046420946717262, "clip_ratio/low_min": 0.004053858574479818, "clip_ratio/region_mean": 0.020467493683099747, "epoch": 0.008124747126813853, "grad_norm": 0.18042035400867462, "learning_rate": 1e-06, "loss": -0.0013, "step": 743 }, { "clip_ratio/high_max": 0.02802775241434574, "clip_ratio/high_mean": 0.011728174053132534, "clip_ratio/low_mean": 0.018203163519501686, "clip_ratio/low_min": 0.0041624438017606735, "clip_ratio/region_mean": 0.029931336641311646, "epoch": 0.008135682183512122, "grad_norm": 0.1283399611711502, "learning_rate": 1e-06, "loss": -0.0022, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 493.6607360839844, "completions/mean_terminated_length": 493.6607360839844, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.00814661724021039, "grad_norm": 0.6451539397239685, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 18693607.0, "reward": 0.6794643998146057, "reward_std": 0.3994313180446625, "rewards/accuracy_reward/mean": 0.5803571343421936, "rewards/accuracy_reward/std": 0.49571844935417175, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 745 }, { "clip_ratio/high_max": 0.023192020133137703, "clip_ratio/high_mean": 0.007843797095119953, "clip_ratio/low_mean": 0.007051930762827396, "clip_ratio/low_min": 0.002544234972447157, "clip_ratio/region_mean": 0.01489572785794735, "epoch": 0.00815755229690866, "grad_norm": 0.36681851744651794, "learning_rate": 1e-06, "loss": 0.0281, "step": 746 }, { "clip_ratio/high_max": 0.05187032371759415, "clip_ratio/high_mean": 0.01726372167468071, "clip_ratio/low_mean": 0.015198352746665478, "clip_ratio/low_min": 0.00728576397523284, "clip_ratio/region_mean": 0.03246207535266876, "epoch": 0.008168487353606929, "grad_norm": 0.2665427327156067, "learning_rate": 1e-06, "loss": 0.0265, "step": 747 }, { "clip_ratio/high_max": 0.07356608659029007, "clip_ratio/high_mean": 0.025166288018226624, "clip_ratio/low_mean": 0.02279573492705822, "clip_ratio/low_min": 0.012258586473762989, "clip_ratio/region_mean": 0.047962021082639694, "epoch": 0.008179422410305198, "grad_norm": 0.212394580245018, "learning_rate": 1e-06, "loss": 0.0253, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 1054.4732666015625, "completions/mean_terminated_length": 998.23583984375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.008190357467003466, "grad_norm": 0.3980249762535095, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 18835144.0, "reward": 0.4035714864730835, "reward_std": 0.37636902928352356, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.4655956029891968, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.28643733263015747, "step": 749 }, { "clip_ratio/high_max": 0.007952713407576084, "clip_ratio/high_mean": 0.0044593787752091885, "clip_ratio/low_mean": 0.004556109197437763, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009015487506985664, "epoch": 0.008201292523701735, "grad_norm": 0.3038676083087921, "learning_rate": 1e-06, "loss": 0.0217, "step": 750 }, { "clip_ratio/high_max": 0.015083531849086285, "clip_ratio/high_mean": 0.008330187760293484, "clip_ratio/low_mean": 0.009648213163018227, "clip_ratio/low_min": 3.7959307519486174e-05, "clip_ratio/region_mean": 0.017978401854634285, "epoch": 0.008212227580400005, "grad_norm": 0.19690334796905518, "learning_rate": 1e-06, "loss": 0.0206, "step": 751 }, { "clip_ratio/high_max": 0.023579951375722885, "clip_ratio/high_mean": 0.011997275985777378, "clip_ratio/low_mean": 0.01634816639125347, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028345447033643723, "epoch": 0.008223162637098273, "grad_norm": 0.15580826997756958, "learning_rate": 1e-06, "loss": 0.0197, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1587.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 534.1160888671875, "completions/mean_terminated_length": 534.1160888671875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.008234097693796542, "grad_norm": 0.5116481184959412, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 18909861.0, "reward": 0.9017858505249023, "reward_std": 0.26114949584007263, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3920665979385376, "rewards/format_reward/mean": 0.8928571343421936, "rewards/format_reward/std": 0.3106848895549774, "step": 753 }, { "clip_ratio/high_max": 0.014212438836693764, "clip_ratio/high_mean": 0.007314757443964481, "clip_ratio/low_mean": 0.005167030729353428, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012481787241995335, "epoch": 0.008245032750494811, "grad_norm": 0.3179207742214203, "learning_rate": 1e-06, "loss": 0.0039, "step": 754 }, { "clip_ratio/high_max": 0.023829877376556396, "clip_ratio/high_mean": 0.012220687232911587, "clip_ratio/low_mean": 0.00998397171497345, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022204658016562462, "epoch": 0.008255967807193081, "grad_norm": 0.1711912751197815, "learning_rate": 1e-06, "loss": 0.0029, "step": 755 }, { "clip_ratio/high_max": 0.03269929438829422, "clip_ratio/high_mean": 0.016848567873239517, "clip_ratio/low_mean": 0.013914257287979126, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030762825161218643, "epoch": 0.008266902863891349, "grad_norm": 0.13107645511627197, "learning_rate": 1e-06, "loss": 0.0022, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 596.4107666015625, "completions/mean_terminated_length": 596.4107666015625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.008277837920589618, "grad_norm": 0.7147883176803589, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 18995699.0, "reward": 0.6473215222358704, "reward_std": 0.46507367491722107, "rewards/accuracy_reward/mean": 0.5535714030265808, "rewards/accuracy_reward/std": 0.49935609102249146, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 757 }, { "clip_ratio/high_max": 0.016208428889513016, "clip_ratio/high_mean": 0.00780579773709178, "clip_ratio/low_mean": 0.007798945065587759, "clip_ratio/low_min": 0.0034890775568783283, "clip_ratio/region_mean": 0.015604743734002113, "epoch": 0.008288772977287888, "grad_norm": 0.4686950743198395, "learning_rate": 1e-06, "loss": 0.0232, "step": 758 }, { "clip_ratio/high_max": 0.038900226354599, "clip_ratio/high_mean": 0.017311694100499153, "clip_ratio/low_mean": 0.018392696976661682, "clip_ratio/low_min": 0.007357402704656124, "clip_ratio/region_mean": 0.035704392939805984, "epoch": 0.008299708033986157, "grad_norm": 0.3218750059604645, "learning_rate": 1e-06, "loss": 0.0208, "step": 759 }, { "clip_ratio/high_max": 0.06075159087777138, "clip_ratio/high_mean": 0.026445509865880013, "clip_ratio/low_mean": 0.028444642201066017, "clip_ratio/low_min": 0.014638956636190414, "clip_ratio/region_mean": 0.05489015206694603, "epoch": 0.008310643090684425, "grad_norm": 0.24479302763938904, "learning_rate": 1e-06, "loss": 0.019, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1209.9107666015625, "completions/mean_terminated_length": 1145.4423828125, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.008321578147382694, "grad_norm": 0.28159797191619873, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 19150949.0, "reward": 0.17678575217723846, "reward_std": 0.14433379471302032, "rewards/accuracy_reward/mean": 0.0892857164144516, "rewards/accuracy_reward/std": 0.28643733263015747, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33220529556274414, "step": 761 }, { "clip_ratio/high_max": 0.010417348705232143, "clip_ratio/high_mean": 0.0048586102202534676, "clip_ratio/low_mean": 0.0030388194136321545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00789743009954691, "epoch": 0.008332513204080964, "grad_norm": 0.1762225329875946, "learning_rate": 1e-06, "loss": -0.0033, "step": 762 }, { "clip_ratio/high_max": 0.021620912477374077, "clip_ratio/high_mean": 0.008171909488737583, "clip_ratio/low_mean": 0.007931890897452831, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016103800386190414, "epoch": 0.008343448260779231, "grad_norm": 0.1394502967596054, "learning_rate": 1e-06, "loss": -0.0041, "step": 763 }, { "clip_ratio/high_max": 0.030531350523233414, "clip_ratio/high_mean": 0.011336500756442547, "clip_ratio/low_mean": 0.011900514364242554, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023237016052007675, "epoch": 0.0083543833174775, "grad_norm": 0.09656234085559845, "learning_rate": 1e-06, "loss": -0.0048, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1130.1429443359375, "completions/mean_terminated_length": 1078.188720703125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.00836531837417577, "grad_norm": 0.203691303730011, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 19298077.0, "reward": 0.1473214328289032, "reward_std": 0.1530575156211853, "rewards/accuracy_reward/mean": 0.0535714291036129, "rewards/accuracy_reward/std": 0.2261820137500763, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 765 }, { "clip_ratio/high_max": 0.0053910063579678535, "clip_ratio/high_mean": 0.0022784036118537188, "clip_ratio/low_mean": 0.003578033298254013, "clip_ratio/low_min": 0.0007160758832469583, "clip_ratio/region_mean": 0.005856436211615801, "epoch": 0.00837625343087404, "grad_norm": 0.1121174618601799, "learning_rate": 1e-06, "loss": -0.0101, "step": 766 }, { "clip_ratio/high_max": 0.009069575928151608, "clip_ratio/high_mean": 0.003662788076326251, "clip_ratio/low_mean": 0.007238627411425114, "clip_ratio/low_min": 0.0012620218330994248, "clip_ratio/region_mean": 0.010901415720582008, "epoch": 0.008387188487572307, "grad_norm": 0.07384546101093292, "learning_rate": 1e-06, "loss": -0.0105, "step": 767 }, { "clip_ratio/high_max": 0.011669943109154701, "clip_ratio/high_mean": 0.004608395043760538, "clip_ratio/low_mean": 0.011862481012940407, "clip_ratio/low_min": 0.0020018278155475855, "clip_ratio/region_mean": 0.016470875591039658, "epoch": 0.008398123544270577, "grad_norm": 0.05700743570923805, "learning_rate": 1e-06, "loss": -0.0107, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 1116.27685546875, "completions/mean_terminated_length": 1063.5377197265625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.008409058600968846, "grad_norm": 0.2753744423389435, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 19439200.0, "reward": 0.20446430146694183, "reward_std": 0.18953277170658112, "rewards/accuracy_reward/mean": 0.1160714253783226, "rewards/accuracy_reward/std": 0.3217501640319824, "rewards/format_reward/mean": 0.8839285969734192, "rewards/format_reward/std": 0.32175013422966003, "step": 769 }, { "clip_ratio/high_max": 0.010543680749833584, "clip_ratio/high_mean": 0.004239484667778015, "clip_ratio/low_mean": 0.0036365720443427563, "clip_ratio/low_min": 0.0004694100935012102, "clip_ratio/region_mean": 0.007876056246459484, "epoch": 0.008419993657667116, "grad_norm": 0.16620196402072906, "learning_rate": 1e-06, "loss": 0.0072, "step": 770 }, { "clip_ratio/high_max": 0.01936594396829605, "clip_ratio/high_mean": 0.006573711056262255, "clip_ratio/low_mean": 0.007656029425561428, "clip_ratio/low_min": 0.0007041151402518153, "clip_ratio/region_mean": 0.014229740016162395, "epoch": 0.008430928714365384, "grad_norm": 0.10802574455738068, "learning_rate": 1e-06, "loss": 0.0067, "step": 771 }, { "clip_ratio/high_max": 0.023956390097737312, "clip_ratio/high_mean": 0.008167299441993237, "clip_ratio/low_mean": 0.011305254884064198, "clip_ratio/low_min": 0.0022688154131174088, "clip_ratio/region_mean": 0.019472552463412285, "epoch": 0.008441863771063653, "grad_norm": 0.08295020461082458, "learning_rate": 1e-06, "loss": 0.0063, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 798.6160888671875, "completions/mean_terminated_length": 787.3603515625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.008452798827761922, "grad_norm": 0.5203120708465576, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 19544969.0, "reward": 0.5544643402099609, "reward_std": 0.47354254126548767, "rewards/accuracy_reward/mean": 0.4553571343421936, "rewards/accuracy_reward/std": 0.500241219997406, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 773 }, { "clip_ratio/high_max": 0.007098786067217588, "clip_ratio/high_mean": 0.005304626189172268, "clip_ratio/low_mean": 0.00574969407171011, "clip_ratio/low_min": 0.0029191270004957914, "clip_ratio/region_mean": 0.011054320260882378, "epoch": 0.00846373388446019, "grad_norm": 0.33358073234558105, "learning_rate": 1e-06, "loss": 0.0157, "step": 774 }, { "clip_ratio/high_max": 0.015922510996460915, "clip_ratio/high_mean": 0.010630419477820396, "clip_ratio/low_mean": 0.013667411170899868, "clip_ratio/low_min": 0.007165129762142897, "clip_ratio/region_mean": 0.02429782971739769, "epoch": 0.00847466894115846, "grad_norm": 0.22611504793167114, "learning_rate": 1e-06, "loss": 0.0142, "step": 775 }, { "clip_ratio/high_max": 0.024945266544818878, "clip_ratio/high_mean": 0.015270051546394825, "clip_ratio/low_mean": 0.022555099800229073, "clip_ratio/low_min": 0.01280435174703598, "clip_ratio/region_mean": 0.037825148552656174, "epoch": 0.008485603997856729, "grad_norm": 0.20297817885875702, "learning_rate": 1e-06, "loss": 0.0128, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1660.0, "completions/mean_length": 573.6964721679688, "completions/mean_terminated_length": 560.4144287109375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.008496539054554999, "grad_norm": 0.37112441658973694, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 19625211.0, "reward": 0.3142857551574707, "reward_std": 0.17003019154071808, "rewards/accuracy_reward/mean": 0.2142857164144516, "rewards/accuracy_reward/std": 0.41217002272605896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.00869443267583847, "clip_ratio/high_mean": 0.0024560322053730488, "clip_ratio/low_mean": 0.0061814673244953156, "clip_ratio/low_min": 0.003274712013080716, "clip_ratio/region_mean": 0.008637499995529652, "epoch": 0.008507474111253266, "grad_norm": 0.1966945230960846, "learning_rate": 1e-06, "loss": 0.0225, "step": 778 }, { "clip_ratio/high_max": 0.01710839942097664, "clip_ratio/high_mean": 0.004612747114151716, "clip_ratio/low_mean": 0.012004241347312927, "clip_ratio/low_min": 0.006306852679699659, "clip_ratio/region_mean": 0.01661699078977108, "epoch": 0.008518409167951536, "grad_norm": 0.16587376594543457, "learning_rate": 1e-06, "loss": 0.0218, "step": 779 }, { "clip_ratio/high_max": 0.022857943549752235, "clip_ratio/high_mean": 0.006333277560770512, "clip_ratio/low_mean": 0.01754133217036724, "clip_ratio/low_min": 0.008247422985732555, "clip_ratio/region_mean": 0.023874608799815178, "epoch": 0.008529344224649805, "grad_norm": 0.18182896077632904, "learning_rate": 1e-06, "loss": 0.0214, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 578.3392944335938, "completions/mean_terminated_length": 578.3392944335938, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.008540279281348075, "grad_norm": 0.6873217225074768, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 19702977.0, "reward": 0.5464286208152771, "reward_std": 0.3133915960788727, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49935612082481384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.02667766809463501, "clip_ratio/high_mean": 0.008835951797664165, "clip_ratio/low_mean": 0.006883976049721241, "clip_ratio/low_min": 0.002530626254156232, "clip_ratio/region_mean": 0.015719927847385406, "epoch": 0.008551214338046342, "grad_norm": 0.33417874574661255, "learning_rate": 1e-06, "loss": 0.0108, "step": 782 }, { "clip_ratio/high_max": 0.050605058670043945, "clip_ratio/high_mean": 0.016358477994799614, "clip_ratio/low_mean": 0.012361481785774231, "clip_ratio/low_min": 0.0035083682741969824, "clip_ratio/region_mean": 0.028719961643218994, "epoch": 0.008562149394744612, "grad_norm": 0.2937104105949402, "learning_rate": 1e-06, "loss": 0.01, "step": 783 }, { "clip_ratio/high_max": 0.06408140808343887, "clip_ratio/high_mean": 0.02048657275736332, "clip_ratio/low_mean": 0.01720830425620079, "clip_ratio/low_min": 0.004198539070785046, "clip_ratio/region_mean": 0.03769487142562866, "epoch": 0.008573084451442881, "grad_norm": 0.2077254205942154, "learning_rate": 1e-06, "loss": 0.0094, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 786.3660888671875, "completions/mean_terminated_length": 775.0, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.008584019508141149, "grad_norm": 0.4166552722454071, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 19810202.0, "reward": 0.5455358028411865, "reward_std": 0.3526986241340637, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49935609102249146, "rewards/format_reward/mean": 0.9910714030265808, "rewards/format_reward/std": 0.09449111670255661, "step": 785 }, { "clip_ratio/high_max": 0.009555310942232609, "clip_ratio/high_mean": 0.0051579647697508335, "clip_ratio/low_mean": 0.004344066139310598, "clip_ratio/low_min": 0.002613120712339878, "clip_ratio/region_mean": 0.009502030909061432, "epoch": 0.008594954564839418, "grad_norm": 0.25492680072784424, "learning_rate": 1e-06, "loss": 0.0005, "step": 786 }, { "clip_ratio/high_max": 0.017316320911049843, "clip_ratio/high_mean": 0.01010274887084961, "clip_ratio/low_mean": 0.009993023239076138, "clip_ratio/low_min": 0.002062989864498377, "clip_ratio/region_mean": 0.020095771178603172, "epoch": 0.008605889621537688, "grad_norm": 0.1915903240442276, "learning_rate": 1e-06, "loss": -0.0007, "step": 787 }, { "clip_ratio/high_max": 0.024957936257123947, "clip_ratio/high_mean": 0.0141238272190094, "clip_ratio/low_mean": 0.016000736504793167, "clip_ratio/low_min": 0.003300783922895789, "clip_ratio/region_mean": 0.030124563723802567, "epoch": 0.008616824678235957, "grad_norm": 0.14078226685523987, "learning_rate": 1e-06, "loss": -0.0017, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1056.0804443359375, "completions/mean_terminated_length": 1038.04541015625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.008627759734934225, "grad_norm": 0.31185683608055115, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 19954691.0, "reward": 0.18928572535514832, "reward_std": 0.12198752164840698, "rewards/accuracy_reward/mean": 0.0892857164144516, "rewards/accuracy_reward/std": 0.28643733263015747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.010956902988255024, "clip_ratio/high_mean": 0.0024396958760917187, "clip_ratio/low_mean": 0.006565455812960863, "clip_ratio/low_min": 0.0018557289149612188, "clip_ratio/region_mean": 0.009005150757730007, "epoch": 0.008638694791632495, "grad_norm": 0.17656078934669495, "learning_rate": 1e-06, "loss": 0.0069, "step": 790 }, { "clip_ratio/high_max": 0.01771366037428379, "clip_ratio/high_mean": 0.003923672717064619, "clip_ratio/low_mean": 0.01204883772879839, "clip_ratio/low_min": 0.0027122192550450563, "clip_ratio/region_mean": 0.01597250998020172, "epoch": 0.008649629848330764, "grad_norm": 0.1417316049337387, "learning_rate": 1e-06, "loss": 0.0064, "step": 791 }, { "clip_ratio/high_max": 0.02063550055027008, "clip_ratio/high_mean": 0.004622672684490681, "clip_ratio/low_mean": 0.015119309537112713, "clip_ratio/low_min": 0.003568709595128894, "clip_ratio/region_mean": 0.019741982221603394, "epoch": 0.008660564905029033, "grad_norm": 0.1133890226483345, "learning_rate": 1e-06, "loss": 0.0061, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 704.2589721679688, "completions/mean_terminated_length": 704.2589721679688, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.008671499961727301, "grad_norm": 0.5069441795349121, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 20051712.0, "reward": 0.4196428954601288, "reward_std": 0.3156649172306061, "rewards/accuracy_reward/mean": 0.3214285671710968, "rewards/accuracy_reward/std": 0.46912387013435364, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 793 }, { "clip_ratio/high_max": 0.009909521788358688, "clip_ratio/high_mean": 0.004713957663625479, "clip_ratio/low_mean": 0.007085656281560659, "clip_ratio/low_min": 0.001978927059099078, "clip_ratio/region_mean": 0.011799613945186138, "epoch": 0.00868243501842557, "grad_norm": 0.3141786754131317, "learning_rate": 1e-06, "loss": -0.0026, "step": 794 }, { "clip_ratio/high_max": 0.019590606912970543, "clip_ratio/high_mean": 0.008620512671768665, "clip_ratio/low_mean": 0.014021366834640503, "clip_ratio/low_min": 0.0024068031925708055, "clip_ratio/region_mean": 0.022641880437731743, "epoch": 0.00869337007512384, "grad_norm": 0.19202712178230286, "learning_rate": 1e-06, "loss": -0.0034, "step": 795 }, { "clip_ratio/high_max": 0.02499058097600937, "clip_ratio/high_mean": 0.010858972556889057, "clip_ratio/low_mean": 0.02072094939649105, "clip_ratio/low_min": 0.0022998342756181955, "clip_ratio/region_mean": 0.03157992288470268, "epoch": 0.008704305131822108, "grad_norm": 0.19308532774448395, "learning_rate": 1e-06, "loss": -0.004, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 575.9910888671875, "completions/mean_terminated_length": 562.729736328125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.008715240188520377, "grad_norm": 0.5224965214729309, "learning_rate": 1e-06, "loss": 0.0401, "num_tokens": 20130979.0, "reward": 0.6160714626312256, "reward_std": 0.24877862632274628, "rewards/accuracy_reward/mean": 0.5178571343421936, "rewards/accuracy_reward/std": 0.5019267797470093, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 797 }, { "clip_ratio/high_max": 0.010973144322633743, "clip_ratio/high_mean": 0.005154285579919815, "clip_ratio/low_mean": 0.008016417734324932, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013170704245567322, "epoch": 0.008726175245218647, "grad_norm": 0.3340732455253601, "learning_rate": 1e-06, "loss": 0.0385, "step": 798 }, { "clip_ratio/high_max": 0.022523824125528336, "clip_ratio/high_mean": 0.009669343940913677, "clip_ratio/low_mean": 0.019339973106980324, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029009316116571426, "epoch": 0.008737110301916916, "grad_norm": 0.2489721029996872, "learning_rate": 1e-06, "loss": 0.0368, "step": 799 }, { "clip_ratio/high_max": 0.030416786670684814, "clip_ratio/high_mean": 0.012583106756210327, "clip_ratio/low_mean": 0.030634889379143715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043217990547418594, "epoch": 0.008748045358615184, "grad_norm": 0.17902347445487976, "learning_rate": 1e-06, "loss": 0.0353, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 600.7678833007812, "completions/mean_terminated_length": 587.729736328125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.008758980415313453, "grad_norm": 0.41358280181884766, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 20215485.0, "reward": 0.602678656578064, "reward_std": 0.3210139870643616, "rewards/accuracy_reward/mean": 0.5089285969734192, "rewards/accuracy_reward/std": 0.5021671056747437, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24314938485622406, "step": 801 }, { "clip_ratio/high_max": 0.019527051597833633, "clip_ratio/high_mean": 0.006734281778335571, "clip_ratio/low_mean": 0.004694811999797821, "clip_ratio/low_min": 0.0003462603781372309, "clip_ratio/region_mean": 0.011429092846810818, "epoch": 0.008769915472011723, "grad_norm": 0.21487440168857574, "learning_rate": 1e-06, "loss": 0.0061, "step": 802 }, { "clip_ratio/high_max": 0.03780006989836693, "clip_ratio/high_mean": 0.012063483707606792, "clip_ratio/low_mean": 0.010431136004626751, "clip_ratio/low_min": 0.0017313018906861544, "clip_ratio/region_mean": 0.022494623437523842, "epoch": 0.008780850528709992, "grad_norm": 0.13981866836547852, "learning_rate": 1e-06, "loss": 0.0053, "step": 803 }, { "clip_ratio/high_max": 0.05141526460647583, "clip_ratio/high_mean": 0.016611265018582344, "clip_ratio/low_mean": 0.01592324674129486, "clip_ratio/low_min": 0.0022506925743073225, "clip_ratio/region_mean": 0.032534513622522354, "epoch": 0.00879178558540826, "grad_norm": 0.10852910578250885, "learning_rate": 1e-06, "loss": 0.0048, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 784.9375610351562, "completions/mean_terminated_length": 773.55859375, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.00880272064210653, "grad_norm": 0.326577752828598, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 20320478.0, "reward": 0.2500000596046448, "reward_std": 0.3189806342124939, "rewards/accuracy_reward/mean": 0.1517857164144516, "rewards/accuracy_reward/std": 0.3604257106781006, "rewards/format_reward/mean": 0.9821428656578064, "rewards/format_reward/std": 0.1330273300409317, "step": 805 }, { "clip_ratio/high_max": 0.009823443368077278, "clip_ratio/high_mean": 0.002354908036068082, "clip_ratio/low_mean": 0.005174161400645971, "clip_ratio/low_min": 0.002203775802627206, "clip_ratio/region_mean": 0.007529069669544697, "epoch": 0.008813655698804799, "grad_norm": 0.18228960037231445, "learning_rate": 1e-06, "loss": 0.0184, "step": 806 }, { "clip_ratio/high_max": 0.01566440984606743, "clip_ratio/high_mean": 0.003688623895868659, "clip_ratio/low_mean": 0.011758147738873959, "clip_ratio/low_min": 0.004808675963431597, "clip_ratio/region_mean": 0.015446772798895836, "epoch": 0.008824590755503067, "grad_norm": 0.14780576527118683, "learning_rate": 1e-06, "loss": 0.0177, "step": 807 }, { "clip_ratio/high_max": 0.01885039173066616, "clip_ratio/high_mean": 0.004579274915158749, "clip_ratio/low_mean": 0.01847652904689312, "clip_ratio/low_min": 0.008668185211718082, "clip_ratio/region_mean": 0.023055804893374443, "epoch": 0.008835525812201336, "grad_norm": 0.10403870046138763, "learning_rate": 1e-06, "loss": 0.0173, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 565.4285888671875, "completions/mean_terminated_length": 565.4285888671875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.008846460868899605, "grad_norm": 0.6891185641288757, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 20399514.0, "reward": 0.5107144117355347, "reward_std": 0.4512324631214142, "rewards/accuracy_reward/mean": 0.4107142984867096, "rewards/accuracy_reward/std": 0.4941745698451996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.009479671716690063, "clip_ratio/high_mean": 0.005271228961646557, "clip_ratio/low_mean": 0.0071245781145989895, "clip_ratio/low_min": 0.0031620552763342857, "clip_ratio/region_mean": 0.012395807541906834, "epoch": 0.008857395925597875, "grad_norm": 0.38028645515441895, "learning_rate": 1e-06, "loss": 0.0182, "step": 810 }, { "clip_ratio/high_max": 0.015593561343848705, "clip_ratio/high_mean": 0.010019190609455109, "clip_ratio/low_mean": 0.014472806826233864, "clip_ratio/low_min": 0.007246376946568489, "clip_ratio/region_mean": 0.02449199929833412, "epoch": 0.008868330982296143, "grad_norm": 0.29526934027671814, "learning_rate": 1e-06, "loss": 0.0168, "step": 811 }, { "clip_ratio/high_max": 0.027464788407087326, "clip_ratio/high_mean": 0.013788972981274128, "clip_ratio/low_mean": 0.02412463165819645, "clip_ratio/low_min": 0.014850798062980175, "clip_ratio/region_mean": 0.03791360184550285, "epoch": 0.008879266038994412, "grad_norm": 0.43785184621810913, "learning_rate": 1e-06, "loss": 0.0158, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1722.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 764.3392944335938, "completions/mean_terminated_length": 764.3392944335938, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.008890201095692682, "grad_norm": 0.5152819156646729, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 20504076.0, "reward": 0.2875000536441803, "reward_std": 0.31980836391448975, "rewards/accuracy_reward/mean": 0.1964285671710968, "rewards/accuracy_reward/std": 0.3990819752216339, "rewards/format_reward/mean": 0.9107142686843872, "rewards/format_reward/std": 0.28643733263015747, "step": 813 }, { "clip_ratio/high_max": 0.013293605297803879, "clip_ratio/high_mean": 0.005804696120321751, "clip_ratio/low_mean": 0.007584002334624529, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013388699851930141, "epoch": 0.00890113615239095, "grad_norm": 0.32395365834236145, "learning_rate": 1e-06, "loss": 0.0037, "step": 814 }, { "clip_ratio/high_max": 0.028191611170768738, "clip_ratio/high_mean": 0.011492626741528511, "clip_ratio/low_mean": 0.018982218578457832, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030474845319986343, "epoch": 0.008912071209089219, "grad_norm": 0.23724950850009918, "learning_rate": 1e-06, "loss": 0.0019, "step": 815 }, { "clip_ratio/high_max": 0.04400641843676567, "clip_ratio/high_mean": 0.016419801861047745, "clip_ratio/low_mean": 0.029671574011445045, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04609137400984764, "epoch": 0.008923006265787488, "grad_norm": 0.17210781574249268, "learning_rate": 1e-06, "loss": 0.0006, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1021.6607666015625, "completions/mean_terminated_length": 1012.4144287109375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.008933941322485758, "grad_norm": 0.44509416818618774, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 20635974.0, "reward": 0.36250007152557373, "reward_std": 0.26617300510406494, "rewards/accuracy_reward/mean": 0.2678571343421936, "rewards/accuracy_reward/std": 0.44483304023742676, "rewards/format_reward/mean": 0.9464285969734192, "rewards/format_reward/std": 0.2261820137500763, "step": 817 }, { "epoch": 0.008933941322485758, "step": 817, "total_flos": 0.0, "train_loss": 0.0056144091926679214, "train_runtime": 23301.359, "train_samples_per_second": 3.927, "train_steps_per_second": 0.035 } ], "logging_steps": 1, "max_steps": 817, "num_input_tokens_seen": 20635974, "num_train_epochs": 1, "save_steps": 41, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }