{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2857142857142857, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2700.5104370117188, "entropy": 0.3671875, "epoch": 0.001142857142857143, "grad_norm": 0.11866585910320282, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.7604166893288493, "reward_std": 0.4268697127699852, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.5104166669771075, "step": 1 }, { "completion_length": 3164.5729370117188, "entropy": 0.35498046875, "epoch": 0.002285714285714286, "grad_norm": 0.11806796491146088, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0, "reward": 0.6875000204890966, "reward_std": 0.36165641620755196, "rewards/accuracy_reward": 0.3020833386108279, "rewards/format_reward": 0.3854166818782687, "step": 2 }, { "completion_length": 3615.7500610351562, "entropy": 0.45654296875, "epoch": 0.0034285714285714284, "grad_norm": 0.13286341726779938, "kl": 4.506111145019531e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.18750000279396772, "reward_std": 0.23272089660167694, "rewards/accuracy_reward": 0.05208333395421505, "rewards/format_reward": 0.13541666697710752, "step": 3 }, { "completion_length": 2482.416717529297, "entropy": 0.40869140625, "epoch": 0.004571428571428572, "grad_norm": 0.14147797226905823, "kl": 3.30805778503418e-05, "learning_rate": 1.2e-07, "loss": 0.0, "reward": 0.8645833730697632, "reward_std": 0.4684411771595478, "rewards/accuracy_reward": 0.18750000279396772, "rewards/format_reward": 0.677083358168602, "step": 4 }, { "completion_length": 3591.3646850585938, "entropy": 0.45947265625, "epoch": 0.005714285714285714, "grad_norm": 0.13485956192016602, "kl": 4.0203332901000977e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.3750000074505806, "reward_std": 0.4921039678156376, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.3333333432674408, "step": 5 }, { "completion_length": 3477.2396850585938, "entropy": 0.45654296875, "epoch": 0.006857142857142857, "grad_norm": 0.13642774522304535, "kl": 4.4018030166625977e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.2812500074505806, "reward_std": 0.3338681757450104, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.25, "step": 6 }, { "completion_length": 3389.4584350585938, "entropy": 0.3916015625, "epoch": 0.008, "grad_norm": 0.1285017877817154, "kl": 2.73287296295166e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.739583358168602, "reward_std": 0.6624889373779297, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.5104166865348816, "step": 7 }, { "completion_length": 2890.104248046875, "entropy": 0.343017578125, "epoch": 0.009142857142857144, "grad_norm": 0.07784460484981537, "kl": 2.562999725341797e-05, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.8541667014360428, "reward_std": 0.31141985207796097, "rewards/accuracy_reward": 0.3750000102445483, "rewards/format_reward": 0.4791666716337204, "step": 8 }, { "completion_length": 3353.1875610351562, "entropy": 0.4384765625, "epoch": 0.010285714285714285, "grad_norm": 0.1676989048719406, "kl": 4.4345855712890625e-05, "learning_rate": 3.2e-07, "loss": 0.0, "reward": 0.5000000074505806, "reward_std": 0.533780675381422, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.3645833432674408, "step": 9 }, { "completion_length": 2996.1875610351562, "entropy": 0.3466796875, "epoch": 0.011428571428571429, "grad_norm": 0.15292252600193024, "kl": 3.3229589462280273e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 0.552083358168602, "reward_std": 0.4631676971912384, "rewards/accuracy_reward": 0.13541667256504297, "rewards/format_reward": 0.4166666753590107, "step": 10 }, { "completion_length": 3595.4063720703125, "entropy": 0.38134765625, "epoch": 0.012571428571428572, "grad_norm": 0.15428116917610168, "kl": 3.191828727722168e-05, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.260416672565043, "reward_std": 0.4286932796239853, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.1666666679084301, "step": 11 }, { "completion_length": 2551.479217529297, "entropy": 0.41015625, "epoch": 0.013714285714285714, "grad_norm": 0.1346665471792221, "kl": 3.972649574279785e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.8125000447034836, "reward_std": 0.455630861222744, "rewards/accuracy_reward": 0.17708333395421505, "rewards/format_reward": 0.6354166865348816, "step": 12 }, { "completion_length": 3197.1146240234375, "entropy": 0.40625, "epoch": 0.014857142857142857, "grad_norm": 0.11476687341928482, "kl": 3.629922866821289e-05, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.541666679084301, "reward_std": 0.28364068269729614, "rewards/accuracy_reward": 0.15625000279396772, "rewards/format_reward": 0.3854166716337204, "step": 13 }, { "completion_length": 2990.2709350585938, "entropy": 0.36376953125, "epoch": 0.016, "grad_norm": 0.19115598499774933, "kl": 2.4765729904174805e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.6875000298023224, "reward_std": 0.4306366816163063, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.4791666716337204, "step": 14 }, { "completion_length": 2998.2084350585938, "entropy": 0.376708984375, "epoch": 0.017142857142857144, "grad_norm": 0.08084020018577576, "kl": 2.4259090423583984e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.65625, "reward_std": 0.2722426578402519, "rewards/accuracy_reward": 0.2604166679084301, "rewards/format_reward": 0.3958333358168602, "step": 15 }, { "completion_length": 3789.6251220703125, "entropy": 0.44580078125, "epoch": 0.018285714285714287, "grad_norm": 0.13743434846401215, "kl": 3.844499588012695e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.28125000558793545, "reward_std": 0.43100808560848236, "rewards/accuracy_reward": 0.11458333488553762, "rewards/format_reward": 0.16666666977107525, "step": 16 }, { "completion_length": 2505.7188110351562, "entropy": 0.45361328125, "epoch": 0.019428571428571427, "grad_norm": 0.1941273808479309, "kl": 3.5628676414489746e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.8541666865348816, "reward_std": 0.42319394648075104, "rewards/accuracy_reward": 0.2500000102445483, "rewards/format_reward": 0.6041666716337204, "step": 17 }, { "completion_length": 3116.479248046875, "entropy": 0.36669921875, "epoch": 0.02057142857142857, "grad_norm": 0.1034688651561737, "kl": 2.0712614059448242e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.645833358168602, "reward_std": 0.3843524754047394, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.4583333507180214, "step": 18 }, { "completion_length": 3206.135498046875, "entropy": 0.3837890625, "epoch": 0.021714285714285714, "grad_norm": 0.12397009134292603, "kl": 1.7814338207244873e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": 0.6041666939854622, "reward_std": 0.49627041071653366, "rewards/accuracy_reward": 0.20833334419876337, "rewards/format_reward": 0.3958333432674408, "step": 19 }, { "completion_length": 2758.656280517578, "entropy": 0.346435546875, "epoch": 0.022857142857142857, "grad_norm": 0.14725361764431, "kl": 1.5437602996826172e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": 0.9270833879709244, "reward_std": 0.5184547901153564, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.614583358168602, "step": 20 }, { "completion_length": 2928.7083740234375, "entropy": 0.419921875, "epoch": 0.024, "grad_norm": 0.16036786139011383, "kl": 3.966689109802246e-05, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.6458333544433117, "reward_std": 0.45568280667066574, "rewards/accuracy_reward": 0.18750000465661287, "rewards/format_reward": 0.45833334140479565, "step": 21 }, { "completion_length": 1764.3541870117188, "entropy": 0.3837890625, "epoch": 0.025142857142857144, "grad_norm": 0.20493587851524353, "kl": 3.927946090698242e-05, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": 1.177083358168602, "reward_std": 0.3750041276216507, "rewards/accuracy_reward": 0.3125000102445483, "rewards/format_reward": 0.8645833432674408, "step": 22 }, { "completion_length": 2654.7188110351562, "entropy": 0.37841796875, "epoch": 0.026285714285714287, "grad_norm": 0.14049632847309113, "kl": 2.409517765045166e-05, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.7395833507180214, "reward_std": 0.5375720374286175, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.5520833507180214, "step": 23 }, { "completion_length": 2969.5000610351562, "entropy": 0.369140625, "epoch": 0.027428571428571427, "grad_norm": 0.16432780027389526, "kl": 3.295391798019409e-05, "learning_rate": 9.2e-07, "loss": 0.0, "reward": 0.7708333432674408, "reward_std": 0.46594493091106415, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.5208333432674408, "step": 24 }, { "completion_length": 2851.1458740234375, "entropy": 0.4326171875, "epoch": 0.02857142857142857, "grad_norm": 0.13283270597457886, "kl": 8.803606033325195e-05, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.6666666865348816, "reward_std": 0.34967152029275894, "rewards/accuracy_reward": 0.2395833358168602, "rewards/format_reward": 0.4270833432674408, "step": 25 }, { "completion_length": 3068.9063110351562, "entropy": 0.40234375, "epoch": 0.029714285714285714, "grad_norm": 0.08674507588148117, "kl": 2.0168721675872803e-05, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.6666666865348816, "reward_std": 0.272128164768219, "rewards/accuracy_reward": 0.19791666977107525, "rewards/format_reward": 0.4687500149011612, "step": 26 }, { "completion_length": 3239.416748046875, "entropy": 0.43212890625, "epoch": 0.030857142857142857, "grad_norm": 0.13486000895500183, "kl": 4.280870780348778e-05, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.6562500223517418, "reward_std": 0.47437138110399246, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.4687500149011612, "step": 27 }, { "completion_length": 3000.885498046875, "entropy": 0.4033203125, "epoch": 0.032, "grad_norm": 0.12638980150222778, "kl": 7.337331771850586e-05, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.7604166865348816, "reward_std": 0.4810705706477165, "rewards/accuracy_reward": 0.3229166716337204, "rewards/format_reward": 0.4375000074505806, "step": 28 }, { "completion_length": 3736.104248046875, "entropy": 0.4453125, "epoch": 0.03314285714285714, "grad_norm": 0.17240460216999054, "kl": 0.00014406442642211914, "learning_rate": 9.996052735444862e-07, "loss": 0.0, "reward": 0.2708333469927311, "reward_std": 0.4822928011417389, "rewards/accuracy_reward": 0.07291666697710752, "rewards/format_reward": 0.19791667256504297, "step": 29 }, { "completion_length": 3230.916748046875, "entropy": 0.3857421875, "epoch": 0.03428571428571429, "grad_norm": 0.11792454868555069, "kl": 0.00036847591400146484, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.7500000353902578, "reward_std": 0.6145796477794647, "rewards/accuracy_reward": 0.281250006519258, "rewards/format_reward": 0.4687500102445483, "step": 30 }, { "completion_length": 3289.6771240234375, "entropy": 0.39013671875, "epoch": 0.03542857142857143, "grad_norm": 0.14808149635791779, "kl": 0.0001868605613708496, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.41467901691794395, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.322916679084301, "step": 31 }, { "completion_length": 3417.041748046875, "entropy": 0.44775390625, "epoch": 0.036571428571428574, "grad_norm": 0.10391438752412796, "kl": 0.00020647048950195312, "learning_rate": 9.98421786662277e-07, "loss": 0.0, "reward": 0.7291666977107525, "reward_std": 0.4650338739156723, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.43750001303851604, "step": 32 }, { "completion_length": 3650.1251220703125, "entropy": 0.3798828125, "epoch": 0.037714285714285714, "grad_norm": 0.10379055887460709, "kl": 0.0002885758876800537, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.42708334140479565, "reward_std": 0.5094060599803925, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.2604166781529784, "step": 33 }, { "completion_length": 2580.8438110351562, "entropy": 0.43701171875, "epoch": 0.038857142857142854, "grad_norm": 0.14274698495864868, "kl": 0.000626683235168457, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.8645833563059568, "reward_std": 0.5226760059595108, "rewards/accuracy_reward": 0.33333334885537624, "rewards/format_reward": 0.5312500149011612, "step": 34 }, { "completion_length": 3257.8333740234375, "entropy": 0.42333984375, "epoch": 0.04, "grad_norm": 0.15037870407104492, "kl": 0.0006988048553466797, "learning_rate": 9.964516155915151e-07, "loss": 0.0, "reward": 0.5520833507180214, "reward_std": 0.5667570382356644, "rewards/accuracy_reward": 0.19791666883975267, "rewards/format_reward": 0.35416668467223644, "step": 35 }, { "completion_length": 3751.5000610351562, "entropy": 0.50048828125, "epoch": 0.04114285714285714, "grad_norm": 0.1191205084323883, "kl": 0.0008706152439117432, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "reward": 0.22916667442768812, "reward_std": 0.4133975952863693, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.19791666697710752, "step": 36 }, { "completion_length": 3529.010498046875, "entropy": 0.4375, "epoch": 0.04228571428571429, "grad_norm": 0.09502461552619934, "kl": 0.0006368160247802734, "learning_rate": 9.947027716509488e-07, "loss": 0.0, "reward": 0.27083334140479565, "reward_std": 0.1726192608475685, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.23958333861082792, "step": 37 }, { "completion_length": 3582.3646240234375, "entropy": 0.45263671875, "epoch": 0.04342857142857143, "grad_norm": 0.057916510850191116, "kl": 0.0004200935363769531, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": 0.260416679084301, "reward_std": 0.18053755164146423, "rewards/accuracy_reward": 0.11458333395421505, "rewards/format_reward": 0.1458333358168602, "step": 38 }, { "completion_length": 2990.2188110351562, "entropy": 0.36572265625, "epoch": 0.044571428571428574, "grad_norm": 0.13697481155395508, "kl": 0.0011453032493591309, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": 0.729166679084301, "reward_std": 0.2874651923775673, "rewards/accuracy_reward": 0.2500000027939677, "rewards/format_reward": 0.4791666716337204, "step": 39 }, { "completion_length": 2710.760498046875, "entropy": 0.38818359375, "epoch": 0.045714285714285714, "grad_norm": 0.17243291437625885, "kl": 0.0023212432861328125, "learning_rate": 9.91429819907136e-07, "loss": 0.0001, "reward": 0.7395833656191826, "reward_std": 0.44072920083999634, "rewards/accuracy_reward": 0.19791667442768812, "rewards/format_reward": 0.541666679084301, "step": 40 }, { "completion_length": 3186.885498046875, "entropy": 0.380859375, "epoch": 0.046857142857142854, "grad_norm": 0.13304336369037628, "kl": 0.0006128549575805664, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "reward": 0.5000000074505806, "reward_std": 0.4803263619542122, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.4062500149011612, "step": 41 }, { "completion_length": 3108.7500610351562, "entropy": 0.45703125, "epoch": 0.048, "grad_norm": 0.05359484255313873, "kl": 0.0004253387451171875, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": 0.43750000186264515, "reward_std": 0.13371453434228897, "rewards/accuracy_reward": 0.13541666697710752, "rewards/format_reward": 0.3020833348855376, "step": 42 }, { "completion_length": 3218.541748046875, "entropy": 0.42431640625, "epoch": 0.04914285714285714, "grad_norm": 0.12211965769529343, "kl": 0.0012423992156982422, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.510416692122817, "reward_std": 0.34035979211330414, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.3229166716337204, "step": 43 }, { "completion_length": 2851.635482788086, "entropy": 0.39013671875, "epoch": 0.05028571428571429, "grad_norm": 0.1306796669960022, "kl": 0.0009481906890869141, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.7708333563059568, "reward_std": 0.42007729411125183, "rewards/accuracy_reward": 0.2708333386108279, "rewards/format_reward": 0.5000000176951289, "step": 44 }, { "completion_length": 3627.354248046875, "entropy": 0.43505859375, "epoch": 0.05142857142857143, "grad_norm": 0.1441727578639984, "kl": 0.0014786720275878906, "learning_rate": 9.8425742251254e-07, "loss": 0.0001, "reward": 0.5104166902601719, "reward_std": 0.4969704672694206, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.3229166828095913, "step": 45 }, { "completion_length": 3407.3438110351562, "entropy": 0.50341796875, "epoch": 0.052571428571428575, "grad_norm": 0.14224384725093842, "kl": 0.001373291015625, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": 0.30208334885537624, "reward_std": 0.34669168293476105, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.28125001303851604, "step": 46 }, { "completion_length": 3059.5521850585938, "entropy": 0.38916015625, "epoch": 0.053714285714285714, "grad_norm": 0.1632174700498581, "kl": 0.001373887062072754, "learning_rate": 9.807937738894303e-07, "loss": 0.0001, "reward": 0.8437500149011612, "reward_std": 0.5346902906894684, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.5208333507180214, "step": 47 }, { "completion_length": 2942.260498046875, "entropy": 0.40576171875, "epoch": 0.054857142857142854, "grad_norm": 0.12902727723121643, "kl": 0.0034656524658203125, "learning_rate": 9.78935800506826e-07, "loss": 0.0001, "reward": 0.6562500335276127, "reward_std": 0.5015772953629494, "rewards/accuracy_reward": 0.19791667722165585, "rewards/format_reward": 0.4583333544433117, "step": 48 }, { "completion_length": 2611.0000610351562, "entropy": 0.38330078125, "epoch": 0.056, "grad_norm": 0.1312219202518463, "kl": 0.005061149597167969, "learning_rate": 9.769942052400235e-07, "loss": 0.0002, "reward": 0.7916666716337204, "reward_std": 0.38058819621801376, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.5729166716337204, "step": 49 }, { "completion_length": 3060.0000610351562, "entropy": 0.349609375, "epoch": 0.05714285714285714, "grad_norm": 0.08954072743654251, "kl": 0.0010552406311035156, "learning_rate": 9.749693666068663e-07, "loss": 0.0, "reward": 0.6250000027939677, "reward_std": 0.31528370827436447, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.3645833460614085, "step": 50 }, { "completion_length": 2443.1146240234375, "entropy": 0.46533203125, "epoch": 0.05828571428571429, "grad_norm": 0.15657995641231537, "kl": 0.007048606872558594, "learning_rate": 9.728616793536587e-07, "loss": 0.0003, "reward": 0.7083333432674408, "reward_std": 0.31336943060159683, "rewards/accuracy_reward": 0.1979166753590107, "rewards/format_reward": 0.5104166716337204, "step": 51 }, { "completion_length": 3218.322967529297, "entropy": 0.404296875, "epoch": 0.05942857142857143, "grad_norm": 0.09073984622955322, "kl": 0.0022614002227783203, "learning_rate": 9.706715543782064e-07, "loss": 0.0001, "reward": 0.791666716337204, "reward_std": 0.5437296032905579, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.4375, "step": 52 }, { "completion_length": 2848.8855590820312, "entropy": 0.42138671875, "epoch": 0.060571428571428575, "grad_norm": 0.16575101017951965, "kl": 0.001046299934387207, "learning_rate": 9.683994186497132e-07, "loss": 0.0, "reward": 0.979166679084301, "reward_std": 0.5605000704526901, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.6250000223517418, "step": 53 }, { "completion_length": 2977.2188110351562, "entropy": 0.39501953125, "epoch": 0.061714285714285715, "grad_norm": 0.1525822877883911, "kl": 0.0014376640319824219, "learning_rate": 9.66045715125541e-07, "loss": 0.0001, "reward": 0.9062500223517418, "reward_std": 0.6681454330682755, "rewards/accuracy_reward": 0.37500000558793545, "rewards/format_reward": 0.5312500149011612, "step": 54 }, { "completion_length": 3231.4063110351562, "entropy": 0.43701171875, "epoch": 0.06285714285714286, "grad_norm": 0.1321529597043991, "kl": 0.0012722015380859375, "learning_rate": 9.636109026648554e-07, "loss": 0.0001, "reward": 0.5416666902601719, "reward_std": 0.4359280541539192, "rewards/accuracy_reward": 0.18750000093132257, "rewards/format_reward": 0.35416668467223644, "step": 55 }, { "completion_length": 3173.5001220703125, "entropy": 0.421875, "epoch": 0.064, "grad_norm": 0.08983828872442245, "kl": 0.0008525848388671875, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": 0.6250000027939677, "reward_std": 0.23468155041337013, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.3958333460614085, "step": 56 }, { "completion_length": 3610.635498046875, "entropy": 0.33154296875, "epoch": 0.06514285714285714, "grad_norm": 0.07729873806238174, "kl": 0.0005555152893066406, "learning_rate": 9.58499865339809e-07, "loss": 0.0, "reward": 0.4479166669771075, "reward_std": 0.4583342596888542, "rewards/accuracy_reward": 0.11458333488553762, "rewards/format_reward": 0.3333333460614085, "step": 57 }, { "completion_length": 2336.0521545410156, "entropy": 0.329833984375, "epoch": 0.06628571428571428, "grad_norm": 0.15206408500671387, "kl": 0.0069732666015625, "learning_rate": 9.55824636882301e-07, "loss": 0.0003, "reward": 0.9895833656191826, "reward_std": 0.4796273037791252, "rewards/accuracy_reward": 0.3020833386108279, "rewards/format_reward": 0.6875000223517418, "step": 58 }, { "completion_length": 3113.2500610351562, "entropy": 0.36474609375, "epoch": 0.06742857142857143, "grad_norm": 0.09677103161811829, "kl": 0.0010061264038085938, "learning_rate": 9.530702921077358e-07, "loss": 0.0, "reward": 0.5312500260770321, "reward_std": 0.302716389298439, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.3750000111758709, "step": 59 }, { "completion_length": 3189.0313110351562, "entropy": 0.3974609375, "epoch": 0.06857142857142857, "grad_norm": 0.08244970440864563, "kl": 0.0017404556274414062, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": 0.4062500074505806, "reward_std": 0.31878524273633957, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.3229166828095913, "step": 60 }, { "completion_length": 3010.1251220703125, "entropy": 0.341796875, "epoch": 0.06971428571428571, "grad_norm": 0.11613977700471878, "kl": 0.0011224746704101562, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "reward": 0.8437500111758709, "reward_std": 0.3721206858754158, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.5937500111758709, "step": 61 }, { "completion_length": 2635.2083740234375, "entropy": 0.35302734375, "epoch": 0.07085714285714285, "grad_norm": 0.13679753243923187, "kl": 0.002574920654296875, "learning_rate": 9.443380060197385e-07, "loss": 0.0001, "reward": 0.9375000298023224, "reward_std": 0.5850840508937836, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.6458333432674408, "step": 62 }, { "completion_length": 2299.1875610351562, "entropy": 0.38427734375, "epoch": 0.072, "grad_norm": 0.1331767588853836, "kl": 0.0034208297729492188, "learning_rate": 9.412727182773486e-07, "loss": 0.0001, "reward": 1.0729166865348816, "reward_std": 0.4625158831477165, "rewards/accuracy_reward": 0.36458334140479565, "rewards/format_reward": 0.7083333432674408, "step": 63 }, { "completion_length": 2695.5104370117188, "entropy": 0.3974609375, "epoch": 0.07314285714285715, "grad_norm": 0.13731464743614197, "kl": 0.0019664764404296875, "learning_rate": 9.381311511432658e-07, "loss": 0.0001, "reward": 0.6770833656191826, "reward_std": 0.4545453414320946, "rewards/accuracy_reward": 0.19791667256504297, "rewards/format_reward": 0.479166679084301, "step": 64 }, { "completion_length": 2934.0938110351562, "entropy": 0.370361328125, "epoch": 0.07428571428571429, "grad_norm": 0.1463320106267929, "kl": 0.002140045166015625, "learning_rate": 9.34913917072228e-07, "loss": 0.0001, "reward": 0.8229166772216558, "reward_std": 0.36905180662870407, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.48958334885537624, "step": 65 }, { "completion_length": 2158.9479598999023, "entropy": 0.34326171875, "epoch": 0.07542857142857143, "grad_norm": 0.168357253074646, "kl": 0.0015735626220703125, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": 0.9166666818782687, "reward_std": 0.25834736227989197, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.5416666669771075, "step": 66 }, { "completion_length": 3636.229248046875, "entropy": 0.36962890625, "epoch": 0.07657142857142857, "grad_norm": 0.10577098280191422, "kl": 0.0020542144775390625, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.18750000279396772, "reward_std": 0.20090095698833466, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.1666666716337204, "step": 67 }, { "completion_length": 2369.2916870117188, "entropy": 0.401611328125, "epoch": 0.07771428571428571, "grad_norm": 0.20740464329719543, "kl": 0.00390625, "learning_rate": 9.248145583195447e-07, "loss": 0.0002, "reward": 0.802083358168602, "reward_std": 0.47093402594327927, "rewards/accuracy_reward": 0.2395833358168602, "rewards/format_reward": 0.5625000149011612, "step": 68 }, { "completion_length": 2438.010498046875, "entropy": 0.447265625, "epoch": 0.07885714285714286, "grad_norm": 0.20514391362667084, "kl": 0.005084991455078125, "learning_rate": 9.213010742252327e-07, "loss": 0.0002, "reward": 0.583333358168602, "reward_std": 0.31931574642658234, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.5208333507180214, "step": 69 }, { "completion_length": 3127.2083740234375, "entropy": 0.364501953125, "epoch": 0.08, "grad_norm": 0.08004138618707657, "kl": 0.0024518966674804688, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.5416666977107525, "reward_std": 0.31260205432772636, "rewards/accuracy_reward": 0.1145833358168602, "rewards/format_reward": 0.4270833544433117, "step": 70 }, { "completion_length": 2821.635467529297, "entropy": 0.435546875, "epoch": 0.08114285714285714, "grad_norm": 0.14604488015174866, "kl": 0.005157470703125, "learning_rate": 9.140576474687263e-07, "loss": 0.0002, "reward": 0.5833333507180214, "reward_std": 0.2736881971359253, "rewards/accuracy_reward": 0.1770833432674408, "rewards/format_reward": 0.4062500074505806, "step": 71 }, { "completion_length": 3006.6563110351562, "entropy": 0.505859375, "epoch": 0.08228571428571428, "grad_norm": 0.19332122802734375, "kl": 0.0032939910888671875, "learning_rate": 9.103291169269299e-07, "loss": 0.0001, "reward": 0.4583333507180214, "reward_std": 0.38849541172385216, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.4166666716337204, "step": 72 }, { "completion_length": 3805.1771850585938, "entropy": 0.5087890625, "epoch": 0.08342857142857144, "grad_norm": 0.08017224818468094, "kl": 0.00171661376953125, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": 0.281250006519258, "reward_std": 0.3908010721206665, "rewards/accuracy_reward": 0.11458333395421505, "rewards/format_reward": 0.16666667256504297, "step": 73 }, { "completion_length": 3330.1563110351562, "entropy": 0.427734375, "epoch": 0.08457142857142858, "grad_norm": 0.1395280957221985, "kl": 0.0033082962036132812, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.5729166939854622, "reward_std": 0.4739295169711113, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.322916679084301, "step": 74 }, { "completion_length": 3050.5313110351562, "entropy": 0.3994140625, "epoch": 0.08571428571428572, "grad_norm": 0.11711548268795013, "kl": 0.0038471221923828125, "learning_rate": 8.987250199168808e-07, "loss": 0.0002, "reward": 0.5312500223517418, "reward_std": 0.39057330042123795, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.447916679084301, "step": 75 }, { "completion_length": 2418.604217529297, "entropy": 0.4072265625, "epoch": 0.08685714285714285, "grad_norm": 0.13031832873821259, "kl": 0.001972198486328125, "learning_rate": 8.9471999940354e-07, "loss": 0.0001, "reward": 0.7083333507180214, "reward_std": 0.29223429784178734, "rewards/accuracy_reward": 0.08333333861082792, "rewards/format_reward": 0.6250000223517418, "step": 76 }, { "completion_length": 3237.010498046875, "entropy": 0.431640625, "epoch": 0.088, "grad_norm": 0.10760512948036194, "kl": 0.0021152496337890625, "learning_rate": 8.906477750432903e-07, "loss": 0.0001, "reward": 0.5520833544433117, "reward_std": 0.3883203938603401, "rewards/accuracy_reward": 0.13541666697710752, "rewards/format_reward": 0.4166666716337204, "step": 77 }, { "completion_length": 2933.1459350585938, "entropy": 0.38671875, "epoch": 0.08914285714285715, "grad_norm": 0.11242065578699112, "kl": 0.0015392303466796875, "learning_rate": 8.865091407243394e-07, "loss": 0.0001, "reward": 0.6666666679084301, "reward_std": 0.5230761393904686, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.4583333395421505, "step": 78 }, { "completion_length": 2425.1146240234375, "entropy": 0.359619140625, "epoch": 0.09028571428571429, "grad_norm": 0.08172761648893356, "kl": 0.0024118423461914062, "learning_rate": 8.823049032816478e-07, "loss": 0.0001, "reward": 0.822916679084301, "reward_std": 0.3384963124990463, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.6145833432674408, "step": 79 }, { "completion_length": 3421.666748046875, "entropy": 0.5087890625, "epoch": 0.09142857142857143, "grad_norm": 0.13740910589694977, "kl": 0.003353118896484375, "learning_rate": 8.780358823396352e-07, "loss": 0.0001, "reward": 0.6666666939854622, "reward_std": 0.408274307847023, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.416666679084301, "step": 80 }, { "completion_length": 3048.104217529297, "entropy": 0.58544921875, "epoch": 0.09257142857142857, "grad_norm": 0.13280166685581207, "kl": 0.006072998046875, "learning_rate": 8.737029101523929e-07, "loss": 0.0002, "reward": 0.5416666716337204, "reward_std": 0.29809625819325447, "rewards/accuracy_reward": 0.1354166679084301, "rewards/format_reward": 0.4062500149011612, "step": 81 }, { "completion_length": 2573.3126220703125, "entropy": 0.42626953125, "epoch": 0.09371428571428571, "grad_norm": 0.1755312979221344, "kl": 0.0029544830322265625, "learning_rate": 8.693068314414344e-07, "loss": 0.0001, "reward": 0.8437500149011612, "reward_std": 0.3547321856021881, "rewards/accuracy_reward": 0.3229166716337204, "rewards/format_reward": 0.5208333432674408, "step": 82 }, { "completion_length": 3012.9271240234375, "entropy": 0.48291015625, "epoch": 0.09485714285714286, "grad_norm": 0.13794995844364166, "kl": 0.0030269622802734375, "learning_rate": 8.648485032310144e-07, "loss": 0.0001, "reward": 0.5208333432674408, "reward_std": 0.3228641413152218, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.3541666716337204, "step": 83 }, { "completion_length": 3144.791748046875, "entropy": 0.46923828125, "epoch": 0.096, "grad_norm": 0.12894625961780548, "kl": 0.0023136138916015625, "learning_rate": 8.603287946810513e-07, "loss": 0.0001, "reward": 0.6979166902601719, "reward_std": 0.5448310598731041, "rewards/accuracy_reward": 0.2395833358168602, "rewards/format_reward": 0.4583333469927311, "step": 84 }, { "completion_length": 2702.4583740234375, "entropy": 0.3408203125, "epoch": 0.09714285714285714, "grad_norm": 0.10937459766864777, "kl": 0.002269744873046875, "learning_rate": 8.557485869176825e-07, "loss": 0.0001, "reward": 0.7604166939854622, "reward_std": 0.47234033048152924, "rewards/accuracy_reward": 0.15625000186264515, "rewards/format_reward": 0.6041666865348816, "step": 85 }, { "completion_length": 2702.3959350585938, "entropy": 0.48193359375, "epoch": 0.09828571428571428, "grad_norm": 0.14910289645195007, "kl": 0.004451751708984375, "learning_rate": 8.511087728614862e-07, "loss": 0.0002, "reward": 0.625, "reward_std": 0.2315434329211712, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.5000000074505806, "step": 86 }, { "completion_length": 2416.2709045410156, "entropy": 0.44677734375, "epoch": 0.09942857142857142, "grad_norm": 0.215481698513031, "kl": 0.004787445068359375, "learning_rate": 8.464102570534061e-07, "loss": 0.0002, "reward": 0.8750000298023224, "reward_std": 0.4938344843685627, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.6666666865348816, "step": 87 }, { "completion_length": 2382.1875610351562, "entropy": 0.4248046875, "epoch": 0.10057142857142858, "grad_norm": 0.19311273097991943, "kl": 0.00421142578125, "learning_rate": 8.416539554784089e-07, "loss": 0.0002, "reward": 0.9270833730697632, "reward_std": 0.6074926629662514, "rewards/accuracy_reward": 0.3020833358168602, "rewards/format_reward": 0.6250000149011612, "step": 88 }, { "completion_length": 2669.0209350585938, "entropy": 0.3544921875, "epoch": 0.10171428571428572, "grad_norm": 0.15478843450546265, "kl": 0.003444671630859375, "learning_rate": 8.368407953869103e-07, "loss": 0.0001, "reward": 0.6875000223517418, "reward_std": 0.4951842427253723, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.5520833656191826, "step": 89 }, { "completion_length": 2330.072967529297, "entropy": 0.6162109375, "epoch": 0.10285714285714286, "grad_norm": 0.14878880977630615, "kl": 0.01177978515625, "learning_rate": 8.319717151140072e-07, "loss": 0.0005, "reward": 0.6770833656191826, "reward_std": 0.27013952285051346, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.6041666744276881, "step": 90 }, { "completion_length": 2697.4375, "entropy": 0.44921875, "epoch": 0.104, "grad_norm": 0.1833600103855133, "kl": 0.005588531494140625, "learning_rate": 8.270476638965461e-07, "loss": 0.0002, "reward": 0.8750000298023224, "reward_std": 0.5495730713009834, "rewards/accuracy_reward": 0.260416679084301, "rewards/format_reward": 0.6145833507180214, "step": 91 }, { "completion_length": 2586.010498046875, "entropy": 0.39111328125, "epoch": 0.10514285714285715, "grad_norm": 0.14938902854919434, "kl": 0.007785797119140625, "learning_rate": 8.220696016880687e-07, "loss": 0.0003, "reward": 0.687500037252903, "reward_std": 0.3853628858923912, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.5625000074505806, "step": 92 }, { "completion_length": 3562.0000610351562, "entropy": 0.625, "epoch": 0.10628571428571429, "grad_norm": 0.1706770658493042, "kl": 0.0075836181640625, "learning_rate": 8.170384989716657e-07, "loss": 0.0003, "reward": 0.1354166679084301, "reward_std": 0.1874575838446617, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1354166679084301, "step": 93 }, { "completion_length": 2885.6876220703125, "entropy": 0.568359375, "epoch": 0.10742857142857143, "grad_norm": 0.12106288969516754, "kl": 0.005161285400390625, "learning_rate": 8.119553365707802e-07, "loss": 0.0002, "reward": 0.6562500149011612, "reward_std": 0.31557222083210945, "rewards/accuracy_reward": 0.22916666697710752, "rewards/format_reward": 0.4270833432674408, "step": 94 }, { "completion_length": 3645.1146850585938, "entropy": 0.4736328125, "epoch": 0.10857142857142857, "grad_norm": 0.0870869979262352, "kl": 0.0027751922607421875, "learning_rate": 8.068211054579943e-07, "loss": 0.0001, "reward": 0.4270833460614085, "reward_std": 0.4592607915401459, "rewards/accuracy_reward": 0.13541667442768812, "rewards/format_reward": 0.2916666744276881, "step": 95 }, { "completion_length": 2586.604217529297, "entropy": 0.39990234375, "epoch": 0.10971428571428571, "grad_norm": 0.16784177720546722, "kl": 0.0048675537109375, "learning_rate": 8.01636806561836e-07, "loss": 0.0002, "reward": 0.8854166865348816, "reward_std": 0.5198706425726414, "rewards/accuracy_reward": 0.31250000558793545, "rewards/format_reward": 0.572916679084301, "step": 96 }, { "completion_length": 2758.572998046875, "entropy": 0.4267578125, "epoch": 0.11085714285714286, "grad_norm": 0.21689672768115997, "kl": 0.003742218017578125, "learning_rate": 7.964034505716476e-07, "loss": 0.0001, "reward": 0.802083358168602, "reward_std": 0.5127636715769768, "rewards/accuracy_reward": 0.23958333861082792, "rewards/format_reward": 0.5625000149011612, "step": 97 }, { "completion_length": 2189.7395935058594, "entropy": 0.34521484375, "epoch": 0.112, "grad_norm": 0.17968373000621796, "kl": 0.003505706787109375, "learning_rate": 7.911220577405484e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.23535311594605446, "rewards/accuracy_reward": 0.15625000093132257, "rewards/format_reward": 0.6875000074505806, "step": 98 }, { "completion_length": 2465.3646240234375, "entropy": 0.38818359375, "epoch": 0.11314285714285714, "grad_norm": 0.16559617221355438, "kl": 0.004673004150390625, "learning_rate": 7.857936576865356e-07, "loss": 0.0002, "reward": 0.9166666716337204, "reward_std": 0.5209992416203022, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.5625000074505806, "step": 99 }, { "completion_length": 2335.166717529297, "entropy": 0.379150390625, "epoch": 0.11428571428571428, "grad_norm": 0.14200669527053833, "kl": 0.005016326904296875, "learning_rate": 7.804192891917571e-07, "loss": 0.0002, "reward": 1.0625000298023224, "reward_std": 0.47875121980905533, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.6666666865348816, "step": 100 }, { "completion_length": 2051.1459350585938, "entropy": 0.426513671875, "epoch": 0.11542857142857142, "grad_norm": 0.23151825368404388, "kl": 0.0040740966796875, "learning_rate": 7.75e-07, "loss": 0.0002, "reward": 0.8437500149011612, "reward_std": 0.22601452097296715, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.6979166716337204, "step": 101 }, { "completion_length": 1650.8438110351562, "entropy": 0.356201171875, "epoch": 0.11657142857142858, "grad_norm": 0.09349505603313446, "kl": 0.0050201416015625, "learning_rate": 7.695368466124296e-07, "loss": 0.0002, "reward": 0.895833358168602, "reward_std": 0.2152964137494564, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.8229166865348816, "step": 102 }, { "completion_length": 1967.0833892822266, "entropy": 0.361328125, "epoch": 0.11771428571428572, "grad_norm": 0.16049090027809143, "kl": 0.0059795379638671875, "learning_rate": 7.640308940816239e-07, "loss": 0.0002, "reward": 1.0937500298023224, "reward_std": 0.3835059180855751, "rewards/accuracy_reward": 0.2395833358168602, "rewards/format_reward": 0.8541666865348816, "step": 103 }, { "completion_length": 2380.3125610351562, "entropy": 0.44921875, "epoch": 0.11885714285714286, "grad_norm": 0.1609274446964264, "kl": 0.0071868896484375, "learning_rate": 7.584832158039378e-07, "loss": 0.0003, "reward": 0.7291666865348816, "reward_std": 0.40659596025943756, "rewards/accuracy_reward": 0.12500000186264515, "rewards/format_reward": 0.6041666865348816, "step": 104 }, { "completion_length": 2000.1876068115234, "entropy": 0.34033203125, "epoch": 0.12, "grad_norm": 0.19138775765895844, "kl": 0.00435638427734375, "learning_rate": 7.528948933102438e-07, "loss": 0.0002, "reward": 1.0312500149011612, "reward_std": 0.5785520151257515, "rewards/accuracy_reward": 0.3229166744276881, "rewards/format_reward": 0.7083333432674408, "step": 105 }, { "completion_length": 2208.072967529297, "entropy": 0.31982421875, "epoch": 0.12114285714285715, "grad_norm": 0.09388583153486252, "kl": 0.0026397705078125, "learning_rate": 7.472670160550848e-07, "loss": 0.0001, "reward": 1.1458333656191826, "reward_std": 0.35796716436743736, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.7083333507180214, "step": 106 }, { "completion_length": 2212.6771545410156, "entropy": 0.52099609375, "epoch": 0.12228571428571429, "grad_norm": 0.19971498847007751, "kl": 0.006237030029296875, "learning_rate": 7.416006812042827e-07, "loss": 0.0002, "reward": 0.8333333730697632, "reward_std": 0.3842815235257149, "rewards/accuracy_reward": 0.1145833358168602, "rewards/format_reward": 0.7187500298023224, "step": 107 }, { "completion_length": 2266.5313110351562, "entropy": 0.384765625, "epoch": 0.12342857142857143, "grad_norm": 0.1431254744529724, "kl": 0.00372314453125, "learning_rate": 7.358969934210438e-07, "loss": 0.0001, "reward": 0.9270833535119891, "reward_std": 0.38414933159947395, "rewards/accuracy_reward": 0.2395833358168602, "rewards/format_reward": 0.6875000102445483, "step": 108 }, { "completion_length": 2486.4584350585938, "entropy": 0.381103515625, "epoch": 0.12457142857142857, "grad_norm": 0.1384974718093872, "kl": 0.004756927490234375, "learning_rate": 7.301570646506027e-07, "loss": 0.0002, "reward": 0.6875000149011612, "reward_std": 0.3808614909648895, "rewards/accuracy_reward": 0.14583333674818277, "rewards/format_reward": 0.541666679084301, "step": 109 }, { "completion_length": 2373.7500610351562, "entropy": 0.43359375, "epoch": 0.12571428571428572, "grad_norm": 0.18450307846069336, "kl": 0.004810333251953125, "learning_rate": 7.243820139034464e-07, "loss": 0.0002, "reward": 0.9166666865348816, "reward_std": 0.5121813043951988, "rewards/accuracy_reward": 0.23958333767950535, "rewards/format_reward": 0.677083358168602, "step": 110 }, { "completion_length": 2390.156280517578, "entropy": 0.482666015625, "epoch": 0.12685714285714286, "grad_norm": 0.14442622661590576, "kl": 0.00649261474609375, "learning_rate": 7.185729670371604e-07, "loss": 0.0003, "reward": 0.7604166865348816, "reward_std": 0.3381837457418442, "rewards/accuracy_reward": 0.1666666753590107, "rewards/format_reward": 0.5937500074505806, "step": 111 }, { "completion_length": 2708.979248046875, "entropy": 0.46240234375, "epoch": 0.128, "grad_norm": 0.15874600410461426, "kl": 0.00481414794921875, "learning_rate": 7.127310565369415e-07, "loss": 0.0002, "reward": 0.8958333432674408, "reward_std": 0.41653573513031006, "rewards/accuracy_reward": 0.2916666707023978, "rewards/format_reward": 0.6041666939854622, "step": 112 }, { "completion_length": 1425.0208435058594, "entropy": 0.34765625, "epoch": 0.12914285714285714, "grad_norm": 0.2752454876899719, "kl": 0.01012420654296875, "learning_rate": 7.068574212948169e-07, "loss": 0.0004, "reward": 1.1041667014360428, "reward_std": 0.4169319197535515, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.833333358168602, "step": 113 }, { "completion_length": 1730.0833587646484, "entropy": 0.32763671875, "epoch": 0.13028571428571428, "grad_norm": 0.1002146303653717, "kl": 0.00527191162109375, "learning_rate": 7.009532063876148e-07, "loss": 0.0002, "reward": 0.9895833730697632, "reward_std": 0.24960162490606308, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.8958333432674408, "step": 114 }, { "completion_length": 2451.6771240234375, "entropy": 0.44873046875, "epoch": 0.13142857142857142, "grad_norm": 0.09600935876369476, "kl": 0.006961822509765625, "learning_rate": 6.950195628537299e-07, "loss": 0.0003, "reward": 0.7187500223517418, "reward_std": 0.19398127868771553, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.5625000074505806, "step": 115 }, { "completion_length": 3191.8751220703125, "entropy": 0.5556640625, "epoch": 0.13257142857142856, "grad_norm": 0.19031184911727905, "kl": 0.007049560546875, "learning_rate": 6.890576474687263e-07, "loss": 0.0003, "reward": 0.4479166939854622, "reward_std": 0.5110370628535748, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.3125000111758709, "step": 116 }, { "completion_length": 2564.8021240234375, "entropy": 0.493896484375, "epoch": 0.1337142857142857, "grad_norm": 0.2184651792049408, "kl": 0.009033203125, "learning_rate": 6.83068622519821e-07, "loss": 0.0004, "reward": 0.7187500223517418, "reward_std": 0.5183624178171158, "rewards/accuracy_reward": 0.11458333395421505, "rewards/format_reward": 0.6041666939854622, "step": 117 }, { "completion_length": 2493.3333740234375, "entropy": 0.38720703125, "epoch": 0.13485714285714287, "grad_norm": 0.12824109196662903, "kl": 0.00505828857421875, "learning_rate": 6.770536555792944e-07, "loss": 0.0002, "reward": 0.9375000298023224, "reward_std": 0.46352487802505493, "rewards/accuracy_reward": 0.28125000558793545, "rewards/format_reward": 0.6562500149011612, "step": 118 }, { "completion_length": 1656.8854522705078, "entropy": 0.4052734375, "epoch": 0.136, "grad_norm": 0.12852801382541656, "kl": 0.00701141357421875, "learning_rate": 6.710139192768694e-07, "loss": 0.0003, "reward": 1.0312500298023224, "reward_std": 0.25884300470352173, "rewards/accuracy_reward": 0.2395833358168602, "rewards/format_reward": 0.7916666865348816, "step": 119 }, { "completion_length": 1066.7917175292969, "entropy": 0.2958984375, "epoch": 0.13714285714285715, "grad_norm": 0.14502935111522675, "kl": 0.007476806640625, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 1.1979166865348816, "reward_std": 0.27546053379774094, "rewards/accuracy_reward": 0.2604166669771075, "rewards/format_reward": 0.9375000149011612, "step": 120 }, { "completion_length": 999.8542022705078, "entropy": 0.2783203125, "epoch": 0.1382857142857143, "grad_norm": 0.1992115080356598, "kl": 0.00730133056640625, "learning_rate": 6.588648530198504e-07, "loss": 0.0003, "reward": 1.2500000149011612, "reward_std": 0.29015830904245377, "rewards/accuracy_reward": 0.3020833386108279, "rewards/format_reward": 0.9479166865348816, "step": 121 }, { "completion_length": 2167.0834197998047, "entropy": 0.432373046875, "epoch": 0.13942857142857143, "grad_norm": 0.1561378836631775, "kl": 0.0062408447265625, "learning_rate": 6.527578915497951e-07, "loss": 0.0002, "reward": 0.9687500298023224, "reward_std": 0.4494870528578758, "rewards/accuracy_reward": 0.2604166781529784, "rewards/format_reward": 0.7083333432674408, "step": 122 }, { "completion_length": 2166.729248046875, "entropy": 0.4207763671875, "epoch": 0.14057142857142857, "grad_norm": 0.18212474882602692, "kl": 0.005710601806640625, "learning_rate": 6.466308972251785e-07, "loss": 0.0002, "reward": 0.9270833507180214, "reward_std": 0.37269312888383865, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.7187500074505806, "step": 123 }, { "completion_length": 1833.7396240234375, "entropy": 0.27783203125, "epoch": 0.1417142857142857, "grad_norm": 0.19929192960262299, "kl": 0.0140838623046875, "learning_rate": 6.404850645156841e-07, "loss": 0.0006, "reward": 1.1145833730697632, "reward_std": 0.5214307978749275, "rewards/accuracy_reward": 0.2604166679084301, "rewards/format_reward": 0.8541666865348816, "step": 124 }, { "completion_length": 2578.0209350585938, "entropy": 0.45166015625, "epoch": 0.14285714285714285, "grad_norm": 0.1536300778388977, "kl": 0.00638580322265625, "learning_rate": 6.343215915635761e-07, "loss": 0.0003, "reward": 0.5625000149011612, "reward_std": 0.30013205483555794, "rewards/accuracy_reward": 0.11458333861082792, "rewards/format_reward": 0.4479166865348816, "step": 125 }, { "completion_length": 2283.6875610351562, "entropy": 0.37451171875, "epoch": 0.144, "grad_norm": 0.16426944732666016, "kl": 0.005584716796875, "learning_rate": 6.281416799501187e-07, "loss": 0.0002, "reward": 0.9479167014360428, "reward_std": 0.5797486528754234, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.6979166865348816, "step": 126 }, { "completion_length": 1901.9063110351562, "entropy": 0.381591796875, "epoch": 0.14514285714285713, "grad_norm": 0.15447713434696198, "kl": 0.00717926025390625, "learning_rate": 6.219465344613258e-07, "loss": 0.0003, "reward": 0.8541666865348816, "reward_std": 0.3032348155975342, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.7812500149011612, "step": 127 }, { "completion_length": 1890.5729675292969, "entropy": 0.397705078125, "epoch": 0.1462857142857143, "grad_norm": 0.23616060614585876, "kl": 0.006561279296875, "learning_rate": 6.157373628530852e-07, "loss": 0.0003, "reward": 1.1041666865348816, "reward_std": 0.43167490512132645, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.770833358168602, "step": 128 }, { "completion_length": 2441.8021850585938, "entropy": 0.466796875, "epoch": 0.14742857142857144, "grad_norm": 0.19474731385707855, "kl": 0.00809478759765625, "learning_rate": 6.095153756157051e-07, "loss": 0.0003, "reward": 0.8125000447034836, "reward_std": 0.5053007081151009, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.6041666865348816, "step": 129 }, { "completion_length": 2335.4063110351562, "entropy": 0.4462890625, "epoch": 0.14857142857142858, "grad_norm": 0.2001771628856659, "kl": 0.00664520263671875, "learning_rate": 6.032817857379256e-07, "loss": 0.0003, "reward": 0.6875000298023224, "reward_std": 0.480606772005558, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.5937500149011612, "step": 130 }, { "completion_length": 2002.8438110351562, "entropy": 0.427978515625, "epoch": 0.14971428571428572, "grad_norm": 0.21314886212348938, "kl": 0.0078277587890625, "learning_rate": 5.97037808470444e-07, "loss": 0.0003, "reward": 1.0625000447034836, "reward_std": 0.5524623095989227, "rewards/accuracy_reward": 0.3229166781529784, "rewards/format_reward": 0.7395833432674408, "step": 131 }, { "completion_length": 2149.666778564453, "entropy": 0.39501953125, "epoch": 0.15085714285714286, "grad_norm": 0.15732061862945557, "kl": 0.0057373046875, "learning_rate": 5.907846610890011e-07, "loss": 0.0002, "reward": 0.9687500298023224, "reward_std": 0.48508264869451523, "rewards/accuracy_reward": 0.2812500102445483, "rewards/format_reward": 0.6875000149011612, "step": 132 }, { "completion_length": 2231.500030517578, "entropy": 0.435546875, "epoch": 0.152, "grad_norm": 0.21705371141433716, "kl": 0.00888824462890625, "learning_rate": 5.845235626570683e-07, "loss": 0.0004, "reward": 0.7708333395421505, "reward_std": 0.4052763059735298, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.625, "step": 133 }, { "completion_length": 2071.229278564453, "entropy": 0.4912109375, "epoch": 0.15314285714285714, "grad_norm": 0.18239766359329224, "kl": 0.01198577880859375, "learning_rate": 5.78255733788191e-07, "loss": 0.0005, "reward": 1.0208333730697632, "reward_std": 0.47505422681570053, "rewards/accuracy_reward": 0.2812500037252903, "rewards/format_reward": 0.7395833432674408, "step": 134 }, { "completion_length": 1155.7292175292969, "entropy": 0.304443359375, "epoch": 0.15428571428571428, "grad_norm": 0.1679978370666504, "kl": 0.01029205322265625, "learning_rate": 5.71982396408026e-07, "loss": 0.0004, "reward": 1.416666716337204, "reward_std": 0.4091631546616554, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.9166666716337204, "step": 135 }, { "completion_length": 1356.5521545410156, "entropy": 0.282470703125, "epoch": 0.15542857142857142, "grad_norm": 0.18188215792179108, "kl": 0.00759124755859375, "learning_rate": 5.657047735161255e-07, "loss": 0.0003, "reward": 1.1979166865348816, "reward_std": 0.3688688538968563, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.9270833432674408, "step": 136 }, { "completion_length": 1961.9167175292969, "entropy": 0.324462890625, "epoch": 0.15657142857142858, "grad_norm": 0.1938006430864334, "kl": 0.0077667236328125, "learning_rate": 5.594240889475106e-07, "loss": 0.0003, "reward": 0.833333358168602, "reward_std": 0.4214525818824768, "rewards/accuracy_reward": 0.08333333674818277, "rewards/format_reward": 0.7500000298023224, "step": 137 }, { "completion_length": 1700.7396240234375, "entropy": 0.302001953125, "epoch": 0.15771428571428572, "grad_norm": 0.16465067863464355, "kl": 0.00738525390625, "learning_rate": 5.531415671340826e-07, "loss": 0.0003, "reward": 1.0208333432674408, "reward_std": 0.37441620975732803, "rewards/accuracy_reward": 0.1979166753590107, "rewards/format_reward": 0.8229166865348816, "step": 138 }, { "completion_length": 1888.5208435058594, "entropy": 0.38720703125, "epoch": 0.15885714285714286, "grad_norm": 0.20309647917747498, "kl": 0.009521484375, "learning_rate": 5.468584328659172e-07, "loss": 0.0004, "reward": 1.0625000298023224, "reward_std": 0.43142497539520264, "rewards/accuracy_reward": 0.21875001024454832, "rewards/format_reward": 0.8437500298023224, "step": 139 }, { "completion_length": 1912.197998046875, "entropy": 0.442138671875, "epoch": 0.16, "grad_norm": 0.3316217064857483, "kl": 0.01258087158203125, "learning_rate": 5.405759110524894e-07, "loss": 0.0005, "reward": 0.9270833879709244, "reward_std": 0.42255595326423645, "rewards/accuracy_reward": 0.16666666883975267, "rewards/format_reward": 0.7604166865348816, "step": 140 }, { "completion_length": 1228.6042175292969, "entropy": 0.244873046875, "epoch": 0.16114285714285714, "grad_norm": 0.14592112600803375, "kl": 0.0072784423828125, "learning_rate": 5.342952264838747e-07, "loss": 0.0003, "reward": 1.2083333730697632, "reward_std": 0.3421749100089073, "rewards/accuracy_reward": 0.23958333395421505, "rewards/format_reward": 0.9687500298023224, "step": 141 }, { "completion_length": 1981.0209350585938, "entropy": 0.40869140625, "epoch": 0.16228571428571428, "grad_norm": 0.21422992646694183, "kl": 0.0090179443359375, "learning_rate": 5.28017603591974e-07, "loss": 0.0004, "reward": 0.9895833730697632, "reward_std": 0.4437461569905281, "rewards/accuracy_reward": 0.1562500037252903, "rewards/format_reward": 0.833333358168602, "step": 142 }, { "completion_length": 1735.0104675292969, "entropy": 0.399658203125, "epoch": 0.16342857142857142, "grad_norm": 0.2593076825141907, "kl": 0.01113128662109375, "learning_rate": 5.21744266211809e-07, "loss": 0.0004, "reward": 1.03125, "reward_std": 0.4072144068777561, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.8854166716337204, "step": 143 }, { "completion_length": 1934.3229675292969, "entropy": 0.434326171875, "epoch": 0.16457142857142856, "grad_norm": 0.2749219238758087, "kl": 0.01078033447265625, "learning_rate": 5.154764373429315e-07, "loss": 0.0004, "reward": 1.0625000149011612, "reward_std": 0.551102414727211, "rewards/accuracy_reward": 0.29166666977107525, "rewards/format_reward": 0.7708333507180214, "step": 144 }, { "completion_length": 1198.5937805175781, "entropy": 0.274169921875, "epoch": 0.1657142857142857, "grad_norm": 0.19994103908538818, "kl": 0.010986328125, "learning_rate": 5.09215338910999e-07, "loss": 0.0004, "reward": 1.2187500447034836, "reward_std": 0.374411478638649, "rewards/accuracy_reward": 0.3020833395421505, "rewards/format_reward": 0.9166666865348816, "step": 145 }, { "completion_length": 1426.0521240234375, "entropy": 0.300048828125, "epoch": 0.16685714285714287, "grad_norm": 0.1441003829240799, "kl": 0.0077667236328125, "learning_rate": 5.02962191529556e-07, "loss": 0.0003, "reward": 1.0312500298023224, "reward_std": 0.2671857923269272, "rewards/accuracy_reward": 0.1145833358168602, "rewards/format_reward": 0.9166666865348816, "step": 146 }, { "completion_length": 1535.447982788086, "entropy": 0.31689453125, "epoch": 0.168, "grad_norm": 0.19958704710006714, "kl": 0.01012420654296875, "learning_rate": 4.967182142620745e-07, "loss": 0.0004, "reward": 1.1562500298023224, "reward_std": 0.39162378013134, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.8750000149011612, "step": 147 }, { "completion_length": 1264.7396545410156, "entropy": 0.24658203125, "epoch": 0.16914285714285715, "grad_norm": 0.10701873153448105, "kl": 0.008941650390625, "learning_rate": 4.904846243842949e-07, "loss": 0.0004, "reward": 1.1354166865348816, "reward_std": 0.15690934658050537, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9479166716337204, "step": 148 }, { "completion_length": 2025.3021240234375, "entropy": 0.352783203125, "epoch": 0.1702857142857143, "grad_norm": 0.13325975835323334, "kl": 0.008880615234375, "learning_rate": 4.842626371469149e-07, "loss": 0.0004, "reward": 1.1458333730697632, "reward_std": 0.3877560868859291, "rewards/accuracy_reward": 0.3229166716337204, "rewards/format_reward": 0.8229166865348816, "step": 149 }, { "completion_length": 1229.2708740234375, "entropy": 0.274658203125, "epoch": 0.17142857142857143, "grad_norm": 0.1998661607503891, "kl": 0.0081634521484375, "learning_rate": 4.780534655386743e-07, "loss": 0.0003, "reward": 1.1354166865348816, "reward_std": 0.4279330112040043, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.927083358168602, "step": 150 }, { "completion_length": 1687.1146240234375, "entropy": 0.4375, "epoch": 0.17257142857142857, "grad_norm": 0.25493502616882324, "kl": 0.0125579833984375, "learning_rate": 4.7185832004988133e-07, "loss": 0.0005, "reward": 1.114583358168602, "reward_std": 0.5058320835232735, "rewards/accuracy_reward": 0.312500006519258, "rewards/format_reward": 0.802083358168602, "step": 151 }, { "completion_length": 1766.2083740234375, "entropy": 0.41552734375, "epoch": 0.1737142857142857, "grad_norm": 0.24902759492397308, "kl": 0.010467529296875, "learning_rate": 4.656784084364238e-07, "loss": 0.0004, "reward": 0.9895833432674408, "reward_std": 0.4082007445394993, "rewards/accuracy_reward": 0.19791666697710752, "rewards/format_reward": 0.7916666865348816, "step": 152 }, { "completion_length": 1433.1562805175781, "entropy": 0.31591796875, "epoch": 0.17485714285714285, "grad_norm": 0.21754246950149536, "kl": 0.0152740478515625, "learning_rate": 4.59514935484316e-07, "loss": 0.0006, "reward": 1.020833358168602, "reward_std": 0.2934442237019539, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.895833358168602, "step": 153 }, { "completion_length": 2112.354248046875, "entropy": 0.39599609375, "epoch": 0.176, "grad_norm": 0.20787468552589417, "kl": 0.009765625, "learning_rate": 4.5336910277482155e-07, "loss": 0.0004, "reward": 1.0833333432674408, "reward_std": 0.4751042574644089, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.7916666716337204, "step": 154 }, { "completion_length": 1740.2500610351562, "entropy": 0.38623046875, "epoch": 0.17714285714285713, "grad_norm": 0.19822287559509277, "kl": 0.0122833251953125, "learning_rate": 4.4724210845020494e-07, "loss": 0.0005, "reward": 1.020833358168602, "reward_std": 0.41686780750751495, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.8541666865348816, "step": 155 }, { "completion_length": 1723.6146545410156, "entropy": 0.29345703125, "epoch": 0.1782857142857143, "grad_norm": 0.12867745757102966, "kl": 0.00856781005859375, "learning_rate": 4.4113514698014953e-07, "loss": 0.0003, "reward": 1.1458334028720856, "reward_std": 0.4736599698662758, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.8229166865348816, "step": 156 }, { "completion_length": 2180.291748046875, "entropy": 0.4892578125, "epoch": 0.17942857142857144, "grad_norm": 0.27378663420677185, "kl": 0.0123748779296875, "learning_rate": 4.350494089288943e-07, "loss": 0.0005, "reward": 0.864583358168602, "reward_std": 0.4690057747066021, "rewards/accuracy_reward": 0.14583333488553762, "rewards/format_reward": 0.7187500298023224, "step": 157 }, { "completion_length": 1268.4479675292969, "entropy": 0.26611328125, "epoch": 0.18057142857142858, "grad_norm": 0.15639232099056244, "kl": 0.00994873046875, "learning_rate": 4.2898608072313045e-07, "loss": 0.0004, "reward": 1.1458333730697632, "reward_std": 0.3376114182174206, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.958333358168602, "step": 158 }, { "completion_length": 1842.7188110351562, "entropy": 0.332275390625, "epoch": 0.18171428571428572, "grad_norm": 0.22000983357429504, "kl": 0.0113372802734375, "learning_rate": 4.2294634442070553e-07, "loss": 0.0005, "reward": 0.895833358168602, "reward_std": 0.43413354456424713, "rewards/accuracy_reward": 0.10416667256504297, "rewards/format_reward": 0.7916666865348816, "step": 159 }, { "completion_length": 1532.4791870117188, "entropy": 0.39111328125, "epoch": 0.18285714285714286, "grad_norm": 0.5555780529975891, "kl": 0.017059326171875, "learning_rate": 4.1693137748017915e-07, "loss": 0.0007, "reward": 1.2291666865348816, "reward_std": 0.5317695289850235, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.8541666865348816, "step": 160 }, { "completion_length": 1631.375015258789, "entropy": 0.3583984375, "epoch": 0.184, "grad_norm": 0.20350432395935059, "kl": 0.01454925537109375, "learning_rate": 4.1094235253127374e-07, "loss": 0.0006, "reward": 1.1458333730697632, "reward_std": 0.44703245162963867, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.895833358168602, "step": 161 }, { "completion_length": 1646.5208740234375, "entropy": 0.4658203125, "epoch": 0.18514285714285714, "grad_norm": 0.2509794533252716, "kl": 0.014892578125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0006, "reward": 1.0104167014360428, "reward_std": 0.4695500135421753, "rewards/accuracy_reward": 0.19791666977107525, "rewards/format_reward": 0.8125000223517418, "step": 162 }, { "completion_length": 1552.0000610351562, "entropy": 0.424560546875, "epoch": 0.18628571428571428, "grad_norm": 0.17314837872982025, "kl": 0.0128173828125, "learning_rate": 3.9904679361238526e-07, "loss": 0.0005, "reward": 1.1770833879709244, "reward_std": 0.33981742709875107, "rewards/accuracy_reward": 0.3541666828095913, "rewards/format_reward": 0.8229166865348816, "step": 163 }, { "completion_length": 1373.3333587646484, "entropy": 0.320556640625, "epoch": 0.18742857142857142, "grad_norm": 0.2898624539375305, "kl": 0.011077880859375, "learning_rate": 3.931425787051832e-07, "loss": 0.0004, "reward": 1.1562500149011612, "reward_std": 0.36444756016135216, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.9062500149011612, "step": 164 }, { "completion_length": 1446.5208740234375, "entropy": 0.34033203125, "epoch": 0.18857142857142858, "grad_norm": 0.36863088607788086, "kl": 0.0135345458984375, "learning_rate": 3.872689434630585e-07, "loss": 0.0005, "reward": 1.1562500298023224, "reward_std": 0.5087258517742157, "rewards/accuracy_reward": 0.30208334140479565, "rewards/format_reward": 0.8541666865348816, "step": 165 }, { "completion_length": 1569.2500610351562, "entropy": 0.2666015625, "epoch": 0.18971428571428572, "grad_norm": 0.14335811138153076, "kl": 0.00957489013671875, "learning_rate": 3.8142703296283953e-07, "loss": 0.0004, "reward": 1.0937500596046448, "reward_std": 0.3973645642399788, "rewards/accuracy_reward": 0.2395833432674408, "rewards/format_reward": 0.8541666865348816, "step": 166 }, { "completion_length": 1177.822982788086, "entropy": 0.25244140625, "epoch": 0.19085714285714286, "grad_norm": 0.11705330014228821, "kl": 0.0107421875, "learning_rate": 3.7561798609655373e-07, "loss": 0.0004, "reward": 1.041666716337204, "reward_std": 0.18237071484327316, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.9791666716337204, "step": 167 }, { "completion_length": 1497.2604370117188, "entropy": 0.283203125, "epoch": 0.192, "grad_norm": 0.22972875833511353, "kl": 0.0095367431640625, "learning_rate": 3.6984293534939737e-07, "loss": 0.0004, "reward": 1.1354167014360428, "reward_std": 0.3216959089040756, "rewards/accuracy_reward": 0.21875000838190317, "rewards/format_reward": 0.9166666865348816, "step": 168 }, { "completion_length": 1162.0416870117188, "entropy": 0.245849609375, "epoch": 0.19314285714285714, "grad_norm": 0.1791468858718872, "kl": 0.01068878173828125, "learning_rate": 3.641030065789562e-07, "loss": 0.0004, "reward": 1.479166716337204, "reward_std": 0.3380242697894573, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.9583333432674408, "step": 169 }, { "completion_length": 1761.4792175292969, "entropy": 0.392333984375, "epoch": 0.19428571428571428, "grad_norm": 0.22026270627975464, "kl": 0.0135498046875, "learning_rate": 3.5839931879571725e-07, "loss": 0.0005, "reward": 1.1145833432674408, "reward_std": 0.25935307145118713, "rewards/accuracy_reward": 0.35416666977107525, "rewards/format_reward": 0.7604166716337204, "step": 170 }, { "completion_length": 1874.7604675292969, "entropy": 0.406982421875, "epoch": 0.19542857142857142, "grad_norm": 0.18767470121383667, "kl": 0.0109100341796875, "learning_rate": 3.5273298394491515e-07, "loss": 0.0004, "reward": 0.9270833730697632, "reward_std": 0.3239624425768852, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.71875, "step": 171 }, { "completion_length": 1946.0000610351562, "entropy": 0.48876953125, "epoch": 0.19657142857142856, "grad_norm": 0.32672053575515747, "kl": 0.0180816650390625, "learning_rate": 3.471051066897562e-07, "loss": 0.0007, "reward": 1.1354166865348816, "reward_std": 0.5261635184288025, "rewards/accuracy_reward": 0.3958333460614085, "rewards/format_reward": 0.7395833432674408, "step": 172 }, { "completion_length": 1126.3541793823242, "entropy": 0.27978515625, "epoch": 0.1977142857142857, "grad_norm": 0.20509202778339386, "kl": 0.01155853271484375, "learning_rate": 3.4151678419606233e-07, "loss": 0.0005, "reward": 1.1562500298023224, "reward_std": 0.35620374977588654, "rewards/accuracy_reward": 0.2395833395421505, "rewards/format_reward": 0.9166666716337204, "step": 173 }, { "completion_length": 1285.9167175292969, "entropy": 0.331787109375, "epoch": 0.19885714285714284, "grad_norm": 0.17950935661792755, "kl": 0.01363372802734375, "learning_rate": 3.359691059183761e-07, "loss": 0.0005, "reward": 1.2187500447034836, "reward_std": 0.3375067636370659, "rewards/accuracy_reward": 0.2604166753590107, "rewards/format_reward": 0.9583333432674408, "step": 174 }, { "completion_length": 1471.7291870117188, "entropy": 0.380615234375, "epoch": 0.2, "grad_norm": 0.31842005252838135, "kl": 0.013336181640625, "learning_rate": 3.3046315338757026e-07, "loss": 0.0005, "reward": 1.0625000149011612, "reward_std": 0.361453078687191, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.8645833432674408, "step": 175 }, { "completion_length": 1566.8229675292969, "entropy": 0.343994140625, "epoch": 0.20114285714285715, "grad_norm": 0.280519962310791, "kl": 0.014129638671875, "learning_rate": 3.250000000000001e-07, "loss": 0.0006, "reward": 1.0520833730697632, "reward_std": 0.4207059293985367, "rewards/accuracy_reward": 0.16666667349636555, "rewards/format_reward": 0.8854166865348816, "step": 176 }, { "completion_length": 1703.625015258789, "entropy": 0.453125, "epoch": 0.2022857142857143, "grad_norm": 0.33665430545806885, "kl": 0.013824462890625, "learning_rate": 3.195807108082429e-07, "loss": 0.0006, "reward": 1.083333358168602, "reward_std": 0.4203081615269184, "rewards/accuracy_reward": 0.250000006519258, "rewards/format_reward": 0.8333333432674408, "step": 177 }, { "completion_length": 1527.3021240234375, "entropy": 0.39013671875, "epoch": 0.20342857142857143, "grad_norm": 0.2330997735261917, "kl": 0.017974853515625, "learning_rate": 3.142063423134644e-07, "loss": 0.0007, "reward": 1.2187500298023224, "reward_std": 0.5016858726739883, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9062500298023224, "step": 178 }, { "completion_length": 1635.5833740234375, "entropy": 0.365234375, "epoch": 0.20457142857142857, "grad_norm": 0.29855063557624817, "kl": 0.01336669921875, "learning_rate": 3.0887794225945143e-07, "loss": 0.0005, "reward": 0.979166716337204, "reward_std": 0.4029111787676811, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.8125000298023224, "step": 179 }, { "completion_length": 1145.468765258789, "entropy": 0.3056640625, "epoch": 0.2057142857142857, "grad_norm": 0.238195538520813, "kl": 0.0108795166015625, "learning_rate": 3.0359654942835247e-07, "loss": 0.0004, "reward": 1.5104166865348816, "reward_std": 0.42421412095427513, "rewards/accuracy_reward": 0.5520833358168602, "rewards/format_reward": 0.9583333432674408, "step": 180 }, { "completion_length": 1850.947998046875, "entropy": 0.420166015625, "epoch": 0.20685714285714285, "grad_norm": 0.36524245142936707, "kl": 0.01495361328125, "learning_rate": 2.9836319343816397e-07, "loss": 0.0006, "reward": 0.9791666865348816, "reward_std": 0.3578517735004425, "rewards/accuracy_reward": 0.19791666697710752, "rewards/format_reward": 0.7812500298023224, "step": 181 }, { "completion_length": 1749.0938415527344, "entropy": 0.359375, "epoch": 0.208, "grad_norm": 0.1934923529624939, "kl": 0.0108184814453125, "learning_rate": 2.931788945420058e-07, "loss": 0.0004, "reward": 1.0000000298023224, "reward_std": 0.3973938375711441, "rewards/accuracy_reward": 0.2187500037252903, "rewards/format_reward": 0.7812500149011612, "step": 182 }, { "completion_length": 1332.4479675292969, "entropy": 0.322021484375, "epoch": 0.20914285714285713, "grad_norm": 0.23912180960178375, "kl": 0.0157623291015625, "learning_rate": 2.8804466342921987e-07, "loss": 0.0006, "reward": 1.2187500596046448, "reward_std": 0.35980356484651566, "rewards/accuracy_reward": 0.260416679084301, "rewards/format_reward": 0.958333358168602, "step": 183 }, { "completion_length": 1341.1354675292969, "entropy": 0.35693359375, "epoch": 0.2102857142857143, "grad_norm": 0.22043198347091675, "kl": 0.0140228271484375, "learning_rate": 2.829615010283344e-07, "loss": 0.0006, "reward": 1.0833333432674408, "reward_std": 0.2259194441139698, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.9166666865348816, "step": 184 }, { "completion_length": 1516.4583892822266, "entropy": 0.30029296875, "epoch": 0.21142857142857144, "grad_norm": 0.23556554317474365, "kl": 0.0121002197265625, "learning_rate": 2.7793039831193133e-07, "loss": 0.0005, "reward": 1.0104166716337204, "reward_std": 0.31887371838092804, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.864583358168602, "step": 185 }, { "completion_length": 1785.104232788086, "entropy": 0.446533203125, "epoch": 0.21257142857142858, "grad_norm": 0.38155755400657654, "kl": 0.01576995849609375, "learning_rate": 2.729523361034538e-07, "loss": 0.0006, "reward": 0.9583333879709244, "reward_std": 0.44071806967258453, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.7708333432674408, "step": 186 }, { "completion_length": 1534.7917175292969, "entropy": 0.41748046875, "epoch": 0.21371428571428572, "grad_norm": 0.37235942482948303, "kl": 0.0169219970703125, "learning_rate": 2.6802828488599294e-07, "loss": 0.0007, "reward": 1.0312500149011612, "reward_std": 0.35790160298347473, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.8854166865348816, "step": 187 }, { "completion_length": 1874.6354675292969, "entropy": 0.521484375, "epoch": 0.21485714285714286, "grad_norm": 0.5140780210494995, "kl": 0.0186004638671875, "learning_rate": 2.631592046130896e-07, "loss": 0.0007, "reward": 0.8125000298023224, "reward_std": 0.4419962018728256, "rewards/accuracy_reward": 0.08333333674818277, "rewards/format_reward": 0.7291666716337204, "step": 188 }, { "completion_length": 1185.3958587646484, "entropy": 0.338623046875, "epoch": 0.216, "grad_norm": 0.28023380041122437, "kl": 0.01312255859375, "learning_rate": 2.583460445215911e-07, "loss": 0.0005, "reward": 1.020833358168602, "reward_std": 0.301351435482502, "rewards/accuracy_reward": 0.11458333674818277, "rewards/format_reward": 0.9062500149011612, "step": 189 }, { "completion_length": 1186.5417175292969, "entropy": 0.27587890625, "epoch": 0.21714285714285714, "grad_norm": 0.17898434400558472, "kl": 0.01284027099609375, "learning_rate": 2.5358974294659373e-07, "loss": 0.0005, "reward": 1.1770833432674408, "reward_std": 0.2806706018745899, "rewards/accuracy_reward": 0.23958333395421505, "rewards/format_reward": 0.9375, "step": 190 }, { "completion_length": 1232.6458740234375, "entropy": 0.323486328125, "epoch": 0.21828571428571428, "grad_norm": 0.19368711113929749, "kl": 0.0129547119140625, "learning_rate": 2.488912271385139e-07, "loss": 0.0005, "reward": 1.3125000596046448, "reward_std": 0.3853513225913048, "rewards/accuracy_reward": 0.35416668467223644, "rewards/format_reward": 0.9583333432674408, "step": 191 }, { "completion_length": 1928.1459350585938, "entropy": 0.443603515625, "epoch": 0.21942857142857142, "grad_norm": 0.19557389616966248, "kl": 0.01648712158203125, "learning_rate": 2.4425141308231765e-07, "loss": 0.0007, "reward": 0.8854166865348816, "reward_std": 0.25648824870586395, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.7500000149011612, "step": 192 }, { "completion_length": 2170.5625, "entropy": 0.6103515625, "epoch": 0.22057142857142858, "grad_norm": 0.5704598426818848, "kl": 0.02215576171875, "learning_rate": 2.3967120531894857e-07, "loss": 0.0009, "reward": 0.8541667014360428, "reward_std": 0.5056344047188759, "rewards/accuracy_reward": 0.15625000186264515, "rewards/format_reward": 0.6979166865348816, "step": 193 }, { "completion_length": 2294.229248046875, "entropy": 0.56884765625, "epoch": 0.22171428571428572, "grad_norm": 0.2927386164665222, "kl": 0.02191162109375, "learning_rate": 2.3515149676898552e-07, "loss": 0.0009, "reward": 1.239583358168602, "reward_std": 0.6518271863460541, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.802083358168602, "step": 194 }, { "completion_length": 1263.7917175292969, "entropy": 0.239990234375, "epoch": 0.22285714285714286, "grad_norm": 0.16641545295715332, "kl": 0.0110015869140625, "learning_rate": 2.306931685585657e-07, "loss": 0.0004, "reward": 1.1875000596046448, "reward_std": 0.38468754291534424, "rewards/accuracy_reward": 0.23958334140479565, "rewards/format_reward": 0.9479166865348816, "step": 195 }, { "completion_length": 2122.697967529297, "entropy": 0.60400390625, "epoch": 0.224, "grad_norm": 0.25676071643829346, "kl": 0.023529052734375, "learning_rate": 2.2629708984760706e-07, "loss": 0.0009, "reward": 0.9791666865348816, "reward_std": 0.43499240279197693, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.7500000149011612, "step": 196 }, { "completion_length": 1125.312515258789, "entropy": 0.328125, "epoch": 0.22514285714285714, "grad_norm": 0.19764328002929688, "kl": 0.011962890625, "learning_rate": 2.2196411766036487e-07, "loss": 0.0005, "reward": 1.3437500596046448, "reward_std": 0.39162378013134, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.96875, "step": 197 }, { "completion_length": 1463.8750305175781, "entropy": 0.341796875, "epoch": 0.22628571428571428, "grad_norm": 0.26102352142333984, "kl": 0.01641845703125, "learning_rate": 2.1769509671835223e-07, "loss": 0.0007, "reward": 1.1354167014360428, "reward_std": 0.33387480303645134, "rewards/accuracy_reward": 0.3229166716337204, "rewards/format_reward": 0.8125000149011612, "step": 198 }, { "completion_length": 1503.4792175292969, "entropy": 0.318115234375, "epoch": 0.22742857142857142, "grad_norm": 0.3636128902435303, "kl": 0.0154876708984375, "learning_rate": 2.134908592756607e-07, "loss": 0.0006, "reward": 0.9479167014360428, "reward_std": 0.40788237005472183, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.864583358168602, "step": 199 }, { "completion_length": 1183.2916717529297, "entropy": 0.2479248046875, "epoch": 0.22857142857142856, "grad_norm": 0.20531828701496124, "kl": 0.01239776611328125, "learning_rate": 2.0935222495670968e-07, "loss": 0.0005, "reward": 1.2291666865348816, "reward_std": 0.3643130548298359, "rewards/accuracy_reward": 0.2812500027939677, "rewards/format_reward": 0.9479166716337204, "step": 200 }, { "completion_length": 1661.4271240234375, "entropy": 0.385498046875, "epoch": 0.2297142857142857, "grad_norm": 0.3458462357521057, "kl": 0.020599365234375, "learning_rate": 2.0528000059645995e-07, "loss": 0.0008, "reward": 1.229166716337204, "reward_std": 0.3946245461702347, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.7708333432674408, "step": 201 }, { "completion_length": 1379.8541717529297, "entropy": 0.33154296875, "epoch": 0.23085714285714284, "grad_norm": 0.23957432806491852, "kl": 0.0141448974609375, "learning_rate": 2.0127498008311922e-07, "loss": 0.0006, "reward": 1.3229166865348816, "reward_std": 0.24646351113915443, "rewards/accuracy_reward": 0.46875000558793545, "rewards/format_reward": 0.8541666716337204, "step": 202 }, { "completion_length": 1426.6042175292969, "entropy": 0.337158203125, "epoch": 0.232, "grad_norm": 0.2487732470035553, "kl": 0.015777587890625, "learning_rate": 1.9733794420337213e-07, "loss": 0.0006, "reward": 1.1562500298023224, "reward_std": 0.26880528777837753, "rewards/accuracy_reward": 0.2916666669771075, "rewards/format_reward": 0.8645833432674408, "step": 203 }, { "completion_length": 1260.0312805175781, "entropy": 0.43408203125, "epoch": 0.23314285714285715, "grad_norm": 0.20831915736198425, "kl": 0.01959228515625, "learning_rate": 1.934696604901642e-07, "loss": 0.0008, "reward": 1.1875000298023224, "reward_std": 0.32854287326335907, "rewards/accuracy_reward": 0.2395833432674408, "rewards/format_reward": 0.9479166865348816, "step": 204 }, { "completion_length": 1440.3020935058594, "entropy": 0.331298828125, "epoch": 0.2342857142857143, "grad_norm": 0.22207972407341003, "kl": 0.013214111328125, "learning_rate": 1.8967088307307e-07, "loss": 0.0005, "reward": 1.3645833730697632, "reward_std": 0.46688663959503174, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.8958333432674408, "step": 205 }, { "completion_length": 1833.2084045410156, "entropy": 0.340576171875, "epoch": 0.23542857142857143, "grad_norm": 0.1765735149383545, "kl": 0.0128936767578125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0005, "reward": 0.9375000298023224, "reward_std": 0.27143751084804535, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.8541666865348816, "step": 206 }, { "completion_length": 1413.0313110351562, "entropy": 0.407470703125, "epoch": 0.23657142857142857, "grad_norm": 0.21233247220516205, "kl": 0.017242431640625, "learning_rate": 1.822847957491922e-07, "loss": 0.0007, "reward": 1.0833333730697632, "reward_std": 0.3562712073326111, "rewards/accuracy_reward": 0.15625000651925802, "rewards/format_reward": 0.927083358168602, "step": 207 }, { "completion_length": 1336.4062805175781, "entropy": 0.294189453125, "epoch": 0.2377142857142857, "grad_norm": 0.2305799126625061, "kl": 0.012664794921875, "learning_rate": 1.7869892577476722e-07, "loss": 0.0005, "reward": 1.1875000149011612, "reward_std": 0.2691424489021301, "rewards/accuracy_reward": 0.2812500102445483, "rewards/format_reward": 0.9062500149011612, "step": 208 }, { "completion_length": 1310.2812957763672, "entropy": 0.389404296875, "epoch": 0.23885714285714285, "grad_norm": 0.24238798022270203, "kl": 0.01934814453125, "learning_rate": 1.7518544168045524e-07, "loss": 0.0008, "reward": 1.2395833730697632, "reward_std": 0.36760086938738823, "rewards/accuracy_reward": 0.3229166753590107, "rewards/format_reward": 0.9166666716337204, "step": 209 }, { "completion_length": 1589.3750610351562, "entropy": 0.323486328125, "epoch": 0.24, "grad_norm": 0.32202091813087463, "kl": 0.01515960693359375, "learning_rate": 1.7174502842694212e-07, "loss": 0.0006, "reward": 1.0833333730697632, "reward_std": 0.32458770275115967, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.8750000149011612, "step": 210 }, { "completion_length": 1605.1041717529297, "entropy": 0.4423828125, "epoch": 0.24114285714285713, "grad_norm": 0.37479087710380554, "kl": 0.020111083984375, "learning_rate": 1.6837835672960831e-07, "loss": 0.0008, "reward": 1.0937500298023224, "reward_std": 0.35892703384160995, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.802083358168602, "step": 211 }, { "completion_length": 1098.7396240234375, "entropy": 0.296630859375, "epoch": 0.2422857142857143, "grad_norm": 0.41751039028167725, "kl": 0.0150604248046875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0006, "reward": 1.1875000596046448, "reward_std": 0.309124119579792, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.927083358168602, "step": 212 }, { "completion_length": 1308.5520935058594, "entropy": 0.43505859375, "epoch": 0.24342857142857144, "grad_norm": 0.46686094999313354, "kl": 0.023101806640625, "learning_rate": 1.6186884885673413e-07, "loss": 0.0009, "reward": 1.1666666865348816, "reward_std": 0.4640028476715088, "rewards/accuracy_reward": 0.28125000558793545, "rewards/format_reward": 0.8854167014360428, "step": 213 }, { "completion_length": 1767.1875305175781, "entropy": 0.484619140625, "epoch": 0.24457142857142858, "grad_norm": 0.334074467420578, "kl": 0.02288818359375, "learning_rate": 1.5872728172265146e-07, "loss": 0.0009, "reward": 1.1354166865348816, "reward_std": 0.4487803429365158, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.8437500149011612, "step": 214 }, { "completion_length": 1314.2396545410156, "entropy": 0.354248046875, "epoch": 0.24571428571428572, "grad_norm": 0.10470432788133621, "kl": 0.012969970703125, "learning_rate": 1.5566199398026147e-07, "loss": 0.0005, "reward": 0.9791666865348816, "reward_std": 0.11020193248987198, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.8958333432674408, "step": 215 }, { "completion_length": 1227.4166870117188, "entropy": 0.43896484375, "epoch": 0.24685714285714286, "grad_norm": 0.5151819586753845, "kl": 0.020538330078125, "learning_rate": 1.5267358321348285e-07, "loss": 0.0008, "reward": 1.1979166865348816, "reward_std": 0.38816463202238083, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.8854166716337204, "step": 216 }, { "completion_length": 1324.9792175292969, "entropy": 0.314697265625, "epoch": 0.248, "grad_norm": 0.23024234175682068, "kl": 0.0144195556640625, "learning_rate": 1.4976263201891613e-07, "loss": 0.0006, "reward": 1.3125000298023224, "reward_std": 0.44106680899858475, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.9062500149011612, "step": 217 }, { "completion_length": 1387.0312957763672, "entropy": 0.253662109375, "epoch": 0.24914285714285714, "grad_norm": 0.3250998258590698, "kl": 0.01210784912109375, "learning_rate": 1.469297078922642e-07, "loss": 0.0005, "reward": 1.010416716337204, "reward_std": 0.378255732357502, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.9270833432674408, "step": 218 }, { "completion_length": 1540.6771087646484, "entropy": 0.5185546875, "epoch": 0.2502857142857143, "grad_norm": 0.24495770037174225, "kl": 0.0228729248046875, "learning_rate": 1.4417536311769885e-07, "loss": 0.0009, "reward": 1.1458334028720856, "reward_std": 0.47743887454271317, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.8541667014360428, "step": 219 }, { "completion_length": 1439.0938262939453, "entropy": 0.388427734375, "epoch": 0.25142857142857145, "grad_norm": 0.22012194991111755, "kl": 0.01434326171875, "learning_rate": 1.4150013466019114e-07, "loss": 0.0006, "reward": 0.8854166865348816, "reward_std": 0.17735834047198296, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.8750000149011612, "step": 220 }, { "completion_length": 1156.2604370117188, "entropy": 0.25439453125, "epoch": 0.25257142857142856, "grad_norm": 0.2978960871696472, "kl": 0.0113677978515625, "learning_rate": 1.3890454406082956e-07, "loss": 0.0005, "reward": 1.322916716337204, "reward_std": 0.3704235702753067, "rewards/accuracy_reward": 0.3645833469927311, "rewards/format_reward": 0.9583333432674408, "step": 221 }, { "completion_length": 1340.5104522705078, "entropy": 0.3916015625, "epoch": 0.2537142857142857, "grad_norm": 0.3874484896659851, "kl": 0.019073486328125, "learning_rate": 1.3638909733514452e-07, "loss": 0.0008, "reward": 1.2187500298023224, "reward_std": 0.33246491849422455, "rewards/accuracy_reward": 0.3020833358168602, "rewards/format_reward": 0.9166666865348816, "step": 222 }, { "completion_length": 1576.0208740234375, "entropy": 0.390625, "epoch": 0.25485714285714284, "grad_norm": 0.3034595847129822, "kl": 0.019012451171875, "learning_rate": 1.3395428487445914e-07, "loss": 0.0008, "reward": 1.0520833730697632, "reward_std": 0.32211463153362274, "rewards/accuracy_reward": 0.22916667722165585, "rewards/format_reward": 0.8229166716337204, "step": 223 }, { "completion_length": 2019.4895935058594, "entropy": 0.421875, "epoch": 0.256, "grad_norm": 0.19334331154823303, "kl": 0.0189056396484375, "learning_rate": 1.316005813502869e-07, "loss": 0.0008, "reward": 1.0208333432674408, "reward_std": 0.50140430778265, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.802083358168602, "step": 224 }, { "completion_length": 1698.7708435058594, "entropy": 0.53515625, "epoch": 0.2571428571428571, "grad_norm": 0.3179035782814026, "kl": 0.028045654296875, "learning_rate": 1.2932844562179352e-07, "loss": 0.0011, "reward": 1.0729166865348816, "reward_std": 0.4662906527519226, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.864583358168602, "step": 225 }, { "completion_length": 1244.3125610351562, "entropy": 0.2498779296875, "epoch": 0.2582857142857143, "grad_norm": 0.14828985929489136, "kl": 0.01190948486328125, "learning_rate": 1.2713832064634125e-07, "loss": 0.0005, "reward": 1.3333334028720856, "reward_std": 0.3380242735147476, "rewards/accuracy_reward": 0.3541666818782687, "rewards/format_reward": 0.9791666716337204, "step": 226 }, { "completion_length": 1184.6458587646484, "entropy": 0.3994140625, "epoch": 0.25942857142857145, "grad_norm": 0.25174474716186523, "kl": 0.021026611328125, "learning_rate": 1.2503063339313356e-07, "loss": 0.0008, "reward": 1.1458333730697632, "reward_std": 0.35975906252861023, "rewards/accuracy_reward": 0.17708334140479565, "rewards/format_reward": 0.9687500149011612, "step": 227 }, { "completion_length": 1297.875015258789, "entropy": 0.412353515625, "epoch": 0.26057142857142856, "grad_norm": 0.3392723500728607, "kl": 0.0189666748046875, "learning_rate": 1.2300579475997657e-07, "loss": 0.0008, "reward": 1.1250000149011612, "reward_std": 0.381549421697855, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.8437500149011612, "step": 228 }, { "completion_length": 1462.6667175292969, "entropy": 0.46826171875, "epoch": 0.26171428571428573, "grad_norm": 0.6046648621559143, "kl": 0.020416259765625, "learning_rate": 1.2106419949317388e-07, "loss": 0.0008, "reward": 0.927083358168602, "reward_std": 0.30044983327388763, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.8854166716337204, "step": 229 }, { "completion_length": 2096.229248046875, "entropy": 0.4873046875, "epoch": 0.26285714285714284, "grad_norm": 0.6036101579666138, "kl": 0.0241851806640625, "learning_rate": 1.1920622611056974e-07, "loss": 0.001, "reward": 0.7812500149011612, "reward_std": 0.3880816847085953, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.7291666865348816, "step": 230 }, { "completion_length": 1415.8437805175781, "entropy": 0.298095703125, "epoch": 0.264, "grad_norm": 0.29470252990722656, "kl": 0.01641845703125, "learning_rate": 1.1743223682775649e-07, "loss": 0.0007, "reward": 1.072916716337204, "reward_std": 0.36706580221652985, "rewards/accuracy_reward": 0.2812500037252903, "rewards/format_reward": 0.7916666865348816, "step": 231 }, { "completion_length": 1634.7708740234375, "entropy": 0.44384765625, "epoch": 0.2651428571428571, "grad_norm": 0.38455915451049805, "kl": 0.02178955078125, "learning_rate": 1.1574257748745986e-07, "loss": 0.0009, "reward": 0.947916716337204, "reward_std": 0.2808724343776703, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9062500149011612, "step": 232 }, { "completion_length": 1405.8541870117188, "entropy": 0.42626953125, "epoch": 0.2662857142857143, "grad_norm": 0.40585455298423767, "kl": 0.0194244384765625, "learning_rate": 1.1413757749211602e-07, "loss": 0.0008, "reward": 0.979166716337204, "reward_std": 0.3568150997161865, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.8854166716337204, "step": 233 }, { "completion_length": 1810.0937957763672, "entropy": 0.4306640625, "epoch": 0.2674285714285714, "grad_norm": 0.26030558347702026, "kl": 0.02484130859375, "learning_rate": 1.1261754973965422e-07, "loss": 0.001, "reward": 0.947916679084301, "reward_std": 0.30990852415561676, "rewards/accuracy_reward": 0.22916667256504297, "rewards/format_reward": 0.7187500149011612, "step": 234 }, { "completion_length": 1116.4167175292969, "entropy": 0.325439453125, "epoch": 0.26857142857142857, "grad_norm": 0.35170045495033264, "kl": 0.0147705078125, "learning_rate": 1.1118279056249653e-07, "loss": 0.0006, "reward": 1.4062500596046448, "reward_std": 0.36736297607421875, "rewards/accuracy_reward": 0.4687500223517418, "rewards/format_reward": 0.9375000149011612, "step": 235 }, { "completion_length": 1921.791748046875, "entropy": 0.501220703125, "epoch": 0.26971428571428574, "grad_norm": 0.23206467926502228, "kl": 0.0231475830078125, "learning_rate": 1.0983357966978745e-07, "loss": 0.0009, "reward": 1.0520833730697632, "reward_std": 0.5711337029933929, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.8229166716337204, "step": 236 }, { "completion_length": 1386.6146240234375, "entropy": 0.397705078125, "epoch": 0.27085714285714285, "grad_norm": 0.191674143075943, "kl": 0.0179901123046875, "learning_rate": 1.0857018009286381e-07, "loss": 0.0007, "reward": 1.1354166865348816, "reward_std": 0.26486562192440033, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.8645833432674408, "step": 237 }, { "completion_length": 1185.1875305175781, "entropy": 0.23828125, "epoch": 0.272, "grad_norm": 0.22915461659431458, "kl": 0.0135345458984375, "learning_rate": 1.0739283813397639e-07, "loss": 0.0005, "reward": 1.1041667014360428, "reward_std": 0.3254629634320736, "rewards/accuracy_reward": 0.3020833507180214, "rewards/format_reward": 0.8020833432674408, "step": 238 }, { "completion_length": 1499.385498046875, "entropy": 0.3544921875, "epoch": 0.27314285714285713, "grad_norm": 0.2647407352924347, "kl": 0.01690673828125, "learning_rate": 1.063017833182728e-07, "loss": 0.0007, "reward": 1.2812500298023224, "reward_std": 0.2977961152791977, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.8229166716337204, "step": 239 }, { "completion_length": 1829.4167175292969, "entropy": 0.8447265625, "epoch": 0.2742857142857143, "grad_norm": 0.652540385723114, "kl": 0.0433349609375, "learning_rate": 1.0529722834905125e-07, "loss": 0.0017, "reward": 0.7916666865348816, "reward_std": 0.4159542843699455, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.7291666716337204, "step": 240 }, { "completion_length": 1511.604248046875, "entropy": 0.53955078125, "epoch": 0.2754285714285714, "grad_norm": 0.4173026978969574, "kl": 0.02716064453125, "learning_rate": 1.0437936906629334e-07, "loss": 0.0011, "reward": 0.9062500447034836, "reward_std": 0.31576745957136154, "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.8437500149011612, "step": 241 }, { "completion_length": 1336.7396087646484, "entropy": 0.57373046875, "epoch": 0.2765714285714286, "grad_norm": 0.49007970094680786, "kl": 0.03204345703125, "learning_rate": 1.0354838440848501e-07, "loss": 0.0013, "reward": 1.020833358168602, "reward_std": 0.23006567358970642, "rewards/accuracy_reward": 0.13541666697710752, "rewards/format_reward": 0.8854166865348816, "step": 242 }, { "completion_length": 1687.6875610351562, "entropy": 0.470703125, "epoch": 0.2777142857142857, "grad_norm": 0.37550440430641174, "kl": 0.024993896484375, "learning_rate": 1.0280443637773163e-07, "loss": 0.001, "reward": 1.0000000447034836, "reward_std": 0.3257058337330818, "rewards/accuracy_reward": 0.14583334140479565, "rewards/format_reward": 0.8541666865348816, "step": 243 }, { "completion_length": 1568.4166870117188, "entropy": 0.3994140625, "epoch": 0.27885714285714286, "grad_norm": 1.2306023836135864, "kl": 0.02081298828125, "learning_rate": 1.0214767000817596e-07, "loss": 0.0008, "reward": 1.2708333432674408, "reward_std": 0.46156562119722366, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.8958333432674408, "step": 244 }, { "completion_length": 1866.0729675292969, "entropy": 0.51953125, "epoch": 0.28, "grad_norm": 0.6619442105293274, "kl": 0.024658203125, "learning_rate": 1.0157821333772304e-07, "loss": 0.001, "reward": 1.135416716337204, "reward_std": 0.6320051997900009, "rewards/accuracy_reward": 0.3020833469927311, "rewards/format_reward": 0.8333333432674408, "step": 245 }, { "completion_length": 1490.1250305175781, "entropy": 0.380859375, "epoch": 0.28114285714285714, "grad_norm": 0.2870371639728546, "kl": 0.0225677490234375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0009, "reward": 1.0625000149011612, "reward_std": 0.3832716643810272, "rewards/accuracy_reward": 0.16666667349636555, "rewards/format_reward": 0.895833358168602, "step": 246 }, { "completion_length": 2113.1563110351562, "entropy": 0.701904296875, "epoch": 0.2822857142857143, "grad_norm": 0.5894800424575806, "kl": 0.0378875732421875, "learning_rate": 1.0070165611810855e-07, "loss": 0.0015, "reward": 0.7708333358168602, "reward_std": 0.5113655403256416, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.6770833358168602, "step": 247 }, { "completion_length": 1501.5000610351562, "entropy": 0.60888671875, "epoch": 0.2834285714285714, "grad_norm": 0.5986164212226868, "kl": 0.0314483642578125, "learning_rate": 1.0039472645551372e-07, "loss": 0.0013, "reward": 1.1979166865348816, "reward_std": 0.40703435614705086, "rewards/accuracy_reward": 0.38541667722165585, "rewards/format_reward": 0.8125000149011612, "step": 248 }, { "completion_length": 1404.7396240234375, "entropy": 0.4013671875, "epoch": 0.2845714285714286, "grad_norm": 0.29348820447921753, "kl": 0.0249481201171875, "learning_rate": 1.0017544823184055e-07, "loss": 0.001, "reward": 1.2812500298023224, "reward_std": 0.34860314428806305, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.8333333432674408, "step": 249 }, { "completion_length": 1369.3021240234375, "entropy": 0.5029296875, "epoch": 0.2857142857142857, "grad_norm": 0.801852822303772, "kl": 0.029754638671875, "learning_rate": 1.000438641958131e-07, "loss": 0.0012, "reward": 1.0937500298023224, "reward_std": 0.4080042615532875, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.9062500149011612, "step": 250 }, { "epoch": 0.2857142857142857, "step": 250, "total_flos": 0.0, "train_loss": 0.00036543175232403515, "train_runtime": 19061.8828, "train_samples_per_second": 1.259, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }