| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2857142857142857, | |
| "eval_steps": 500, | |
| "global_step": 250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2700.5104370117188, | |
| "entropy": 0.3671875, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.11866585910320282, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "reward": 0.7604166893288493, | |
| "reward_std": 0.4268697127699852, | |
| "rewards/accuracy_reward": 0.25000001303851604, | |
| "rewards/format_reward": 0.5104166669771075, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 3164.5729370117188, | |
| "entropy": 0.35498046875, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.11806796491146088, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0, | |
| "reward": 0.6875000204890966, | |
| "reward_std": 0.36165641620755196, | |
| "rewards/accuracy_reward": 0.3020833386108279, | |
| "rewards/format_reward": 0.3854166818782687, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3615.7500610351562, | |
| "entropy": 0.45654296875, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.13286341726779938, | |
| "kl": 4.506111145019531e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": 0.18750000279396772, | |
| "reward_std": 0.23272089660167694, | |
| "rewards/accuracy_reward": 0.05208333395421505, | |
| "rewards/format_reward": 0.13541666697710752, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2482.416717529297, | |
| "entropy": 0.40869140625, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.14147797226905823, | |
| "kl": 3.30805778503418e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.8645833730697632, | |
| "reward_std": 0.4684411771595478, | |
| "rewards/accuracy_reward": 0.18750000279396772, | |
| "rewards/format_reward": 0.677083358168602, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3591.3646850585938, | |
| "entropy": 0.45947265625, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.13485956192016602, | |
| "kl": 4.0203332901000977e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.3750000074505806, | |
| "reward_std": 0.4921039678156376, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3477.2396850585938, | |
| "entropy": 0.45654296875, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.13642774522304535, | |
| "kl": 4.4018030166625977e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": 0.2812500074505806, | |
| "reward_std": 0.3338681757450104, | |
| "rewards/accuracy_reward": 0.031250000931322575, | |
| "rewards/format_reward": 0.25, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 3389.4584350585938, | |
| "entropy": 0.3916015625, | |
| "epoch": 0.008, | |
| "grad_norm": 0.1285017877817154, | |
| "kl": 2.73287296295166e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.739583358168602, | |
| "reward_std": 0.6624889373779297, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.5104166865348816, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2890.104248046875, | |
| "entropy": 0.343017578125, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.07784460484981537, | |
| "kl": 2.562999725341797e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.8541667014360428, | |
| "reward_std": 0.31141985207796097, | |
| "rewards/accuracy_reward": 0.3750000102445483, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3353.1875610351562, | |
| "entropy": 0.4384765625, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.1676989048719406, | |
| "kl": 4.4345855712890625e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.533780675381422, | |
| "rewards/accuracy_reward": 0.13541667070239782, | |
| "rewards/format_reward": 0.3645833432674408, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2996.1875610351562, | |
| "entropy": 0.3466796875, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.15292252600193024, | |
| "kl": 3.3229589462280273e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.552083358168602, | |
| "reward_std": 0.4631676971912384, | |
| "rewards/accuracy_reward": 0.13541667256504297, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3595.4063720703125, | |
| "entropy": 0.38134765625, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.15428116917610168, | |
| "kl": 3.191828727722168e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.260416672565043, | |
| "reward_std": 0.4286932796239853, | |
| "rewards/accuracy_reward": 0.09375000558793545, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2551.479217529297, | |
| "entropy": 0.41015625, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.1346665471792221, | |
| "kl": 3.972649574279785e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.8125000447034836, | |
| "reward_std": 0.455630861222744, | |
| "rewards/accuracy_reward": 0.17708333395421505, | |
| "rewards/format_reward": 0.6354166865348816, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 3197.1146240234375, | |
| "entropy": 0.40625, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.11476687341928482, | |
| "kl": 3.629922866821289e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.541666679084301, | |
| "reward_std": 0.28364068269729614, | |
| "rewards/accuracy_reward": 0.15625000279396772, | |
| "rewards/format_reward": 0.3854166716337204, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2990.2709350585938, | |
| "entropy": 0.36376953125, | |
| "epoch": 0.016, | |
| "grad_norm": 0.19115598499774933, | |
| "kl": 2.4765729904174805e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.4306366816163063, | |
| "rewards/accuracy_reward": 0.2083333432674408, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2998.2084350585938, | |
| "entropy": 0.376708984375, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.08084020018577576, | |
| "kl": 2.4259090423583984e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.65625, | |
| "reward_std": 0.2722426578402519, | |
| "rewards/accuracy_reward": 0.2604166679084301, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3789.6251220703125, | |
| "entropy": 0.44580078125, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.13743434846401215, | |
| "kl": 3.844499588012695e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.28125000558793545, | |
| "reward_std": 0.43100808560848236, | |
| "rewards/accuracy_reward": 0.11458333488553762, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2505.7188110351562, | |
| "entropy": 0.45361328125, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.1941273808479309, | |
| "kl": 3.5628676414489746e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.8541666865348816, | |
| "reward_std": 0.42319394648075104, | |
| "rewards/accuracy_reward": 0.2500000102445483, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 3116.479248046875, | |
| "entropy": 0.36669921875, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.1034688651561737, | |
| "kl": 2.0712614059448242e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.645833358168602, | |
| "reward_std": 0.3843524754047394, | |
| "rewards/accuracy_reward": 0.1875000111758709, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 3206.135498046875, | |
| "entropy": 0.3837890625, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.12397009134292603, | |
| "kl": 1.7814338207244873e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.6041666939854622, | |
| "reward_std": 0.49627041071653366, | |
| "rewards/accuracy_reward": 0.20833334419876337, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2758.656280517578, | |
| "entropy": 0.346435546875, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.14725361764431, | |
| "kl": 1.5437602996826172e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.9270833879709244, | |
| "reward_std": 0.5184547901153564, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 0.614583358168602, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2928.7083740234375, | |
| "entropy": 0.419921875, | |
| "epoch": 0.024, | |
| "grad_norm": 0.16036786139011383, | |
| "kl": 3.966689109802246e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.6458333544433117, | |
| "reward_std": 0.45568280667066574, | |
| "rewards/accuracy_reward": 0.18750000465661287, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1764.3541870117188, | |
| "entropy": 0.3837890625, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.20493587851524353, | |
| "kl": 3.927946090698242e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.177083358168602, | |
| "reward_std": 0.3750041276216507, | |
| "rewards/accuracy_reward": 0.3125000102445483, | |
| "rewards/format_reward": 0.8645833432674408, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2654.7188110351562, | |
| "entropy": 0.37841796875, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.14049632847309113, | |
| "kl": 2.409517765045166e-05, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.7395833507180214, | |
| "reward_std": 0.5375720374286175, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/format_reward": 0.5520833507180214, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2969.5000610351562, | |
| "entropy": 0.369140625, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.16432780027389526, | |
| "kl": 3.295391798019409e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.7708333432674408, | |
| "reward_std": 0.46594493091106415, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2851.1458740234375, | |
| "entropy": 0.4326171875, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.13283270597457886, | |
| "kl": 8.803606033325195e-05, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.34967152029275894, | |
| "rewards/accuracy_reward": 0.2395833358168602, | |
| "rewards/format_reward": 0.4270833432674408, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3068.9063110351562, | |
| "entropy": 0.40234375, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.08674507588148117, | |
| "kl": 2.0168721675872803e-05, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.272128164768219, | |
| "rewards/accuracy_reward": 0.19791666977107525, | |
| "rewards/format_reward": 0.4687500149011612, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 3239.416748046875, | |
| "entropy": 0.43212890625, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.13486000895500183, | |
| "kl": 4.280870780348778e-05, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": 0.6562500223517418, | |
| "reward_std": 0.47437138110399246, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.4687500149011612, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 3000.885498046875, | |
| "entropy": 0.4033203125, | |
| "epoch": 0.032, | |
| "grad_norm": 0.12638980150222778, | |
| "kl": 7.337331771850586e-05, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 0.7604166865348816, | |
| "reward_std": 0.4810705706477165, | |
| "rewards/accuracy_reward": 0.3229166716337204, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3736.104248046875, | |
| "entropy": 0.4453125, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.17240460216999054, | |
| "kl": 0.00014406442642211914, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": 0.2708333469927311, | |
| "reward_std": 0.4822928011417389, | |
| "rewards/accuracy_reward": 0.07291666697710752, | |
| "rewards/format_reward": 0.19791667256504297, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 3230.916748046875, | |
| "entropy": 0.3857421875, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.11792454868555069, | |
| "kl": 0.00036847591400146484, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": 0.7500000353902578, | |
| "reward_std": 0.6145796477794647, | |
| "rewards/accuracy_reward": 0.281250006519258, | |
| "rewards/format_reward": 0.4687500102445483, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 3289.6771240234375, | |
| "entropy": 0.39013671875, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.14808149635791779, | |
| "kl": 0.0001868605613708496, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": 0.5, | |
| "reward_std": 0.41467901691794395, | |
| "rewards/accuracy_reward": 0.17708333861082792, | |
| "rewards/format_reward": 0.322916679084301, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3417.041748046875, | |
| "entropy": 0.44775390625, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.10391438752412796, | |
| "kl": 0.00020647048950195312, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 0.7291666977107525, | |
| "reward_std": 0.4650338739156723, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3650.1251220703125, | |
| "entropy": 0.3798828125, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.10379055887460709, | |
| "kl": 0.0002885758876800537, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.42708334140479565, | |
| "reward_std": 0.5094060599803925, | |
| "rewards/accuracy_reward": 0.16666667070239782, | |
| "rewards/format_reward": 0.2604166781529784, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2580.8438110351562, | |
| "entropy": 0.43701171875, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.14274698495864868, | |
| "kl": 0.000626683235168457, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": 0.8645833563059568, | |
| "reward_std": 0.5226760059595108, | |
| "rewards/accuracy_reward": 0.33333334885537624, | |
| "rewards/format_reward": 0.5312500149011612, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3257.8333740234375, | |
| "entropy": 0.42333984375, | |
| "epoch": 0.04, | |
| "grad_norm": 0.15037870407104492, | |
| "kl": 0.0006988048553466797, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0, | |
| "reward": 0.5520833507180214, | |
| "reward_std": 0.5667570382356644, | |
| "rewards/accuracy_reward": 0.19791666883975267, | |
| "rewards/format_reward": 0.35416668467223644, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3751.5000610351562, | |
| "entropy": 0.50048828125, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.1191205084323883, | |
| "kl": 0.0008706152439117432, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": 0.22916667442768812, | |
| "reward_std": 0.4133975952863693, | |
| "rewards/accuracy_reward": 0.031250000931322575, | |
| "rewards/format_reward": 0.19791666697710752, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3529.010498046875, | |
| "entropy": 0.4375, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.09502461552619934, | |
| "kl": 0.0006368160247802734, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0, | |
| "reward": 0.27083334140479565, | |
| "reward_std": 0.1726192608475685, | |
| "rewards/accuracy_reward": 0.031250000931322575, | |
| "rewards/format_reward": 0.23958333861082792, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3582.3646240234375, | |
| "entropy": 0.45263671875, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.057916510850191116, | |
| "kl": 0.0004200935363769531, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0, | |
| "reward": 0.260416679084301, | |
| "reward_std": 0.18053755164146423, | |
| "rewards/accuracy_reward": 0.11458333395421505, | |
| "rewards/format_reward": 0.1458333358168602, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2990.2188110351562, | |
| "entropy": 0.36572265625, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.13697481155395508, | |
| "kl": 0.0011453032493591309, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": 0.729166679084301, | |
| "reward_std": 0.2874651923775673, | |
| "rewards/accuracy_reward": 0.2500000027939677, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2710.760498046875, | |
| "entropy": 0.38818359375, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.17243291437625885, | |
| "kl": 0.0023212432861328125, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7395833656191826, | |
| "reward_std": 0.44072920083999634, | |
| "rewards/accuracy_reward": 0.19791667442768812, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3186.885498046875, | |
| "entropy": 0.380859375, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.13304336369037628, | |
| "kl": 0.0006128549575805664, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0, | |
| "reward": 0.5000000074505806, | |
| "reward_std": 0.4803263619542122, | |
| "rewards/accuracy_reward": 0.09375000279396772, | |
| "rewards/format_reward": 0.4062500149011612, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 3108.7500610351562, | |
| "entropy": 0.45703125, | |
| "epoch": 0.048, | |
| "grad_norm": 0.05359484255313873, | |
| "kl": 0.0004253387451171875, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0, | |
| "reward": 0.43750000186264515, | |
| "reward_std": 0.13371453434228897, | |
| "rewards/accuracy_reward": 0.13541666697710752, | |
| "rewards/format_reward": 0.3020833348855376, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3218.541748046875, | |
| "entropy": 0.42431640625, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.12211965769529343, | |
| "kl": 0.0012423992156982422, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.510416692122817, | |
| "reward_std": 0.34035979211330414, | |
| "rewards/accuracy_reward": 0.1875000037252903, | |
| "rewards/format_reward": 0.3229166716337204, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2851.635482788086, | |
| "entropy": 0.39013671875, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.1306796669960022, | |
| "kl": 0.0009481906890869141, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0, | |
| "reward": 0.7708333563059568, | |
| "reward_std": 0.42007729411125183, | |
| "rewards/accuracy_reward": 0.2708333386108279, | |
| "rewards/format_reward": 0.5000000176951289, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3627.354248046875, | |
| "entropy": 0.43505859375, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.1441727578639984, | |
| "kl": 0.0014786720275878906, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5104166902601719, | |
| "reward_std": 0.4969704672694206, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.3229166828095913, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3407.3438110351562, | |
| "entropy": 0.50341796875, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.14224384725093842, | |
| "kl": 0.001373291015625, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0001, | |
| "reward": 0.30208334885537624, | |
| "reward_std": 0.34669168293476105, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.28125001303851604, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 3059.5521850585938, | |
| "entropy": 0.38916015625, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.1632174700498581, | |
| "kl": 0.001373887062072754, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0001, | |
| "reward": 0.8437500149011612, | |
| "reward_std": 0.5346902906894684, | |
| "rewards/accuracy_reward": 0.322916679084301, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2942.260498046875, | |
| "entropy": 0.40576171875, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.12902727723121643, | |
| "kl": 0.0034656524658203125, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6562500335276127, | |
| "reward_std": 0.5015772953629494, | |
| "rewards/accuracy_reward": 0.19791667722165585, | |
| "rewards/format_reward": 0.4583333544433117, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2611.0000610351562, | |
| "entropy": 0.38330078125, | |
| "epoch": 0.056, | |
| "grad_norm": 0.1312219202518463, | |
| "kl": 0.005061149597167969, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0002, | |
| "reward": 0.7916666716337204, | |
| "reward_std": 0.38058819621801376, | |
| "rewards/accuracy_reward": 0.2187500074505806, | |
| "rewards/format_reward": 0.5729166716337204, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 3060.0000610351562, | |
| "entropy": 0.349609375, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.08954072743654251, | |
| "kl": 0.0010552406311035156, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0, | |
| "reward": 0.6250000027939677, | |
| "reward_std": 0.31528370827436447, | |
| "rewards/accuracy_reward": 0.2604166716337204, | |
| "rewards/format_reward": 0.3645833460614085, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2443.1146240234375, | |
| "entropy": 0.46533203125, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.15657995641231537, | |
| "kl": 0.007048606872558594, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.31336943060159683, | |
| "rewards/accuracy_reward": 0.1979166753590107, | |
| "rewards/format_reward": 0.5104166716337204, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 3218.322967529297, | |
| "entropy": 0.404296875, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.09073984622955322, | |
| "kl": 0.0022614002227783203, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0001, | |
| "reward": 0.791666716337204, | |
| "reward_std": 0.5437296032905579, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.4375, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2848.8855590820312, | |
| "entropy": 0.42138671875, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.16575101017951965, | |
| "kl": 0.001046299934387207, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0, | |
| "reward": 0.979166679084301, | |
| "reward_std": 0.5605000704526901, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2977.2188110351562, | |
| "entropy": 0.39501953125, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.1525822877883911, | |
| "kl": 0.0014376640319824219, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9062500223517418, | |
| "reward_std": 0.6681454330682755, | |
| "rewards/accuracy_reward": 0.37500000558793545, | |
| "rewards/format_reward": 0.5312500149011612, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 3231.4063110351562, | |
| "entropy": 0.43701171875, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.1321529597043991, | |
| "kl": 0.0012722015380859375, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5416666902601719, | |
| "reward_std": 0.4359280541539192, | |
| "rewards/accuracy_reward": 0.18750000093132257, | |
| "rewards/format_reward": 0.35416668467223644, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 3173.5001220703125, | |
| "entropy": 0.421875, | |
| "epoch": 0.064, | |
| "grad_norm": 0.08983828872442245, | |
| "kl": 0.0008525848388671875, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0, | |
| "reward": 0.6250000027939677, | |
| "reward_std": 0.23468155041337013, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.3958333460614085, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3610.635498046875, | |
| "entropy": 0.33154296875, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.07729873806238174, | |
| "kl": 0.0005555152893066406, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0, | |
| "reward": 0.4479166669771075, | |
| "reward_std": 0.4583342596888542, | |
| "rewards/accuracy_reward": 0.11458333488553762, | |
| "rewards/format_reward": 0.3333333460614085, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2336.0521545410156, | |
| "entropy": 0.329833984375, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.15206408500671387, | |
| "kl": 0.0069732666015625, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9895833656191826, | |
| "reward_std": 0.4796273037791252, | |
| "rewards/accuracy_reward": 0.3020833386108279, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 3113.2500610351562, | |
| "entropy": 0.36474609375, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.09677103161811829, | |
| "kl": 0.0010061264038085938, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0, | |
| "reward": 0.5312500260770321, | |
| "reward_std": 0.302716389298439, | |
| "rewards/accuracy_reward": 0.1562500037252903, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 3189.0313110351562, | |
| "entropy": 0.3974609375, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.08244970440864563, | |
| "kl": 0.0017404556274414062, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4062500074505806, | |
| "reward_std": 0.31878524273633957, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.3229166828095913, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 3010.1251220703125, | |
| "entropy": 0.341796875, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.11613977700471878, | |
| "kl": 0.0011224746704101562, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0, | |
| "reward": 0.8437500111758709, | |
| "reward_std": 0.3721206858754158, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.5937500111758709, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2635.2083740234375, | |
| "entropy": 0.35302734375, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.13679753243923187, | |
| "kl": 0.002574920654296875, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9375000298023224, | |
| "reward_std": 0.5850840508937836, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2299.1875610351562, | |
| "entropy": 0.38427734375, | |
| "epoch": 0.072, | |
| "grad_norm": 0.1331767588853836, | |
| "kl": 0.0034208297729492188, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0001, | |
| "reward": 1.0729166865348816, | |
| "reward_std": 0.4625158831477165, | |
| "rewards/accuracy_reward": 0.36458334140479565, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2695.5104370117188, | |
| "entropy": 0.3974609375, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.13731464743614197, | |
| "kl": 0.0019664764404296875, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6770833656191826, | |
| "reward_std": 0.4545453414320946, | |
| "rewards/accuracy_reward": 0.19791667256504297, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2934.0938110351562, | |
| "entropy": 0.370361328125, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.1463320106267929, | |
| "kl": 0.002140045166015625, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0001, | |
| "reward": 0.8229166772216558, | |
| "reward_std": 0.36905180662870407, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.48958334885537624, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2158.9479598999023, | |
| "entropy": 0.34326171875, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.168357253074646, | |
| "kl": 0.0015735626220703125, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9166666818782687, | |
| "reward_std": 0.25834736227989197, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.5416666669771075, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3636.229248046875, | |
| "entropy": 0.36962890625, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.10577098280191422, | |
| "kl": 0.0020542144775390625, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0001, | |
| "reward": 0.18750000279396772, | |
| "reward_std": 0.20090095698833466, | |
| "rewards/accuracy_reward": 0.02083333395421505, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 2369.2916870117188, | |
| "entropy": 0.401611328125, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.20740464329719543, | |
| "kl": 0.00390625, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0002, | |
| "reward": 0.802083358168602, | |
| "reward_std": 0.47093402594327927, | |
| "rewards/accuracy_reward": 0.2395833358168602, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2438.010498046875, | |
| "entropy": 0.447265625, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.20514391362667084, | |
| "kl": 0.005084991455078125, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0002, | |
| "reward": 0.583333358168602, | |
| "reward_std": 0.31931574642658234, | |
| "rewards/accuracy_reward": 0.06250000279396772, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3127.2083740234375, | |
| "entropy": 0.364501953125, | |
| "epoch": 0.08, | |
| "grad_norm": 0.08004138618707657, | |
| "kl": 0.0024518966674804688, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5416666977107525, | |
| "reward_std": 0.31260205432772636, | |
| "rewards/accuracy_reward": 0.1145833358168602, | |
| "rewards/format_reward": 0.4270833544433117, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2821.635467529297, | |
| "entropy": 0.435546875, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.14604488015174866, | |
| "kl": 0.005157470703125, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5833333507180214, | |
| "reward_std": 0.2736881971359253, | |
| "rewards/accuracy_reward": 0.1770833432674408, | |
| "rewards/format_reward": 0.4062500074505806, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 3006.6563110351562, | |
| "entropy": 0.505859375, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.19332122802734375, | |
| "kl": 0.0032939910888671875, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4583333507180214, | |
| "reward_std": 0.38849541172385216, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3805.1771850585938, | |
| "entropy": 0.5087890625, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.08017224818468094, | |
| "kl": 0.00171661376953125, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0001, | |
| "reward": 0.281250006519258, | |
| "reward_std": 0.3908010721206665, | |
| "rewards/accuracy_reward": 0.11458333395421505, | |
| "rewards/format_reward": 0.16666667256504297, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 3330.1563110351562, | |
| "entropy": 0.427734375, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.1395280957221985, | |
| "kl": 0.0033082962036132812, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5729166939854622, | |
| "reward_std": 0.4739295169711113, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.322916679084301, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 3050.5313110351562, | |
| "entropy": 0.3994140625, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.11711548268795013, | |
| "kl": 0.0038471221923828125, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5312500223517418, | |
| "reward_std": 0.39057330042123795, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.447916679084301, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2418.604217529297, | |
| "entropy": 0.4072265625, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.13031832873821259, | |
| "kl": 0.001972198486328125, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7083333507180214, | |
| "reward_std": 0.29223429784178734, | |
| "rewards/accuracy_reward": 0.08333333861082792, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3237.010498046875, | |
| "entropy": 0.431640625, | |
| "epoch": 0.088, | |
| "grad_norm": 0.10760512948036194, | |
| "kl": 0.0021152496337890625, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5520833544433117, | |
| "reward_std": 0.3883203938603401, | |
| "rewards/accuracy_reward": 0.13541666697710752, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 2933.1459350585938, | |
| "entropy": 0.38671875, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.11242065578699112, | |
| "kl": 0.0015392303466796875, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6666666679084301, | |
| "reward_std": 0.5230761393904686, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2425.1146240234375, | |
| "entropy": 0.359619140625, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.08172761648893356, | |
| "kl": 0.0024118423461914062, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0001, | |
| "reward": 0.822916679084301, | |
| "reward_std": 0.3384963124990463, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.6145833432674408, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3421.666748046875, | |
| "entropy": 0.5087890625, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.13740910589694977, | |
| "kl": 0.003353118896484375, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6666666939854622, | |
| "reward_std": 0.408274307847023, | |
| "rewards/accuracy_reward": 0.25000001303851604, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3048.104217529297, | |
| "entropy": 0.58544921875, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.13280166685581207, | |
| "kl": 0.006072998046875, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5416666716337204, | |
| "reward_std": 0.29809625819325447, | |
| "rewards/accuracy_reward": 0.1354166679084301, | |
| "rewards/format_reward": 0.4062500149011612, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2573.3126220703125, | |
| "entropy": 0.42626953125, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.1755312979221344, | |
| "kl": 0.0029544830322265625, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0001, | |
| "reward": 0.8437500149011612, | |
| "reward_std": 0.3547321856021881, | |
| "rewards/accuracy_reward": 0.3229166716337204, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 3012.9271240234375, | |
| "entropy": 0.48291015625, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.13794995844364166, | |
| "kl": 0.0030269622802734375, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5208333432674408, | |
| "reward_std": 0.3228641413152218, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 3144.791748046875, | |
| "entropy": 0.46923828125, | |
| "epoch": 0.096, | |
| "grad_norm": 0.12894625961780548, | |
| "kl": 0.0023136138916015625, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6979166902601719, | |
| "reward_std": 0.5448310598731041, | |
| "rewards/accuracy_reward": 0.2395833358168602, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 2702.4583740234375, | |
| "entropy": 0.3408203125, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.10937459766864777, | |
| "kl": 0.002269744873046875, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7604166939854622, | |
| "reward_std": 0.47234033048152924, | |
| "rewards/accuracy_reward": 0.15625000186264515, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2702.3959350585938, | |
| "entropy": 0.48193359375, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.14910289645195007, | |
| "kl": 0.004451751708984375, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0002, | |
| "reward": 0.625, | |
| "reward_std": 0.2315434329211712, | |
| "rewards/accuracy_reward": 0.12500000558793545, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2416.2709045410156, | |
| "entropy": 0.44677734375, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.215481698513031, | |
| "kl": 0.004787445068359375, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8750000298023224, | |
| "reward_std": 0.4938344843685627, | |
| "rewards/accuracy_reward": 0.2083333395421505, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2382.1875610351562, | |
| "entropy": 0.4248046875, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.19311273097991943, | |
| "kl": 0.00421142578125, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9270833730697632, | |
| "reward_std": 0.6074926629662514, | |
| "rewards/accuracy_reward": 0.3020833358168602, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 2669.0209350585938, | |
| "entropy": 0.3544921875, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.15478843450546265, | |
| "kl": 0.003444671630859375, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6875000223517418, | |
| "reward_std": 0.4951842427253723, | |
| "rewards/accuracy_reward": 0.13541666977107525, | |
| "rewards/format_reward": 0.5520833656191826, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2330.072967529297, | |
| "entropy": 0.6162109375, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.14878880977630615, | |
| "kl": 0.01177978515625, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0005, | |
| "reward": 0.6770833656191826, | |
| "reward_std": 0.27013952285051346, | |
| "rewards/accuracy_reward": 0.07291666883975267, | |
| "rewards/format_reward": 0.6041666744276881, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 2697.4375, | |
| "entropy": 0.44921875, | |
| "epoch": 0.104, | |
| "grad_norm": 0.1833600103855133, | |
| "kl": 0.005588531494140625, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8750000298023224, | |
| "reward_std": 0.5495730713009834, | |
| "rewards/accuracy_reward": 0.260416679084301, | |
| "rewards/format_reward": 0.6145833507180214, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2586.010498046875, | |
| "entropy": 0.39111328125, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.14938902854919434, | |
| "kl": 0.007785797119140625, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0003, | |
| "reward": 0.687500037252903, | |
| "reward_std": 0.3853628858923912, | |
| "rewards/accuracy_reward": 0.1250000074505806, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3562.0000610351562, | |
| "entropy": 0.625, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.1706770658493042, | |
| "kl": 0.0075836181640625, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1354166679084301, | |
| "reward_std": 0.1874575838446617, | |
| "rewards/accuracy_reward": 0.0, | |
| "rewards/format_reward": 0.1354166679084301, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 2885.6876220703125, | |
| "entropy": 0.568359375, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.12106288969516754, | |
| "kl": 0.005161285400390625, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6562500149011612, | |
| "reward_std": 0.31557222083210945, | |
| "rewards/accuracy_reward": 0.22916666697710752, | |
| "rewards/format_reward": 0.4270833432674408, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3645.1146850585938, | |
| "entropy": 0.4736328125, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.0870869979262352, | |
| "kl": 0.0027751922607421875, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4270833460614085, | |
| "reward_std": 0.4592607915401459, | |
| "rewards/accuracy_reward": 0.13541667442768812, | |
| "rewards/format_reward": 0.2916666744276881, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2586.604217529297, | |
| "entropy": 0.39990234375, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.16784177720546722, | |
| "kl": 0.0048675537109375, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8854166865348816, | |
| "reward_std": 0.5198706425726414, | |
| "rewards/accuracy_reward": 0.31250000558793545, | |
| "rewards/format_reward": 0.572916679084301, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 2758.572998046875, | |
| "entropy": 0.4267578125, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.21689672768115997, | |
| "kl": 0.003742218017578125, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0001, | |
| "reward": 0.802083358168602, | |
| "reward_std": 0.5127636715769768, | |
| "rewards/accuracy_reward": 0.23958333861082792, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2189.7395935058594, | |
| "entropy": 0.34521484375, | |
| "epoch": 0.112, | |
| "grad_norm": 0.17968373000621796, | |
| "kl": 0.003505706787109375, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0001, | |
| "reward": 0.84375, | |
| "reward_std": 0.23535311594605446, | |
| "rewards/accuracy_reward": 0.15625000093132257, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2465.3646240234375, | |
| "entropy": 0.38818359375, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.16559617221355438, | |
| "kl": 0.004673004150390625, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9166666716337204, | |
| "reward_std": 0.5209992416203022, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2335.166717529297, | |
| "entropy": 0.379150390625, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.14200669527053833, | |
| "kl": 0.005016326904296875, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0002, | |
| "reward": 1.0625000298023224, | |
| "reward_std": 0.47875121980905533, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2051.1459350585938, | |
| "entropy": 0.426513671875, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.23151825368404388, | |
| "kl": 0.0040740966796875, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8437500149011612, | |
| "reward_std": 0.22601452097296715, | |
| "rewards/accuracy_reward": 0.1458333432674408, | |
| "rewards/format_reward": 0.6979166716337204, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 1650.8438110351562, | |
| "entropy": 0.356201171875, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.09349505603313446, | |
| "kl": 0.0050201416015625, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0002, | |
| "reward": 0.895833358168602, | |
| "reward_std": 0.2152964137494564, | |
| "rewards/accuracy_reward": 0.07291666977107525, | |
| "rewards/format_reward": 0.8229166865348816, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 1967.0833892822266, | |
| "entropy": 0.361328125, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.16049090027809143, | |
| "kl": 0.0059795379638671875, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0002, | |
| "reward": 1.0937500298023224, | |
| "reward_std": 0.3835059180855751, | |
| "rewards/accuracy_reward": 0.2395833358168602, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2380.3125610351562, | |
| "entropy": 0.44921875, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.1609274446964264, | |
| "kl": 0.0071868896484375, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.40659596025943756, | |
| "rewards/accuracy_reward": 0.12500000186264515, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2000.1876068115234, | |
| "entropy": 0.34033203125, | |
| "epoch": 0.12, | |
| "grad_norm": 0.19138775765895844, | |
| "kl": 0.00435638427734375, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0002, | |
| "reward": 1.0312500149011612, | |
| "reward_std": 0.5785520151257515, | |
| "rewards/accuracy_reward": 0.3229166744276881, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2208.072967529297, | |
| "entropy": 0.31982421875, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.09388583153486252, | |
| "kl": 0.0026397705078125, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0001, | |
| "reward": 1.1458333656191826, | |
| "reward_std": 0.35796716436743736, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2212.6771545410156, | |
| "entropy": 0.52099609375, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.19971498847007751, | |
| "kl": 0.006237030029296875, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8333333730697632, | |
| "reward_std": 0.3842815235257149, | |
| "rewards/accuracy_reward": 0.1145833358168602, | |
| "rewards/format_reward": 0.7187500298023224, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2266.5313110351562, | |
| "entropy": 0.384765625, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.1431254744529724, | |
| "kl": 0.00372314453125, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9270833535119891, | |
| "reward_std": 0.38414933159947395, | |
| "rewards/accuracy_reward": 0.2395833358168602, | |
| "rewards/format_reward": 0.6875000102445483, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 2486.4584350585938, | |
| "entropy": 0.381103515625, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.1384974718093872, | |
| "kl": 0.004756927490234375, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6875000149011612, | |
| "reward_std": 0.3808614909648895, | |
| "rewards/accuracy_reward": 0.14583333674818277, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2373.7500610351562, | |
| "entropy": 0.43359375, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.18450307846069336, | |
| "kl": 0.004810333251953125, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9166666865348816, | |
| "reward_std": 0.5121813043951988, | |
| "rewards/accuracy_reward": 0.23958333767950535, | |
| "rewards/format_reward": 0.677083358168602, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 2390.156280517578, | |
| "entropy": 0.482666015625, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.14442622661590576, | |
| "kl": 0.00649261474609375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7604166865348816, | |
| "reward_std": 0.3381837457418442, | |
| "rewards/accuracy_reward": 0.1666666753590107, | |
| "rewards/format_reward": 0.5937500074505806, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 2708.979248046875, | |
| "entropy": 0.46240234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.15874600410461426, | |
| "kl": 0.00481414794921875, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8958333432674408, | |
| "reward_std": 0.41653573513031006, | |
| "rewards/accuracy_reward": 0.2916666707023978, | |
| "rewards/format_reward": 0.6041666939854622, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 1425.0208435058594, | |
| "entropy": 0.34765625, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.2752454876899719, | |
| "kl": 0.01012420654296875, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1041667014360428, | |
| "reward_std": 0.4169319197535515, | |
| "rewards/accuracy_reward": 0.2708333432674408, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 1730.0833587646484, | |
| "entropy": 0.32763671875, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.1002146303653717, | |
| "kl": 0.00527191162109375, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9895833730697632, | |
| "reward_std": 0.24960162490606308, | |
| "rewards/accuracy_reward": 0.09375000279396772, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2451.6771240234375, | |
| "entropy": 0.44873046875, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.09600935876369476, | |
| "kl": 0.006961822509765625, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7187500223517418, | |
| "reward_std": 0.19398127868771553, | |
| "rewards/accuracy_reward": 0.15625, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3191.8751220703125, | |
| "entropy": 0.5556640625, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.19031184911727905, | |
| "kl": 0.007049560546875, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0003, | |
| "reward": 0.4479166939854622, | |
| "reward_std": 0.5110370628535748, | |
| "rewards/accuracy_reward": 0.13541667070239782, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 2564.8021240234375, | |
| "entropy": 0.493896484375, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.2184651792049408, | |
| "kl": 0.009033203125, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0004, | |
| "reward": 0.7187500223517418, | |
| "reward_std": 0.5183624178171158, | |
| "rewards/accuracy_reward": 0.11458333395421505, | |
| "rewards/format_reward": 0.6041666939854622, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2493.3333740234375, | |
| "entropy": 0.38720703125, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.12824109196662903, | |
| "kl": 0.00505828857421875, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9375000298023224, | |
| "reward_std": 0.46352487802505493, | |
| "rewards/accuracy_reward": 0.28125000558793545, | |
| "rewards/format_reward": 0.6562500149011612, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 1656.8854522705078, | |
| "entropy": 0.4052734375, | |
| "epoch": 0.136, | |
| "grad_norm": 0.12852801382541656, | |
| "kl": 0.00701141357421875, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0312500298023224, | |
| "reward_std": 0.25884300470352173, | |
| "rewards/accuracy_reward": 0.2395833358168602, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 1066.7917175292969, | |
| "entropy": 0.2958984375, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.14502935111522675, | |
| "kl": 0.007476806640625, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1979166865348816, | |
| "reward_std": 0.27546053379774094, | |
| "rewards/accuracy_reward": 0.2604166669771075, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 999.8542022705078, | |
| "entropy": 0.2783203125, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.1992115080356598, | |
| "kl": 0.00730133056640625, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0003, | |
| "reward": 1.2500000149011612, | |
| "reward_std": 0.29015830904245377, | |
| "rewards/accuracy_reward": 0.3020833386108279, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2167.0834197998047, | |
| "entropy": 0.432373046875, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.1561378836631775, | |
| "kl": 0.0062408447265625, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9687500298023224, | |
| "reward_std": 0.4494870528578758, | |
| "rewards/accuracy_reward": 0.2604166781529784, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2166.729248046875, | |
| "entropy": 0.4207763671875, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.18212474882602692, | |
| "kl": 0.005710601806640625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9270833507180214, | |
| "reward_std": 0.37269312888383865, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.7187500074505806, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 1833.7396240234375, | |
| "entropy": 0.27783203125, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.19929192960262299, | |
| "kl": 0.0140838623046875, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1145833730697632, | |
| "reward_std": 0.5214307978749275, | |
| "rewards/accuracy_reward": 0.2604166679084301, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2578.0209350585938, | |
| "entropy": 0.45166015625, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.1536300778388977, | |
| "kl": 0.00638580322265625, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.30013205483555794, | |
| "rewards/accuracy_reward": 0.11458333861082792, | |
| "rewards/format_reward": 0.4479166865348816, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2283.6875610351562, | |
| "entropy": 0.37451171875, | |
| "epoch": 0.144, | |
| "grad_norm": 0.16426944732666016, | |
| "kl": 0.005584716796875, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9479167014360428, | |
| "reward_std": 0.5797486528754234, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.6979166865348816, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 1901.9063110351562, | |
| "entropy": 0.381591796875, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.15447713434696198, | |
| "kl": 0.00717926025390625, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0003, | |
| "reward": 0.8541666865348816, | |
| "reward_std": 0.3032348155975342, | |
| "rewards/accuracy_reward": 0.07291666977107525, | |
| "rewards/format_reward": 0.7812500149011612, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 1890.5729675292969, | |
| "entropy": 0.397705078125, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.23616060614585876, | |
| "kl": 0.006561279296875, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1041666865348816, | |
| "reward_std": 0.43167490512132645, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 2441.8021850585938, | |
| "entropy": 0.466796875, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.19474731385707855, | |
| "kl": 0.00809478759765625, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0003, | |
| "reward": 0.8125000447034836, | |
| "reward_std": 0.5053007081151009, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 2335.4063110351562, | |
| "entropy": 0.4462890625, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.2001771628856659, | |
| "kl": 0.00664520263671875, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6875000298023224, | |
| "reward_std": 0.480606772005558, | |
| "rewards/accuracy_reward": 0.0937500037252903, | |
| "rewards/format_reward": 0.5937500149011612, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2002.8438110351562, | |
| "entropy": 0.427978515625, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.21314886212348938, | |
| "kl": 0.0078277587890625, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0625000447034836, | |
| "reward_std": 0.5524623095989227, | |
| "rewards/accuracy_reward": 0.3229166781529784, | |
| "rewards/format_reward": 0.7395833432674408, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2149.666778564453, | |
| "entropy": 0.39501953125, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.15732061862945557, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9687500298023224, | |
| "reward_std": 0.48508264869451523, | |
| "rewards/accuracy_reward": 0.2812500102445483, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 2231.500030517578, | |
| "entropy": 0.435546875, | |
| "epoch": 0.152, | |
| "grad_norm": 0.21705371141433716, | |
| "kl": 0.00888824462890625, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0004, | |
| "reward": 0.7708333395421505, | |
| "reward_std": 0.4052763059735298, | |
| "rewards/accuracy_reward": 0.14583333395421505, | |
| "rewards/format_reward": 0.625, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2071.229278564453, | |
| "entropy": 0.4912109375, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.18239766359329224, | |
| "kl": 0.01198577880859375, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0005, | |
| "reward": 1.0208333730697632, | |
| "reward_std": 0.47505422681570053, | |
| "rewards/accuracy_reward": 0.2812500037252903, | |
| "rewards/format_reward": 0.7395833432674408, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1155.7292175292969, | |
| "entropy": 0.304443359375, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.1679978370666504, | |
| "kl": 0.01029205322265625, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0004, | |
| "reward": 1.416666716337204, | |
| "reward_std": 0.4091631546616554, | |
| "rewards/accuracy_reward": 0.5000000111758709, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 1356.5521545410156, | |
| "entropy": 0.282470703125, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.18188215792179108, | |
| "kl": 0.00759124755859375, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1979166865348816, | |
| "reward_std": 0.3688688538968563, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/format_reward": 0.9270833432674408, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 1961.9167175292969, | |
| "entropy": 0.324462890625, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.1938006430864334, | |
| "kl": 0.0077667236328125, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0003, | |
| "reward": 0.833333358168602, | |
| "reward_std": 0.4214525818824768, | |
| "rewards/accuracy_reward": 0.08333333674818277, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 1700.7396240234375, | |
| "entropy": 0.302001953125, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.16465067863464355, | |
| "kl": 0.00738525390625, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0208333432674408, | |
| "reward_std": 0.37441620975732803, | |
| "rewards/accuracy_reward": 0.1979166753590107, | |
| "rewards/format_reward": 0.8229166865348816, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 1888.5208435058594, | |
| "entropy": 0.38720703125, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.20309647917747498, | |
| "kl": 0.009521484375, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0625000298023224, | |
| "reward_std": 0.43142497539520264, | |
| "rewards/accuracy_reward": 0.21875001024454832, | |
| "rewards/format_reward": 0.8437500298023224, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 1912.197998046875, | |
| "entropy": 0.442138671875, | |
| "epoch": 0.16, | |
| "grad_norm": 0.3316217064857483, | |
| "kl": 0.01258087158203125, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0005, | |
| "reward": 0.9270833879709244, | |
| "reward_std": 0.42255595326423645, | |
| "rewards/accuracy_reward": 0.16666666883975267, | |
| "rewards/format_reward": 0.7604166865348816, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 1228.6042175292969, | |
| "entropy": 0.244873046875, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.14592112600803375, | |
| "kl": 0.0072784423828125, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0003, | |
| "reward": 1.2083333730697632, | |
| "reward_std": 0.3421749100089073, | |
| "rewards/accuracy_reward": 0.23958333395421505, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 1981.0209350585938, | |
| "entropy": 0.40869140625, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.21422992646694183, | |
| "kl": 0.0090179443359375, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9895833730697632, | |
| "reward_std": 0.4437461569905281, | |
| "rewards/accuracy_reward": 0.1562500037252903, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 1735.0104675292969, | |
| "entropy": 0.399658203125, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.2593076825141907, | |
| "kl": 0.01113128662109375, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0004, | |
| "reward": 1.03125, | |
| "reward_std": 0.4072144068777561, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/format_reward": 0.8854166716337204, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 1934.3229675292969, | |
| "entropy": 0.434326171875, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.2749219238758087, | |
| "kl": 0.01078033447265625, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0625000149011612, | |
| "reward_std": 0.551102414727211, | |
| "rewards/accuracy_reward": 0.29166666977107525, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 1198.5937805175781, | |
| "entropy": 0.274169921875, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.19994103908538818, | |
| "kl": 0.010986328125, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0004, | |
| "reward": 1.2187500447034836, | |
| "reward_std": 0.374411478638649, | |
| "rewards/accuracy_reward": 0.3020833395421505, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 1426.0521240234375, | |
| "entropy": 0.300048828125, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.1441003829240799, | |
| "kl": 0.0077667236328125, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0312500298023224, | |
| "reward_std": 0.2671857923269272, | |
| "rewards/accuracy_reward": 0.1145833358168602, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 1535.447982788086, | |
| "entropy": 0.31689453125, | |
| "epoch": 0.168, | |
| "grad_norm": 0.19958704710006714, | |
| "kl": 0.01012420654296875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1562500298023224, | |
| "reward_std": 0.39162378013134, | |
| "rewards/accuracy_reward": 0.2812500074505806, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 1264.7396545410156, | |
| "entropy": 0.24658203125, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.10701873153448105, | |
| "kl": 0.008941650390625, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1354166865348816, | |
| "reward_std": 0.15690934658050537, | |
| "rewards/accuracy_reward": 0.1875, | |
| "rewards/format_reward": 0.9479166716337204, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2025.3021240234375, | |
| "entropy": 0.352783203125, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.13325975835323334, | |
| "kl": 0.008880615234375, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1458333730697632, | |
| "reward_std": 0.3877560868859291, | |
| "rewards/accuracy_reward": 0.3229166716337204, | |
| "rewards/format_reward": 0.8229166865348816, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 1229.2708740234375, | |
| "entropy": 0.274658203125, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.1998661607503891, | |
| "kl": 0.0081634521484375, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1354166865348816, | |
| "reward_std": 0.4279330112040043, | |
| "rewards/accuracy_reward": 0.2083333432674408, | |
| "rewards/format_reward": 0.927083358168602, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 1687.1146240234375, | |
| "entropy": 0.4375, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.25493502616882324, | |
| "kl": 0.0125579833984375, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0005, | |
| "reward": 1.114583358168602, | |
| "reward_std": 0.5058320835232735, | |
| "rewards/accuracy_reward": 0.312500006519258, | |
| "rewards/format_reward": 0.802083358168602, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 1766.2083740234375, | |
| "entropy": 0.41552734375, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.24902759492397308, | |
| "kl": 0.010467529296875, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9895833432674408, | |
| "reward_std": 0.4082007445394993, | |
| "rewards/accuracy_reward": 0.19791666697710752, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 1433.1562805175781, | |
| "entropy": 0.31591796875, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.21754246950149536, | |
| "kl": 0.0152740478515625, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0006, | |
| "reward": 1.020833358168602, | |
| "reward_std": 0.2934442237019539, | |
| "rewards/accuracy_reward": 0.12500000279396772, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 2112.354248046875, | |
| "entropy": 0.39599609375, | |
| "epoch": 0.176, | |
| "grad_norm": 0.20787468552589417, | |
| "kl": 0.009765625, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0833333432674408, | |
| "reward_std": 0.4751042574644089, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 1740.2500610351562, | |
| "entropy": 0.38623046875, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.19822287559509277, | |
| "kl": 0.0122833251953125, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0005, | |
| "reward": 1.020833358168602, | |
| "reward_std": 0.41686780750751495, | |
| "rewards/accuracy_reward": 0.16666666977107525, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 1723.6146545410156, | |
| "entropy": 0.29345703125, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.12867745757102966, | |
| "kl": 0.00856781005859375, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1458334028720856, | |
| "reward_std": 0.4736599698662758, | |
| "rewards/accuracy_reward": 0.322916679084301, | |
| "rewards/format_reward": 0.8229166865348816, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2180.291748046875, | |
| "entropy": 0.4892578125, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.27378663420677185, | |
| "kl": 0.0123748779296875, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0005, | |
| "reward": 0.864583358168602, | |
| "reward_std": 0.4690057747066021, | |
| "rewards/accuracy_reward": 0.14583333488553762, | |
| "rewards/format_reward": 0.7187500298023224, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 1268.4479675292969, | |
| "entropy": 0.26611328125, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.15639232099056244, | |
| "kl": 0.00994873046875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1458333730697632, | |
| "reward_std": 0.3376114182174206, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/format_reward": 0.958333358168602, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 1842.7188110351562, | |
| "entropy": 0.332275390625, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.22000983357429504, | |
| "kl": 0.0113372802734375, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0005, | |
| "reward": 0.895833358168602, | |
| "reward_std": 0.43413354456424713, | |
| "rewards/accuracy_reward": 0.10416667256504297, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 1532.4791870117188, | |
| "entropy": 0.39111328125, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.5555780529975891, | |
| "kl": 0.017059326171875, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0007, | |
| "reward": 1.2291666865348816, | |
| "reward_std": 0.5317695289850235, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 1631.375015258789, | |
| "entropy": 0.3583984375, | |
| "epoch": 0.184, | |
| "grad_norm": 0.20350432395935059, | |
| "kl": 0.01454925537109375, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1458333730697632, | |
| "reward_std": 0.44703245162963867, | |
| "rewards/accuracy_reward": 0.25000000558793545, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 1646.5208740234375, | |
| "entropy": 0.4658203125, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.2509794533252716, | |
| "kl": 0.014892578125, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0006, | |
| "reward": 1.0104167014360428, | |
| "reward_std": 0.4695500135421753, | |
| "rewards/accuracy_reward": 0.19791666977107525, | |
| "rewards/format_reward": 0.8125000223517418, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 1552.0000610351562, | |
| "entropy": 0.424560546875, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.17314837872982025, | |
| "kl": 0.0128173828125, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1770833879709244, | |
| "reward_std": 0.33981742709875107, | |
| "rewards/accuracy_reward": 0.3541666828095913, | |
| "rewards/format_reward": 0.8229166865348816, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 1373.3333587646484, | |
| "entropy": 0.320556640625, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.2898624539375305, | |
| "kl": 0.011077880859375, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1562500149011612, | |
| "reward_std": 0.36444756016135216, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 1446.5208740234375, | |
| "entropy": 0.34033203125, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.36863088607788086, | |
| "kl": 0.0135345458984375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1562500298023224, | |
| "reward_std": 0.5087258517742157, | |
| "rewards/accuracy_reward": 0.30208334140479565, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 1569.2500610351562, | |
| "entropy": 0.2666015625, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.14335811138153076, | |
| "kl": 0.00957489013671875, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0937500596046448, | |
| "reward_std": 0.3973645642399788, | |
| "rewards/accuracy_reward": 0.2395833432674408, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 1177.822982788086, | |
| "entropy": 0.25244140625, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.11705330014228821, | |
| "kl": 0.0107421875, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0004, | |
| "reward": 1.041666716337204, | |
| "reward_std": 0.18237071484327316, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 1497.2604370117188, | |
| "entropy": 0.283203125, | |
| "epoch": 0.192, | |
| "grad_norm": 0.22972875833511353, | |
| "kl": 0.0095367431640625, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1354167014360428, | |
| "reward_std": 0.3216959089040756, | |
| "rewards/accuracy_reward": 0.21875000838190317, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 1162.0416870117188, | |
| "entropy": 0.245849609375, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.1791468858718872, | |
| "kl": 0.01068878173828125, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0004, | |
| "reward": 1.479166716337204, | |
| "reward_std": 0.3380242697894573, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 1761.4792175292969, | |
| "entropy": 0.392333984375, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.22026270627975464, | |
| "kl": 0.0135498046875, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1145833432674408, | |
| "reward_std": 0.25935307145118713, | |
| "rewards/accuracy_reward": 0.35416666977107525, | |
| "rewards/format_reward": 0.7604166716337204, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 1874.7604675292969, | |
| "entropy": 0.406982421875, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.18767470121383667, | |
| "kl": 0.0109100341796875, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9270833730697632, | |
| "reward_std": 0.3239624425768852, | |
| "rewards/accuracy_reward": 0.20833334140479565, | |
| "rewards/format_reward": 0.71875, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 1946.0000610351562, | |
| "entropy": 0.48876953125, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.32672053575515747, | |
| "kl": 0.0180816650390625, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0007, | |
| "reward": 1.1354166865348816, | |
| "reward_std": 0.5261635184288025, | |
| "rewards/accuracy_reward": 0.3958333460614085, | |
| "rewards/format_reward": 0.7395833432674408, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 1126.3541793823242, | |
| "entropy": 0.27978515625, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.20509202778339386, | |
| "kl": 0.01155853271484375, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1562500298023224, | |
| "reward_std": 0.35620374977588654, | |
| "rewards/accuracy_reward": 0.2395833395421505, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 1285.9167175292969, | |
| "entropy": 0.331787109375, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.17950935661792755, | |
| "kl": 0.01363372802734375, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0005, | |
| "reward": 1.2187500447034836, | |
| "reward_std": 0.3375067636370659, | |
| "rewards/accuracy_reward": 0.2604166753590107, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 1471.7291870117188, | |
| "entropy": 0.380615234375, | |
| "epoch": 0.2, | |
| "grad_norm": 0.31842005252838135, | |
| "kl": 0.013336181640625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0005, | |
| "reward": 1.0625000149011612, | |
| "reward_std": 0.361453078687191, | |
| "rewards/accuracy_reward": 0.1979166716337204, | |
| "rewards/format_reward": 0.8645833432674408, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 1566.8229675292969, | |
| "entropy": 0.343994140625, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.280519962310791, | |
| "kl": 0.014129638671875, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0006, | |
| "reward": 1.0520833730697632, | |
| "reward_std": 0.4207059293985367, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/format_reward": 0.8854166865348816, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 1703.625015258789, | |
| "entropy": 0.453125, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.33665430545806885, | |
| "kl": 0.013824462890625, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0006, | |
| "reward": 1.083333358168602, | |
| "reward_std": 0.4203081615269184, | |
| "rewards/accuracy_reward": 0.250000006519258, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 1527.3021240234375, | |
| "entropy": 0.39013671875, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.2330997735261917, | |
| "kl": 0.017974853515625, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0007, | |
| "reward": 1.2187500298023224, | |
| "reward_std": 0.5016858726739883, | |
| "rewards/accuracy_reward": 0.3125, | |
| "rewards/format_reward": 0.9062500298023224, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 1635.5833740234375, | |
| "entropy": 0.365234375, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.29855063557624817, | |
| "kl": 0.01336669921875, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0005, | |
| "reward": 0.979166716337204, | |
| "reward_std": 0.4029111787676811, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 1145.468765258789, | |
| "entropy": 0.3056640625, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.238195538520813, | |
| "kl": 0.0108795166015625, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0004, | |
| "reward": 1.5104166865348816, | |
| "reward_std": 0.42421412095427513, | |
| "rewards/accuracy_reward": 0.5520833358168602, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 1850.947998046875, | |
| "entropy": 0.420166015625, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.36524245142936707, | |
| "kl": 0.01495361328125, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0006, | |
| "reward": 0.9791666865348816, | |
| "reward_std": 0.3578517735004425, | |
| "rewards/accuracy_reward": 0.19791666697710752, | |
| "rewards/format_reward": 0.7812500298023224, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 1749.0938415527344, | |
| "entropy": 0.359375, | |
| "epoch": 0.208, | |
| "grad_norm": 0.1934923529624939, | |
| "kl": 0.0108184814453125, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0000000298023224, | |
| "reward_std": 0.3973938375711441, | |
| "rewards/accuracy_reward": 0.2187500037252903, | |
| "rewards/format_reward": 0.7812500149011612, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1332.4479675292969, | |
| "entropy": 0.322021484375, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.23912180960178375, | |
| "kl": 0.0157623291015625, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0006, | |
| "reward": 1.2187500596046448, | |
| "reward_std": 0.35980356484651566, | |
| "rewards/accuracy_reward": 0.260416679084301, | |
| "rewards/format_reward": 0.958333358168602, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 1341.1354675292969, | |
| "entropy": 0.35693359375, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.22043198347091675, | |
| "kl": 0.0140228271484375, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0006, | |
| "reward": 1.0833333432674408, | |
| "reward_std": 0.2259194441139698, | |
| "rewards/accuracy_reward": 0.1666666716337204, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 1516.4583892822266, | |
| "entropy": 0.30029296875, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.23556554317474365, | |
| "kl": 0.0121002197265625, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0005, | |
| "reward": 1.0104166716337204, | |
| "reward_std": 0.31887371838092804, | |
| "rewards/accuracy_reward": 0.1458333432674408, | |
| "rewards/format_reward": 0.864583358168602, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 1785.104232788086, | |
| "entropy": 0.446533203125, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.38155755400657654, | |
| "kl": 0.01576995849609375, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0006, | |
| "reward": 0.9583333879709244, | |
| "reward_std": 0.44071806967258453, | |
| "rewards/accuracy_reward": 0.18750000558793545, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 1534.7917175292969, | |
| "entropy": 0.41748046875, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.37235942482948303, | |
| "kl": 0.0169219970703125, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0007, | |
| "reward": 1.0312500149011612, | |
| "reward_std": 0.35790160298347473, | |
| "rewards/accuracy_reward": 0.1458333395421505, | |
| "rewards/format_reward": 0.8854166865348816, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 1874.6354675292969, | |
| "entropy": 0.521484375, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.5140780210494995, | |
| "kl": 0.0186004638671875, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0007, | |
| "reward": 0.8125000298023224, | |
| "reward_std": 0.4419962018728256, | |
| "rewards/accuracy_reward": 0.08333333674818277, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 1185.3958587646484, | |
| "entropy": 0.338623046875, | |
| "epoch": 0.216, | |
| "grad_norm": 0.28023380041122437, | |
| "kl": 0.01312255859375, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0005, | |
| "reward": 1.020833358168602, | |
| "reward_std": 0.301351435482502, | |
| "rewards/accuracy_reward": 0.11458333674818277, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 1186.5417175292969, | |
| "entropy": 0.27587890625, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.17898434400558472, | |
| "kl": 0.01284027099609375, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1770833432674408, | |
| "reward_std": 0.2806706018745899, | |
| "rewards/accuracy_reward": 0.23958333395421505, | |
| "rewards/format_reward": 0.9375, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1232.6458740234375, | |
| "entropy": 0.323486328125, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.19368711113929749, | |
| "kl": 0.0129547119140625, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0005, | |
| "reward": 1.3125000596046448, | |
| "reward_std": 0.3853513225913048, | |
| "rewards/accuracy_reward": 0.35416668467223644, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 1928.1459350585938, | |
| "entropy": 0.443603515625, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.19557389616966248, | |
| "kl": 0.01648712158203125, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0007, | |
| "reward": 0.8854166865348816, | |
| "reward_std": 0.25648824870586395, | |
| "rewards/accuracy_reward": 0.1354166716337204, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2170.5625, | |
| "entropy": 0.6103515625, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.5704598426818848, | |
| "kl": 0.02215576171875, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0009, | |
| "reward": 0.8541667014360428, | |
| "reward_std": 0.5056344047188759, | |
| "rewards/accuracy_reward": 0.15625000186264515, | |
| "rewards/format_reward": 0.6979166865348816, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 2294.229248046875, | |
| "entropy": 0.56884765625, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.2927386164665222, | |
| "kl": 0.02191162109375, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0009, | |
| "reward": 1.239583358168602, | |
| "reward_std": 0.6518271863460541, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.802083358168602, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 1263.7917175292969, | |
| "entropy": 0.239990234375, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.16641545295715332, | |
| "kl": 0.0110015869140625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1875000596046448, | |
| "reward_std": 0.38468754291534424, | |
| "rewards/accuracy_reward": 0.23958334140479565, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 2122.697967529297, | |
| "entropy": 0.60400390625, | |
| "epoch": 0.224, | |
| "grad_norm": 0.25676071643829346, | |
| "kl": 0.023529052734375, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0009, | |
| "reward": 0.9791666865348816, | |
| "reward_std": 0.43499240279197693, | |
| "rewards/accuracy_reward": 0.229166679084301, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 1125.312515258789, | |
| "entropy": 0.328125, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.19764328002929688, | |
| "kl": 0.011962890625, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0005, | |
| "reward": 1.3437500596046448, | |
| "reward_std": 0.39162378013134, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.96875, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 1463.8750305175781, | |
| "entropy": 0.341796875, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.26102352142333984, | |
| "kl": 0.01641845703125, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0007, | |
| "reward": 1.1354167014360428, | |
| "reward_std": 0.33387480303645134, | |
| "rewards/accuracy_reward": 0.3229166716337204, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 1503.4792175292969, | |
| "entropy": 0.318115234375, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.3636128902435303, | |
| "kl": 0.0154876708984375, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0006, | |
| "reward": 0.9479167014360428, | |
| "reward_std": 0.40788237005472183, | |
| "rewards/accuracy_reward": 0.08333333395421505, | |
| "rewards/format_reward": 0.864583358168602, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1183.2916717529297, | |
| "entropy": 0.2479248046875, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.20531828701496124, | |
| "kl": 0.01239776611328125, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0005, | |
| "reward": 1.2291666865348816, | |
| "reward_std": 0.3643130548298359, | |
| "rewards/accuracy_reward": 0.2812500027939677, | |
| "rewards/format_reward": 0.9479166716337204, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1661.4271240234375, | |
| "entropy": 0.385498046875, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.3458462357521057, | |
| "kl": 0.020599365234375, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0008, | |
| "reward": 1.229166716337204, | |
| "reward_std": 0.3946245461702347, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1379.8541717529297, | |
| "entropy": 0.33154296875, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.23957432806491852, | |
| "kl": 0.0141448974609375, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0006, | |
| "reward": 1.3229166865348816, | |
| "reward_std": 0.24646351113915443, | |
| "rewards/accuracy_reward": 0.46875000558793545, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 1426.6042175292969, | |
| "entropy": 0.337158203125, | |
| "epoch": 0.232, | |
| "grad_norm": 0.2487732470035553, | |
| "kl": 0.015777587890625, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1562500298023224, | |
| "reward_std": 0.26880528777837753, | |
| "rewards/accuracy_reward": 0.2916666669771075, | |
| "rewards/format_reward": 0.8645833432674408, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1260.0312805175781, | |
| "entropy": 0.43408203125, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.20831915736198425, | |
| "kl": 0.01959228515625, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0008, | |
| "reward": 1.1875000298023224, | |
| "reward_std": 0.32854287326335907, | |
| "rewards/accuracy_reward": 0.2395833432674408, | |
| "rewards/format_reward": 0.9479166865348816, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 1440.3020935058594, | |
| "entropy": 0.331298828125, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.22207972407341003, | |
| "kl": 0.013214111328125, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0005, | |
| "reward": 1.3645833730697632, | |
| "reward_std": 0.46688663959503174, | |
| "rewards/accuracy_reward": 0.4687500149011612, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 1833.2084045410156, | |
| "entropy": 0.340576171875, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.1765735149383545, | |
| "kl": 0.0128936767578125, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0005, | |
| "reward": 0.9375000298023224, | |
| "reward_std": 0.27143751084804535, | |
| "rewards/accuracy_reward": 0.08333333488553762, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 1413.0313110351562, | |
| "entropy": 0.407470703125, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.21233247220516205, | |
| "kl": 0.017242431640625, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0007, | |
| "reward": 1.0833333730697632, | |
| "reward_std": 0.3562712073326111, | |
| "rewards/accuracy_reward": 0.15625000651925802, | |
| "rewards/format_reward": 0.927083358168602, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1336.4062805175781, | |
| "entropy": 0.294189453125, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.2305799126625061, | |
| "kl": 0.012664794921875, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1875000149011612, | |
| "reward_std": 0.2691424489021301, | |
| "rewards/accuracy_reward": 0.2812500102445483, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 1310.2812957763672, | |
| "entropy": 0.389404296875, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.24238798022270203, | |
| "kl": 0.01934814453125, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0008, | |
| "reward": 1.2395833730697632, | |
| "reward_std": 0.36760086938738823, | |
| "rewards/accuracy_reward": 0.3229166753590107, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 1589.3750610351562, | |
| "entropy": 0.323486328125, | |
| "epoch": 0.24, | |
| "grad_norm": 0.32202091813087463, | |
| "kl": 0.01515960693359375, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0006, | |
| "reward": 1.0833333730697632, | |
| "reward_std": 0.32458770275115967, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 1605.1041717529297, | |
| "entropy": 0.4423828125, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.37479087710380554, | |
| "kl": 0.020111083984375, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0008, | |
| "reward": 1.0937500298023224, | |
| "reward_std": 0.35892703384160995, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.802083358168602, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1098.7396240234375, | |
| "entropy": 0.296630859375, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.41751039028167725, | |
| "kl": 0.0150604248046875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1875000596046448, | |
| "reward_std": 0.309124119579792, | |
| "rewards/accuracy_reward": 0.2604166716337204, | |
| "rewards/format_reward": 0.927083358168602, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1308.5520935058594, | |
| "entropy": 0.43505859375, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.46686094999313354, | |
| "kl": 0.023101806640625, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0009, | |
| "reward": 1.1666666865348816, | |
| "reward_std": 0.4640028476715088, | |
| "rewards/accuracy_reward": 0.28125000558793545, | |
| "rewards/format_reward": 0.8854167014360428, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 1767.1875305175781, | |
| "entropy": 0.484619140625, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.334074467420578, | |
| "kl": 0.02288818359375, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0009, | |
| "reward": 1.1354166865348816, | |
| "reward_std": 0.4487803429365158, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.8437500149011612, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1314.2396545410156, | |
| "entropy": 0.354248046875, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.10470432788133621, | |
| "kl": 0.012969970703125, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0005, | |
| "reward": 0.9791666865348816, | |
| "reward_std": 0.11020193248987198, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1227.4166870117188, | |
| "entropy": 0.43896484375, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.5151819586753845, | |
| "kl": 0.020538330078125, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0008, | |
| "reward": 1.1979166865348816, | |
| "reward_std": 0.38816463202238083, | |
| "rewards/accuracy_reward": 0.3125000149011612, | |
| "rewards/format_reward": 0.8854166716337204, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 1324.9792175292969, | |
| "entropy": 0.314697265625, | |
| "epoch": 0.248, | |
| "grad_norm": 0.23024234175682068, | |
| "kl": 0.0144195556640625, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0006, | |
| "reward": 1.3125000298023224, | |
| "reward_std": 0.44106680899858475, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 1387.0312957763672, | |
| "entropy": 0.253662109375, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.3250998258590698, | |
| "kl": 0.01210784912109375, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0005, | |
| "reward": 1.010416716337204, | |
| "reward_std": 0.378255732357502, | |
| "rewards/accuracy_reward": 0.0833333358168602, | |
| "rewards/format_reward": 0.9270833432674408, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1540.6771087646484, | |
| "entropy": 0.5185546875, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.24495770037174225, | |
| "kl": 0.0228729248046875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0009, | |
| "reward": 1.1458334028720856, | |
| "reward_std": 0.47743887454271317, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.8541667014360428, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1439.0938262939453, | |
| "entropy": 0.388427734375, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.22012194991111755, | |
| "kl": 0.01434326171875, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0006, | |
| "reward": 0.8854166865348816, | |
| "reward_std": 0.17735834047198296, | |
| "rewards/accuracy_reward": 0.010416666977107525, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 1156.2604370117188, | |
| "entropy": 0.25439453125, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.2978960871696472, | |
| "kl": 0.0113677978515625, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0005, | |
| "reward": 1.322916716337204, | |
| "reward_std": 0.3704235702753067, | |
| "rewards/accuracy_reward": 0.3645833469927311, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 1340.5104522705078, | |
| "entropy": 0.3916015625, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.3874484896659851, | |
| "kl": 0.019073486328125, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0008, | |
| "reward": 1.2187500298023224, | |
| "reward_std": 0.33246491849422455, | |
| "rewards/accuracy_reward": 0.3020833358168602, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1576.0208740234375, | |
| "entropy": 0.390625, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.3034595847129822, | |
| "kl": 0.019012451171875, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0008, | |
| "reward": 1.0520833730697632, | |
| "reward_std": 0.32211463153362274, | |
| "rewards/accuracy_reward": 0.22916667722165585, | |
| "rewards/format_reward": 0.8229166716337204, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 2019.4895935058594, | |
| "entropy": 0.421875, | |
| "epoch": 0.256, | |
| "grad_norm": 0.19334331154823303, | |
| "kl": 0.0189056396484375, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0008, | |
| "reward": 1.0208333432674408, | |
| "reward_std": 0.50140430778265, | |
| "rewards/accuracy_reward": 0.2187500111758709, | |
| "rewards/format_reward": 0.802083358168602, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 1698.7708435058594, | |
| "entropy": 0.53515625, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.3179035782814026, | |
| "kl": 0.028045654296875, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0011, | |
| "reward": 1.0729166865348816, | |
| "reward_std": 0.4662906527519226, | |
| "rewards/accuracy_reward": 0.2083333358168602, | |
| "rewards/format_reward": 0.864583358168602, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1244.3125610351562, | |
| "entropy": 0.2498779296875, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.14828985929489136, | |
| "kl": 0.01190948486328125, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0005, | |
| "reward": 1.3333334028720856, | |
| "reward_std": 0.3380242735147476, | |
| "rewards/accuracy_reward": 0.3541666818782687, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1184.6458587646484, | |
| "entropy": 0.3994140625, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.25174474716186523, | |
| "kl": 0.021026611328125, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0008, | |
| "reward": 1.1458333730697632, | |
| "reward_std": 0.35975906252861023, | |
| "rewards/accuracy_reward": 0.17708334140479565, | |
| "rewards/format_reward": 0.9687500149011612, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1297.875015258789, | |
| "entropy": 0.412353515625, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.3392723500728607, | |
| "kl": 0.0189666748046875, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0008, | |
| "reward": 1.1250000149011612, | |
| "reward_std": 0.381549421697855, | |
| "rewards/accuracy_reward": 0.2812500074505806, | |
| "rewards/format_reward": 0.8437500149011612, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 1462.6667175292969, | |
| "entropy": 0.46826171875, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.6046648621559143, | |
| "kl": 0.020416259765625, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0008, | |
| "reward": 0.927083358168602, | |
| "reward_std": 0.30044983327388763, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.8854166716337204, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 2096.229248046875, | |
| "entropy": 0.4873046875, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.6036101579666138, | |
| "kl": 0.0241851806640625, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.001, | |
| "reward": 0.7812500149011612, | |
| "reward_std": 0.3880816847085953, | |
| "rewards/accuracy_reward": 0.0520833358168602, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1415.8437805175781, | |
| "entropy": 0.298095703125, | |
| "epoch": 0.264, | |
| "grad_norm": 0.29470252990722656, | |
| "kl": 0.01641845703125, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0007, | |
| "reward": 1.072916716337204, | |
| "reward_std": 0.36706580221652985, | |
| "rewards/accuracy_reward": 0.2812500037252903, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 1634.7708740234375, | |
| "entropy": 0.44384765625, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.38455915451049805, | |
| "kl": 0.02178955078125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0009, | |
| "reward": 0.947916716337204, | |
| "reward_std": 0.2808724343776703, | |
| "rewards/accuracy_reward": 0.0416666679084301, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 1405.8541870117188, | |
| "entropy": 0.42626953125, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.40585455298423767, | |
| "kl": 0.0194244384765625, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0008, | |
| "reward": 0.979166716337204, | |
| "reward_std": 0.3568150997161865, | |
| "rewards/accuracy_reward": 0.0937500037252903, | |
| "rewards/format_reward": 0.8854166716337204, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 1810.0937957763672, | |
| "entropy": 0.4306640625, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.26030558347702026, | |
| "kl": 0.02484130859375, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.001, | |
| "reward": 0.947916679084301, | |
| "reward_std": 0.30990852415561676, | |
| "rewards/accuracy_reward": 0.22916667256504297, | |
| "rewards/format_reward": 0.7187500149011612, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1116.4167175292969, | |
| "entropy": 0.325439453125, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.35170045495033264, | |
| "kl": 0.0147705078125, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0006, | |
| "reward": 1.4062500596046448, | |
| "reward_std": 0.36736297607421875, | |
| "rewards/accuracy_reward": 0.4687500223517418, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1921.791748046875, | |
| "entropy": 0.501220703125, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.23206467926502228, | |
| "kl": 0.0231475830078125, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0009, | |
| "reward": 1.0520833730697632, | |
| "reward_std": 0.5711337029933929, | |
| "rewards/accuracy_reward": 0.2291666753590107, | |
| "rewards/format_reward": 0.8229166716337204, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1386.6146240234375, | |
| "entropy": 0.397705078125, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.191674143075943, | |
| "kl": 0.0179901123046875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0007, | |
| "reward": 1.1354166865348816, | |
| "reward_std": 0.26486562192440033, | |
| "rewards/accuracy_reward": 0.27083333395421505, | |
| "rewards/format_reward": 0.8645833432674408, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 1185.1875305175781, | |
| "entropy": 0.23828125, | |
| "epoch": 0.272, | |
| "grad_norm": 0.22915461659431458, | |
| "kl": 0.0135345458984375, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1041667014360428, | |
| "reward_std": 0.3254629634320736, | |
| "rewards/accuracy_reward": 0.3020833507180214, | |
| "rewards/format_reward": 0.8020833432674408, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1499.385498046875, | |
| "entropy": 0.3544921875, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.2647407352924347, | |
| "kl": 0.01690673828125, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0007, | |
| "reward": 1.2812500298023224, | |
| "reward_std": 0.2977961152791977, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.8229166716337204, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 1829.4167175292969, | |
| "entropy": 0.8447265625, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.652540385723114, | |
| "kl": 0.0433349609375, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0017, | |
| "reward": 0.7916666865348816, | |
| "reward_std": 0.4159542843699455, | |
| "rewards/accuracy_reward": 0.06250000186264515, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 1511.604248046875, | |
| "entropy": 0.53955078125, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.4173026978969574, | |
| "kl": 0.02716064453125, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0011, | |
| "reward": 0.9062500447034836, | |
| "reward_std": 0.31576745957136154, | |
| "rewards/accuracy_reward": 0.06250000093132257, | |
| "rewards/format_reward": 0.8437500149011612, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1336.7396087646484, | |
| "entropy": 0.57373046875, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.49007970094680786, | |
| "kl": 0.03204345703125, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0013, | |
| "reward": 1.020833358168602, | |
| "reward_std": 0.23006567358970642, | |
| "rewards/accuracy_reward": 0.13541666697710752, | |
| "rewards/format_reward": 0.8854166865348816, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1687.6875610351562, | |
| "entropy": 0.470703125, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.37550440430641174, | |
| "kl": 0.024993896484375, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.001, | |
| "reward": 1.0000000447034836, | |
| "reward_std": 0.3257058337330818, | |
| "rewards/accuracy_reward": 0.14583334140479565, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 1568.4166870117188, | |
| "entropy": 0.3994140625, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 1.2306023836135864, | |
| "kl": 0.02081298828125, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0008, | |
| "reward": 1.2708333432674408, | |
| "reward_std": 0.46156562119722366, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 1866.0729675292969, | |
| "entropy": 0.51953125, | |
| "epoch": 0.28, | |
| "grad_norm": 0.6619442105293274, | |
| "kl": 0.024658203125, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.001, | |
| "reward": 1.135416716337204, | |
| "reward_std": 0.6320051997900009, | |
| "rewards/accuracy_reward": 0.3020833469927311, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1490.1250305175781, | |
| "entropy": 0.380859375, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.2870371639728546, | |
| "kl": 0.0225677490234375, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0009, | |
| "reward": 1.0625000149011612, | |
| "reward_std": 0.3832716643810272, | |
| "rewards/accuracy_reward": 0.16666667349636555, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 2113.1563110351562, | |
| "entropy": 0.701904296875, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.5894800424575806, | |
| "kl": 0.0378875732421875, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0015, | |
| "reward": 0.7708333358168602, | |
| "reward_std": 0.5113655403256416, | |
| "rewards/accuracy_reward": 0.09375, | |
| "rewards/format_reward": 0.6770833358168602, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1501.5000610351562, | |
| "entropy": 0.60888671875, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.5986164212226868, | |
| "kl": 0.0314483642578125, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0013, | |
| "reward": 1.1979166865348816, | |
| "reward_std": 0.40703435614705086, | |
| "rewards/accuracy_reward": 0.38541667722165585, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1404.7396240234375, | |
| "entropy": 0.4013671875, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.29348820447921753, | |
| "kl": 0.0249481201171875, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.001, | |
| "reward": 1.2812500298023224, | |
| "reward_std": 0.34860314428806305, | |
| "rewards/accuracy_reward": 0.4479166716337204, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 1369.3021240234375, | |
| "entropy": 0.5029296875, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.801852822303772, | |
| "kl": 0.029754638671875, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0012, | |
| "reward": 1.0937500298023224, | |
| "reward_std": 0.4080042615532875, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/format_reward": 0.9062500149011612, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2857142857142857, | |
| "step": 250, | |
| "total_flos": 0.0, | |
| "train_loss": 0.00036543175232403515, | |
| "train_runtime": 19061.8828, | |
| "train_samples_per_second": 1.259, | |
| "train_steps_per_second": 0.013 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |