{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987438399845395, "eval_steps": 100, "global_step": 646, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 893.5828884124755, "epoch": 0.015460430959512996, "grad_norm": 0.05477782406272581, "kl": 0.0004816532135009766, "learning_rate": 3.0769230769230774e-06, "loss": 0.0, "reward": 0.21403060818556696, "reward_std": 0.18781698173843325, "rewards/accuracy_reward": 0.21403060818556696, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 817.3508731842041, "epoch": 0.03092086191902599, "grad_norm": 0.05164180394901766, "kl": 0.005183982849121094, "learning_rate": 6.153846153846155e-06, "loss": 0.0002, "reward": 0.36135203461162746, "reward_std": 0.1941540485713631, "rewards/accuracy_reward": 0.36135203461162746, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 798.6872316360474, "epoch": 0.04638129287853899, "grad_norm": 0.054602332205036755, "kl": 0.009011650085449218, "learning_rate": 9.230769230769232e-06, "loss": 0.0004, "reward": 0.43112243991345167, "reward_std": 0.21865551262162625, "rewards/accuracy_reward": 0.43112243991345167, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 757.7996000289917, "epoch": 0.06184172383805198, "grad_norm": 0.056117459068221236, "kl": 0.16724586486816406, "learning_rate": 1.230769230769231e-05, "loss": 0.0067, "reward": 0.49477039841003717, "reward_std": 0.20602972037158906, "rewards/accuracy_reward": 0.49477039841003717, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 748.4751132965088, "epoch": 0.07730215479756498, "grad_norm": 0.06290510173535674, "kl": 0.023724746704101563, "learning_rate": 1.5384615384615387e-05, "loss": 0.0009, "reward": 0.5140306028537452, "reward_std": 0.2120750412810594, "rewards/accuracy_reward": 0.5140306028537452, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 739.1664382934571, "epoch": 0.09276258575707798, "grad_norm": 0.05984969387434123, "kl": 0.03551025390625, "learning_rate": 1.8461538461538465e-05, "loss": 0.0014, "reward": 0.526785704959184, "reward_std": 0.20515552521683275, "rewards/accuracy_reward": 0.526785704959184, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 705.585445022583, "epoch": 0.10822301671659097, "grad_norm": 0.14192296132745433, "kl": 0.0682769775390625, "learning_rate": 1.999634547413886e-05, "loss": 0.0027, "reward": 0.5465561124496162, "reward_std": 0.23511593420989813, "rewards/accuracy_reward": 0.5465561124496162, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 655.4872343063355, "epoch": 0.12368344767610397, "grad_norm": 0.10437024812626647, "kl": 0.12667694091796874, "learning_rate": 1.9967125291968495e-05, "loss": 0.0051, "reward": 0.5284438657574355, "reward_std": 0.23128830096684397, "rewards/accuracy_reward": 0.5284438657574355, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 646.3841724395752, "epoch": 0.13914387863561697, "grad_norm": 0.10113391204729148, "kl": 0.186383056640625, "learning_rate": 1.990877034074683e-05, "loss": 0.0075, "reward": 0.5177295829169453, "reward_std": 0.23487156317569316, "rewards/accuracy_reward": 0.5177295829169453, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 670.7409303665161, "epoch": 0.15460430959512997, "grad_norm": 0.13002298056265418, "kl": 0.3470306396484375, "learning_rate": 1.9821451197042028e-05, "loss": 0.0139, "reward": 0.5079081559553742, "reward_std": 0.23903855890966952, "rewards/accuracy_reward": 0.5079081559553742, "rewards/format_reward": 0.0, "step": 100 }, { "epoch": 0.15460430959512997, "eval_completion_length": 551.0136389160157, "eval_kl": 0.2697265625, "eval_loss": 0.011112870648503304, "eval_reward": 0.530000029206276, "eval_reward_std": 0.2772822642326355, "eval_rewards/accuracy_reward": 0.530000029206276, "eval_rewards/format_reward": 0.0, "eval_runtime": 110.8876, "eval_samples_per_second": 0.893, "eval_steps_per_second": 0.036, "step": 100 }, { "completion_length": 549.0640195846557, "epoch": 0.17006474055464296, "grad_norm": 0.28429562437188954, "kl": 0.2885589599609375, "learning_rate": 1.9705423102261324e-05, "loss": 0.0115, "reward": 0.4946428484516218, "reward_std": 0.2506483959499747, "rewards/accuracy_reward": 0.4946428484516218, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 594.3490919113159, "epoch": 0.18552517151415596, "grad_norm": 0.09104387343519704, "kl": 0.21728363037109374, "learning_rate": 1.956102521655831e-05, "loss": 0.0087, "reward": 0.517984684323892, "reward_std": 0.2393535960931331, "rewards/accuracy_reward": 0.517984684323892, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 731.2531753540039, "epoch": 0.20098560247366895, "grad_norm": 0.5351787744232117, "kl": 0.826373291015625, "learning_rate": 1.9388679627438486e-05, "loss": 0.0331, "reward": 0.3987244822550565, "reward_std": 0.23079254203476013, "rewards/accuracy_reward": 0.3987244822550565, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 772.120905303955, "epoch": 0.21644603343318194, "grad_norm": 0.08975704690138181, "kl": 1.55367431640625, "learning_rate": 1.9188890115960967e-05, "loss": 0.0622, "reward": 0.4589285627473146, "reward_std": 0.23901810268871487, "rewards/accuracy_reward": 0.4589285627473146, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 749.2706487655639, "epoch": 0.23190646439269494, "grad_norm": 0.11341391574096771, "kl": 0.0939483642578125, "learning_rate": 1.8962240684142923e-05, "loss": 0.0038, "reward": 0.49234693106263877, "reward_std": 0.22801345195621253, "rewards/accuracy_reward": 0.49234693106263877, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 791.6997276306153, "epoch": 0.24736689535220793, "grad_norm": 0.3224456998961169, "kl": 0.3746490478515625, "learning_rate": 1.8709393847871146e-05, "loss": 0.015, "reward": 0.3957908083219081, "reward_std": 0.27026979019865394, "rewards/accuracy_reward": 0.3957908083219081, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 817.1753665924073, "epoch": 0.26282732631172095, "grad_norm": 0.24361295168325228, "kl": 0.87244873046875, "learning_rate": 1.8431088700310846e-05, "loss": 0.0349, "reward": 0.29821428112918513, "reward_std": 0.22583510340191423, "rewards/accuracy_reward": 0.29821428112918513, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 753.5609531402588, "epoch": 0.27828775727123395, "grad_norm": 0.1166452373188838, "kl": 0.372216796875, "learning_rate": 1.8128138751472432e-05, "loss": 0.0149, "reward": 0.3784438705071807, "reward_std": 0.23936128611676394, "rewards/accuracy_reward": 0.3784438705071807, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 655.2237131118775, "epoch": 0.29374818823074694, "grad_norm": 0.07909899864987424, "kl": 0.3955078125, "learning_rate": 1.780142955025139e-05, "loss": 0.0158, "reward": 0.4104591752868146, "reward_std": 0.2427055777516216, "rewards/accuracy_reward": 0.4104591752868146, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 569.6598121643067, "epoch": 0.30920861919025994, "grad_norm": 0.08783730023470541, "kl": 0.37242431640625, "learning_rate": 1.745191609589231e-05, "loss": 0.0149, "reward": 0.43775509400293233, "reward_std": 0.2702743059489876, "rewards/accuracy_reward": 0.43775509400293233, "rewards/format_reward": 0.0, "step": 200 }, { "epoch": 0.30920861919025994, "eval_completion_length": 592.9845031738281, "eval_kl": 0.38939453125, "eval_loss": 0.015397945418953896, "eval_reward": 0.4400000186264515, "eval_reward_std": 0.28451553016901016, "eval_rewards/accuracy_reward": 0.4400000186264515, "eval_rewards/format_reward": 0.0, "eval_runtime": 113.6553, "eval_samples_per_second": 0.871, "eval_steps_per_second": 0.035, "step": 200 }, { "completion_length": 590.8179740905762, "epoch": 0.32466905014977293, "grad_norm": 0.4429670525609896, "kl": 0.35711669921875, "learning_rate": 1.7080620046443503e-05, "loss": 0.0143, "reward": 0.4413265222683549, "reward_std": 0.26584886815398934, "rewards/accuracy_reward": 0.4413265222683549, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 590.4183538436889, "epoch": 0.3401294811092859, "grad_norm": 6.707552360764972, "kl": 0.8987060546875, "learning_rate": 1.6688626732362192e-05, "loss": 0.036, "reward": 0.4778061142191291, "reward_std": 0.264619250735268, "rewards/accuracy_reward": 0.4778061142191291, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 658.2447557449341, "epoch": 0.3555899120687989, "grad_norm": 0.11712079199179656, "kl": 0.389324951171875, "learning_rate": 1.6277081983999742e-05, "loss": 0.0156, "reward": 0.42512754308991135, "reward_std": 0.2852953039575368, "rewards/accuracy_reward": 0.42512754308991135, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 632.0840436935425, "epoch": 0.3710503430283119, "grad_norm": 0.07151809962617796, "kl": 0.4206787109375, "learning_rate": 1.5847188782240473e-05, "loss": 0.0168, "reward": 0.4248724417295307, "reward_std": 0.26311199530027807, "rewards/accuracy_reward": 0.4248724417295307, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 592.4107023239136, "epoch": 0.3865107739878249, "grad_norm": 0.08333585854235208, "kl": 0.354058837890625, "learning_rate": 1.5400203742084508e-05, "loss": 0.0142, "reward": 0.45089285019785164, "reward_std": 0.2697589965071529, "rewards/accuracy_reward": 0.45089285019785164, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 613.9152961730957, "epoch": 0.4019712049473379, "grad_norm": 0.08634652030847269, "kl": 0.39644775390625, "learning_rate": 1.4937433439453465e-05, "loss": 0.0159, "reward": 0.42321427781134846, "reward_std": 0.2768237174488604, "rewards/accuracy_reward": 0.42321427781134846, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 602.9336614608765, "epoch": 0.4174316359068509, "grad_norm": 0.09339307778633552, "kl": 0.427685546875, "learning_rate": 1.4460230591956097e-05, "loss": 0.0171, "reward": 0.441454073600471, "reward_std": 0.28628530018031595, "rewards/accuracy_reward": 0.441454073600471, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 607.3401681900025, "epoch": 0.4328920668663639, "grad_norm": 0.09497207943977368, "kl": 0.49241943359375, "learning_rate": 1.3969990104777712e-05, "loss": 0.0197, "reward": 0.3746173400897533, "reward_std": 0.287894579814747, "rewards/accuracy_reward": 0.3746173400897533, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 554.3765167236328, "epoch": 0.4483524978258769, "grad_norm": 0.10760598840353774, "kl": 0.4693603515625, "learning_rate": 1.3468144993251735e-05, "loss": 0.0188, "reward": 0.3869897902011871, "reward_std": 0.27005039355717597, "rewards/accuracy_reward": 0.3869897902011871, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 565.3260089874268, "epoch": 0.4638129287853899, "grad_norm": 0.08708785017207243, "kl": 0.47398681640625, "learning_rate": 1.295616219403197e-05, "loss": 0.019, "reward": 0.3772959113586694, "reward_std": 0.25850353664718567, "rewards/accuracy_reward": 0.3772959113586694, "rewards/format_reward": 0.0, "step": 300 }, { "epoch": 0.4638129287853899, "eval_completion_length": 563.6594519042969, "eval_kl": 0.46421875, "eval_loss": 0.016797564923763275, "eval_reward": 0.4000000201165676, "eval_reward_std": 0.32052057892084124, "eval_rewards/accuracy_reward": 0.4000000201165676, "eval_rewards/format_reward": 0.0, "eval_runtime": 112.3506, "eval_samples_per_second": 0.881, "eval_steps_per_second": 0.036, "step": 300 }, { "completion_length": 561.7035604476929, "epoch": 0.47927335974490287, "grad_norm": 0.09043618938604044, "kl": 0.439971923828125, "learning_rate": 1.2435538277109919e-05, "loss": 0.0176, "reward": 0.38673468665219846, "reward_std": 0.2600894993636757, "rewards/accuracy_reward": 0.38673468665219846, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 596.5308570861816, "epoch": 0.49473379070441587, "grad_norm": 0.14081320533250113, "kl": 0.438336181640625, "learning_rate": 1.19077950712113e-05, "loss": 0.0175, "reward": 0.4330357064725831, "reward_std": 0.2756219625007361, "rewards/accuracy_reward": 0.4330357064725831, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 607.1673355102539, "epoch": 0.5101942216639289, "grad_norm": 0.10027936642706035, "kl": 0.46512451171875, "learning_rate": 1.137447521535908e-05, "loss": 0.0186, "reward": 0.388903054734692, "reward_std": 0.27682310505770147, "rewards/accuracy_reward": 0.388903054734692, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 621.5595531463623, "epoch": 0.5256546526234419, "grad_norm": 0.11573394826594872, "kl": 0.507958984375, "learning_rate": 1.0837137649606241e-05, "loss": 0.0203, "reward": 0.38035713706631213, "reward_std": 0.280763331009075, "rewards/accuracy_reward": 0.38035713706631213, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 563.3604457855224, "epoch": 0.5411150835829549, "grad_norm": 0.1835125826838055, "kl": 0.409375, "learning_rate": 1.0297353058119209e-05, "loss": 0.0164, "reward": 0.415816318616271, "reward_std": 0.27372801350429654, "rewards/accuracy_reward": 0.415816318616271, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 587.2214157104493, "epoch": 0.5565755145424679, "grad_norm": 0.1000658792975156, "kl": 0.43155517578125, "learning_rate": 9.756699277932196e-06, "loss": 0.0173, "reward": 0.4036989708431065, "reward_std": 0.2821080778259784, "rewards/accuracy_reward": 0.4036989708431065, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 595.8506248474121, "epoch": 0.5720359455019809, "grad_norm": 0.8763257922920367, "kl": 0.4752197265625, "learning_rate": 9.216756686793163e-06, "loss": 0.019, "reward": 0.40025509386323394, "reward_std": 0.2795922602061182, "rewards/accuracy_reward": 0.40025509386323394, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 590.1071298599243, "epoch": 0.5874963764614939, "grad_norm": 0.10389955173536307, "kl": 0.515869140625, "learning_rate": 8.67910358358298e-06, "loss": 0.0206, "reward": 0.3820152990985662, "reward_std": 0.29924757215194403, "rewards/accuracy_reward": 0.3820152990985662, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 559.9300924301148, "epoch": 0.6029568074210069, "grad_norm": 0.1070276652241689, "kl": 0.4572998046875, "learning_rate": 8.145311574811325e-06, "loss": 0.0183, "reward": 0.38852040134370325, "reward_std": 0.2695158838760108, "rewards/accuracy_reward": 0.38852040134370325, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 585.0901662826539, "epoch": 0.6184172383805199, "grad_norm": 0.1240698442943613, "kl": 0.468115234375, "learning_rate": 7.616940980675004e-06, "loss": 0.0187, "reward": 0.39999999292194843, "reward_std": 0.2908332996070385, "rewards/accuracy_reward": 0.39999999292194843, "rewards/format_reward": 0.0, "step": 400 }, { "epoch": 0.6184172383805199, "eval_completion_length": 579.0039282226562, "eval_kl": 0.454453125, "eval_loss": 0.018264248967170715, "eval_reward": 0.4242857307195663, "eval_reward_std": 0.3304371988773346, "eval_rewards/accuracy_reward": 0.4242857307195663, "eval_rewards/format_reward": 0.0, "eval_runtime": 112.7887, "eval_samples_per_second": 0.878, "eval_steps_per_second": 0.035, "step": 400 }, { "completion_length": 596.5103164672852, "epoch": 0.6338776693400329, "grad_norm": 0.12074064710856633, "kl": 0.485009765625, "learning_rate": 7.095536274107046e-06, "loss": 0.0194, "reward": 0.38278060434386135, "reward_std": 0.295634587854147, "rewards/accuracy_reward": 0.38278060434386135, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 598.0414419174194, "epoch": 0.6493381002995459, "grad_norm": 0.12233619499866348, "kl": 0.53099365234375, "learning_rate": 6.58262156614881e-06, "loss": 0.0212, "reward": 0.36505101402290163, "reward_std": 0.2934939767234027, "rewards/accuracy_reward": 0.36505101402290163, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 567.2344249725342, "epoch": 0.6647985312590589, "grad_norm": 0.14290575648298232, "kl": 0.54254150390625, "learning_rate": 6.079696150841634e-06, "loss": 0.0217, "reward": 0.3419642790220678, "reward_std": 0.27319411835633217, "rewards/accuracy_reward": 0.3419642790220678, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 567.8533065795898, "epoch": 0.6802589622185718, "grad_norm": 0.1332021338177752, "kl": 0.55421142578125, "learning_rate": 5.588230122660672e-06, "loss": 0.0222, "reward": 0.3531887697754428, "reward_std": 0.27350661787204444, "rewards/accuracy_reward": 0.3531887697754428, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 572.4235851287842, "epoch": 0.6957193931780848, "grad_norm": 0.1659377031735989, "kl": 0.539794921875, "learning_rate": 5.109660079301668e-06, "loss": 0.0216, "reward": 0.36785713671706616, "reward_std": 0.2756811453495175, "rewards/accuracy_reward": 0.36785713671706616, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 570.152794265747, "epoch": 0.7111798241375978, "grad_norm": 0.12626468054028842, "kl": 0.51593017578125, "learning_rate": 4.64538492238166e-06, "loss": 0.0206, "reward": 0.38137754444032906, "reward_std": 0.2765601733699441, "rewards/accuracy_reward": 0.38137754444032906, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 593.0752410888672, "epoch": 0.7266402550971108, "grad_norm": 0.4239556346834354, "kl": 0.49893798828125, "learning_rate": 4.196761768328599e-06, "loss": 0.02, "reward": 0.3998724427074194, "reward_std": 0.2849443768151104, "rewards/accuracy_reward": 0.3998724427074194, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 610.4562358856201, "epoch": 0.7421006860566238, "grad_norm": 0.13103766064000663, "kl": 0.47908935546875, "learning_rate": 3.7651019814126656e-06, "loss": 0.0192, "reward": 0.39668366676196454, "reward_std": 0.2925532532390207, "rewards/accuracy_reward": 0.39668366676196454, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 600.7034322738648, "epoch": 0.7575611170161368, "grad_norm": 0.12932522707643554, "kl": 0.51405029296875, "learning_rate": 3.3516673405151546e-06, "loss": 0.0206, "reward": 0.3905612169764936, "reward_std": 0.28836513138376174, "rewards/accuracy_reward": 0.3905612169764936, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 591.6121044158936, "epoch": 0.7730215479756498, "grad_norm": 0.12487727544118296, "kl": 0.49349365234375, "learning_rate": 2.957666350839663e-06, "loss": 0.0197, "reward": 0.38303570710122586, "reward_std": 0.28170021236874165, "rewards/accuracy_reward": 0.38303570710122586, "rewards/format_reward": 0.0, "step": 500 }, { "epoch": 0.7730215479756498, "eval_completion_length": 594.4429809570313, "eval_kl": 0.496015625, "eval_loss": 0.019787130877375603, "eval_reward": 0.3971428729593754, "eval_reward_std": 0.3095328077673912, "eval_rewards/accuracy_reward": 0.3971428729593754, "eval_rewards/format_reward": 0.0, "eval_runtime": 113.39, "eval_samples_per_second": 0.873, "eval_steps_per_second": 0.035, "step": 500 }, { "completion_length": 599.3793228149414, "epoch": 0.7884819789351628, "grad_norm": 0.21375062613439136, "kl": 0.515185546875, "learning_rate": 2.5842507113469307e-06, "loss": 0.0206, "reward": 0.375255094806198, "reward_std": 0.27221947656944395, "rewards/accuracy_reward": 0.375255094806198, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 585.9983297348023, "epoch": 0.8039424098946758, "grad_norm": 0.1500149911042034, "kl": 0.5186767578125, "learning_rate": 2.2325119482391466e-06, "loss": 0.0207, "reward": 0.37665815658401697, "reward_std": 0.26996592339128256, "rewards/accuracy_reward": 0.37665815658401697, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 590.9803451538086, "epoch": 0.8194028408541888, "grad_norm": 0.46915042082176195, "kl": 0.5627685546875, "learning_rate": 1.9034782243345074e-06, "loss": 0.0225, "reward": 0.3559948909911327, "reward_std": 0.2665759083814919, "rewards/accuracy_reward": 0.3559948909911327, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 587.5531749725342, "epoch": 0.8348632718137018, "grad_norm": 0.35357251691875513, "kl": 0.5494384765625, "learning_rate": 1.5981113336584041e-06, "loss": 0.022, "reward": 0.3682397903525271, "reward_std": 0.2814167995005846, "rewards/accuracy_reward": 0.3682397903525271, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 567.9155498504639, "epoch": 0.8503237027732148, "grad_norm": 1.5999018510524907, "kl": 0.89453125, "learning_rate": 1.3173038900362977e-06, "loss": 0.0358, "reward": 0.3917091765906662, "reward_std": 0.28189596980810167, "rewards/accuracy_reward": 0.3917091765906662, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 558.6094276428223, "epoch": 0.8657841337327278, "grad_norm": 1.9953500059119544, "kl": 1.2019775390625, "learning_rate": 1.0618767179063416e-06, "loss": 0.0481, "reward": 0.3970663193613291, "reward_std": 0.2844697198830545, "rewards/accuracy_reward": 0.3970663193613291, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 590.6031772613526, "epoch": 0.8812445646922408, "grad_norm": 1.7788121098553087, "kl": 1.5089599609375, "learning_rate": 8.325764529785851e-07, "loss": 0.0604, "reward": 0.3730867275618948, "reward_std": 0.26905877529643474, "rewards/accuracy_reward": 0.3730867275618948, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 608.6429714202881, "epoch": 0.8967049956517538, "grad_norm": 23.56674292032029, "kl": 1.8998779296875, "learning_rate": 6.300733597542086e-07, "loss": 0.076, "reward": 0.3751275438349694, "reward_std": 0.28051954903639853, "rewards/accuracy_reward": 0.3751275438349694, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 600.8526664733887, "epoch": 0.9121654266112668, "grad_norm": 0.987276324273893, "kl": 1.8114501953125, "learning_rate": 4.549593722844492e-07, "loss": 0.0725, "reward": 0.38966836063191296, "reward_std": 0.2809427470434457, "rewards/accuracy_reward": 0.38966836063191296, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 589.8212898254394, "epoch": 0.9276258575707798, "grad_norm": 0.7872302567234748, "kl": 1.91683349609375, "learning_rate": 3.0774636389618196e-07, "loss": 0.0767, "reward": 0.4075255025178194, "reward_std": 0.2732542660087347, "rewards/accuracy_reward": 0.4075255025178194, "rewards/format_reward": 0.0, "step": 600 }, { "epoch": 0.9276258575707798, "eval_completion_length": 574.0087805175781, "eval_kl": 1.86640625, "eval_loss": 0.07402127236127853, "eval_reward": 0.39571430325508117, "eval_reward_std": 0.3305768924951553, "eval_rewards/accuracy_reward": 0.39571430325508117, "eval_rewards/format_reward": 0.0, "eval_runtime": 112.1819, "eval_samples_per_second": 0.882, "eval_steps_per_second": 0.036, "step": 600 }, { "completion_length": 582.802791595459, "epoch": 0.9430862885302927, "grad_norm": 0.8922411277504559, "kl": 1.8591552734375, "learning_rate": 1.8886465094192895e-07, "loss": 0.0744, "reward": 0.37487244252115487, "reward_std": 0.2767856624443084, "rewards/accuracy_reward": 0.37487244252115487, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 577.6701448440551, "epoch": 0.9585467194898057, "grad_norm": 0.6713673354373313, "kl": 1.8814208984375, "learning_rate": 9.866173494794462e-08, "loss": 0.0753, "reward": 0.3966836669947952, "reward_std": 0.27219036710448563, "rewards/accuracy_reward": 0.3966836669947952, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 571.5952686309814, "epoch": 0.9740071504493187, "grad_norm": 1.6739477461143617, "kl": 1.7553955078125, "learning_rate": 3.7401286837214224e-08, "loss": 0.0702, "reward": 0.38431122010806573, "reward_std": 0.2745078310370445, "rewards/accuracy_reward": 0.38431122010806573, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 575.404195022583, "epoch": 0.9894675814088317, "grad_norm": 0.5847733761169498, "kl": 1.7476806640625, "learning_rate": 5.262376196544239e-09, "loss": 0.0699, "reward": 0.37410713639110327, "reward_std": 0.28336770869791506, "rewards/accuracy_reward": 0.37410713639110327, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 569.2087484995524, "epoch": 0.9987438399845395, "kl": 1.7266031901041667, "reward": 0.40284863238533336, "reward_std": 0.28753911207119626, "rewards/accuracy_reward": 0.40284863238533336, "rewards/format_reward": 0.0, "step": 646, "total_flos": 0.0, "train_loss": 0.025075771342079053, "train_runtime": 60646.5116, "train_samples_per_second": 1.194, "train_steps_per_second": 0.011 } ], "logging_steps": 10, "max_steps": 646, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 7, "trial_name": null, "trial_params": null }