| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9987438399845395, | |
| "eval_steps": 100, | |
| "global_step": 646, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 893.5828884124755, | |
| "epoch": 0.015460430959512996, | |
| "grad_norm": 0.05477782406272581, | |
| "kl": 0.0004816532135009766, | |
| "learning_rate": 3.0769230769230774e-06, | |
| "loss": 0.0, | |
| "reward": 0.21403060818556696, | |
| "reward_std": 0.18781698173843325, | |
| "rewards/accuracy_reward": 0.21403060818556696, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 817.3508731842041, | |
| "epoch": 0.03092086191902599, | |
| "grad_norm": 0.05164180394901766, | |
| "kl": 0.005183982849121094, | |
| "learning_rate": 6.153846153846155e-06, | |
| "loss": 0.0002, | |
| "reward": 0.36135203461162746, | |
| "reward_std": 0.1941540485713631, | |
| "rewards/accuracy_reward": 0.36135203461162746, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 798.6872316360474, | |
| "epoch": 0.04638129287853899, | |
| "grad_norm": 0.054602332205036755, | |
| "kl": 0.009011650085449218, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 0.0004, | |
| "reward": 0.43112243991345167, | |
| "reward_std": 0.21865551262162625, | |
| "rewards/accuracy_reward": 0.43112243991345167, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 757.7996000289917, | |
| "epoch": 0.06184172383805198, | |
| "grad_norm": 0.056117459068221236, | |
| "kl": 0.16724586486816406, | |
| "learning_rate": 1.230769230769231e-05, | |
| "loss": 0.0067, | |
| "reward": 0.49477039841003717, | |
| "reward_std": 0.20602972037158906, | |
| "rewards/accuracy_reward": 0.49477039841003717, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 748.4751132965088, | |
| "epoch": 0.07730215479756498, | |
| "grad_norm": 0.06290510173535674, | |
| "kl": 0.023724746704101563, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 0.0009, | |
| "reward": 0.5140306028537452, | |
| "reward_std": 0.2120750412810594, | |
| "rewards/accuracy_reward": 0.5140306028537452, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 739.1664382934571, | |
| "epoch": 0.09276258575707798, | |
| "grad_norm": 0.05984969387434123, | |
| "kl": 0.03551025390625, | |
| "learning_rate": 1.8461538461538465e-05, | |
| "loss": 0.0014, | |
| "reward": 0.526785704959184, | |
| "reward_std": 0.20515552521683275, | |
| "rewards/accuracy_reward": 0.526785704959184, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 705.585445022583, | |
| "epoch": 0.10822301671659097, | |
| "grad_norm": 0.14192296132745433, | |
| "kl": 0.0682769775390625, | |
| "learning_rate": 1.999634547413886e-05, | |
| "loss": 0.0027, | |
| "reward": 0.5465561124496162, | |
| "reward_std": 0.23511593420989813, | |
| "rewards/accuracy_reward": 0.5465561124496162, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 655.4872343063355, | |
| "epoch": 0.12368344767610397, | |
| "grad_norm": 0.10437024812626647, | |
| "kl": 0.12667694091796874, | |
| "learning_rate": 1.9967125291968495e-05, | |
| "loss": 0.0051, | |
| "reward": 0.5284438657574355, | |
| "reward_std": 0.23128830096684397, | |
| "rewards/accuracy_reward": 0.5284438657574355, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 646.3841724395752, | |
| "epoch": 0.13914387863561697, | |
| "grad_norm": 0.10113391204729148, | |
| "kl": 0.186383056640625, | |
| "learning_rate": 1.990877034074683e-05, | |
| "loss": 0.0075, | |
| "reward": 0.5177295829169453, | |
| "reward_std": 0.23487156317569316, | |
| "rewards/accuracy_reward": 0.5177295829169453, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 670.7409303665161, | |
| "epoch": 0.15460430959512997, | |
| "grad_norm": 0.13002298056265418, | |
| "kl": 0.3470306396484375, | |
| "learning_rate": 1.9821451197042028e-05, | |
| "loss": 0.0139, | |
| "reward": 0.5079081559553742, | |
| "reward_std": 0.23903855890966952, | |
| "rewards/accuracy_reward": 0.5079081559553742, | |
| "rewards/format_reward": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.15460430959512997, | |
| "eval_completion_length": 551.0136389160157, | |
| "eval_kl": 0.2697265625, | |
| "eval_loss": 0.011112870648503304, | |
| "eval_reward": 0.530000029206276, | |
| "eval_reward_std": 0.2772822642326355, | |
| "eval_rewards/accuracy_reward": 0.530000029206276, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 110.8876, | |
| "eval_samples_per_second": 0.893, | |
| "eval_steps_per_second": 0.036, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 549.0640195846557, | |
| "epoch": 0.17006474055464296, | |
| "grad_norm": 0.28429562437188954, | |
| "kl": 0.2885589599609375, | |
| "learning_rate": 1.9705423102261324e-05, | |
| "loss": 0.0115, | |
| "reward": 0.4946428484516218, | |
| "reward_std": 0.2506483959499747, | |
| "rewards/accuracy_reward": 0.4946428484516218, | |
| "rewards/format_reward": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 594.3490919113159, | |
| "epoch": 0.18552517151415596, | |
| "grad_norm": 0.09104387343519704, | |
| "kl": 0.21728363037109374, | |
| "learning_rate": 1.956102521655831e-05, | |
| "loss": 0.0087, | |
| "reward": 0.517984684323892, | |
| "reward_std": 0.2393535960931331, | |
| "rewards/accuracy_reward": 0.517984684323892, | |
| "rewards/format_reward": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 731.2531753540039, | |
| "epoch": 0.20098560247366895, | |
| "grad_norm": 0.5351787744232117, | |
| "kl": 0.826373291015625, | |
| "learning_rate": 1.9388679627438486e-05, | |
| "loss": 0.0331, | |
| "reward": 0.3987244822550565, | |
| "reward_std": 0.23079254203476013, | |
| "rewards/accuracy_reward": 0.3987244822550565, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 772.120905303955, | |
| "epoch": 0.21644603343318194, | |
| "grad_norm": 0.08975704690138181, | |
| "kl": 1.55367431640625, | |
| "learning_rate": 1.9188890115960967e-05, | |
| "loss": 0.0622, | |
| "reward": 0.4589285627473146, | |
| "reward_std": 0.23901810268871487, | |
| "rewards/accuracy_reward": 0.4589285627473146, | |
| "rewards/format_reward": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 749.2706487655639, | |
| "epoch": 0.23190646439269494, | |
| "grad_norm": 0.11341391574096771, | |
| "kl": 0.0939483642578125, | |
| "learning_rate": 1.8962240684142923e-05, | |
| "loss": 0.0038, | |
| "reward": 0.49234693106263877, | |
| "reward_std": 0.22801345195621253, | |
| "rewards/accuracy_reward": 0.49234693106263877, | |
| "rewards/format_reward": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 791.6997276306153, | |
| "epoch": 0.24736689535220793, | |
| "grad_norm": 0.3224456998961169, | |
| "kl": 0.3746490478515625, | |
| "learning_rate": 1.8709393847871146e-05, | |
| "loss": 0.015, | |
| "reward": 0.3957908083219081, | |
| "reward_std": 0.27026979019865394, | |
| "rewards/accuracy_reward": 0.3957908083219081, | |
| "rewards/format_reward": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 817.1753665924073, | |
| "epoch": 0.26282732631172095, | |
| "grad_norm": 0.24361295168325228, | |
| "kl": 0.87244873046875, | |
| "learning_rate": 1.8431088700310846e-05, | |
| "loss": 0.0349, | |
| "reward": 0.29821428112918513, | |
| "reward_std": 0.22583510340191423, | |
| "rewards/accuracy_reward": 0.29821428112918513, | |
| "rewards/format_reward": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 753.5609531402588, | |
| "epoch": 0.27828775727123395, | |
| "grad_norm": 0.1166452373188838, | |
| "kl": 0.372216796875, | |
| "learning_rate": 1.8128138751472432e-05, | |
| "loss": 0.0149, | |
| "reward": 0.3784438705071807, | |
| "reward_std": 0.23936128611676394, | |
| "rewards/accuracy_reward": 0.3784438705071807, | |
| "rewards/format_reward": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 655.2237131118775, | |
| "epoch": 0.29374818823074694, | |
| "grad_norm": 0.07909899864987424, | |
| "kl": 0.3955078125, | |
| "learning_rate": 1.780142955025139e-05, | |
| "loss": 0.0158, | |
| "reward": 0.4104591752868146, | |
| "reward_std": 0.2427055777516216, | |
| "rewards/accuracy_reward": 0.4104591752868146, | |
| "rewards/format_reward": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 569.6598121643067, | |
| "epoch": 0.30920861919025994, | |
| "grad_norm": 0.08783730023470541, | |
| "kl": 0.37242431640625, | |
| "learning_rate": 1.745191609589231e-05, | |
| "loss": 0.0149, | |
| "reward": 0.43775509400293233, | |
| "reward_std": 0.2702743059489876, | |
| "rewards/accuracy_reward": 0.43775509400293233, | |
| "rewards/format_reward": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.30920861919025994, | |
| "eval_completion_length": 592.9845031738281, | |
| "eval_kl": 0.38939453125, | |
| "eval_loss": 0.015397945418953896, | |
| "eval_reward": 0.4400000186264515, | |
| "eval_reward_std": 0.28451553016901016, | |
| "eval_rewards/accuracy_reward": 0.4400000186264515, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 113.6553, | |
| "eval_samples_per_second": 0.871, | |
| "eval_steps_per_second": 0.035, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 590.8179740905762, | |
| "epoch": 0.32466905014977293, | |
| "grad_norm": 0.4429670525609896, | |
| "kl": 0.35711669921875, | |
| "learning_rate": 1.7080620046443503e-05, | |
| "loss": 0.0143, | |
| "reward": 0.4413265222683549, | |
| "reward_std": 0.26584886815398934, | |
| "rewards/accuracy_reward": 0.4413265222683549, | |
| "rewards/format_reward": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 590.4183538436889, | |
| "epoch": 0.3401294811092859, | |
| "grad_norm": 6.707552360764972, | |
| "kl": 0.8987060546875, | |
| "learning_rate": 1.6688626732362192e-05, | |
| "loss": 0.036, | |
| "reward": 0.4778061142191291, | |
| "reward_std": 0.264619250735268, | |
| "rewards/accuracy_reward": 0.4778061142191291, | |
| "rewards/format_reward": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 658.2447557449341, | |
| "epoch": 0.3555899120687989, | |
| "grad_norm": 0.11712079199179656, | |
| "kl": 0.389324951171875, | |
| "learning_rate": 1.6277081983999742e-05, | |
| "loss": 0.0156, | |
| "reward": 0.42512754308991135, | |
| "reward_std": 0.2852953039575368, | |
| "rewards/accuracy_reward": 0.42512754308991135, | |
| "rewards/format_reward": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 632.0840436935425, | |
| "epoch": 0.3710503430283119, | |
| "grad_norm": 0.07151809962617796, | |
| "kl": 0.4206787109375, | |
| "learning_rate": 1.5847188782240473e-05, | |
| "loss": 0.0168, | |
| "reward": 0.4248724417295307, | |
| "reward_std": 0.26311199530027807, | |
| "rewards/accuracy_reward": 0.4248724417295307, | |
| "rewards/format_reward": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 592.4107023239136, | |
| "epoch": 0.3865107739878249, | |
| "grad_norm": 0.08333585854235208, | |
| "kl": 0.354058837890625, | |
| "learning_rate": 1.5400203742084508e-05, | |
| "loss": 0.0142, | |
| "reward": 0.45089285019785164, | |
| "reward_std": 0.2697589965071529, | |
| "rewards/accuracy_reward": 0.45089285019785164, | |
| "rewards/format_reward": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 613.9152961730957, | |
| "epoch": 0.4019712049473379, | |
| "grad_norm": 0.08634652030847269, | |
| "kl": 0.39644775390625, | |
| "learning_rate": 1.4937433439453465e-05, | |
| "loss": 0.0159, | |
| "reward": 0.42321427781134846, | |
| "reward_std": 0.2768237174488604, | |
| "rewards/accuracy_reward": 0.42321427781134846, | |
| "rewards/format_reward": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 602.9336614608765, | |
| "epoch": 0.4174316359068509, | |
| "grad_norm": 0.09339307778633552, | |
| "kl": 0.427685546875, | |
| "learning_rate": 1.4460230591956097e-05, | |
| "loss": 0.0171, | |
| "reward": 0.441454073600471, | |
| "reward_std": 0.28628530018031595, | |
| "rewards/accuracy_reward": 0.441454073600471, | |
| "rewards/format_reward": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 607.3401681900025, | |
| "epoch": 0.4328920668663639, | |
| "grad_norm": 0.09497207943977368, | |
| "kl": 0.49241943359375, | |
| "learning_rate": 1.3969990104777712e-05, | |
| "loss": 0.0197, | |
| "reward": 0.3746173400897533, | |
| "reward_std": 0.287894579814747, | |
| "rewards/accuracy_reward": 0.3746173400897533, | |
| "rewards/format_reward": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 554.3765167236328, | |
| "epoch": 0.4483524978258769, | |
| "grad_norm": 0.10760598840353774, | |
| "kl": 0.4693603515625, | |
| "learning_rate": 1.3468144993251735e-05, | |
| "loss": 0.0188, | |
| "reward": 0.3869897902011871, | |
| "reward_std": 0.27005039355717597, | |
| "rewards/accuracy_reward": 0.3869897902011871, | |
| "rewards/format_reward": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 565.3260089874268, | |
| "epoch": 0.4638129287853899, | |
| "grad_norm": 0.08708785017207243, | |
| "kl": 0.47398681640625, | |
| "learning_rate": 1.295616219403197e-05, | |
| "loss": 0.019, | |
| "reward": 0.3772959113586694, | |
| "reward_std": 0.25850353664718567, | |
| "rewards/accuracy_reward": 0.3772959113586694, | |
| "rewards/format_reward": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4638129287853899, | |
| "eval_completion_length": 563.6594519042969, | |
| "eval_kl": 0.46421875, | |
| "eval_loss": 0.016797564923763275, | |
| "eval_reward": 0.4000000201165676, | |
| "eval_reward_std": 0.32052057892084124, | |
| "eval_rewards/accuracy_reward": 0.4000000201165676, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 112.3506, | |
| "eval_samples_per_second": 0.881, | |
| "eval_steps_per_second": 0.036, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 561.7035604476929, | |
| "epoch": 0.47927335974490287, | |
| "grad_norm": 0.09043618938604044, | |
| "kl": 0.439971923828125, | |
| "learning_rate": 1.2435538277109919e-05, | |
| "loss": 0.0176, | |
| "reward": 0.38673468665219846, | |
| "reward_std": 0.2600894993636757, | |
| "rewards/accuracy_reward": 0.38673468665219846, | |
| "rewards/format_reward": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 596.5308570861816, | |
| "epoch": 0.49473379070441587, | |
| "grad_norm": 0.14081320533250113, | |
| "kl": 0.438336181640625, | |
| "learning_rate": 1.19077950712113e-05, | |
| "loss": 0.0175, | |
| "reward": 0.4330357064725831, | |
| "reward_std": 0.2756219625007361, | |
| "rewards/accuracy_reward": 0.4330357064725831, | |
| "rewards/format_reward": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 607.1673355102539, | |
| "epoch": 0.5101942216639289, | |
| "grad_norm": 0.10027936642706035, | |
| "kl": 0.46512451171875, | |
| "learning_rate": 1.137447521535908e-05, | |
| "loss": 0.0186, | |
| "reward": 0.388903054734692, | |
| "reward_std": 0.27682310505770147, | |
| "rewards/accuracy_reward": 0.388903054734692, | |
| "rewards/format_reward": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 621.5595531463623, | |
| "epoch": 0.5256546526234419, | |
| "grad_norm": 0.11573394826594872, | |
| "kl": 0.507958984375, | |
| "learning_rate": 1.0837137649606241e-05, | |
| "loss": 0.0203, | |
| "reward": 0.38035713706631213, | |
| "reward_std": 0.280763331009075, | |
| "rewards/accuracy_reward": 0.38035713706631213, | |
| "rewards/format_reward": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 563.3604457855224, | |
| "epoch": 0.5411150835829549, | |
| "grad_norm": 0.1835125826838055, | |
| "kl": 0.409375, | |
| "learning_rate": 1.0297353058119209e-05, | |
| "loss": 0.0164, | |
| "reward": 0.415816318616271, | |
| "reward_std": 0.27372801350429654, | |
| "rewards/accuracy_reward": 0.415816318616271, | |
| "rewards/format_reward": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 587.2214157104493, | |
| "epoch": 0.5565755145424679, | |
| "grad_norm": 0.1000658792975156, | |
| "kl": 0.43155517578125, | |
| "learning_rate": 9.756699277932196e-06, | |
| "loss": 0.0173, | |
| "reward": 0.4036989708431065, | |
| "reward_std": 0.2821080778259784, | |
| "rewards/accuracy_reward": 0.4036989708431065, | |
| "rewards/format_reward": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 595.8506248474121, | |
| "epoch": 0.5720359455019809, | |
| "grad_norm": 0.8763257922920367, | |
| "kl": 0.4752197265625, | |
| "learning_rate": 9.216756686793163e-06, | |
| "loss": 0.019, | |
| "reward": 0.40025509386323394, | |
| "reward_std": 0.2795922602061182, | |
| "rewards/accuracy_reward": 0.40025509386323394, | |
| "rewards/format_reward": 0.0, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 590.1071298599243, | |
| "epoch": 0.5874963764614939, | |
| "grad_norm": 0.10389955173536307, | |
| "kl": 0.515869140625, | |
| "learning_rate": 8.67910358358298e-06, | |
| "loss": 0.0206, | |
| "reward": 0.3820152990985662, | |
| "reward_std": 0.29924757215194403, | |
| "rewards/accuracy_reward": 0.3820152990985662, | |
| "rewards/format_reward": 0.0, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 559.9300924301148, | |
| "epoch": 0.6029568074210069, | |
| "grad_norm": 0.1070276652241689, | |
| "kl": 0.4572998046875, | |
| "learning_rate": 8.145311574811325e-06, | |
| "loss": 0.0183, | |
| "reward": 0.38852040134370325, | |
| "reward_std": 0.2695158838760108, | |
| "rewards/accuracy_reward": 0.38852040134370325, | |
| "rewards/format_reward": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 585.0901662826539, | |
| "epoch": 0.6184172383805199, | |
| "grad_norm": 0.1240698442943613, | |
| "kl": 0.468115234375, | |
| "learning_rate": 7.616940980675004e-06, | |
| "loss": 0.0187, | |
| "reward": 0.39999999292194843, | |
| "reward_std": 0.2908332996070385, | |
| "rewards/accuracy_reward": 0.39999999292194843, | |
| "rewards/format_reward": 0.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6184172383805199, | |
| "eval_completion_length": 579.0039282226562, | |
| "eval_kl": 0.454453125, | |
| "eval_loss": 0.018264248967170715, | |
| "eval_reward": 0.4242857307195663, | |
| "eval_reward_std": 0.3304371988773346, | |
| "eval_rewards/accuracy_reward": 0.4242857307195663, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 112.7887, | |
| "eval_samples_per_second": 0.878, | |
| "eval_steps_per_second": 0.035, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 596.5103164672852, | |
| "epoch": 0.6338776693400329, | |
| "grad_norm": 0.12074064710856633, | |
| "kl": 0.485009765625, | |
| "learning_rate": 7.095536274107046e-06, | |
| "loss": 0.0194, | |
| "reward": 0.38278060434386135, | |
| "reward_std": 0.295634587854147, | |
| "rewards/accuracy_reward": 0.38278060434386135, | |
| "rewards/format_reward": 0.0, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 598.0414419174194, | |
| "epoch": 0.6493381002995459, | |
| "grad_norm": 0.12233619499866348, | |
| "kl": 0.53099365234375, | |
| "learning_rate": 6.58262156614881e-06, | |
| "loss": 0.0212, | |
| "reward": 0.36505101402290163, | |
| "reward_std": 0.2934939767234027, | |
| "rewards/accuracy_reward": 0.36505101402290163, | |
| "rewards/format_reward": 0.0, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 567.2344249725342, | |
| "epoch": 0.6647985312590589, | |
| "grad_norm": 0.14290575648298232, | |
| "kl": 0.54254150390625, | |
| "learning_rate": 6.079696150841634e-06, | |
| "loss": 0.0217, | |
| "reward": 0.3419642790220678, | |
| "reward_std": 0.27319411835633217, | |
| "rewards/accuracy_reward": 0.3419642790220678, | |
| "rewards/format_reward": 0.0, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 567.8533065795898, | |
| "epoch": 0.6802589622185718, | |
| "grad_norm": 0.1332021338177752, | |
| "kl": 0.55421142578125, | |
| "learning_rate": 5.588230122660672e-06, | |
| "loss": 0.0222, | |
| "reward": 0.3531887697754428, | |
| "reward_std": 0.27350661787204444, | |
| "rewards/accuracy_reward": 0.3531887697754428, | |
| "rewards/format_reward": 0.0, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 572.4235851287842, | |
| "epoch": 0.6957193931780848, | |
| "grad_norm": 0.1659377031735989, | |
| "kl": 0.539794921875, | |
| "learning_rate": 5.109660079301668e-06, | |
| "loss": 0.0216, | |
| "reward": 0.36785713671706616, | |
| "reward_std": 0.2756811453495175, | |
| "rewards/accuracy_reward": 0.36785713671706616, | |
| "rewards/format_reward": 0.0, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 570.152794265747, | |
| "epoch": 0.7111798241375978, | |
| "grad_norm": 0.12626468054028842, | |
| "kl": 0.51593017578125, | |
| "learning_rate": 4.64538492238166e-06, | |
| "loss": 0.0206, | |
| "reward": 0.38137754444032906, | |
| "reward_std": 0.2765601733699441, | |
| "rewards/accuracy_reward": 0.38137754444032906, | |
| "rewards/format_reward": 0.0, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 593.0752410888672, | |
| "epoch": 0.7266402550971108, | |
| "grad_norm": 0.4239556346834354, | |
| "kl": 0.49893798828125, | |
| "learning_rate": 4.196761768328599e-06, | |
| "loss": 0.02, | |
| "reward": 0.3998724427074194, | |
| "reward_std": 0.2849443768151104, | |
| "rewards/accuracy_reward": 0.3998724427074194, | |
| "rewards/format_reward": 0.0, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 610.4562358856201, | |
| "epoch": 0.7421006860566238, | |
| "grad_norm": 0.13103766064000663, | |
| "kl": 0.47908935546875, | |
| "learning_rate": 3.7651019814126656e-06, | |
| "loss": 0.0192, | |
| "reward": 0.39668366676196454, | |
| "reward_std": 0.2925532532390207, | |
| "rewards/accuracy_reward": 0.39668366676196454, | |
| "rewards/format_reward": 0.0, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 600.7034322738648, | |
| "epoch": 0.7575611170161368, | |
| "grad_norm": 0.12932522707643554, | |
| "kl": 0.51405029296875, | |
| "learning_rate": 3.3516673405151546e-06, | |
| "loss": 0.0206, | |
| "reward": 0.3905612169764936, | |
| "reward_std": 0.28836513138376174, | |
| "rewards/accuracy_reward": 0.3905612169764936, | |
| "rewards/format_reward": 0.0, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 591.6121044158936, | |
| "epoch": 0.7730215479756498, | |
| "grad_norm": 0.12487727544118296, | |
| "kl": 0.49349365234375, | |
| "learning_rate": 2.957666350839663e-06, | |
| "loss": 0.0197, | |
| "reward": 0.38303570710122586, | |
| "reward_std": 0.28170021236874165, | |
| "rewards/accuracy_reward": 0.38303570710122586, | |
| "rewards/format_reward": 0.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7730215479756498, | |
| "eval_completion_length": 594.4429809570313, | |
| "eval_kl": 0.496015625, | |
| "eval_loss": 0.019787130877375603, | |
| "eval_reward": 0.3971428729593754, | |
| "eval_reward_std": 0.3095328077673912, | |
| "eval_rewards/accuracy_reward": 0.3971428729593754, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 113.39, | |
| "eval_samples_per_second": 0.873, | |
| "eval_steps_per_second": 0.035, | |
| "step": 500 | |
| }, | |
| { | |
| "completion_length": 599.3793228149414, | |
| "epoch": 0.7884819789351628, | |
| "grad_norm": 0.21375062613439136, | |
| "kl": 0.515185546875, | |
| "learning_rate": 2.5842507113469307e-06, | |
| "loss": 0.0206, | |
| "reward": 0.375255094806198, | |
| "reward_std": 0.27221947656944395, | |
| "rewards/accuracy_reward": 0.375255094806198, | |
| "rewards/format_reward": 0.0, | |
| "step": 510 | |
| }, | |
| { | |
| "completion_length": 585.9983297348023, | |
| "epoch": 0.8039424098946758, | |
| "grad_norm": 0.1500149911042034, | |
| "kl": 0.5186767578125, | |
| "learning_rate": 2.2325119482391466e-06, | |
| "loss": 0.0207, | |
| "reward": 0.37665815658401697, | |
| "reward_std": 0.26996592339128256, | |
| "rewards/accuracy_reward": 0.37665815658401697, | |
| "rewards/format_reward": 0.0, | |
| "step": 520 | |
| }, | |
| { | |
| "completion_length": 590.9803451538086, | |
| "epoch": 0.8194028408541888, | |
| "grad_norm": 0.46915042082176195, | |
| "kl": 0.5627685546875, | |
| "learning_rate": 1.9034782243345074e-06, | |
| "loss": 0.0225, | |
| "reward": 0.3559948909911327, | |
| "reward_std": 0.2665759083814919, | |
| "rewards/accuracy_reward": 0.3559948909911327, | |
| "rewards/format_reward": 0.0, | |
| "step": 530 | |
| }, | |
| { | |
| "completion_length": 587.5531749725342, | |
| "epoch": 0.8348632718137018, | |
| "grad_norm": 0.35357251691875513, | |
| "kl": 0.5494384765625, | |
| "learning_rate": 1.5981113336584041e-06, | |
| "loss": 0.022, | |
| "reward": 0.3682397903525271, | |
| "reward_std": 0.2814167995005846, | |
| "rewards/accuracy_reward": 0.3682397903525271, | |
| "rewards/format_reward": 0.0, | |
| "step": 540 | |
| }, | |
| { | |
| "completion_length": 567.9155498504639, | |
| "epoch": 0.8503237027732148, | |
| "grad_norm": 1.5999018510524907, | |
| "kl": 0.89453125, | |
| "learning_rate": 1.3173038900362977e-06, | |
| "loss": 0.0358, | |
| "reward": 0.3917091765906662, | |
| "reward_std": 0.28189596980810167, | |
| "rewards/accuracy_reward": 0.3917091765906662, | |
| "rewards/format_reward": 0.0, | |
| "step": 550 | |
| }, | |
| { | |
| "completion_length": 558.6094276428223, | |
| "epoch": 0.8657841337327278, | |
| "grad_norm": 1.9953500059119544, | |
| "kl": 1.2019775390625, | |
| "learning_rate": 1.0618767179063416e-06, | |
| "loss": 0.0481, | |
| "reward": 0.3970663193613291, | |
| "reward_std": 0.2844697198830545, | |
| "rewards/accuracy_reward": 0.3970663193613291, | |
| "rewards/format_reward": 0.0, | |
| "step": 560 | |
| }, | |
| { | |
| "completion_length": 590.6031772613526, | |
| "epoch": 0.8812445646922408, | |
| "grad_norm": 1.7788121098553087, | |
| "kl": 1.5089599609375, | |
| "learning_rate": 8.325764529785851e-07, | |
| "loss": 0.0604, | |
| "reward": 0.3730867275618948, | |
| "reward_std": 0.26905877529643474, | |
| "rewards/accuracy_reward": 0.3730867275618948, | |
| "rewards/format_reward": 0.0, | |
| "step": 570 | |
| }, | |
| { | |
| "completion_length": 608.6429714202881, | |
| "epoch": 0.8967049956517538, | |
| "grad_norm": 23.56674292032029, | |
| "kl": 1.8998779296875, | |
| "learning_rate": 6.300733597542086e-07, | |
| "loss": 0.076, | |
| "reward": 0.3751275438349694, | |
| "reward_std": 0.28051954903639853, | |
| "rewards/accuracy_reward": 0.3751275438349694, | |
| "rewards/format_reward": 0.0, | |
| "step": 580 | |
| }, | |
| { | |
| "completion_length": 600.8526664733887, | |
| "epoch": 0.9121654266112668, | |
| "grad_norm": 0.987276324273893, | |
| "kl": 1.8114501953125, | |
| "learning_rate": 4.549593722844492e-07, | |
| "loss": 0.0725, | |
| "reward": 0.38966836063191296, | |
| "reward_std": 0.2809427470434457, | |
| "rewards/accuracy_reward": 0.38966836063191296, | |
| "rewards/format_reward": 0.0, | |
| "step": 590 | |
| }, | |
| { | |
| "completion_length": 589.8212898254394, | |
| "epoch": 0.9276258575707798, | |
| "grad_norm": 0.7872302567234748, | |
| "kl": 1.91683349609375, | |
| "learning_rate": 3.0774636389618196e-07, | |
| "loss": 0.0767, | |
| "reward": 0.4075255025178194, | |
| "reward_std": 0.2732542660087347, | |
| "rewards/accuracy_reward": 0.4075255025178194, | |
| "rewards/format_reward": 0.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9276258575707798, | |
| "eval_completion_length": 574.0087805175781, | |
| "eval_kl": 1.86640625, | |
| "eval_loss": 0.07402127236127853, | |
| "eval_reward": 0.39571430325508117, | |
| "eval_reward_std": 0.3305768924951553, | |
| "eval_rewards/accuracy_reward": 0.39571430325508117, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 112.1819, | |
| "eval_samples_per_second": 0.882, | |
| "eval_steps_per_second": 0.036, | |
| "step": 600 | |
| }, | |
| { | |
| "completion_length": 582.802791595459, | |
| "epoch": 0.9430862885302927, | |
| "grad_norm": 0.8922411277504559, | |
| "kl": 1.8591552734375, | |
| "learning_rate": 1.8886465094192895e-07, | |
| "loss": 0.0744, | |
| "reward": 0.37487244252115487, | |
| "reward_std": 0.2767856624443084, | |
| "rewards/accuracy_reward": 0.37487244252115487, | |
| "rewards/format_reward": 0.0, | |
| "step": 610 | |
| }, | |
| { | |
| "completion_length": 577.6701448440551, | |
| "epoch": 0.9585467194898057, | |
| "grad_norm": 0.6713673354373313, | |
| "kl": 1.8814208984375, | |
| "learning_rate": 9.866173494794462e-08, | |
| "loss": 0.0753, | |
| "reward": 0.3966836669947952, | |
| "reward_std": 0.27219036710448563, | |
| "rewards/accuracy_reward": 0.3966836669947952, | |
| "rewards/format_reward": 0.0, | |
| "step": 620 | |
| }, | |
| { | |
| "completion_length": 571.5952686309814, | |
| "epoch": 0.9740071504493187, | |
| "grad_norm": 1.6739477461143617, | |
| "kl": 1.7553955078125, | |
| "learning_rate": 3.7401286837214224e-08, | |
| "loss": 0.0702, | |
| "reward": 0.38431122010806573, | |
| "reward_std": 0.2745078310370445, | |
| "rewards/accuracy_reward": 0.38431122010806573, | |
| "rewards/format_reward": 0.0, | |
| "step": 630 | |
| }, | |
| { | |
| "completion_length": 575.404195022583, | |
| "epoch": 0.9894675814088317, | |
| "grad_norm": 0.5847733761169498, | |
| "kl": 1.7476806640625, | |
| "learning_rate": 5.262376196544239e-09, | |
| "loss": 0.0699, | |
| "reward": 0.37410713639110327, | |
| "reward_std": 0.28336770869791506, | |
| "rewards/accuracy_reward": 0.37410713639110327, | |
| "rewards/format_reward": 0.0, | |
| "step": 640 | |
| }, | |
| { | |
| "completion_length": 569.2087484995524, | |
| "epoch": 0.9987438399845395, | |
| "kl": 1.7266031901041667, | |
| "reward": 0.40284863238533336, | |
| "reward_std": 0.28753911207119626, | |
| "rewards/accuracy_reward": 0.40284863238533336, | |
| "rewards/format_reward": 0.0, | |
| "step": 646, | |
| "total_flos": 0.0, | |
| "train_loss": 0.025075771342079053, | |
| "train_runtime": 60646.5116, | |
| "train_samples_per_second": 1.194, | |
| "train_steps_per_second": 0.011 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 646, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 7, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |