Qwen2.5-1.5B-Open-R1-GRPO1st / trainer_state.json
ibndias's picture
Model save
6423c7f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9987438399845395,
"eval_steps": 100,
"global_step": 646,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 893.5828884124755,
"epoch": 0.015460430959512996,
"grad_norm": 0.05477782406272581,
"kl": 0.0004816532135009766,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.0,
"reward": 0.21403060818556696,
"reward_std": 0.18781698173843325,
"rewards/accuracy_reward": 0.21403060818556696,
"rewards/format_reward": 0.0,
"step": 10
},
{
"completion_length": 817.3508731842041,
"epoch": 0.03092086191902599,
"grad_norm": 0.05164180394901766,
"kl": 0.005183982849121094,
"learning_rate": 6.153846153846155e-06,
"loss": 0.0002,
"reward": 0.36135203461162746,
"reward_std": 0.1941540485713631,
"rewards/accuracy_reward": 0.36135203461162746,
"rewards/format_reward": 0.0,
"step": 20
},
{
"completion_length": 798.6872316360474,
"epoch": 0.04638129287853899,
"grad_norm": 0.054602332205036755,
"kl": 0.009011650085449218,
"learning_rate": 9.230769230769232e-06,
"loss": 0.0004,
"reward": 0.43112243991345167,
"reward_std": 0.21865551262162625,
"rewards/accuracy_reward": 0.43112243991345167,
"rewards/format_reward": 0.0,
"step": 30
},
{
"completion_length": 757.7996000289917,
"epoch": 0.06184172383805198,
"grad_norm": 0.056117459068221236,
"kl": 0.16724586486816406,
"learning_rate": 1.230769230769231e-05,
"loss": 0.0067,
"reward": 0.49477039841003717,
"reward_std": 0.20602972037158906,
"rewards/accuracy_reward": 0.49477039841003717,
"rewards/format_reward": 0.0,
"step": 40
},
{
"completion_length": 748.4751132965088,
"epoch": 0.07730215479756498,
"grad_norm": 0.06290510173535674,
"kl": 0.023724746704101563,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0009,
"reward": 0.5140306028537452,
"reward_std": 0.2120750412810594,
"rewards/accuracy_reward": 0.5140306028537452,
"rewards/format_reward": 0.0,
"step": 50
},
{
"completion_length": 739.1664382934571,
"epoch": 0.09276258575707798,
"grad_norm": 0.05984969387434123,
"kl": 0.03551025390625,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.0014,
"reward": 0.526785704959184,
"reward_std": 0.20515552521683275,
"rewards/accuracy_reward": 0.526785704959184,
"rewards/format_reward": 0.0,
"step": 60
},
{
"completion_length": 705.585445022583,
"epoch": 0.10822301671659097,
"grad_norm": 0.14192296132745433,
"kl": 0.0682769775390625,
"learning_rate": 1.999634547413886e-05,
"loss": 0.0027,
"reward": 0.5465561124496162,
"reward_std": 0.23511593420989813,
"rewards/accuracy_reward": 0.5465561124496162,
"rewards/format_reward": 0.0,
"step": 70
},
{
"completion_length": 655.4872343063355,
"epoch": 0.12368344767610397,
"grad_norm": 0.10437024812626647,
"kl": 0.12667694091796874,
"learning_rate": 1.9967125291968495e-05,
"loss": 0.0051,
"reward": 0.5284438657574355,
"reward_std": 0.23128830096684397,
"rewards/accuracy_reward": 0.5284438657574355,
"rewards/format_reward": 0.0,
"step": 80
},
{
"completion_length": 646.3841724395752,
"epoch": 0.13914387863561697,
"grad_norm": 0.10113391204729148,
"kl": 0.186383056640625,
"learning_rate": 1.990877034074683e-05,
"loss": 0.0075,
"reward": 0.5177295829169453,
"reward_std": 0.23487156317569316,
"rewards/accuracy_reward": 0.5177295829169453,
"rewards/format_reward": 0.0,
"step": 90
},
{
"completion_length": 670.7409303665161,
"epoch": 0.15460430959512997,
"grad_norm": 0.13002298056265418,
"kl": 0.3470306396484375,
"learning_rate": 1.9821451197042028e-05,
"loss": 0.0139,
"reward": 0.5079081559553742,
"reward_std": 0.23903855890966952,
"rewards/accuracy_reward": 0.5079081559553742,
"rewards/format_reward": 0.0,
"step": 100
},
{
"epoch": 0.15460430959512997,
"eval_completion_length": 551.0136389160157,
"eval_kl": 0.2697265625,
"eval_loss": 0.011112870648503304,
"eval_reward": 0.530000029206276,
"eval_reward_std": 0.2772822642326355,
"eval_rewards/accuracy_reward": 0.530000029206276,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 110.8876,
"eval_samples_per_second": 0.893,
"eval_steps_per_second": 0.036,
"step": 100
},
{
"completion_length": 549.0640195846557,
"epoch": 0.17006474055464296,
"grad_norm": 0.28429562437188954,
"kl": 0.2885589599609375,
"learning_rate": 1.9705423102261324e-05,
"loss": 0.0115,
"reward": 0.4946428484516218,
"reward_std": 0.2506483959499747,
"rewards/accuracy_reward": 0.4946428484516218,
"rewards/format_reward": 0.0,
"step": 110
},
{
"completion_length": 594.3490919113159,
"epoch": 0.18552517151415596,
"grad_norm": 0.09104387343519704,
"kl": 0.21728363037109374,
"learning_rate": 1.956102521655831e-05,
"loss": 0.0087,
"reward": 0.517984684323892,
"reward_std": 0.2393535960931331,
"rewards/accuracy_reward": 0.517984684323892,
"rewards/format_reward": 0.0,
"step": 120
},
{
"completion_length": 731.2531753540039,
"epoch": 0.20098560247366895,
"grad_norm": 0.5351787744232117,
"kl": 0.826373291015625,
"learning_rate": 1.9388679627438486e-05,
"loss": 0.0331,
"reward": 0.3987244822550565,
"reward_std": 0.23079254203476013,
"rewards/accuracy_reward": 0.3987244822550565,
"rewards/format_reward": 0.0,
"step": 130
},
{
"completion_length": 772.120905303955,
"epoch": 0.21644603343318194,
"grad_norm": 0.08975704690138181,
"kl": 1.55367431640625,
"learning_rate": 1.9188890115960967e-05,
"loss": 0.0622,
"reward": 0.4589285627473146,
"reward_std": 0.23901810268871487,
"rewards/accuracy_reward": 0.4589285627473146,
"rewards/format_reward": 0.0,
"step": 140
},
{
"completion_length": 749.2706487655639,
"epoch": 0.23190646439269494,
"grad_norm": 0.11341391574096771,
"kl": 0.0939483642578125,
"learning_rate": 1.8962240684142923e-05,
"loss": 0.0038,
"reward": 0.49234693106263877,
"reward_std": 0.22801345195621253,
"rewards/accuracy_reward": 0.49234693106263877,
"rewards/format_reward": 0.0,
"step": 150
},
{
"completion_length": 791.6997276306153,
"epoch": 0.24736689535220793,
"grad_norm": 0.3224456998961169,
"kl": 0.3746490478515625,
"learning_rate": 1.8709393847871146e-05,
"loss": 0.015,
"reward": 0.3957908083219081,
"reward_std": 0.27026979019865394,
"rewards/accuracy_reward": 0.3957908083219081,
"rewards/format_reward": 0.0,
"step": 160
},
{
"completion_length": 817.1753665924073,
"epoch": 0.26282732631172095,
"grad_norm": 0.24361295168325228,
"kl": 0.87244873046875,
"learning_rate": 1.8431088700310846e-05,
"loss": 0.0349,
"reward": 0.29821428112918513,
"reward_std": 0.22583510340191423,
"rewards/accuracy_reward": 0.29821428112918513,
"rewards/format_reward": 0.0,
"step": 170
},
{
"completion_length": 753.5609531402588,
"epoch": 0.27828775727123395,
"grad_norm": 0.1166452373188838,
"kl": 0.372216796875,
"learning_rate": 1.8128138751472432e-05,
"loss": 0.0149,
"reward": 0.3784438705071807,
"reward_std": 0.23936128611676394,
"rewards/accuracy_reward": 0.3784438705071807,
"rewards/format_reward": 0.0,
"step": 180
},
{
"completion_length": 655.2237131118775,
"epoch": 0.29374818823074694,
"grad_norm": 0.07909899864987424,
"kl": 0.3955078125,
"learning_rate": 1.780142955025139e-05,
"loss": 0.0158,
"reward": 0.4104591752868146,
"reward_std": 0.2427055777516216,
"rewards/accuracy_reward": 0.4104591752868146,
"rewards/format_reward": 0.0,
"step": 190
},
{
"completion_length": 569.6598121643067,
"epoch": 0.30920861919025994,
"grad_norm": 0.08783730023470541,
"kl": 0.37242431640625,
"learning_rate": 1.745191609589231e-05,
"loss": 0.0149,
"reward": 0.43775509400293233,
"reward_std": 0.2702743059489876,
"rewards/accuracy_reward": 0.43775509400293233,
"rewards/format_reward": 0.0,
"step": 200
},
{
"epoch": 0.30920861919025994,
"eval_completion_length": 592.9845031738281,
"eval_kl": 0.38939453125,
"eval_loss": 0.015397945418953896,
"eval_reward": 0.4400000186264515,
"eval_reward_std": 0.28451553016901016,
"eval_rewards/accuracy_reward": 0.4400000186264515,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 113.6553,
"eval_samples_per_second": 0.871,
"eval_steps_per_second": 0.035,
"step": 200
},
{
"completion_length": 590.8179740905762,
"epoch": 0.32466905014977293,
"grad_norm": 0.4429670525609896,
"kl": 0.35711669921875,
"learning_rate": 1.7080620046443503e-05,
"loss": 0.0143,
"reward": 0.4413265222683549,
"reward_std": 0.26584886815398934,
"rewards/accuracy_reward": 0.4413265222683549,
"rewards/format_reward": 0.0,
"step": 210
},
{
"completion_length": 590.4183538436889,
"epoch": 0.3401294811092859,
"grad_norm": 6.707552360764972,
"kl": 0.8987060546875,
"learning_rate": 1.6688626732362192e-05,
"loss": 0.036,
"reward": 0.4778061142191291,
"reward_std": 0.264619250735268,
"rewards/accuracy_reward": 0.4778061142191291,
"rewards/format_reward": 0.0,
"step": 220
},
{
"completion_length": 658.2447557449341,
"epoch": 0.3555899120687989,
"grad_norm": 0.11712079199179656,
"kl": 0.389324951171875,
"learning_rate": 1.6277081983999742e-05,
"loss": 0.0156,
"reward": 0.42512754308991135,
"reward_std": 0.2852953039575368,
"rewards/accuracy_reward": 0.42512754308991135,
"rewards/format_reward": 0.0,
"step": 230
},
{
"completion_length": 632.0840436935425,
"epoch": 0.3710503430283119,
"grad_norm": 0.07151809962617796,
"kl": 0.4206787109375,
"learning_rate": 1.5847188782240473e-05,
"loss": 0.0168,
"reward": 0.4248724417295307,
"reward_std": 0.26311199530027807,
"rewards/accuracy_reward": 0.4248724417295307,
"rewards/format_reward": 0.0,
"step": 240
},
{
"completion_length": 592.4107023239136,
"epoch": 0.3865107739878249,
"grad_norm": 0.08333585854235208,
"kl": 0.354058837890625,
"learning_rate": 1.5400203742084508e-05,
"loss": 0.0142,
"reward": 0.45089285019785164,
"reward_std": 0.2697589965071529,
"rewards/accuracy_reward": 0.45089285019785164,
"rewards/format_reward": 0.0,
"step": 250
},
{
"completion_length": 613.9152961730957,
"epoch": 0.4019712049473379,
"grad_norm": 0.08634652030847269,
"kl": 0.39644775390625,
"learning_rate": 1.4937433439453465e-05,
"loss": 0.0159,
"reward": 0.42321427781134846,
"reward_std": 0.2768237174488604,
"rewards/accuracy_reward": 0.42321427781134846,
"rewards/format_reward": 0.0,
"step": 260
},
{
"completion_length": 602.9336614608765,
"epoch": 0.4174316359068509,
"grad_norm": 0.09339307778633552,
"kl": 0.427685546875,
"learning_rate": 1.4460230591956097e-05,
"loss": 0.0171,
"reward": 0.441454073600471,
"reward_std": 0.28628530018031595,
"rewards/accuracy_reward": 0.441454073600471,
"rewards/format_reward": 0.0,
"step": 270
},
{
"completion_length": 607.3401681900025,
"epoch": 0.4328920668663639,
"grad_norm": 0.09497207943977368,
"kl": 0.49241943359375,
"learning_rate": 1.3969990104777712e-05,
"loss": 0.0197,
"reward": 0.3746173400897533,
"reward_std": 0.287894579814747,
"rewards/accuracy_reward": 0.3746173400897533,
"rewards/format_reward": 0.0,
"step": 280
},
{
"completion_length": 554.3765167236328,
"epoch": 0.4483524978258769,
"grad_norm": 0.10760598840353774,
"kl": 0.4693603515625,
"learning_rate": 1.3468144993251735e-05,
"loss": 0.0188,
"reward": 0.3869897902011871,
"reward_std": 0.27005039355717597,
"rewards/accuracy_reward": 0.3869897902011871,
"rewards/format_reward": 0.0,
"step": 290
},
{
"completion_length": 565.3260089874268,
"epoch": 0.4638129287853899,
"grad_norm": 0.08708785017207243,
"kl": 0.47398681640625,
"learning_rate": 1.295616219403197e-05,
"loss": 0.019,
"reward": 0.3772959113586694,
"reward_std": 0.25850353664718567,
"rewards/accuracy_reward": 0.3772959113586694,
"rewards/format_reward": 0.0,
"step": 300
},
{
"epoch": 0.4638129287853899,
"eval_completion_length": 563.6594519042969,
"eval_kl": 0.46421875,
"eval_loss": 0.016797564923763275,
"eval_reward": 0.4000000201165676,
"eval_reward_std": 0.32052057892084124,
"eval_rewards/accuracy_reward": 0.4000000201165676,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 112.3506,
"eval_samples_per_second": 0.881,
"eval_steps_per_second": 0.036,
"step": 300
},
{
"completion_length": 561.7035604476929,
"epoch": 0.47927335974490287,
"grad_norm": 0.09043618938604044,
"kl": 0.439971923828125,
"learning_rate": 1.2435538277109919e-05,
"loss": 0.0176,
"reward": 0.38673468665219846,
"reward_std": 0.2600894993636757,
"rewards/accuracy_reward": 0.38673468665219846,
"rewards/format_reward": 0.0,
"step": 310
},
{
"completion_length": 596.5308570861816,
"epoch": 0.49473379070441587,
"grad_norm": 0.14081320533250113,
"kl": 0.438336181640625,
"learning_rate": 1.19077950712113e-05,
"loss": 0.0175,
"reward": 0.4330357064725831,
"reward_std": 0.2756219625007361,
"rewards/accuracy_reward": 0.4330357064725831,
"rewards/format_reward": 0.0,
"step": 320
},
{
"completion_length": 607.1673355102539,
"epoch": 0.5101942216639289,
"grad_norm": 0.10027936642706035,
"kl": 0.46512451171875,
"learning_rate": 1.137447521535908e-05,
"loss": 0.0186,
"reward": 0.388903054734692,
"reward_std": 0.27682310505770147,
"rewards/accuracy_reward": 0.388903054734692,
"rewards/format_reward": 0.0,
"step": 330
},
{
"completion_length": 621.5595531463623,
"epoch": 0.5256546526234419,
"grad_norm": 0.11573394826594872,
"kl": 0.507958984375,
"learning_rate": 1.0837137649606241e-05,
"loss": 0.0203,
"reward": 0.38035713706631213,
"reward_std": 0.280763331009075,
"rewards/accuracy_reward": 0.38035713706631213,
"rewards/format_reward": 0.0,
"step": 340
},
{
"completion_length": 563.3604457855224,
"epoch": 0.5411150835829549,
"grad_norm": 0.1835125826838055,
"kl": 0.409375,
"learning_rate": 1.0297353058119209e-05,
"loss": 0.0164,
"reward": 0.415816318616271,
"reward_std": 0.27372801350429654,
"rewards/accuracy_reward": 0.415816318616271,
"rewards/format_reward": 0.0,
"step": 350
},
{
"completion_length": 587.2214157104493,
"epoch": 0.5565755145424679,
"grad_norm": 0.1000658792975156,
"kl": 0.43155517578125,
"learning_rate": 9.756699277932196e-06,
"loss": 0.0173,
"reward": 0.4036989708431065,
"reward_std": 0.2821080778259784,
"rewards/accuracy_reward": 0.4036989708431065,
"rewards/format_reward": 0.0,
"step": 360
},
{
"completion_length": 595.8506248474121,
"epoch": 0.5720359455019809,
"grad_norm": 0.8763257922920367,
"kl": 0.4752197265625,
"learning_rate": 9.216756686793163e-06,
"loss": 0.019,
"reward": 0.40025509386323394,
"reward_std": 0.2795922602061182,
"rewards/accuracy_reward": 0.40025509386323394,
"rewards/format_reward": 0.0,
"step": 370
},
{
"completion_length": 590.1071298599243,
"epoch": 0.5874963764614939,
"grad_norm": 0.10389955173536307,
"kl": 0.515869140625,
"learning_rate": 8.67910358358298e-06,
"loss": 0.0206,
"reward": 0.3820152990985662,
"reward_std": 0.29924757215194403,
"rewards/accuracy_reward": 0.3820152990985662,
"rewards/format_reward": 0.0,
"step": 380
},
{
"completion_length": 559.9300924301148,
"epoch": 0.6029568074210069,
"grad_norm": 0.1070276652241689,
"kl": 0.4572998046875,
"learning_rate": 8.145311574811325e-06,
"loss": 0.0183,
"reward": 0.38852040134370325,
"reward_std": 0.2695158838760108,
"rewards/accuracy_reward": 0.38852040134370325,
"rewards/format_reward": 0.0,
"step": 390
},
{
"completion_length": 585.0901662826539,
"epoch": 0.6184172383805199,
"grad_norm": 0.1240698442943613,
"kl": 0.468115234375,
"learning_rate": 7.616940980675004e-06,
"loss": 0.0187,
"reward": 0.39999999292194843,
"reward_std": 0.2908332996070385,
"rewards/accuracy_reward": 0.39999999292194843,
"rewards/format_reward": 0.0,
"step": 400
},
{
"epoch": 0.6184172383805199,
"eval_completion_length": 579.0039282226562,
"eval_kl": 0.454453125,
"eval_loss": 0.018264248967170715,
"eval_reward": 0.4242857307195663,
"eval_reward_std": 0.3304371988773346,
"eval_rewards/accuracy_reward": 0.4242857307195663,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 112.7887,
"eval_samples_per_second": 0.878,
"eval_steps_per_second": 0.035,
"step": 400
},
{
"completion_length": 596.5103164672852,
"epoch": 0.6338776693400329,
"grad_norm": 0.12074064710856633,
"kl": 0.485009765625,
"learning_rate": 7.095536274107046e-06,
"loss": 0.0194,
"reward": 0.38278060434386135,
"reward_std": 0.295634587854147,
"rewards/accuracy_reward": 0.38278060434386135,
"rewards/format_reward": 0.0,
"step": 410
},
{
"completion_length": 598.0414419174194,
"epoch": 0.6493381002995459,
"grad_norm": 0.12233619499866348,
"kl": 0.53099365234375,
"learning_rate": 6.58262156614881e-06,
"loss": 0.0212,
"reward": 0.36505101402290163,
"reward_std": 0.2934939767234027,
"rewards/accuracy_reward": 0.36505101402290163,
"rewards/format_reward": 0.0,
"step": 420
},
{
"completion_length": 567.2344249725342,
"epoch": 0.6647985312590589,
"grad_norm": 0.14290575648298232,
"kl": 0.54254150390625,
"learning_rate": 6.079696150841634e-06,
"loss": 0.0217,
"reward": 0.3419642790220678,
"reward_std": 0.27319411835633217,
"rewards/accuracy_reward": 0.3419642790220678,
"rewards/format_reward": 0.0,
"step": 430
},
{
"completion_length": 567.8533065795898,
"epoch": 0.6802589622185718,
"grad_norm": 0.1332021338177752,
"kl": 0.55421142578125,
"learning_rate": 5.588230122660672e-06,
"loss": 0.0222,
"reward": 0.3531887697754428,
"reward_std": 0.27350661787204444,
"rewards/accuracy_reward": 0.3531887697754428,
"rewards/format_reward": 0.0,
"step": 440
},
{
"completion_length": 572.4235851287842,
"epoch": 0.6957193931780848,
"grad_norm": 0.1659377031735989,
"kl": 0.539794921875,
"learning_rate": 5.109660079301668e-06,
"loss": 0.0216,
"reward": 0.36785713671706616,
"reward_std": 0.2756811453495175,
"rewards/accuracy_reward": 0.36785713671706616,
"rewards/format_reward": 0.0,
"step": 450
},
{
"completion_length": 570.152794265747,
"epoch": 0.7111798241375978,
"grad_norm": 0.12626468054028842,
"kl": 0.51593017578125,
"learning_rate": 4.64538492238166e-06,
"loss": 0.0206,
"reward": 0.38137754444032906,
"reward_std": 0.2765601733699441,
"rewards/accuracy_reward": 0.38137754444032906,
"rewards/format_reward": 0.0,
"step": 460
},
{
"completion_length": 593.0752410888672,
"epoch": 0.7266402550971108,
"grad_norm": 0.4239556346834354,
"kl": 0.49893798828125,
"learning_rate": 4.196761768328599e-06,
"loss": 0.02,
"reward": 0.3998724427074194,
"reward_std": 0.2849443768151104,
"rewards/accuracy_reward": 0.3998724427074194,
"rewards/format_reward": 0.0,
"step": 470
},
{
"completion_length": 610.4562358856201,
"epoch": 0.7421006860566238,
"grad_norm": 0.13103766064000663,
"kl": 0.47908935546875,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.0192,
"reward": 0.39668366676196454,
"reward_std": 0.2925532532390207,
"rewards/accuracy_reward": 0.39668366676196454,
"rewards/format_reward": 0.0,
"step": 480
},
{
"completion_length": 600.7034322738648,
"epoch": 0.7575611170161368,
"grad_norm": 0.12932522707643554,
"kl": 0.51405029296875,
"learning_rate": 3.3516673405151546e-06,
"loss": 0.0206,
"reward": 0.3905612169764936,
"reward_std": 0.28836513138376174,
"rewards/accuracy_reward": 0.3905612169764936,
"rewards/format_reward": 0.0,
"step": 490
},
{
"completion_length": 591.6121044158936,
"epoch": 0.7730215479756498,
"grad_norm": 0.12487727544118296,
"kl": 0.49349365234375,
"learning_rate": 2.957666350839663e-06,
"loss": 0.0197,
"reward": 0.38303570710122586,
"reward_std": 0.28170021236874165,
"rewards/accuracy_reward": 0.38303570710122586,
"rewards/format_reward": 0.0,
"step": 500
},
{
"epoch": 0.7730215479756498,
"eval_completion_length": 594.4429809570313,
"eval_kl": 0.496015625,
"eval_loss": 0.019787130877375603,
"eval_reward": 0.3971428729593754,
"eval_reward_std": 0.3095328077673912,
"eval_rewards/accuracy_reward": 0.3971428729593754,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 113.39,
"eval_samples_per_second": 0.873,
"eval_steps_per_second": 0.035,
"step": 500
},
{
"completion_length": 599.3793228149414,
"epoch": 0.7884819789351628,
"grad_norm": 0.21375062613439136,
"kl": 0.515185546875,
"learning_rate": 2.5842507113469307e-06,
"loss": 0.0206,
"reward": 0.375255094806198,
"reward_std": 0.27221947656944395,
"rewards/accuracy_reward": 0.375255094806198,
"rewards/format_reward": 0.0,
"step": 510
},
{
"completion_length": 585.9983297348023,
"epoch": 0.8039424098946758,
"grad_norm": 0.1500149911042034,
"kl": 0.5186767578125,
"learning_rate": 2.2325119482391466e-06,
"loss": 0.0207,
"reward": 0.37665815658401697,
"reward_std": 0.26996592339128256,
"rewards/accuracy_reward": 0.37665815658401697,
"rewards/format_reward": 0.0,
"step": 520
},
{
"completion_length": 590.9803451538086,
"epoch": 0.8194028408541888,
"grad_norm": 0.46915042082176195,
"kl": 0.5627685546875,
"learning_rate": 1.9034782243345074e-06,
"loss": 0.0225,
"reward": 0.3559948909911327,
"reward_std": 0.2665759083814919,
"rewards/accuracy_reward": 0.3559948909911327,
"rewards/format_reward": 0.0,
"step": 530
},
{
"completion_length": 587.5531749725342,
"epoch": 0.8348632718137018,
"grad_norm": 0.35357251691875513,
"kl": 0.5494384765625,
"learning_rate": 1.5981113336584041e-06,
"loss": 0.022,
"reward": 0.3682397903525271,
"reward_std": 0.2814167995005846,
"rewards/accuracy_reward": 0.3682397903525271,
"rewards/format_reward": 0.0,
"step": 540
},
{
"completion_length": 567.9155498504639,
"epoch": 0.8503237027732148,
"grad_norm": 1.5999018510524907,
"kl": 0.89453125,
"learning_rate": 1.3173038900362977e-06,
"loss": 0.0358,
"reward": 0.3917091765906662,
"reward_std": 0.28189596980810167,
"rewards/accuracy_reward": 0.3917091765906662,
"rewards/format_reward": 0.0,
"step": 550
},
{
"completion_length": 558.6094276428223,
"epoch": 0.8657841337327278,
"grad_norm": 1.9953500059119544,
"kl": 1.2019775390625,
"learning_rate": 1.0618767179063416e-06,
"loss": 0.0481,
"reward": 0.3970663193613291,
"reward_std": 0.2844697198830545,
"rewards/accuracy_reward": 0.3970663193613291,
"rewards/format_reward": 0.0,
"step": 560
},
{
"completion_length": 590.6031772613526,
"epoch": 0.8812445646922408,
"grad_norm": 1.7788121098553087,
"kl": 1.5089599609375,
"learning_rate": 8.325764529785851e-07,
"loss": 0.0604,
"reward": 0.3730867275618948,
"reward_std": 0.26905877529643474,
"rewards/accuracy_reward": 0.3730867275618948,
"rewards/format_reward": 0.0,
"step": 570
},
{
"completion_length": 608.6429714202881,
"epoch": 0.8967049956517538,
"grad_norm": 23.56674292032029,
"kl": 1.8998779296875,
"learning_rate": 6.300733597542086e-07,
"loss": 0.076,
"reward": 0.3751275438349694,
"reward_std": 0.28051954903639853,
"rewards/accuracy_reward": 0.3751275438349694,
"rewards/format_reward": 0.0,
"step": 580
},
{
"completion_length": 600.8526664733887,
"epoch": 0.9121654266112668,
"grad_norm": 0.987276324273893,
"kl": 1.8114501953125,
"learning_rate": 4.549593722844492e-07,
"loss": 0.0725,
"reward": 0.38966836063191296,
"reward_std": 0.2809427470434457,
"rewards/accuracy_reward": 0.38966836063191296,
"rewards/format_reward": 0.0,
"step": 590
},
{
"completion_length": 589.8212898254394,
"epoch": 0.9276258575707798,
"grad_norm": 0.7872302567234748,
"kl": 1.91683349609375,
"learning_rate": 3.0774636389618196e-07,
"loss": 0.0767,
"reward": 0.4075255025178194,
"reward_std": 0.2732542660087347,
"rewards/accuracy_reward": 0.4075255025178194,
"rewards/format_reward": 0.0,
"step": 600
},
{
"epoch": 0.9276258575707798,
"eval_completion_length": 574.0087805175781,
"eval_kl": 1.86640625,
"eval_loss": 0.07402127236127853,
"eval_reward": 0.39571430325508117,
"eval_reward_std": 0.3305768924951553,
"eval_rewards/accuracy_reward": 0.39571430325508117,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 112.1819,
"eval_samples_per_second": 0.882,
"eval_steps_per_second": 0.036,
"step": 600
},
{
"completion_length": 582.802791595459,
"epoch": 0.9430862885302927,
"grad_norm": 0.8922411277504559,
"kl": 1.8591552734375,
"learning_rate": 1.8886465094192895e-07,
"loss": 0.0744,
"reward": 0.37487244252115487,
"reward_std": 0.2767856624443084,
"rewards/accuracy_reward": 0.37487244252115487,
"rewards/format_reward": 0.0,
"step": 610
},
{
"completion_length": 577.6701448440551,
"epoch": 0.9585467194898057,
"grad_norm": 0.6713673354373313,
"kl": 1.8814208984375,
"learning_rate": 9.866173494794462e-08,
"loss": 0.0753,
"reward": 0.3966836669947952,
"reward_std": 0.27219036710448563,
"rewards/accuracy_reward": 0.3966836669947952,
"rewards/format_reward": 0.0,
"step": 620
},
{
"completion_length": 571.5952686309814,
"epoch": 0.9740071504493187,
"grad_norm": 1.6739477461143617,
"kl": 1.7553955078125,
"learning_rate": 3.7401286837214224e-08,
"loss": 0.0702,
"reward": 0.38431122010806573,
"reward_std": 0.2745078310370445,
"rewards/accuracy_reward": 0.38431122010806573,
"rewards/format_reward": 0.0,
"step": 630
},
{
"completion_length": 575.404195022583,
"epoch": 0.9894675814088317,
"grad_norm": 0.5847733761169498,
"kl": 1.7476806640625,
"learning_rate": 5.262376196544239e-09,
"loss": 0.0699,
"reward": 0.37410713639110327,
"reward_std": 0.28336770869791506,
"rewards/accuracy_reward": 0.37410713639110327,
"rewards/format_reward": 0.0,
"step": 640
},
{
"completion_length": 569.2087484995524,
"epoch": 0.9987438399845395,
"kl": 1.7266031901041667,
"reward": 0.40284863238533336,
"reward_std": 0.28753911207119626,
"rewards/accuracy_reward": 0.40284863238533336,
"rewards/format_reward": 0.0,
"step": 646,
"total_flos": 0.0,
"train_loss": 0.025075771342079053,
"train_runtime": 60646.5116,
"train_samples_per_second": 1.194,
"train_steps_per_second": 0.011
}
],
"logging_steps": 10,
"max_steps": 646,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 7,
"trial_name": null,
"trial_params": null
}