PEFT
Safetensors
English
j1-nano-0.6B / trainer_state.json
leonardtang's picture
Upload folder using huggingface_hub
1d4dd3e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.037392884964944,
"eval_steps": 1000,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 775.203125,
"epoch": 0.002077382498052454,
"grad_norm": 0.16490910947322845,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 0.2167968787252903,
"reward_std": 0.11324757407419384,
"rewards/argmax_reward_func": 0.0625,
"rewards/format_reward_func": 0.154296875,
"step": 1
},
{
"completion_length": 820.609375,
"epoch": 0.004154764996104908,
"grad_norm": 0.15733271837234497,
"kl": 0.0,
"learning_rate": 2e-05,
"loss": 0.0,
"reward": 0.1472656298428774,
"reward_std": 0.020439805870410055,
"rewards/argmax_reward_func": 0.0,
"rewards/format_reward_func": 0.14726562798023224,
"step": 2
},
{
"completion_length": 901.25,
"epoch": 0.006232147494157362,
"grad_norm": 0.14691142737865448,
"kl": 0.0010660013067536056,
"learning_rate": 4e-05,
"loss": 0.0,
"reward": 0.20703125,
"reward_std": 0.11269514623563737,
"rewards/argmax_reward_func": 0.0625,
"rewards/format_reward_func": 0.14453125,
"step": 3
},
{
"completion_length": 873.015625,
"epoch": 0.008309529992209816,
"grad_norm": 0.14998185634613037,
"kl": 0.0019050340633839369,
"learning_rate": 6e-05,
"loss": 0.0,
"reward": 0.2011718824505806,
"reward_std": 0.09004563023336232,
"rewards/argmax_reward_func": 0.046875,
"rewards/format_reward_func": 0.1542968787252903,
"step": 4
},
{
"completion_length": 870.546875,
"epoch": 0.01038691249026227,
"grad_norm": 0.1567591279745102,
"kl": 0.005349995743017644,
"learning_rate": 8e-05,
"loss": 0.0,
"reward": 0.2285156361758709,
"reward_std": 0.1110378596931696,
"rewards/argmax_reward_func": 0.0625,
"rewards/format_reward_func": 0.1660156287252903,
"step": 5
},
{
"completion_length": 849.125,
"epoch": 0.012464294988314724,
"grad_norm": 0.10938515514135361,
"kl": 0.01296996301971376,
"learning_rate": 0.0001,
"loss": 0.0,
"reward": 0.24414063058793545,
"reward_std": 0.0999893163680099,
"rewards/argmax_reward_func": 0.0625,
"rewards/format_reward_func": 0.1816406324505806,
"step": 6
},
{
"completion_length": 901.015625,
"epoch": 0.014541677486367177,
"grad_norm": 0.12581659853458405,
"kl": 0.02171943092253059,
"learning_rate": 9.999973058889791e-05,
"loss": 0.0,
"reward": 0.2585937548428774,
"reward_std": 0.12816310487687588,
"rewards/argmax_reward_func": 0.078125,
"rewards/format_reward_func": 0.18046875298023224,
"step": 7
},
{
"completion_length": 916.671875,
"epoch": 0.01661905998441963,
"grad_norm": 0.12178487330675125,
"kl": 0.04081101668998599,
"learning_rate": 9.999892235849491e-05,
"loss": 0.0,
"reward": 0.3437500111758709,
"reward_std": 0.1900349531788379,
"rewards/argmax_reward_func": 0.15625,
"rewards/format_reward_func": 0.1875,
"step": 8
},
{
"completion_length": 803.90625,
"epoch": 0.018696442482472084,
"grad_norm": 0.12499672174453735,
"kl": 0.06826442573219538,
"learning_rate": 9.999757531750085e-05,
"loss": 0.0,
"reward": 0.45625001564621925,
"reward_std": 0.25411650398746133,
"rewards/argmax_reward_func": 0.265625,
"rewards/format_reward_func": 0.1906250026077032,
"step": 9
},
{
"completion_length": 953.875,
"epoch": 0.02077382498052454,
"grad_norm": 0.11061865091323853,
"kl": 0.06516677932813764,
"learning_rate": 9.999568948043205e-05,
"loss": 0.0,
"reward": 0.3804687615483999,
"reward_std": 0.23091456340625882,
"rewards/argmax_reward_func": 0.1875,
"rewards/format_reward_func": 0.19296875037252903,
"step": 10
},
{
"completion_length": 824.546875,
"epoch": 0.022851207478576992,
"grad_norm": 0.10025237500667572,
"kl": 0.10202133795246482,
"learning_rate": 9.999326486761114e-05,
"loss": 0.0001,
"reward": 0.4562500212341547,
"reward_std": 0.203293202444911,
"rewards/argmax_reward_func": 0.265625,
"rewards/format_reward_func": 0.1906250026077032,
"step": 11
},
{
"completion_length": 925.234375,
"epoch": 0.02492858997662945,
"grad_norm": 0.12423845380544662,
"kl": 0.14641187246888876,
"learning_rate": 9.99903015051668e-05,
"loss": 0.0001,
"reward": 0.6261719018220901,
"reward_std": 0.22925727342953905,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.1886718738824129,
"step": 12
},
{
"completion_length": 810.546875,
"epoch": 0.0270059724746819,
"grad_norm": 0.1263190507888794,
"kl": 0.23557536769658327,
"learning_rate": 9.998679942503358e-05,
"loss": 0.0001,
"reward": 0.5953125320374966,
"reward_std": 0.2717941626906395,
"rewards/argmax_reward_func": 0.40625,
"rewards/format_reward_func": 0.18906250409781933,
"step": 13
},
{
"completion_length": 738.953125,
"epoch": 0.029083354972734354,
"grad_norm": 0.09476204961538315,
"kl": 0.29587008990347385,
"learning_rate": 9.998275866495138e-05,
"loss": 0.0001,
"reward": 0.7289062887430191,
"reward_std": 0.18009126000106335,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19765625335276127,
"step": 14
},
{
"completion_length": 699.109375,
"epoch": 0.03116073747078681,
"grad_norm": 0.15413929522037506,
"kl": 0.2644388508051634,
"learning_rate": 9.997817926846529e-05,
"loss": 0.0001,
"reward": 0.6968750357627869,
"reward_std": 0.4021669775247574,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.19687500223517418,
"step": 15
},
{
"completion_length": 716.4375,
"epoch": 0.03323811996883926,
"grad_norm": 0.13675570487976074,
"kl": 0.41714945435523987,
"learning_rate": 9.99730612849249e-05,
"loss": 0.0002,
"reward": 0.6320312805473804,
"reward_std": 0.27289901627227664,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.19453125074505806,
"step": 16
},
{
"completion_length": 688.609375,
"epoch": 0.03531550246689172,
"grad_norm": 0.14246560633182526,
"kl": 0.35029047913849354,
"learning_rate": 9.996740476948385e-05,
"loss": 0.0002,
"reward": 0.6304687857627869,
"reward_std": 0.31930290907621384,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.19296875223517418,
"step": 17
},
{
"completion_length": 630.828125,
"epoch": 0.03739288496494417,
"grad_norm": 2.1836376190185547,
"kl": 10.275608837604523,
"learning_rate": 9.996120978309931e-05,
"loss": 0.0051,
"reward": 0.5742187947034836,
"reward_std": 0.39885240606963634,
"rewards/argmax_reward_func": 0.375,
"rewards/format_reward_func": 0.1992187537252903,
"step": 18
},
{
"completion_length": 647.890625,
"epoch": 0.039470267462996624,
"grad_norm": 0.1236676499247551,
"kl": 0.366399560123682,
"learning_rate": 9.995447639253115e-05,
"loss": 0.0002,
"reward": 0.7765625417232513,
"reward_std": 0.2894718423485756,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19843750074505806,
"step": 19
},
{
"completion_length": 567.578125,
"epoch": 0.04154764996104908,
"grad_norm": 0.12248539924621582,
"kl": 0.28800770081579685,
"learning_rate": 9.994720467034142e-05,
"loss": 0.0001,
"reward": 0.807812537997961,
"reward_std": 0.24527766928076744,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.1984375026077032,
"step": 20
},
{
"completion_length": 538.796875,
"epoch": 0.04362503245910153,
"grad_norm": 0.14135704934597015,
"kl": 0.5838185884058475,
"learning_rate": 9.993939469489342e-05,
"loss": 0.0003,
"reward": 0.6835937947034836,
"reward_std": 0.24417280592024326,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 21
},
{
"completion_length": 534.40625,
"epoch": 0.045702414957153985,
"grad_norm": 0.16315752267837524,
"kl": 0.37346063926815987,
"learning_rate": 9.993104655035088e-05,
"loss": 0.0002,
"reward": 0.6835937909781933,
"reward_std": 0.37675532698631287,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.1992187537252903,
"step": 22
},
{
"completion_length": 583.4375,
"epoch": 0.04777979745520644,
"grad_norm": 0.13479600846767426,
"kl": 0.5061899088323116,
"learning_rate": 9.992216032667716e-05,
"loss": 0.0003,
"reward": 0.5878906548023224,
"reward_std": 0.2911291141062975,
"rewards/argmax_reward_func": 0.390625,
"rewards/format_reward_func": 0.19726562686264515,
"step": 23
},
{
"completion_length": 532.203125,
"epoch": 0.0498571799532589,
"grad_norm": 0.12497097253799438,
"kl": 0.5098075568675995,
"learning_rate": 9.991273611963412e-05,
"loss": 0.0003,
"reward": 0.8250000476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 24
},
{
"completion_length": 503.4375,
"epoch": 0.051934562451311346,
"grad_norm": 0.153394415974617,
"kl": 0.35232703387737274,
"learning_rate": 9.990277403078122e-05,
"loss": 0.0002,
"reward": 0.7156250439584255,
"reward_std": 0.3314562924206257,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 25
},
{
"completion_length": 491.25,
"epoch": 0.0540119449493638,
"grad_norm": 0.15910868346691132,
"kl": 0.41421468555927277,
"learning_rate": 9.989227416747434e-05,
"loss": 0.0002,
"reward": 0.7625000476837158,
"reward_std": 0.35355337895452976,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 26
},
{
"completion_length": 485.390625,
"epoch": 0.05608932744741626,
"grad_norm": 34.968101501464844,
"kl": 504.1205723620951,
"learning_rate": 9.988123664286469e-05,
"loss": 0.2521,
"reward": 0.6375000439584255,
"reward_std": 0.39774755388498306,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 27
},
{
"completion_length": 543.859375,
"epoch": 0.05816670994546871,
"grad_norm": 0.1266699880361557,
"kl": 0.3936588950455189,
"learning_rate": 9.98696615758975e-05,
"loss": 0.0002,
"reward": 0.7296875417232513,
"reward_std": 0.2673747483640909,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.1984375026077032,
"step": 28
},
{
"completion_length": 520.84375,
"epoch": 0.06024409244352116,
"grad_norm": 0.1384185552597046,
"kl": 0.41820336878299713,
"learning_rate": 9.985754909131085e-05,
"loss": 0.0002,
"reward": 0.6523437947034836,
"reward_std": 0.2883669789880514,
"rewards/argmax_reward_func": 0.453125,
"rewards/format_reward_func": 0.19921875186264515,
"step": 29
},
{
"completion_length": 566.515625,
"epoch": 0.06232147494157362,
"grad_norm": 0.12411382049322128,
"kl": 0.37708618491888046,
"learning_rate": 9.984489931963428e-05,
"loss": 0.0002,
"reward": 0.7304687909781933,
"reward_std": 0.26626989617943764,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.1992187537252903,
"step": 30
},
{
"completion_length": 557.71875,
"epoch": 0.06439885743962608,
"grad_norm": 0.1417299211025238,
"kl": 0.5239567384123802,
"learning_rate": 9.98317123971873e-05,
"loss": 0.0003,
"reward": 0.5867187865078449,
"reward_std": 0.33698057383298874,
"rewards/argmax_reward_func": 0.390625,
"rewards/format_reward_func": 0.19609375298023224,
"step": 31
},
{
"completion_length": 571.46875,
"epoch": 0.06647623993767852,
"grad_norm": 0.12581190466880798,
"kl": 0.3905966766178608,
"learning_rate": 9.981798846607808e-05,
"loss": 0.0002,
"reward": 0.8238281756639481,
"reward_std": 0.31101649068295956,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19882812909781933,
"step": 32
},
{
"completion_length": 617.171875,
"epoch": 0.06855362243573097,
"grad_norm": 0.119756318628788,
"kl": 0.37264879420399666,
"learning_rate": 9.980372767420177e-05,
"loss": 0.0002,
"reward": 0.6210937835276127,
"reward_std": 0.2883669789880514,
"rewards/argmax_reward_func": 0.421875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 33
},
{
"completion_length": 604.265625,
"epoch": 0.07063100493378344,
"grad_norm": 0.12877410650253296,
"kl": 0.4410099685192108,
"learning_rate": 9.978893017523903e-05,
"loss": 0.0002,
"reward": 0.6687500476837158,
"reward_std": 0.3535533808171749,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 34
},
{
"completion_length": 664.875,
"epoch": 0.07270838743183589,
"grad_norm": 0.10366171598434448,
"kl": 0.5219907499849796,
"learning_rate": 9.977359612865423e-05,
"loss": 0.0003,
"reward": 0.726562537252903,
"reward_std": 0.26958445459604263,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.1953125037252903,
"step": 35
},
{
"completion_length": 658.515625,
"epoch": 0.07478576992988833,
"grad_norm": 0.1170380637049675,
"kl": 0.46299856156110764,
"learning_rate": 9.97577256996939e-05,
"loss": 0.0002,
"reward": 0.8226562887430191,
"reward_std": 0.3126737759448588,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19765625335276127,
"step": 36
},
{
"completion_length": 621.890625,
"epoch": 0.0768631524279408,
"grad_norm": 0.09539435803890228,
"kl": 0.39572376012802124,
"learning_rate": 9.974131905938483e-05,
"loss": 0.0002,
"reward": 0.851562537252903,
"reward_std": 0.18119611439760774,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.1953125037252903,
"step": 37
},
{
"completion_length": 616.671875,
"epoch": 0.07894053492599325,
"grad_norm": 0.1066877692937851,
"kl": 0.3970871977508068,
"learning_rate": 9.972437638453227e-05,
"loss": 0.0002,
"reward": 0.5734375342726707,
"reward_std": 0.2673747483640909,
"rewards/argmax_reward_func": 0.375,
"rewards/format_reward_func": 0.1984375026077032,
"step": 38
},
{
"completion_length": 620.515625,
"epoch": 0.0810179174240457,
"grad_norm": 0.09872303903102875,
"kl": 0.4494887478649616,
"learning_rate": 9.970689785771798e-05,
"loss": 0.0002,
"reward": 0.8539062887430191,
"reward_std": 0.22428543493151665,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19765625521540642,
"step": 39
},
{
"completion_length": 646.25,
"epoch": 0.08309529992209816,
"grad_norm": 0.10916193574666977,
"kl": 0.3997967578470707,
"learning_rate": 9.968888366729835e-05,
"loss": 0.0002,
"reward": 0.7867187820374966,
"reward_std": 0.31156892515718937,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19296875223517418,
"step": 40
},
{
"completion_length": 636.578125,
"epoch": 0.08517268242015061,
"grad_norm": 0.109690822660923,
"kl": 0.4680747017264366,
"learning_rate": 9.967033400740227e-05,
"loss": 0.0002,
"reward": 0.7125000357627869,
"reward_std": 0.2916815411299467,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19687500409781933,
"step": 41
},
{
"completion_length": 633.8125,
"epoch": 0.08725006491820306,
"grad_norm": 0.11978733539581299,
"kl": 0.42677244916558266,
"learning_rate": 9.965124907792915e-05,
"loss": 0.0002,
"reward": 0.6804687902331352,
"reward_std": 0.3369805682450533,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.19609375298023224,
"step": 42
},
{
"completion_length": 641.328125,
"epoch": 0.08932744741625552,
"grad_norm": 0.10323718935251236,
"kl": 0.4870793893933296,
"learning_rate": 9.963162908454664e-05,
"loss": 0.0002,
"reward": 0.6820312812924385,
"reward_std": 0.2905766926705837,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.19765625335276127,
"step": 43
},
{
"completion_length": 672.90625,
"epoch": 0.09140482991430797,
"grad_norm": 0.10177203267812729,
"kl": 0.4337821826338768,
"learning_rate": 9.96114742386885e-05,
"loss": 0.0002,
"reward": 0.7445312850177288,
"reward_std": 0.3347708657383919,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.19765625335276127,
"step": 44
},
{
"completion_length": 639.203125,
"epoch": 0.09348221241236043,
"grad_norm": 0.07864588499069214,
"kl": 0.6409419141709805,
"learning_rate": 9.95907847575523e-05,
"loss": 0.0003,
"reward": 0.6367187835276127,
"reward_std": 0.17788154818117619,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.1992187537252903,
"step": 45
},
{
"completion_length": 662.859375,
"epoch": 0.09555959491041288,
"grad_norm": 0.1133044883608818,
"kl": 0.49651604518294334,
"learning_rate": 9.95695608640971e-05,
"loss": 0.0002,
"reward": 0.6664062887430191,
"reward_std": 0.31267377361655235,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.19765624962747097,
"step": 46
},
{
"completion_length": 615.1875,
"epoch": 0.09763697740846533,
"grad_norm": 0.09465198963880539,
"kl": 0.42672090977430344,
"learning_rate": 9.954780278704097e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.2651650384068489,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 47
},
{
"completion_length": 624.234375,
"epoch": 0.0997143599065178,
"grad_norm": 0.12003368884325027,
"kl": 0.45439790561795235,
"learning_rate": 9.952551076085864e-05,
"loss": 0.0002,
"reward": 0.6531250402331352,
"reward_std": 0.375650467351079,
"rewards/argmax_reward_func": 0.453125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 48
},
{
"completion_length": 657.6875,
"epoch": 0.10179174240457024,
"grad_norm": 0.10603732615709305,
"kl": 0.4523283280432224,
"learning_rate": 9.950268502577884e-05,
"loss": 0.0002,
"reward": 0.823437537997961,
"reward_std": 0.3115689232945442,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19843750074505806,
"step": 49
},
{
"completion_length": 611.640625,
"epoch": 0.10386912490262269,
"grad_norm": 0.11261381953954697,
"kl": 0.5920008532702923,
"learning_rate": 9.947932582778188e-05,
"loss": 0.0003,
"reward": 0.7765625417232513,
"reward_std": 0.33366600796580315,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.1984375026077032,
"step": 50
},
{
"completion_length": 662.359375,
"epoch": 0.10594650740067516,
"grad_norm": 0.11194069683551788,
"kl": 0.40367136895656586,
"learning_rate": 9.94554334185968e-05,
"loss": 0.0002,
"reward": 0.6375000365078449,
"reward_std": 0.35355338267982006,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 51
},
{
"completion_length": 686.265625,
"epoch": 0.1080238898987276,
"grad_norm": 0.10016939043998718,
"kl": 0.429857462644577,
"learning_rate": 9.943100805569887e-05,
"loss": 0.0002,
"reward": 0.7468750476837158,
"reward_std": 0.33145629055798054,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 52
},
{
"completion_length": 613.890625,
"epoch": 0.11010127239678005,
"grad_norm": 0.11434896290302277,
"kl": 0.41631242260336876,
"learning_rate": 9.94060500023066e-05,
"loss": 0.0002,
"reward": 0.620312537997961,
"reward_std": 0.333666006103158,
"rewards/argmax_reward_func": 0.421875,
"rewards/format_reward_func": 0.1984375026077032,
"step": 53
},
{
"completion_length": 671.75,
"epoch": 0.11217865489483252,
"grad_norm": 0.0824907198548317,
"kl": 0.4247642531991005,
"learning_rate": 9.938055952737907e-05,
"loss": 0.0002,
"reward": 0.7156250476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 54
},
{
"completion_length": 711.078125,
"epoch": 0.11425603739288497,
"grad_norm": 0.09910566359758377,
"kl": 0.46975456923246384,
"learning_rate": 9.935453690561297e-05,
"loss": 0.0002,
"reward": 0.7906250469386578,
"reward_std": 0.3137786239385605,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 55
},
{
"completion_length": 659.5625,
"epoch": 0.11633341989093741,
"grad_norm": 0.0940733402967453,
"kl": 0.42034388333559036,
"learning_rate": 9.932798241743961e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.2872621212154627,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 56
},
{
"completion_length": 694.796875,
"epoch": 0.11841080238898988,
"grad_norm": 0.20298048853874207,
"kl": 1.01119814068079,
"learning_rate": 9.930089634902197e-05,
"loss": 0.0005,
"reward": 0.714062537997961,
"reward_std": 0.28947182931005955,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19843750447034836,
"step": 57
},
{
"completion_length": 679.421875,
"epoch": 0.12048818488704233,
"grad_norm": 0.10724397003650665,
"kl": 0.45981432124972343,
"learning_rate": 9.927327899225151e-05,
"loss": 0.0002,
"reward": 0.7156250476837158,
"reward_std": 0.375650467351079,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 58
},
{
"completion_length": 666.390625,
"epoch": 0.12256556738509478,
"grad_norm": 0.09369952231645584,
"kl": 0.5710588954389095,
"learning_rate": 9.924513064474519e-05,
"loss": 0.0003,
"reward": 0.8085937909781933,
"reward_std": 0.24417280405759811,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 59
},
{
"completion_length": 667.3125,
"epoch": 0.12464294988314724,
"grad_norm": 0.10410826653242111,
"kl": 0.6697803623974323,
"learning_rate": 9.921645160984206e-05,
"loss": 0.0003,
"reward": 0.7625000476837158,
"reward_std": 0.35355337895452976,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 60
},
{
"completion_length": 694.453125,
"epoch": 0.1267203323811997,
"grad_norm": 0.10938889533281326,
"kl": 0.42841707170009613,
"learning_rate": 9.918724219660013e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.3756504636257887,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 61
},
{
"completion_length": 767.796875,
"epoch": 0.12879771487925215,
"grad_norm": 0.08572812378406525,
"kl": 0.42277197539806366,
"learning_rate": 9.915750271979305e-05,
"loss": 0.0002,
"reward": 0.6843750402331352,
"reward_std": 0.28726212307810783,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 62
},
{
"completion_length": 746.703125,
"epoch": 0.1308750973773046,
"grad_norm": 0.08672405034303665,
"kl": 0.4743144288659096,
"learning_rate": 9.91272334999066e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.2872621212154627,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 63
},
{
"completion_length": 785.140625,
"epoch": 0.13295247987535705,
"grad_norm": 0.07892299443483353,
"kl": 0.5303685143589973,
"learning_rate": 9.909643486313533e-05,
"loss": 0.0003,
"reward": 0.7312500402331352,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 64
},
{
"completion_length": 816.015625,
"epoch": 0.1350298623734095,
"grad_norm": 0.071454256772995,
"kl": 0.39743437245488167,
"learning_rate": 9.906510714137905e-05,
"loss": 0.0002,
"reward": 0.6218750402331352,
"reward_std": 0.24306794628500938,
"rewards/argmax_reward_func": 0.421875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 65
},
{
"completion_length": 836.4375,
"epoch": 0.13710724487146195,
"grad_norm": 0.08313830941915512,
"kl": 0.3903077654540539,
"learning_rate": 9.903325067223919e-05,
"loss": 0.0002,
"reward": 0.6367187909781933,
"reward_std": 0.31046406738460064,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.1992187537252903,
"step": 66
},
{
"completion_length": 787.484375,
"epoch": 0.13918462736951442,
"grad_norm": 0.08504212647676468,
"kl": 0.5619952343404293,
"learning_rate": 9.90008657990152e-05,
"loss": 0.0003,
"reward": 0.7464844211935997,
"reward_std": 0.28781455010175705,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.19960937649011612,
"step": 67
},
{
"completion_length": 807.921875,
"epoch": 0.14126200986756687,
"grad_norm": 0.08398205786943436,
"kl": 0.47908810153603554,
"learning_rate": 9.896795287070086e-05,
"loss": 0.0002,
"reward": 0.7468750476837158,
"reward_std": 0.331456296145916,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 68
},
{
"completion_length": 837.234375,
"epoch": 0.14333939236561932,
"grad_norm": 0.054244451224803925,
"kl": 0.39820099994540215,
"learning_rate": 9.893451224198052e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 69
},
{
"completion_length": 910.0625,
"epoch": 0.14541677486367177,
"grad_norm": 0.08078251034021378,
"kl": 0.4756108485162258,
"learning_rate": 9.890054427322521e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.331456296145916,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 70
},
{
"completion_length": 867.84375,
"epoch": 0.14749415736172422,
"grad_norm": 0.08043571561574936,
"kl": 0.3970469869673252,
"learning_rate": 9.886604933048888e-05,
"loss": 0.0002,
"reward": 0.6679687947034836,
"reward_std": 0.3104640601668507,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 71
},
{
"completion_length": 882.265625,
"epoch": 0.14957153985977667,
"grad_norm": 0.09208390861749649,
"kl": 0.40190327540040016,
"learning_rate": 9.883102778550434e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.3977475520223379,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 72
},
{
"completion_length": 889.78125,
"epoch": 0.15164892235782915,
"grad_norm": 0.09202940762042999,
"kl": 0.38338571041822433,
"learning_rate": 9.879548001567931e-05,
"loss": 0.0002,
"reward": 0.7000000476837158,
"reward_std": 0.4419417232275009,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.20000000298023224,
"step": 73
},
{
"completion_length": 942.609375,
"epoch": 0.1537263048558816,
"grad_norm": 0.06312800943851471,
"kl": 0.4080694951117039,
"learning_rate": 9.875940640409234e-05,
"loss": 0.0002,
"reward": 0.5750000402331352,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 74
},
{
"completion_length": 948.859375,
"epoch": 0.15580368735393405,
"grad_norm": 0.0712570995092392,
"kl": 0.4405221752822399,
"learning_rate": 9.872280733948867e-05,
"loss": 0.0002,
"reward": 0.8085937947034836,
"reward_std": 0.2883669827133417,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 75
},
{
"completion_length": 1053.21875,
"epoch": 0.1578810698519865,
"grad_norm": 0.05858299508690834,
"kl": 0.4397047348320484,
"learning_rate": 9.868568321627611e-05,
"loss": 0.0002,
"reward": 0.7000000383704901,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.20000000298023224,
"step": 76
},
{
"completion_length": 1019.96875,
"epoch": 0.15995845235003894,
"grad_norm": 0.07670939713716507,
"kl": 0.40835118666291237,
"learning_rate": 9.86480344345207e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.33145629800856113,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 77
},
{
"completion_length": 1075.15625,
"epoch": 0.1620358348480914,
"grad_norm": 0.06651510298252106,
"kl": 0.42486657947301865,
"learning_rate": 9.860986139994239e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.28726211935281754,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 78
},
{
"completion_length": 1096.828125,
"epoch": 0.16411321734614387,
"grad_norm": 0.06264790147542953,
"kl": 0.3813174143433571,
"learning_rate": 9.857116452391079e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 79
},
{
"completion_length": 1159.390625,
"epoch": 0.16619059984419632,
"grad_norm": 0.06721258908510208,
"kl": 0.41810835897922516,
"learning_rate": 9.85319442234406e-05,
"loss": 0.0002,
"reward": 0.7617187947034836,
"reward_std": 0.3104640692472458,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.1992187537252903,
"step": 80
},
{
"completion_length": 1207.40625,
"epoch": 0.16826798234224877,
"grad_norm": 0.07961631566286087,
"kl": 0.353565227240324,
"learning_rate": 9.84922009211872e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.4419417269527912,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 81
},
{
"completion_length": 1267.5625,
"epoch": 0.17034536484030122,
"grad_norm": 0.06159353628754616,
"kl": 0.3608316369354725,
"learning_rate": 9.845193504544209e-05,
"loss": 0.0002,
"reward": 0.6218750365078449,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.421875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 82
},
{
"completion_length": 1271.1875,
"epoch": 0.17242274733835367,
"grad_norm": 0.0616268515586853,
"kl": 0.3721548244357109,
"learning_rate": 9.841114703012817e-05,
"loss": 0.0002,
"reward": 0.7613281682133675,
"reward_std": 0.26682231575250626,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.19882812723517418,
"step": 83
},
{
"completion_length": 1197.84375,
"epoch": 0.17450012983640611,
"grad_norm": 0.06743966042995453,
"kl": 0.46105678752064705,
"learning_rate": 9.836983731479525e-05,
"loss": 0.0002,
"reward": 0.7625000476837158,
"reward_std": 0.30935920774936676,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 84
},
{
"completion_length": 1239.859375,
"epoch": 0.1765775123344586,
"grad_norm": 0.07362944632768631,
"kl": 0.35114892572164536,
"learning_rate": 9.832800634461518e-05,
"loss": 0.0002,
"reward": 0.6828125417232513,
"reward_std": 0.3336660098284483,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.19843750074505806,
"step": 85
},
{
"completion_length": 1253.21875,
"epoch": 0.17865489483251104,
"grad_norm": 0.060973405838012695,
"kl": 0.3400215059518814,
"learning_rate": 9.828565457037703e-05,
"loss": 0.0002,
"reward": 0.7613281719386578,
"reward_std": 0.2668223213404417,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.19882812723517418,
"step": 86
},
{
"completion_length": 1251.828125,
"epoch": 0.1807322773305635,
"grad_norm": 0.06071100011467934,
"kl": 0.3388819098472595,
"learning_rate": 9.824278244848235e-05,
"loss": 0.0002,
"reward": 0.6843750402331352,
"reward_std": 0.28726212307810783,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 87
},
{
"completion_length": 1177.9375,
"epoch": 0.18280965982861594,
"grad_norm": 0.07785635441541672,
"kl": 0.39376673474907875,
"learning_rate": 9.819939044094016e-05,
"loss": 0.0002,
"reward": 0.6687500476837158,
"reward_std": 0.3977475520223379,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 88
},
{
"completion_length": 1312.515625,
"epoch": 0.1848870423266684,
"grad_norm": 0.06982032209634781,
"kl": 0.3353493846952915,
"learning_rate": 9.815547901536201e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.33145629428327084,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 89
},
{
"completion_length": 1360.75,
"epoch": 0.18696442482472087,
"grad_norm": 0.06107737869024277,
"kl": 0.45528167858719826,
"learning_rate": 9.811104864495691e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.24306794814765453,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 90
},
{
"completion_length": 1353.296875,
"epoch": 0.18904180732277331,
"grad_norm": 0.06465540081262589,
"kl": 0.353522464632988,
"learning_rate": 9.806609980852628e-05,
"loss": 0.0002,
"reward": 0.8046875409781933,
"reward_std": 0.2938912510871887,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19531250186264515,
"step": 91
},
{
"completion_length": 1441.984375,
"epoch": 0.19111918982082576,
"grad_norm": 0.0610247403383255,
"kl": 0.36326174437999725,
"learning_rate": 9.802063299045873e-05,
"loss": 0.0002,
"reward": 0.7468750402331352,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 92
},
{
"completion_length": 1441.265625,
"epoch": 0.1931965723188782,
"grad_norm": 0.05115514621138573,
"kl": 0.411540150642395,
"learning_rate": 9.797464868072488e-05,
"loss": 0.0002,
"reward": 0.6812500357627869,
"reward_std": 0.2032931987196207,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 93
},
{
"completion_length": 1397.03125,
"epoch": 0.19527395481693066,
"grad_norm": 0.053147751837968826,
"kl": 0.4489905573427677,
"learning_rate": 9.792814737487207e-05,
"loss": 0.0002,
"reward": 0.7937500439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 94
},
{
"completion_length": 1407.515625,
"epoch": 0.1973513373149831,
"grad_norm": 0.0552426278591156,
"kl": 0.3701773174107075,
"learning_rate": 9.788112957401903e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 95
},
{
"completion_length": 1505.546875,
"epoch": 0.1994287198130356,
"grad_norm": 0.05075477808713913,
"kl": 0.39650479704141617,
"learning_rate": 9.783359578485047e-05,
"loss": 0.0002,
"reward": 0.8855469226837158,
"reward_std": 0.17953883111476898,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19804687798023224,
"step": 96
},
{
"completion_length": 1542.90625,
"epoch": 0.20150610231108804,
"grad_norm": 0.053789589554071426,
"kl": 0.35163769498467445,
"learning_rate": 9.778554651961159e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 97
},
{
"completion_length": 1533.46875,
"epoch": 0.20358348480914049,
"grad_norm": 0.05969106778502464,
"kl": 0.40055200457572937,
"learning_rate": 9.773698229610263e-05,
"loss": 0.0002,
"reward": 0.8664062917232513,
"reward_std": 0.29499610885977745,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.1945312526077032,
"step": 98
},
{
"completion_length": 1658.3125,
"epoch": 0.20566086730719293,
"grad_norm": 0.05904076248407364,
"kl": 0.3737713471055031,
"learning_rate": 9.768790363767322e-05,
"loss": 0.0002,
"reward": 0.7132812924683094,
"reward_std": 0.2905766908079386,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19765625149011612,
"step": 99
},
{
"completion_length": 1522.9375,
"epoch": 0.20773824980524538,
"grad_norm": 0.04626452177762985,
"kl": 0.3718419596552849,
"learning_rate": 9.763831107321678e-05,
"loss": 0.0002,
"reward": 0.6843750439584255,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 100
},
{
"completion_length": 1437.34375,
"epoch": 0.20981563230329783,
"grad_norm": 0.0583551786839962,
"kl": 0.3823527656495571,
"learning_rate": 9.75882051371648e-05,
"loss": 0.0002,
"reward": 0.7929687947034836,
"reward_std": 0.26626989617943764,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 101
},
{
"completion_length": 1665.859375,
"epoch": 0.2118930148013503,
"grad_norm": 0.071258544921875,
"kl": 0.3524062894284725,
"learning_rate": 9.753758636948111e-05,
"loss": 0.0002,
"reward": 0.7121094167232513,
"reward_std": 0.3806223217397928,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19648437574505806,
"step": 102
},
{
"completion_length": 1570.484375,
"epoch": 0.21397039729940276,
"grad_norm": 0.06221286952495575,
"kl": 0.4012618362903595,
"learning_rate": 9.748645531565604e-05,
"loss": 0.0002,
"reward": 0.8691406697034836,
"reward_std": 0.2911291141062975,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19726562686264515,
"step": 103
},
{
"completion_length": 1470.140625,
"epoch": 0.2160477797974552,
"grad_norm": 0.05706779286265373,
"kl": 0.37375468015670776,
"learning_rate": 9.743481252670049e-05,
"loss": 0.0002,
"reward": 0.7136719189584255,
"reward_std": 0.24583008396439254,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19804687798023224,
"step": 104
},
{
"completion_length": 1487.796875,
"epoch": 0.21812516229550766,
"grad_norm": 0.04427757114171982,
"kl": 0.40517764165997505,
"learning_rate": 9.738265855914013e-05,
"loss": 0.0002,
"reward": 0.7468750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 105
},
{
"completion_length": 1542.703125,
"epoch": 0.2202025447935601,
"grad_norm": 0.060884129256010056,
"kl": 0.41977495700120926,
"learning_rate": 9.732999397500926e-05,
"loss": 0.0002,
"reward": 0.6503906659781933,
"reward_std": 0.24693494127131999,
"rewards/argmax_reward_func": 0.453125,
"rewards/format_reward_func": 0.19726562686264515,
"step": 106
},
{
"completion_length": 1549.03125,
"epoch": 0.22227992729161256,
"grad_norm": 0.04595618322491646,
"kl": 0.47253532335162163,
"learning_rate": 9.727681934184481e-05,
"loss": 0.0002,
"reward": 0.9000000506639481,
"reward_std": 0.1590990237891674,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 107
},
{
"completion_length": 1636.546875,
"epoch": 0.22435730978966503,
"grad_norm": 0.03207004442811012,
"kl": 0.37253231182694435,
"learning_rate": 9.722313523268028e-05,
"loss": 0.0002,
"reward": 0.8875000439584255,
"reward_std": 0.0883883461356163,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 108
},
{
"completion_length": 1696.84375,
"epoch": 0.22643469228771748,
"grad_norm": 0.08920740336179733,
"kl": 0.7168225161731243,
"learning_rate": 9.716894222603942e-05,
"loss": 0.0004,
"reward": 0.8093750476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 109
},
{
"completion_length": 1443.65625,
"epoch": 0.22851207478576993,
"grad_norm": 0.06247260421514511,
"kl": 0.3850158527493477,
"learning_rate": 9.711424090593019e-05,
"loss": 0.0002,
"reward": 0.7617187947034836,
"reward_std": 0.2662698905915022,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.1992187537252903,
"step": 110
},
{
"completion_length": 1509.8125,
"epoch": 0.23058945728382238,
"grad_norm": 0.06556280702352524,
"kl": 0.3532305136322975,
"learning_rate": 9.705903186183828e-05,
"loss": 0.0002,
"reward": 0.7281250506639481,
"reward_std": 0.3137786276638508,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 111
},
{
"completion_length": 1467.875,
"epoch": 0.23266683978187483,
"grad_norm": 0.06337332725524902,
"kl": 0.3515300862491131,
"learning_rate": 9.700331568872086e-05,
"loss": 0.0002,
"reward": 0.8054687976837158,
"reward_std": 0.2905766889452934,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19609375298023224,
"step": 112
},
{
"completion_length": 1409.40625,
"epoch": 0.23474422227992728,
"grad_norm": 0.06349179893732071,
"kl": 0.35185598209500313,
"learning_rate": 9.694709298700016e-05,
"loss": 0.0002,
"reward": 0.7750000469386578,
"reward_std": 0.24748736945912242,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 113
},
{
"completion_length": 1412.53125,
"epoch": 0.23682160477797976,
"grad_norm": 0.064293272793293,
"kl": 0.506983544677496,
"learning_rate": 9.689036436255699e-05,
"loss": 0.0003,
"reward": 0.724609412252903,
"reward_std": 0.22483785497024655,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19335937686264515,
"step": 114
},
{
"completion_length": 1372.859375,
"epoch": 0.2388989872760322,
"grad_norm": 0.06638536602258682,
"kl": 0.35727328434586525,
"learning_rate": 9.683313042672418e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.287262124940753,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 115
},
{
"completion_length": 1373.09375,
"epoch": 0.24097636977408465,
"grad_norm": 0.06149492412805557,
"kl": 0.3754408285021782,
"learning_rate": 9.677539179628005e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 116
},
{
"completion_length": 1313.734375,
"epoch": 0.2430537522721371,
"grad_norm": 0.062166426330804825,
"kl": 0.40280015021562576,
"learning_rate": 9.671714909344174e-05,
"loss": 0.0002,
"reward": 0.8531250506639481,
"reward_std": 0.2695844564586878,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 117
},
{
"completion_length": 1132.78125,
"epoch": 0.24513113477018955,
"grad_norm": 0.06206024810671806,
"kl": 0.4296950623393059,
"learning_rate": 9.665840294585845e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 118
},
{
"completion_length": 1270.625,
"epoch": 0.24720851726824203,
"grad_norm": 0.05055106431245804,
"kl": 0.3436691351234913,
"learning_rate": 9.659915398660477e-05,
"loss": 0.0002,
"reward": 0.7742187902331352,
"reward_std": 0.16020388156175613,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19609375298023224,
"step": 119
},
{
"completion_length": 1221.015625,
"epoch": 0.24928589976629448,
"grad_norm": 0.06833141297101974,
"kl": 0.3341045156121254,
"learning_rate": 9.65394028541738e-05,
"loss": 0.0002,
"reward": 0.7937500439584255,
"reward_std": 0.2651650384068489,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 120
},
{
"completion_length": 1273.125,
"epoch": 0.2513632822643469,
"grad_norm": 0.06194274127483368,
"kl": 0.3503304682672024,
"learning_rate": 9.647915019247029e-05,
"loss": 0.0002,
"reward": 0.6687500439584255,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 121
},
{
"completion_length": 1169.4375,
"epoch": 0.2534406647623994,
"grad_norm": 0.05682160705327988,
"kl": 0.4410577192902565,
"learning_rate": 9.641839665080363e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 122
},
{
"completion_length": 1201.921875,
"epoch": 0.25551804726045185,
"grad_norm": 0.061100929975509644,
"kl": 0.3569498844444752,
"learning_rate": 9.635714288388102e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 123
},
{
"completion_length": 1296.234375,
"epoch": 0.2575954297585043,
"grad_norm": 0.0515943244099617,
"kl": 0.34240079671144485,
"learning_rate": 9.629538955180021e-05,
"loss": 0.0002,
"reward": 0.6835937909781933,
"reward_std": 0.15578446350991726,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 124
},
{
"completion_length": 1237.078125,
"epoch": 0.25967281225655675,
"grad_norm": 0.07088616490364075,
"kl": 0.34578079730272293,
"learning_rate": 9.623313732004258e-05,
"loss": 0.0002,
"reward": 0.6687500402331352,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 125
},
{
"completion_length": 1213.71875,
"epoch": 0.2617501947546092,
"grad_norm": 0.05374123901128769,
"kl": 0.3377624601125717,
"learning_rate": 9.617038685946578e-05,
"loss": 0.0002,
"reward": 0.7468750402331352,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 126
},
{
"completion_length": 1325.421875,
"epoch": 0.26382757725266165,
"grad_norm": 0.05408313870429993,
"kl": 0.3581954091787338,
"learning_rate": 9.610713884629666e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 127
},
{
"completion_length": 1305.421875,
"epoch": 0.2659049597507141,
"grad_norm": 0.05651029199361801,
"kl": 0.3299425356090069,
"learning_rate": 9.60433939621239e-05,
"loss": 0.0002,
"reward": 0.6500000394880772,
"reward_std": 0.15909902285784483,
"rewards/argmax_reward_func": 0.453125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 128
},
{
"completion_length": 1394.34375,
"epoch": 0.26798234224876655,
"grad_norm": 0.05847406014800072,
"kl": 0.3248457871377468,
"learning_rate": 9.597915289389066e-05,
"loss": 0.0002,
"reward": 0.8847656697034836,
"reward_std": 0.22483785450458527,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19726562686264515,
"step": 129
},
{
"completion_length": 1361.125,
"epoch": 0.270059724746819,
"grad_norm": 0.03918185085058212,
"kl": 0.29694442078471184,
"learning_rate": 9.591441633388724e-05,
"loss": 0.0001,
"reward": 0.8687500506639481,
"reward_std": 0.11490485025569797,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19687500223517418,
"step": 130
},
{
"completion_length": 1294.34375,
"epoch": 0.27213710724487145,
"grad_norm": 0.06627894192934036,
"kl": 0.317622110247612,
"learning_rate": 9.584918497974354e-05,
"loss": 0.0002,
"reward": 0.8031250387430191,
"reward_std": 0.2519067842513323,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19375000149011612,
"step": 131
},
{
"completion_length": 1264.578125,
"epoch": 0.2742144897429239,
"grad_norm": 0.05716657266020775,
"kl": 0.33274614438414574,
"learning_rate": 9.578345953442162e-05,
"loss": 0.0002,
"reward": 0.7093750424683094,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19375000335276127,
"step": 132
},
{
"completion_length": 1101.5625,
"epoch": 0.27629187224097634,
"grad_norm": 0.06597350537776947,
"kl": 0.3318898268043995,
"learning_rate": 9.571724070620806e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 133
},
{
"completion_length": 1340.328125,
"epoch": 0.27836925473902885,
"grad_norm": 0.06743122637271881,
"kl": 0.2939135618507862,
"learning_rate": 9.565052920870636e-05,
"loss": 0.0001,
"reward": 0.6312500461935997,
"reward_std": 0.27400387404486537,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.19375000149011612,
"step": 134
},
{
"completion_length": 1385.546875,
"epoch": 0.2804466372370813,
"grad_norm": 0.05118987336754799,
"kl": 0.27961407601833344,
"learning_rate": 9.558332576082925e-05,
"loss": 0.0001,
"reward": 0.8664062991738319,
"reward_std": 0.20660776272416115,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19453125074505806,
"step": 135
},
{
"completion_length": 1284.90625,
"epoch": 0.28252401973513375,
"grad_norm": 0.060963716357946396,
"kl": 0.310220867395401,
"learning_rate": 9.551563108679091e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 136
},
{
"completion_length": 1260.078125,
"epoch": 0.2846014022331862,
"grad_norm": 0.0460037924349308,
"kl": 0.39314381033182144,
"learning_rate": 9.544744591609922e-05,
"loss": 0.0002,
"reward": 0.7781250402331352,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 137
},
{
"completion_length": 1031.03125,
"epoch": 0.28667878473123865,
"grad_norm": 0.06592284142971039,
"kl": 0.4330439232289791,
"learning_rate": 9.537877098354786e-05,
"loss": 0.0002,
"reward": 0.9343750476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 138
},
{
"completion_length": 1101.796875,
"epoch": 0.2887561672292911,
"grad_norm": 0.0644359141588211,
"kl": 0.2887462917715311,
"learning_rate": 9.53096070292084e-05,
"loss": 0.0001,
"reward": 0.8218750432133675,
"reward_std": 0.22539028152823448,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 139
},
{
"completion_length": 1065.90625,
"epoch": 0.29083354972734354,
"grad_norm": 0.065298892557621,
"kl": 0.30470659770071507,
"learning_rate": 9.523995479842232e-05,
"loss": 0.0002,
"reward": 0.6218750365078449,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.421875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 140
},
{
"completion_length": 978.953125,
"epoch": 0.292910932225396,
"grad_norm": 0.05792571231722832,
"kl": 0.4863986298441887,
"learning_rate": 9.516981504179299e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 141
},
{
"completion_length": 1087.9375,
"epoch": 0.29498831472344844,
"grad_norm": 0.06688184291124344,
"kl": 0.29886077158153057,
"learning_rate": 9.509918851517758e-05,
"loss": 0.0001,
"reward": 0.8562500476837158,
"reward_std": 0.2651650384068489,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 142
},
{
"completion_length": 1019.140625,
"epoch": 0.2970656972215009,
"grad_norm": 0.06881757080554962,
"kl": 0.3445068225264549,
"learning_rate": 9.502807597967893e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 143
},
{
"completion_length": 1186.5625,
"epoch": 0.29914307971955334,
"grad_norm": 0.05837235972285271,
"kl": 0.32853276655077934,
"learning_rate": 9.495647820163725e-05,
"loss": 0.0002,
"reward": 0.8855469226837158,
"reward_std": 0.17953882738947868,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19804687798023224,
"step": 144
},
{
"completion_length": 931.265625,
"epoch": 0.30122046221760584,
"grad_norm": 0.046699460595846176,
"kl": 0.33741075173020363,
"learning_rate": 9.488439595262204e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 145
},
{
"completion_length": 944.25,
"epoch": 0.3032978447156583,
"grad_norm": 0.06217503920197487,
"kl": 0.3317374251782894,
"learning_rate": 9.48118300094236e-05,
"loss": 0.0002,
"reward": 0.7312500476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 146
},
{
"completion_length": 1001.078125,
"epoch": 0.30537522721371074,
"grad_norm": 0.06545262783765793,
"kl": 0.3109145648777485,
"learning_rate": 9.473878115404477e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 147
},
{
"completion_length": 910.796875,
"epoch": 0.3074526097117632,
"grad_norm": 0.05757139250636101,
"kl": 0.3004848547279835,
"learning_rate": 9.466525017369243e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 148
},
{
"completion_length": 1078.265625,
"epoch": 0.30952999220981564,
"grad_norm": 0.07616781443357468,
"kl": 0.28462448343634605,
"learning_rate": 9.459123786076912e-05,
"loss": 0.0001,
"reward": 0.8093750476837158,
"reward_std": 0.33145629428327084,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 149
},
{
"completion_length": 963.71875,
"epoch": 0.3116073747078681,
"grad_norm": 0.06607849150896072,
"kl": 0.2945715934038162,
"learning_rate": 9.451674501286436e-05,
"loss": 0.0001,
"reward": 0.7468750402331352,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 150
},
{
"completion_length": 925.484375,
"epoch": 0.31368475720592054,
"grad_norm": 0.08415860682725906,
"kl": 0.326167568564415,
"learning_rate": 9.444177243274618e-05,
"loss": 0.0002,
"reward": 0.7000000439584255,
"reward_std": 0.35355337895452976,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.20000000298023224,
"step": 151
},
{
"completion_length": 814.296875,
"epoch": 0.315762139703973,
"grad_norm": 0.049171049147844315,
"kl": 0.312137458473444,
"learning_rate": 9.436632092835239e-05,
"loss": 0.0002,
"reward": 1.0281250476837158,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.828125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 152
},
{
"completion_length": 783.484375,
"epoch": 0.31783952220202544,
"grad_norm": 0.06367822736501694,
"kl": 0.33039499446749687,
"learning_rate": 9.42903913127819e-05,
"loss": 0.0002,
"reward": 0.7281250394880772,
"reward_std": 0.18119611032307148,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 153
},
{
"completion_length": 861.609375,
"epoch": 0.3199169047000779,
"grad_norm": 0.06421905755996704,
"kl": 0.3065376691520214,
"learning_rate": 9.421398440428597e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 154
},
{
"completion_length": 902.390625,
"epoch": 0.32199428719813034,
"grad_norm": 0.07363509386777878,
"kl": 0.33688198402523994,
"learning_rate": 9.413710102625938e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.287262124940753,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 155
},
{
"completion_length": 922.0625,
"epoch": 0.3240716696961828,
"grad_norm": 0.06810685992240906,
"kl": 0.34481339529156685,
"learning_rate": 9.405974200723155e-05,
"loss": 0.0002,
"reward": 0.7937500476837158,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 156
},
{
"completion_length": 884.640625,
"epoch": 0.3261490521942353,
"grad_norm": 0.06919455528259277,
"kl": 0.3362896367907524,
"learning_rate": 9.398190818085763e-05,
"loss": 0.0002,
"reward": 0.8398437947034836,
"reward_std": 0.2441728077828884,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19921875186264515,
"step": 157
},
{
"completion_length": 831.109375,
"epoch": 0.32822643469228774,
"grad_norm": 0.08263985067605972,
"kl": 0.882828488945961,
"learning_rate": 9.390360038590951e-05,
"loss": 0.0004,
"reward": 0.8531250506639481,
"reward_std": 0.22539028339087963,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 158
},
{
"completion_length": 879.40625,
"epoch": 0.3303038171903402,
"grad_norm": 0.0637197494506836,
"kl": 0.31912703067064285,
"learning_rate": 9.382481946626674e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 159
},
{
"completion_length": 696.3125,
"epoch": 0.33238119968839264,
"grad_norm": 0.08041277527809143,
"kl": 0.3748646304011345,
"learning_rate": 9.374556627090749e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.2651650384068489,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 160
},
{
"completion_length": 891.265625,
"epoch": 0.3344585821864451,
"grad_norm": 0.06974095106124878,
"kl": 0.3626530338078737,
"learning_rate": 9.366584165389941e-05,
"loss": 0.0002,
"reward": 0.8406250439584255,
"reward_std": 0.24306794814765453,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 161
},
{
"completion_length": 813.03125,
"epoch": 0.33653596468449753,
"grad_norm": 0.092588409781456,
"kl": 0.3759094402194023,
"learning_rate": 9.358564647439037e-05,
"loss": 0.0002,
"reward": 0.7593750506639481,
"reward_std": 0.35797279700636864,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 162
},
{
"completion_length": 838.296875,
"epoch": 0.33861334718255,
"grad_norm": 0.06730964034795761,
"kl": 0.3410007916390896,
"learning_rate": 9.350498159659924e-05,
"loss": 0.0002,
"reward": 0.8406250439584255,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 163
},
{
"completion_length": 775.375,
"epoch": 0.34069072968060243,
"grad_norm": 0.06589485704898834,
"kl": 0.3439077027142048,
"learning_rate": 9.342384788980656e-05,
"loss": 0.0002,
"reward": 0.7312500420957804,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 164
},
{
"completion_length": 781.0,
"epoch": 0.3427681121786549,
"grad_norm": 0.07955412566661835,
"kl": 0.359022606164217,
"learning_rate": 9.33422462283452e-05,
"loss": 0.0002,
"reward": 0.7468750402331352,
"reward_std": 0.287262124940753,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 165
},
{
"completion_length": 895.96875,
"epoch": 0.34484549467670733,
"grad_norm": 0.06293340772390366,
"kl": 0.4317344203591347,
"learning_rate": 9.326017749159087e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 166
},
{
"completion_length": 884.25,
"epoch": 0.3469228771747598,
"grad_norm": 0.07700355350971222,
"kl": 0.5744567923247814,
"learning_rate": 9.317764256395275e-05,
"loss": 0.0003,
"reward": 0.6031250357627869,
"reward_std": 0.26958445459604263,
"rewards/argmax_reward_func": 0.40625,
"rewards/format_reward_func": 0.19687500409781933,
"step": 167
},
{
"completion_length": 889.921875,
"epoch": 0.34900025967281223,
"grad_norm": 0.062210842967033386,
"kl": 0.32679086178541183,
"learning_rate": 9.309464233486387e-05,
"loss": 0.0002,
"reward": 0.7468750402331352,
"reward_std": 0.19887377507984638,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 168
},
{
"completion_length": 788.84375,
"epoch": 0.35107764217086473,
"grad_norm": 0.0710466280579567,
"kl": 0.35585347935557365,
"learning_rate": 9.301117769877153e-05,
"loss": 0.0002,
"reward": 0.8187500350177288,
"reward_std": 0.22207572124898434,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19375000149011612,
"step": 169
},
{
"completion_length": 849.484375,
"epoch": 0.3531550246689172,
"grad_norm": 0.0648435726761818,
"kl": 0.32205165177583694,
"learning_rate": 9.292724955512774e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 170
},
{
"completion_length": 856.984375,
"epoch": 0.35523240716696963,
"grad_norm": 0.06077580899000168,
"kl": 0.34612051025032997,
"learning_rate": 9.284285880837946e-05,
"loss": 0.0002,
"reward": 0.7156250365078449,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 171
},
{
"completion_length": 776.5,
"epoch": 0.3573097896650221,
"grad_norm": 0.07481009513139725,
"kl": 0.3612271770834923,
"learning_rate": 9.275800636795884e-05,
"loss": 0.0002,
"reward": 0.7773437909781933,
"reward_std": 0.28836698085069656,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.1992187537252903,
"step": 172
},
{
"completion_length": 789.046875,
"epoch": 0.35938717216307453,
"grad_norm": 0.07435107976198196,
"kl": 0.3340052030980587,
"learning_rate": 9.267269314827345e-05,
"loss": 0.0002,
"reward": 0.8398437947034836,
"reward_std": 0.28836698085069656,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19921875186264515,
"step": 173
},
{
"completion_length": 768.390625,
"epoch": 0.361464554661127,
"grad_norm": 0.08201409131288528,
"kl": 0.3197612836956978,
"learning_rate": 9.258692006869643e-05,
"loss": 0.0002,
"reward": 0.621093787252903,
"reward_std": 0.3325611485633999,
"rewards/argmax_reward_func": 0.421875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 174
},
{
"completion_length": 740.515625,
"epoch": 0.36354193715917943,
"grad_norm": 0.08215157687664032,
"kl": 0.3205004744231701,
"learning_rate": 9.250068805355658e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.2872621212154627,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 175
},
{
"completion_length": 833.796875,
"epoch": 0.3656193196572319,
"grad_norm": 0.06702969968318939,
"kl": 0.31005076318979263,
"learning_rate": 9.24139980321284e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.28726212307810783,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 176
},
{
"completion_length": 790.546875,
"epoch": 0.3676967021552843,
"grad_norm": 0.08229520171880722,
"kl": 0.32232359051704407,
"learning_rate": 9.232685093862204e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.33145629428327084,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 177
},
{
"completion_length": 814.953125,
"epoch": 0.3697740846533368,
"grad_norm": 0.0782497227191925,
"kl": 0.3153250627219677,
"learning_rate": 9.22392477121733e-05,
"loss": 0.0002,
"reward": 0.7750000506639481,
"reward_std": 0.3358757123351097,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 178
},
{
"completion_length": 972.390625,
"epoch": 0.3718514671513892,
"grad_norm": 0.07078168541193008,
"kl": 0.3324251137673855,
"learning_rate": 9.215118929683344e-05,
"loss": 0.0002,
"reward": 0.7750000469386578,
"reward_std": 0.29168154671788216,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 179
},
{
"completion_length": 757.90625,
"epoch": 0.37392884964944173,
"grad_norm": 0.09468799084424973,
"kl": 0.33874499425292015,
"learning_rate": 9.206267664155907e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.4640388172119856,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 180
},
{
"completion_length": 844.265625,
"epoch": 0.3760062321474942,
"grad_norm": 0.08337994664907455,
"kl": 0.31716278567910194,
"learning_rate": 9.197371070020184e-05,
"loss": 0.0002,
"reward": 0.7906250506639481,
"reward_std": 0.3579728025943041,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 181
},
{
"completion_length": 864.234375,
"epoch": 0.37808361464554663,
"grad_norm": 0.0694384053349495,
"kl": 0.32097451388835907,
"learning_rate": 9.188429243149824e-05,
"loss": 0.0002,
"reward": 0.6472656652331352,
"reward_std": 0.24362037517130375,
"rewards/argmax_reward_func": 0.453125,
"rewards/format_reward_func": 0.19414062798023224,
"step": 182
},
{
"completion_length": 749.046875,
"epoch": 0.3801609971435991,
"grad_norm": 0.062498513609170914,
"kl": 0.3336629420518875,
"learning_rate": 9.179442279905928e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 183
},
{
"completion_length": 923.640625,
"epoch": 0.3822383796416515,
"grad_norm": 0.07083828747272491,
"kl": 0.32143479958176613,
"learning_rate": 9.170410277135999e-05,
"loss": 0.0002,
"reward": 0.75625004991889,
"reward_std": 0.31819804944097996,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.19375000149011612,
"step": 184
},
{
"completion_length": 884.015625,
"epoch": 0.384315762139704,
"grad_norm": 0.0693785548210144,
"kl": 0.4987417571246624,
"learning_rate": 9.161333332172912e-05,
"loss": 0.0002,
"reward": 0.7062500454485416,
"reward_std": 0.24748736806213856,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.1906250026077032,
"step": 185
},
{
"completion_length": 795.234375,
"epoch": 0.3863931446377564,
"grad_norm": 0.07442086935043335,
"kl": 0.33293722197413445,
"learning_rate": 9.152211542833857e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.3535533845424652,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 186
},
{
"completion_length": 929.25,
"epoch": 0.3884705271358089,
"grad_norm": 0.05656367912888527,
"kl": 0.31210994347929955,
"learning_rate": 9.143045007419284e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 187
},
{
"completion_length": 902.828125,
"epoch": 0.3905479096338613,
"grad_norm": 0.06982313841581345,
"kl": 0.3056885749101639,
"learning_rate": 9.133833824711853e-05,
"loss": 0.0002,
"reward": 0.7156250439584255,
"reward_std": 0.28726212307810783,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 188
},
{
"completion_length": 995.71875,
"epoch": 0.3926252921319138,
"grad_norm": 0.07361900061368942,
"kl": 0.3039589188992977,
"learning_rate": 9.124578093975358e-05,
"loss": 0.0002,
"reward": 0.7300781719386578,
"reward_std": 0.35521066188812256,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19882812909781933,
"step": 189
},
{
"completion_length": 964.359375,
"epoch": 0.3947026746299662,
"grad_norm": 0.07159875333309174,
"kl": 0.3399963229894638,
"learning_rate": 9.115277914953662e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.33145629428327084,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 190
},
{
"completion_length": 1034.03125,
"epoch": 0.39678005712801867,
"grad_norm": 0.07782501727342606,
"kl": 0.3184865601360798,
"learning_rate": 9.105933387869628e-05,
"loss": 0.0002,
"reward": 0.6910156607627869,
"reward_std": 0.4104533866047859,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.19101562723517418,
"step": 191
},
{
"completion_length": 1139.8125,
"epoch": 0.3988574396260712,
"grad_norm": 0.05474551394581795,
"kl": 0.29825419560074806,
"learning_rate": 9.096544613424025e-05,
"loss": 0.0001,
"reward": 0.8804688006639481,
"reward_std": 0.27510872669517994,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19296875223517418,
"step": 192
},
{
"completion_length": 1003.0,
"epoch": 0.4009348221241236,
"grad_norm": 0.0725637748837471,
"kl": 0.3301442116498947,
"learning_rate": 9.087111692794459e-05,
"loss": 0.0002,
"reward": 0.7304687947034836,
"reward_std": 0.31046406365931034,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.1992187537252903,
"step": 193
},
{
"completion_length": 1139.734375,
"epoch": 0.4030122046221761,
"grad_norm": 0.057006120681762695,
"kl": 0.31723184883594513,
"learning_rate": 9.077634727634272e-05,
"loss": 0.0002,
"reward": 0.8449219167232513,
"reward_std": 0.23146697832271457,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.1886718776077032,
"step": 194
},
{
"completion_length": 1048.71875,
"epoch": 0.4050895871202285,
"grad_norm": 0.07205278426408768,
"kl": 0.33233997970819473,
"learning_rate": 9.068113820071447e-05,
"loss": 0.0002,
"reward": 0.7875000387430191,
"reward_std": 0.3181980513036251,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19375000335276127,
"step": 195
},
{
"completion_length": 955.6875,
"epoch": 0.40716696961828097,
"grad_norm": 0.057774197310209274,
"kl": 0.3174768090248108,
"learning_rate": 9.058549072707513e-05,
"loss": 0.0002,
"reward": 0.8347656726837158,
"reward_std": 0.1994262058287859,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19414062798023224,
"step": 196
},
{
"completion_length": 1300.125,
"epoch": 0.4092443521163334,
"grad_norm": 0.05007508769631386,
"kl": 0.30298993550240993,
"learning_rate": 9.048940588616435e-05,
"loss": 0.0002,
"reward": 0.7843750491738319,
"reward_std": 0.22539028525352478,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.1906250026077032,
"step": 197
},
{
"completion_length": 1237.953125,
"epoch": 0.41132173461438587,
"grad_norm": 0.060646846890449524,
"kl": 0.3001830168068409,
"learning_rate": 9.039288471343504e-05,
"loss": 0.0002,
"reward": 0.8812500461935997,
"reward_std": 0.27400387451052666,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19375000335276127,
"step": 198
},
{
"completion_length": 1186.453125,
"epoch": 0.4133991171124383,
"grad_norm": 0.05342816561460495,
"kl": 0.30144498124718666,
"learning_rate": 9.029592824904225e-05,
"loss": 0.0002,
"reward": 0.8074219226837158,
"reward_std": 0.24583008885383606,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19804687798023224,
"step": 199
},
{
"completion_length": 1274.375,
"epoch": 0.41547649961049077,
"grad_norm": 0.05866052210330963,
"kl": 0.3122952822595835,
"learning_rate": 9.019853753783185e-05,
"loss": 0.0002,
"reward": 0.652343787252903,
"reward_std": 0.2264951393008232,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.1835937537252903,
"step": 200
},
{
"completion_length": 1241.5,
"epoch": 0.4175538821085432,
"grad_norm": 0.05399727076292038,
"kl": 0.33987458795309067,
"learning_rate": 9.010071362932944e-05,
"loss": 0.0002,
"reward": 0.8687500432133675,
"reward_std": 0.2032931987196207,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19687500223517418,
"step": 201
},
{
"completion_length": 1336.140625,
"epoch": 0.41963126460659567,
"grad_norm": 0.06403433531522751,
"kl": 0.28705168329179287,
"learning_rate": 9.000245757772885e-05,
"loss": 0.0001,
"reward": 0.8281250521540642,
"reward_std": 0.29610096476972103,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.18750000558793545,
"step": 202
},
{
"completion_length": 1035.203125,
"epoch": 0.42170864710464817,
"grad_norm": 0.0628470629453659,
"kl": 0.30243775993585587,
"learning_rate": 8.990377044188098e-05,
"loss": 0.0002,
"reward": 0.85000004991889,
"reward_std": 0.22980970283970237,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19375000149011612,
"step": 203
},
{
"completion_length": 1175.90625,
"epoch": 0.4237860296027006,
"grad_norm": 0.05064735934138298,
"kl": 0.3158372975885868,
"learning_rate": 8.980465328528219e-05,
"loss": 0.0002,
"reward": 0.7906250394880772,
"reward_std": 0.18119611404836178,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 204
},
{
"completion_length": 1109.28125,
"epoch": 0.42586341210075307,
"grad_norm": 0.060826126486063004,
"kl": 0.2915416620671749,
"learning_rate": 8.9705107176063e-05,
"loss": 0.0001,
"reward": 0.9437500461935997,
"reward_std": 0.22980970703065395,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.19375000149011612,
"step": 205
},
{
"completion_length": 979.453125,
"epoch": 0.4279407945988055,
"grad_norm": 0.062372464686632156,
"kl": 0.3590022251009941,
"learning_rate": 8.960513318697647e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 206
},
{
"completion_length": 1014.40625,
"epoch": 0.43001817709685797,
"grad_norm": 0.0675223246216774,
"kl": 0.3203696608543396,
"learning_rate": 8.950473239538673e-05,
"loss": 0.0002,
"reward": 0.8835937976837158,
"reward_std": 0.27068931609392166,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.1960937511175871,
"step": 207
},
{
"completion_length": 1077.65625,
"epoch": 0.4320955595949104,
"grad_norm": 0.07710019499063492,
"kl": 0.28732946887612343,
"learning_rate": 8.940390588325727e-05,
"loss": 0.0001,
"reward": 0.8781250491738319,
"reward_std": 0.4110058154910803,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19062500447034836,
"step": 208
},
{
"completion_length": 985.4375,
"epoch": 0.43417294209296287,
"grad_norm": 0.04350803792476654,
"kl": 0.3246513232588768,
"learning_rate": 8.930265473713938e-05,
"loss": 0.0002,
"reward": 0.8531250469386578,
"reward_std": 0.13700193725526333,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19687500409781933,
"step": 209
},
{
"completion_length": 984.140625,
"epoch": 0.4362503245910153,
"grad_norm": 0.06984654814004898,
"kl": 0.32982902973890305,
"learning_rate": 8.920098004816036e-05,
"loss": 0.0002,
"reward": 0.8710937947034836,
"reward_std": 0.24417280592024326,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19921875186264515,
"step": 210
},
{
"completion_length": 870.03125,
"epoch": 0.43832770708906776,
"grad_norm": 0.06809406727552414,
"kl": 0.29028210788965225,
"learning_rate": 8.909888291201182e-05,
"loss": 0.0001,
"reward": 0.8718750439584255,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 211
},
{
"completion_length": 906.40625,
"epoch": 0.4404050895871202,
"grad_norm": 0.08602919429540634,
"kl": 0.2871505431830883,
"learning_rate": 8.899636442893783e-05,
"loss": 0.0001,
"reward": 0.8062500469386578,
"reward_std": 0.3800698835402727,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 212
},
{
"completion_length": 869.421875,
"epoch": 0.44248247208517266,
"grad_norm": 0.05844723433256149,
"kl": 0.26848769187927246,
"learning_rate": 8.88934257037231e-05,
"loss": 0.0001,
"reward": 0.8710937947034836,
"reward_std": 0.19997863844037056,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 213
},
{
"completion_length": 864.75,
"epoch": 0.4445598545832251,
"grad_norm": 0.07575644552707672,
"kl": 0.4048551693558693,
"learning_rate": 8.879006784568104e-05,
"loss": 0.0002,
"reward": 0.7613281682133675,
"reward_std": 0.26682231947779655,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.19882812723517418,
"step": 214
},
{
"completion_length": 877.96875,
"epoch": 0.4466372370812776,
"grad_norm": 0.07090688496828079,
"kl": 0.2992668803781271,
"learning_rate": 8.868629196864182e-05,
"loss": 0.0001,
"reward": 0.8035156689584255,
"reward_std": 0.24362037889659405,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19414062798023224,
"step": 215
},
{
"completion_length": 779.8125,
"epoch": 0.44871461957933007,
"grad_norm": 0.069987952709198,
"kl": 0.2883603498339653,
"learning_rate": 8.858209919094039e-05,
"loss": 0.0001,
"reward": 0.7156250402331352,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 216
},
{
"completion_length": 748.59375,
"epoch": 0.4507920020773825,
"grad_norm": 0.07478881627321243,
"kl": 0.2929369006305933,
"learning_rate": 8.847749063540439e-05,
"loss": 0.0001,
"reward": 0.8066406697034836,
"reward_std": 0.24693494103848934,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19726562686264515,
"step": 217
},
{
"completion_length": 749.8125,
"epoch": 0.45286938457543496,
"grad_norm": 0.08688110113143921,
"kl": 0.3713537007570267,
"learning_rate": 8.837246742934207e-05,
"loss": 0.0002,
"reward": 0.7765625454485416,
"reward_std": 0.33366601169109344,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.1984375026077032,
"step": 218
},
{
"completion_length": 697.546875,
"epoch": 0.4549467670734874,
"grad_norm": 0.08973264694213867,
"kl": 0.36194442212581635,
"learning_rate": 8.826703070453015e-05,
"loss": 0.0002,
"reward": 0.8511719219386578,
"reward_std": 0.316540764644742,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19492187909781933,
"step": 219
},
{
"completion_length": 736.953125,
"epoch": 0.45702414957153986,
"grad_norm": 0.06507878005504608,
"kl": 0.32681479677557945,
"learning_rate": 8.816118159720156e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 220
},
{
"completion_length": 686.046875,
"epoch": 0.4591015320695923,
"grad_norm": 0.08443711698055267,
"kl": 0.27842542715370655,
"learning_rate": 8.805492124803331e-05,
"loss": 0.0001,
"reward": 0.7750000506639481,
"reward_std": 0.2474873699247837,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19687500223517418,
"step": 221
},
{
"completion_length": 636.28125,
"epoch": 0.46117891456764476,
"grad_norm": 0.08033400774002075,
"kl": 0.2751711644232273,
"learning_rate": 8.794825080213414e-05,
"loss": 0.0001,
"reward": 0.7781250476837158,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 222
},
{
"completion_length": 652.40625,
"epoch": 0.4632562970656972,
"grad_norm": 0.07973612844944,
"kl": 0.28703486546874046,
"learning_rate": 8.78411714090321e-05,
"loss": 0.0001,
"reward": 0.7937500439584255,
"reward_std": 0.2651650384068489,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 223
},
{
"completion_length": 656.5,
"epoch": 0.46533367956374966,
"grad_norm": 0.0935521349310875,
"kl": 0.28887104988098145,
"learning_rate": 8.77336842226623e-05,
"loss": 0.0001,
"reward": 0.7937500439584255,
"reward_std": 0.3535533845424652,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 224
},
{
"completion_length": 601.90625,
"epoch": 0.4674110620618021,
"grad_norm": 0.08775703608989716,
"kl": 0.2865128982812166,
"learning_rate": 8.76257904013544e-05,
"loss": 0.0001,
"reward": 0.7468750439584255,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 225
},
{
"completion_length": 649.09375,
"epoch": 0.46948844455985456,
"grad_norm": 0.07471180707216263,
"kl": 0.3112582378089428,
"learning_rate": 8.751749110782012e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 226
},
{
"completion_length": 651.375,
"epoch": 0.47156582705790706,
"grad_norm": 0.08434654772281647,
"kl": 0.33592014387249947,
"learning_rate": 8.740878750914076e-05,
"loss": 0.0002,
"reward": 0.8390625491738319,
"reward_std": 0.24527766555547714,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.1984375026077032,
"step": 227
},
{
"completion_length": 592.15625,
"epoch": 0.4736432095559595,
"grad_norm": 0.10138159990310669,
"kl": 0.3475854229182005,
"learning_rate": 8.729968077675454e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.33145629428327084,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 228
},
{
"completion_length": 618.953125,
"epoch": 0.47572059205401196,
"grad_norm": 0.08923006802797318,
"kl": 0.32317574694752693,
"learning_rate": 8.71901720864441e-05,
"loss": 0.0002,
"reward": 0.8085937947034836,
"reward_std": 0.28836698085069656,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.1992187537252903,
"step": 229
},
{
"completion_length": 606.46875,
"epoch": 0.4777979745520644,
"grad_norm": 0.07547228038311005,
"kl": 0.4202072508633137,
"learning_rate": 8.70802626183237e-05,
"loss": 0.0002,
"reward": 0.7757812924683094,
"reward_std": 0.20218834839761257,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19765625149011612,
"step": 230
},
{
"completion_length": 567.375,
"epoch": 0.47987535705011686,
"grad_norm": 0.07534275949001312,
"kl": 0.5509752966463566,
"learning_rate": 8.696995355682656e-05,
"loss": 0.0003,
"reward": 0.8250000439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 231
},
{
"completion_length": 615.265625,
"epoch": 0.4819527395481693,
"grad_norm": 0.08038201183080673,
"kl": 0.3771616071462631,
"learning_rate": 8.685924609069214e-05,
"loss": 0.0002,
"reward": 0.8695312887430191,
"reward_std": 0.20218834280967712,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19765625149011612,
"step": 232
},
{
"completion_length": 602.453125,
"epoch": 0.48403012204622176,
"grad_norm": 0.07698789983987808,
"kl": 0.6121297106146812,
"learning_rate": 8.674814141295324e-05,
"loss": 0.0003,
"reward": 0.8718750439584255,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 233
},
{
"completion_length": 592.75,
"epoch": 0.4861075045442742,
"grad_norm": 0.09831973165273666,
"kl": 0.31690799072384834,
"learning_rate": 8.663664072092323e-05,
"loss": 0.0002,
"reward": 0.8246094211935997,
"reward_std": 0.3099116366356611,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19960937835276127,
"step": 234
},
{
"completion_length": 558.90625,
"epoch": 0.48818488704232665,
"grad_norm": 0.09684620797634125,
"kl": 0.3237866424024105,
"learning_rate": 8.652474521618306e-05,
"loss": 0.0002,
"reward": 0.7937500439584255,
"reward_std": 0.3093592096120119,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 235
},
{
"completion_length": 626.078125,
"epoch": 0.4902622695403791,
"grad_norm": 0.06933271139860153,
"kl": 0.3663709722459316,
"learning_rate": 8.641245610456838e-05,
"loss": 0.0002,
"reward": 0.9812500476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.78125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 236
},
{
"completion_length": 579.40625,
"epoch": 0.49233965203843155,
"grad_norm": 0.08088324964046478,
"kl": 0.3435916490852833,
"learning_rate": 8.629977459615655e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 237
},
{
"completion_length": 607.125,
"epoch": 0.49441703453648406,
"grad_norm": 0.07071245461702347,
"kl": 0.2806865181773901,
"learning_rate": 8.618670190525352e-05,
"loss": 0.0001,
"reward": 0.8250000439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 238
},
{
"completion_length": 558.390625,
"epoch": 0.4964944170345365,
"grad_norm": 0.08282584697008133,
"kl": 0.40584639832377434,
"learning_rate": 8.607323925038082e-05,
"loss": 0.0002,
"reward": 0.7156250439584255,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 239
},
{
"completion_length": 667.71875,
"epoch": 0.49857179953258896,
"grad_norm": 0.08190900087356567,
"kl": 0.35680179484188557,
"learning_rate": 8.595938785426241e-05,
"loss": 0.0002,
"reward": 0.9343750476837158,
"reward_std": 0.24306794814765453,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 240
},
{
"completion_length": 670.8125,
"epoch": 0.5006491820306413,
"grad_norm": 0.08591850101947784,
"kl": 0.3619570918381214,
"learning_rate": 8.584514894381151e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 241
},
{
"completion_length": 676.25,
"epoch": 0.5027265645286938,
"grad_norm": 0.08506251126527786,
"kl": 0.3224334083497524,
"learning_rate": 8.573052375011733e-05,
"loss": 0.0002,
"reward": 0.8867187947034836,
"reward_std": 0.2662698905915022,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 242
},
{
"completion_length": 673.1875,
"epoch": 0.5048039470267462,
"grad_norm": 0.06150234118103981,
"kl": 0.3478453829884529,
"learning_rate": 8.561551350843186e-05,
"loss": 0.0002,
"reward": 0.9656250476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.765625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 243
},
{
"completion_length": 692.546875,
"epoch": 0.5068813295247988,
"grad_norm": 0.06618204712867737,
"kl": 0.29330621659755707,
"learning_rate": 8.550011945815655e-05,
"loss": 0.0001,
"reward": 0.8562500476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 244
},
{
"completion_length": 687.03125,
"epoch": 0.5089587120228513,
"grad_norm": 0.07623764872550964,
"kl": 0.3498356007039547,
"learning_rate": 8.538434284282892e-05,
"loss": 0.0002,
"reward": 0.7937500476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 245
},
{
"completion_length": 660.703125,
"epoch": 0.5110360945209037,
"grad_norm": 0.04135030135512352,
"kl": 0.32844917103648186,
"learning_rate": 8.526818491010922e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.0883883461356163,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 246
},
{
"completion_length": 716.609375,
"epoch": 0.5131134770189562,
"grad_norm": 0.0865129679441452,
"kl": 0.3059841375797987,
"learning_rate": 8.515164691176687e-05,
"loss": 0.0002,
"reward": 0.7312500439584255,
"reward_std": 0.3093592096120119,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 247
},
{
"completion_length": 757.4375,
"epoch": 0.5151908595170086,
"grad_norm": 0.07157998532056808,
"kl": 0.2929275669157505,
"learning_rate": 8.503473010366713e-05,
"loss": 0.0001,
"reward": 0.8867187909781933,
"reward_std": 0.22207572311162949,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19921875186264515,
"step": 248
},
{
"completion_length": 662.859375,
"epoch": 0.517268242015061,
"grad_norm": 0.06820650398731232,
"kl": 0.31902188807725906,
"learning_rate": 8.491743574575743e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 249
},
{
"completion_length": 721.625,
"epoch": 0.5193456245131135,
"grad_norm": 0.0756940096616745,
"kl": 0.31403973512351513,
"learning_rate": 8.479976510205387e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.22097086161375046,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 250
},
{
"completion_length": 748.046875,
"epoch": 0.521423007011166,
"grad_norm": 0.07090619206428528,
"kl": 0.2569838650524616,
"learning_rate": 8.468171944062755e-05,
"loss": 0.0001,
"reward": 0.7929687947034836,
"reward_std": 0.22207572311162949,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 251
},
{
"completion_length": 693.609375,
"epoch": 0.5235003895092184,
"grad_norm": 0.06538081914186478,
"kl": 0.2928556613624096,
"learning_rate": 8.456330003359093e-05,
"loss": 0.0001,
"reward": 0.8250000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 252
},
{
"completion_length": 728.953125,
"epoch": 0.5255777720072708,
"grad_norm": 0.09474781900644302,
"kl": 0.27900537475943565,
"learning_rate": 8.444450815708415e-05,
"loss": 0.0001,
"reward": 0.8250000476837158,
"reward_std": 0.3977475557476282,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 253
},
{
"completion_length": 737.625,
"epoch": 0.5276551545053233,
"grad_norm": 0.06914320588111877,
"kl": 0.26191011257469654,
"learning_rate": 8.432534509126122e-05,
"loss": 0.0001,
"reward": 0.7468750439584255,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 254
},
{
"completion_length": 743.609375,
"epoch": 0.5297325370033757,
"grad_norm": 0.05855982005596161,
"kl": 0.26190576888620853,
"learning_rate": 8.420581212027624e-05,
"loss": 0.0001,
"reward": 0.8875000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 255
},
{
"completion_length": 768.265625,
"epoch": 0.5318099195014282,
"grad_norm": 0.058118585497140884,
"kl": 0.2930552177131176,
"learning_rate": 8.408591053226964e-05,
"loss": 0.0001,
"reward": 0.9492187947034836,
"reward_std": 0.13368737325072289,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.1992187537252903,
"step": 256
},
{
"completion_length": 763.0,
"epoch": 0.5338873019994806,
"grad_norm": 0.07115372270345688,
"kl": 0.35762836039066315,
"learning_rate": 8.396564161935411e-05,
"loss": 0.0002,
"reward": 0.8710937947034836,
"reward_std": 0.1999786328524351,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 257
},
{
"completion_length": 865.8125,
"epoch": 0.5359646844975331,
"grad_norm": 0.06384899467229843,
"kl": 0.2847513500601053,
"learning_rate": 8.38450066776009e-05,
"loss": 0.0001,
"reward": 0.8375000506639481,
"reward_std": 0.2032931987196207,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 258
},
{
"completion_length": 691.734375,
"epoch": 0.5380420669955855,
"grad_norm": 0.08122014999389648,
"kl": 0.2869179602712393,
"learning_rate": 8.37240070070257e-05,
"loss": 0.0001,
"reward": 0.7937500439584255,
"reward_std": 0.30935921147465706,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 259
},
{
"completion_length": 759.203125,
"epoch": 0.540119449493638,
"grad_norm": 0.06432370841503143,
"kl": 0.3190025221556425,
"learning_rate": 8.360264391157471e-05,
"loss": 0.0002,
"reward": 0.9500000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 260
},
{
"completion_length": 872.890625,
"epoch": 0.5421968319916904,
"grad_norm": 0.08087541162967682,
"kl": 0.2903926521539688,
"learning_rate": 8.348091869911054e-05,
"loss": 0.0001,
"reward": 0.7554687969386578,
"reward_std": 0.27510873042047024,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.19296875409781933,
"step": 261
},
{
"completion_length": 868.875,
"epoch": 0.5442742144897429,
"grad_norm": 0.06983164697885513,
"kl": 0.25976957008242607,
"learning_rate": 8.335883268139813e-05,
"loss": 0.0001,
"reward": 0.8062500506639481,
"reward_std": 0.247487373650074,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 262
},
{
"completion_length": 781.53125,
"epoch": 0.5463515969877953,
"grad_norm": 0.07666690647602081,
"kl": 0.286643173545599,
"learning_rate": 8.323638717409061e-05,
"loss": 0.0001,
"reward": 0.8406250476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 263
},
{
"completion_length": 768.796875,
"epoch": 0.5484289794858478,
"grad_norm": 0.06480922549962997,
"kl": 0.30170151591300964,
"learning_rate": 8.311358349671517e-05,
"loss": 0.0002,
"reward": 0.7625000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 264
},
{
"completion_length": 874.328125,
"epoch": 0.5505063619839002,
"grad_norm": 0.06416033208370209,
"kl": 0.28673115372657776,
"learning_rate": 8.299042297265876e-05,
"loss": 0.0001,
"reward": 0.8843750506639481,
"reward_std": 0.22539028525352478,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19687500223517418,
"step": 265
},
{
"completion_length": 790.890625,
"epoch": 0.5525837444819527,
"grad_norm": 0.06319725513458252,
"kl": 0.3224434554576874,
"learning_rate": 8.286690692915386e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 266
},
{
"completion_length": 837.4375,
"epoch": 0.5546611269800052,
"grad_norm": 0.07317644357681274,
"kl": 0.3563056066632271,
"learning_rate": 8.274303669726426e-05,
"loss": 0.0002,
"reward": 0.7906250432133675,
"reward_std": 0.22539028525352478,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 267
},
{
"completion_length": 718.125,
"epoch": 0.5567385094780577,
"grad_norm": 0.06230226531624794,
"kl": 0.30831460282206535,
"learning_rate": 8.261881361187054e-05,
"loss": 0.0002,
"reward": 0.9500000476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 268
},
{
"completion_length": 879.25,
"epoch": 0.5588158919761101,
"grad_norm": 0.07465776056051254,
"kl": 0.47362302988767624,
"learning_rate": 8.249423901165584e-05,
"loss": 0.0002,
"reward": 0.8535156697034836,
"reward_std": 0.22483785264194012,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19726562686264515,
"step": 269
},
{
"completion_length": 669.40625,
"epoch": 0.5608932744741626,
"grad_norm": 0.07374807447195053,
"kl": 0.329727228730917,
"learning_rate": 8.236931423909138e-05,
"loss": 0.0002,
"reward": 0.7773437947034836,
"reward_std": 0.19997863098978996,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.1992187537252903,
"step": 270
},
{
"completion_length": 732.3125,
"epoch": 0.562970656972215,
"grad_norm": 0.0676698312163353,
"kl": 0.3591331150382757,
"learning_rate": 8.2244040640422e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 271
},
{
"completion_length": 840.921875,
"epoch": 0.5650480394702675,
"grad_norm": 0.0682259052991867,
"kl": 0.35003719478845596,
"learning_rate": 8.21184195656516e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 272
},
{
"completion_length": 722.375,
"epoch": 0.5671254219683199,
"grad_norm": 0.05751950666308403,
"kl": 0.37356993556022644,
"learning_rate": 8.199245236852871e-05,
"loss": 0.0002,
"reward": 0.6843750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 273
},
{
"completion_length": 679.171875,
"epoch": 0.5692028044663724,
"grad_norm": 0.09351193159818649,
"kl": 0.3799058124423027,
"learning_rate": 8.186614040653176e-05,
"loss": 0.0002,
"reward": 0.9343750476837158,
"reward_std": 0.33145629428327084,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 274
},
{
"completion_length": 767.84375,
"epoch": 0.5712801869644248,
"grad_norm": 0.06785906106233597,
"kl": 0.3164171427488327,
"learning_rate": 8.173948504085454e-05,
"loss": 0.0002,
"reward": 0.8242187947034836,
"reward_std": 0.17788154631853104,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19921875186264515,
"step": 275
},
{
"completion_length": 700.25,
"epoch": 0.5733575694624773,
"grad_norm": 0.06914710998535156,
"kl": 0.3422697074711323,
"learning_rate": 8.161248763639153e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 276
},
{
"completion_length": 662.9375,
"epoch": 0.5754349519605297,
"grad_norm": 0.05800582095980644,
"kl": 0.37523847445845604,
"learning_rate": 8.148514956172315e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.1325825173407793,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 277
},
{
"completion_length": 834.5625,
"epoch": 0.5775123344585822,
"grad_norm": 0.06169675290584564,
"kl": 0.31875982135534286,
"learning_rate": 8.135747218910104e-05,
"loss": 0.0002,
"reward": 0.8367187976837158,
"reward_std": 0.20439805276691914,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.1960937511175871,
"step": 278
},
{
"completion_length": 766.546875,
"epoch": 0.5795897169566346,
"grad_norm": 0.08040869235992432,
"kl": 1.0613461509346962,
"learning_rate": 8.122945689443328e-05,
"loss": 0.0005,
"reward": 0.8703125417232513,
"reward_std": 0.1568893175572157,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19843750074505806,
"step": 279
},
{
"completion_length": 737.5625,
"epoch": 0.5816670994546871,
"grad_norm": 0.0702415257692337,
"kl": 0.34963829442858696,
"learning_rate": 8.11011050572695e-05,
"loss": 0.0002,
"reward": 0.8222656659781933,
"reward_std": 0.22483785450458527,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19726562686264515,
"step": 280
},
{
"completion_length": 753.703125,
"epoch": 0.5837444819527395,
"grad_norm": 0.07661338895559311,
"kl": 0.38233664259314537,
"learning_rate": 8.097241806078615e-05,
"loss": 0.0002,
"reward": 0.7929687909781933,
"reward_std": 0.26626989245414734,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 281
},
{
"completion_length": 880.609375,
"epoch": 0.585821864450792,
"grad_norm": 0.07432933151721954,
"kl": 0.42199838161468506,
"learning_rate": 8.084339729177142e-05,
"loss": 0.0002,
"reward": 0.8500000461935997,
"reward_std": 0.27400387451052666,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19375000335276127,
"step": 282
},
{
"completion_length": 778.453125,
"epoch": 0.5878992469488444,
"grad_norm": 0.07835783809423447,
"kl": 0.36370869539678097,
"learning_rate": 8.071404414061041e-05,
"loss": 0.0002,
"reward": 0.8207031637430191,
"reward_std": 0.2712417396251112,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19570313021540642,
"step": 283
},
{
"completion_length": 806.515625,
"epoch": 0.5899766294468969,
"grad_norm": 0.048540204763412476,
"kl": 0.3912508450448513,
"learning_rate": 8.058436000127014e-05,
"loss": 0.0002,
"reward": 0.8679687865078449,
"reward_std": 0.1602038759738207,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.1960937511175871,
"step": 284
},
{
"completion_length": 859.765625,
"epoch": 0.5920540119449493,
"grad_norm": 0.06207654997706413,
"kl": 0.31765272468328476,
"learning_rate": 8.045434627128446e-05,
"loss": 0.0002,
"reward": 0.9312500506639481,
"reward_std": 0.2032931987196207,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 285
},
{
"completion_length": 710.453125,
"epoch": 0.5941313944430018,
"grad_norm": 0.08810100704431534,
"kl": 0.40968091040849686,
"learning_rate": 8.032400435173907e-05,
"loss": 0.0002,
"reward": 0.8542969226837158,
"reward_std": 0.31212134286761284,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19804687798023224,
"step": 286
},
{
"completion_length": 700.296875,
"epoch": 0.5962087769410542,
"grad_norm": 0.07407598942518234,
"kl": 0.3017115257680416,
"learning_rate": 8.019333564725639e-05,
"loss": 0.0002,
"reward": 0.9476562887430191,
"reward_std": 0.18009125301614404,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.19765625521540642,
"step": 287
},
{
"completion_length": 628.984375,
"epoch": 0.5982861594391067,
"grad_norm": 0.05131203308701515,
"kl": 0.3888060562312603,
"learning_rate": 8.006234156598042e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.0883883461356163,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 288
},
{
"completion_length": 648.28125,
"epoch": 0.6003635419371591,
"grad_norm": 0.07319964468479156,
"kl": 0.3936074487864971,
"learning_rate": 7.99310235195615e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 289
},
{
"completion_length": 788.953125,
"epoch": 0.6024409244352117,
"grad_norm": 0.07722538709640503,
"kl": 0.35653146356344223,
"learning_rate": 7.979938292314129e-05,
"loss": 0.0002,
"reward": 0.8386719189584255,
"reward_std": 0.24583008512854576,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19804687798023224,
"step": 290
},
{
"completion_length": 679.46875,
"epoch": 0.6045183069332641,
"grad_norm": 0.03349410742521286,
"kl": 0.35145866870880127,
"learning_rate": 7.966742119533723e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.04419417306780815,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 291
},
{
"completion_length": 761.25,
"epoch": 0.6065956894313166,
"grad_norm": 0.06922980397939682,
"kl": 0.33772632107138634,
"learning_rate": 7.953513975822755e-05,
"loss": 0.0002,
"reward": 0.8242187947034836,
"reward_std": 0.2220757193863392,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19921875186264515,
"step": 292
},
{
"completion_length": 618.25,
"epoch": 0.608673071929369,
"grad_norm": 0.07786116003990173,
"kl": 0.5136113204061985,
"learning_rate": 7.940254003733578e-05,
"loss": 0.0003,
"reward": 0.7781250476837158,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 293
},
{
"completion_length": 704.921875,
"epoch": 0.6107504544274215,
"grad_norm": 0.0848776176571846,
"kl": 0.4174853079020977,
"learning_rate": 7.926962346161535e-05,
"loss": 0.0002,
"reward": 0.699218787252903,
"reward_std": 0.22207571775652468,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.1992187537252903,
"step": 294
},
{
"completion_length": 657.734375,
"epoch": 0.6128278369254739,
"grad_norm": 0.0675949826836586,
"kl": 0.4570797383785248,
"learning_rate": 7.913639146343435e-05,
"loss": 0.0002,
"reward": 0.7937500439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 295
},
{
"completion_length": 689.328125,
"epoch": 0.6149052194235264,
"grad_norm": 0.07435144484043121,
"kl": 0.3593181371688843,
"learning_rate": 7.900284547855991e-05,
"loss": 0.0002,
"reward": 0.8691406697034836,
"reward_std": 0.2469349391758442,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19726562686264515,
"step": 296
},
{
"completion_length": 783.453125,
"epoch": 0.6169826019215788,
"grad_norm": 0.07517191022634506,
"kl": 0.7363171242177486,
"learning_rate": 7.886898694614291e-05,
"loss": 0.0004,
"reward": 0.8375000469386578,
"reward_std": 0.20329319685697556,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 297
},
{
"completion_length": 665.171875,
"epoch": 0.6190599844196313,
"grad_norm": 0.07602944225072861,
"kl": 0.4283002242445946,
"learning_rate": 7.873481730870232e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 298
},
{
"completion_length": 741.75,
"epoch": 0.6211373669176837,
"grad_norm": 0.07438351958990097,
"kl": 0.2955322675406933,
"learning_rate": 7.860033801210976e-05,
"loss": 0.0001,
"reward": 0.8250000439584255,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 299
},
{
"completion_length": 719.9375,
"epoch": 0.6232147494157362,
"grad_norm": 0.08875050395727158,
"kl": 0.34281647577881813,
"learning_rate": 7.84655505055738e-05,
"loss": 0.0002,
"reward": 0.7125000432133675,
"reward_std": 0.38006988912820816,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19687500409781933,
"step": 300
},
{
"completion_length": 755.453125,
"epoch": 0.6252921319137886,
"grad_norm": 0.07758081704378128,
"kl": 0.29608317092061043,
"learning_rate": 7.833045624162452e-05,
"loss": 0.0001,
"reward": 0.7781250476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 301
},
{
"completion_length": 721.40625,
"epoch": 0.6273695144118411,
"grad_norm": 0.07114533334970474,
"kl": 0.5064779743552208,
"learning_rate": 7.819505667609767e-05,
"loss": 0.0003,
"reward": 0.7468750439584255,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 302
},
{
"completion_length": 804.59375,
"epoch": 0.6294468969098935,
"grad_norm": 0.06897041946649551,
"kl": 0.3391858469694853,
"learning_rate": 7.805935326811912e-05,
"loss": 0.0002,
"reward": 0.9500000476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 303
},
{
"completion_length": 723.96875,
"epoch": 0.631524279407946,
"grad_norm": 0.07760775089263916,
"kl": 0.3714125622063875,
"learning_rate": 7.792334748008905e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 304
},
{
"completion_length": 694.328125,
"epoch": 0.6336016619059984,
"grad_norm": 0.08604968339204788,
"kl": 0.3233291208744049,
"learning_rate": 7.77870407776662e-05,
"loss": 0.0002,
"reward": 0.7000000476837158,
"reward_std": 0.3093592058867216,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.20000000298023224,
"step": 305
},
{
"completion_length": 875.234375,
"epoch": 0.6356790444040509,
"grad_norm": 0.07271739840507507,
"kl": 0.2942599691450596,
"learning_rate": 7.765043462975217e-05,
"loss": 0.0001,
"reward": 0.7464844100177288,
"reward_std": 0.1817485373467207,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.18398437649011612,
"step": 306
},
{
"completion_length": 674.65625,
"epoch": 0.6377564269021033,
"grad_norm": 0.06499814242124557,
"kl": 0.6551753357052803,
"learning_rate": 7.751353050847545e-05,
"loss": 0.0003,
"reward": 0.6683594062924385,
"reward_std": 0.13313494622707367,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.19960937649011612,
"step": 307
},
{
"completion_length": 794.671875,
"epoch": 0.6398338094001558,
"grad_norm": 0.07812398672103882,
"kl": 0.29106237180531025,
"learning_rate": 7.737632988917564e-05,
"loss": 0.0001,
"reward": 0.8218750506639481,
"reward_std": 0.3137786276638508,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 308
},
{
"completion_length": 726.0,
"epoch": 0.6419111918982082,
"grad_norm": 0.08285919576883316,
"kl": 0.3495354764163494,
"learning_rate": 7.723883425038758e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 309
},
{
"completion_length": 862.09375,
"epoch": 0.6439885743962607,
"grad_norm": 0.06892167776823044,
"kl": 0.318182036280632,
"learning_rate": 7.710104507382531e-05,
"loss": 0.0002,
"reward": 0.7753906697034836,
"reward_std": 0.24693494103848934,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19726562686264515,
"step": 310
},
{
"completion_length": 705.40625,
"epoch": 0.6460659568943131,
"grad_norm": 0.053015708923339844,
"kl": 0.30275189504027367,
"learning_rate": 7.696296384436619e-05,
"loss": 0.0002,
"reward": 0.7781250402331352,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 311
},
{
"completion_length": 689.0,
"epoch": 0.6481433393923656,
"grad_norm": 0.08785798400640488,
"kl": 0.3134246002882719,
"learning_rate": 7.682459205003483e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.2872621212154627,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 312
},
{
"completion_length": 875.421875,
"epoch": 0.650220721890418,
"grad_norm": 0.07502438127994537,
"kl": 0.33200008049607277,
"learning_rate": 7.668593118198719e-05,
"loss": 0.0002,
"reward": 0.8218750506639481,
"reward_std": 0.26958445087075233,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19687500409781933,
"step": 313
},
{
"completion_length": 708.828125,
"epoch": 0.6522981043884706,
"grad_norm": 0.08758591115474701,
"kl": 0.3114005923271179,
"learning_rate": 7.654698273449435e-05,
"loss": 0.0002,
"reward": 0.9179687947034836,
"reward_std": 0.31046406738460064,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 314
},
{
"completion_length": 843.875,
"epoch": 0.654375486886523,
"grad_norm": 0.06280484795570374,
"kl": 0.2624143324792385,
"learning_rate": 7.640774820492647e-05,
"loss": 0.0001,
"reward": 0.7937500439584255,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 315
},
{
"completion_length": 676.296875,
"epoch": 0.6564528693845755,
"grad_norm": 0.06573140621185303,
"kl": 0.29568540304899216,
"learning_rate": 7.626822909373667e-05,
"loss": 0.0001,
"reward": 0.7781250402331352,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 316
},
{
"completion_length": 708.625,
"epoch": 0.6585302518826279,
"grad_norm": 0.07694177329540253,
"kl": 0.2915249727666378,
"learning_rate": 7.612842690444486e-05,
"loss": 0.0001,
"reward": 0.9648437947034836,
"reward_std": 0.2441728077828884,
"rewards/argmax_reward_func": 0.765625,
"rewards/format_reward_func": 0.1992187537252903,
"step": 317
},
{
"completion_length": 656.375,
"epoch": 0.6606076343806804,
"grad_norm": 0.09348881989717484,
"kl": 0.3169392794370651,
"learning_rate": 7.598834314362151e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.3314562924206257,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 318
},
{
"completion_length": 640.75,
"epoch": 0.6626850168787328,
"grad_norm": 0.07280497252941132,
"kl": 0.2968177553266287,
"learning_rate": 7.584797932087145e-05,
"loss": 0.0001,
"reward": 0.8710937947034836,
"reward_std": 0.19997863844037056,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19921875186264515,
"step": 319
},
{
"completion_length": 694.359375,
"epoch": 0.6647623993767853,
"grad_norm": 0.09892084449529648,
"kl": 0.5367627218365669,
"learning_rate": 7.570733694881755e-05,
"loss": 0.0003,
"reward": 0.9031250439584255,
"reward_std": 0.28726212307810783,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 320
},
{
"completion_length": 653.953125,
"epoch": 0.6668397818748377,
"grad_norm": 0.07763518393039703,
"kl": 0.31165359169244766,
"learning_rate": 7.556641754308447e-05,
"loss": 0.0002,
"reward": 0.8406250439584255,
"reward_std": 0.24306794814765453,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 321
},
{
"completion_length": 738.703125,
"epoch": 0.6689171643728902,
"grad_norm": 0.0881708562374115,
"kl": 0.3136756382882595,
"learning_rate": 7.542522262228231e-05,
"loss": 0.0002,
"reward": 0.8085937947034836,
"reward_std": 0.33256115205585957,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.1992187537252903,
"step": 322
},
{
"completion_length": 757.421875,
"epoch": 0.6709945468709426,
"grad_norm": 0.0727957934141159,
"kl": 0.28189600445330143,
"learning_rate": 7.528375370799024e-05,
"loss": 0.0001,
"reward": 0.8093750476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 323
},
{
"completion_length": 660.125,
"epoch": 0.6730719293689951,
"grad_norm": 0.068515844643116,
"kl": 0.29710386879742146,
"learning_rate": 7.514201232474011e-05,
"loss": 0.0001,
"reward": 0.8562500439584255,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 324
},
{
"completion_length": 704.8125,
"epoch": 0.6751493118670475,
"grad_norm": 0.07097381353378296,
"kl": 0.31055452302098274,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0002,
"reward": 0.8406250439584255,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 325
},
{
"completion_length": 680.921875,
"epoch": 0.6772266943651,
"grad_norm": 0.06986773759126663,
"kl": 0.3125472627580166,
"learning_rate": 7.48577182641578e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 326
},
{
"completion_length": 744.046875,
"epoch": 0.6793040768631524,
"grad_norm": 0.06576069444417953,
"kl": 0.3361051678657532,
"learning_rate": 7.471516865050467e-05,
"loss": 0.0002,
"reward": 0.9343750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 327
},
{
"completion_length": 654.0,
"epoch": 0.6813814593612049,
"grad_norm": 0.07205154001712799,
"kl": 0.30227479338645935,
"learning_rate": 7.457235269521856e-05,
"loss": 0.0002,
"reward": 0.7617187909781933,
"reward_std": 0.17788154468871653,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.1992187537252903,
"step": 328
},
{
"completion_length": 647.859375,
"epoch": 0.6834588418592573,
"grad_norm": 0.0794130265712738,
"kl": 0.4595659039914608,
"learning_rate": 7.44292719373476e-05,
"loss": 0.0002,
"reward": 0.9343750476837158,
"reward_std": 0.24306794628500938,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 329
},
{
"completion_length": 735.078125,
"epoch": 0.6855362243573098,
"grad_norm": 0.0897228941321373,
"kl": 0.3426021710038185,
"learning_rate": 7.428592791879361e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.33145629800856113,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 330
},
{
"completion_length": 853.703125,
"epoch": 0.6876136068553622,
"grad_norm": 0.07609646022319794,
"kl": 0.26740806736052036,
"learning_rate": 7.414232218429537e-05,
"loss": 0.0001,
"reward": 0.9156250506639481,
"reward_std": 0.26958445832133293,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.19687500223517418,
"step": 331
},
{
"completion_length": 616.890625,
"epoch": 0.6896909893534147,
"grad_norm": 0.09115231037139893,
"kl": 0.334526427090168,
"learning_rate": 7.399845628141206e-05,
"loss": 0.0002,
"reward": 0.8718750439584255,
"reward_std": 0.2872621212154627,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 332
},
{
"completion_length": 622.765625,
"epoch": 0.6917683718514671,
"grad_norm": 0.08646494895219803,
"kl": 0.3055717647075653,
"learning_rate": 7.385433176050653e-05,
"loss": 0.0002,
"reward": 0.8710937909781933,
"reward_std": 0.2883669827133417,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19921875186264515,
"step": 333
},
{
"completion_length": 686.921875,
"epoch": 0.6938457543495196,
"grad_norm": 0.0787225142121315,
"kl": 0.3159499131143093,
"learning_rate": 7.370995017472863e-05,
"loss": 0.0002,
"reward": 0.8531250506639481,
"reward_std": 0.26958445459604263,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19687500223517418,
"step": 334
},
{
"completion_length": 634.703125,
"epoch": 0.695923136847572,
"grad_norm": 0.09521856158971786,
"kl": 0.3115619271993637,
"learning_rate": 7.356531307999843e-05,
"loss": 0.0002,
"reward": 0.7468750476837158,
"reward_std": 0.375650467351079,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 335
},
{
"completion_length": 753.703125,
"epoch": 0.6980005193456245,
"grad_norm": 0.09729248285293579,
"kl": 0.3037104904651642,
"learning_rate": 7.342042203498951e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.3977475520223379,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 336
},
{
"completion_length": 674.0,
"epoch": 0.700077901843677,
"grad_norm": 0.09549879282712936,
"kl": 0.3098057843744755,
"learning_rate": 7.32752786011121e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.37565046921372414,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 337
},
{
"completion_length": 673.1875,
"epoch": 0.7021552843417295,
"grad_norm": 0.08708694577217102,
"kl": 0.29914069548249245,
"learning_rate": 7.312988434249632e-05,
"loss": 0.0001,
"reward": 0.9031250476837158,
"reward_std": 0.33145629800856113,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 338
},
{
"completion_length": 699.65625,
"epoch": 0.7042326668397819,
"grad_norm": 0.09121581166982651,
"kl": 0.31991639360785484,
"learning_rate": 7.298424082597526e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.3093592096120119,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 339
},
{
"completion_length": 667.546875,
"epoch": 0.7063100493378344,
"grad_norm": 0.08295177668333054,
"kl": 0.3119734339416027,
"learning_rate": 7.283834962106811e-05,
"loss": 0.0002,
"reward": 0.6656250394880772,
"reward_std": 0.31377863325178623,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.19687500409781933,
"step": 340
},
{
"completion_length": 734.140625,
"epoch": 0.7083874318358868,
"grad_norm": 0.07721901684999466,
"kl": 0.2920740433037281,
"learning_rate": 7.269221229996331e-05,
"loss": 0.0001,
"reward": 0.8875000476837158,
"reward_std": 0.30935920774936676,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 341
},
{
"completion_length": 723.78125,
"epoch": 0.7104648143339393,
"grad_norm": 0.07446262985467911,
"kl": 0.31096627190709114,
"learning_rate": 7.254583043750151e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 342
},
{
"completion_length": 598.140625,
"epoch": 0.7125421968319917,
"grad_norm": 0.07494507730007172,
"kl": 0.3281702548265457,
"learning_rate": 7.239920561115867e-05,
"loss": 0.0002,
"reward": 0.7000000476837158,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.20000000298023224,
"step": 343
},
{
"completion_length": 669.484375,
"epoch": 0.7146195793300442,
"grad_norm": 0.06954500079154968,
"kl": 0.29709911718964577,
"learning_rate": 7.225233940102906e-05,
"loss": 0.0001,
"reward": 0.9343750476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 344
},
{
"completion_length": 738.015625,
"epoch": 0.7166969618280966,
"grad_norm": 0.08409620076417923,
"kl": 0.3333327900618315,
"learning_rate": 7.210523338980813e-05,
"loss": 0.0002,
"reward": 0.8398437947034836,
"reward_std": 0.2883669827133417,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.1992187537252903,
"step": 345
},
{
"completion_length": 596.5,
"epoch": 0.7187743443261491,
"grad_norm": 0.07967247068881989,
"kl": 0.3089658170938492,
"learning_rate": 7.195788916277565e-05,
"loss": 0.0002,
"reward": 0.7929687947034836,
"reward_std": 0.22207571775652468,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19921875186264515,
"step": 346
},
{
"completion_length": 717.390625,
"epoch": 0.7208517268242015,
"grad_norm": 0.0864432230591774,
"kl": 0.3028757870197296,
"learning_rate": 7.181030830777837e-05,
"loss": 0.0002,
"reward": 0.8843750506639481,
"reward_std": 0.2695844564586878,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19687500223517418,
"step": 347
},
{
"completion_length": 716.515625,
"epoch": 0.722929109322254,
"grad_norm": 0.07595375925302505,
"kl": 0.30636318400502205,
"learning_rate": 7.166249241521318e-05,
"loss": 0.0002,
"reward": 0.7898437976837158,
"reward_std": 0.22649514116346836,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19609375298023224,
"step": 348
},
{
"completion_length": 584.3125,
"epoch": 0.7250064918203064,
"grad_norm": 0.10229937732219696,
"kl": 0.32858528569340706,
"learning_rate": 7.151444307800975e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.3756504710763693,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 349
},
{
"completion_length": 590.1875,
"epoch": 0.7270838743183589,
"grad_norm": 0.07948501408100128,
"kl": 0.3115417957305908,
"learning_rate": 7.13661618916135e-05,
"loss": 0.0002,
"reward": 0.7937500439584255,
"reward_std": 0.2651650346815586,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 350
},
{
"completion_length": 618.46875,
"epoch": 0.7291612568164113,
"grad_norm": 0.06686828285455704,
"kl": 0.32769910246133804,
"learning_rate": 7.121765045396834e-05,
"loss": 0.0002,
"reward": 0.8867187947034836,
"reward_std": 0.17788154655136168,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 351
},
{
"completion_length": 600.5625,
"epoch": 0.7312386393144638,
"grad_norm": 0.07947742938995361,
"kl": 0.3473210446536541,
"learning_rate": 7.106891036549945e-05,
"loss": 0.0002,
"reward": 0.7937500439584255,
"reward_std": 0.2651650384068489,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 352
},
{
"completion_length": 551.78125,
"epoch": 0.7333160218125162,
"grad_norm": 0.04208023473620415,
"kl": 0.35688477009534836,
"learning_rate": 7.091994322909611e-05,
"loss": 0.0002,
"reward": 0.9968750476837158,
"reward_std": 0.06629125960171223,
"rewards/argmax_reward_func": 0.796875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 353
},
{
"completion_length": 576.609375,
"epoch": 0.7353934043105687,
"grad_norm": 0.06657633185386658,
"kl": 0.32345687225461006,
"learning_rate": 7.077075065009433e-05,
"loss": 0.0002,
"reward": 0.7156250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 354
},
{
"completion_length": 552.921875,
"epoch": 0.7374707868086211,
"grad_norm": 0.0544576533138752,
"kl": 0.3251136727631092,
"learning_rate": 7.062133423625959e-05,
"loss": 0.0002,
"reward": 0.8406250439584255,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 355
},
{
"completion_length": 606.703125,
"epoch": 0.7395481693066736,
"grad_norm": 0.07147221267223358,
"kl": 0.3404123783111572,
"learning_rate": 7.04716955977695e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 356
},
{
"completion_length": 605.28125,
"epoch": 0.741625551804726,
"grad_norm": 0.05363324284553528,
"kl": 0.3204925172030926,
"learning_rate": 7.03218363471965e-05,
"loss": 0.0002,
"reward": 0.9500000439584255,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 357
},
{
"completion_length": 563.65625,
"epoch": 0.7437029343027785,
"grad_norm": 0.041013430804014206,
"kl": 0.3419278897345066,
"learning_rate": 7.017175809949044e-05,
"loss": 0.0002,
"reward": 0.8562500439584255,
"reward_std": 0.0883883461356163,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 358
},
{
"completion_length": 557.15625,
"epoch": 0.7457803168008309,
"grad_norm": 0.06874032318592072,
"kl": 0.3553139455616474,
"learning_rate": 7.002146247196113e-05,
"loss": 0.0002,
"reward": 0.776562537997961,
"reward_std": 0.1568893138319254,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.1984375026077032,
"step": 359
},
{
"completion_length": 567.15625,
"epoch": 0.7478576992988835,
"grad_norm": 0.0703793615102768,
"kl": 0.33949872851371765,
"learning_rate": 6.987095108426101e-05,
"loss": 0.0002,
"reward": 0.7312500402331352,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 360
},
{
"completion_length": 552.984375,
"epoch": 0.7499350817969359,
"grad_norm": 0.06514879316091537,
"kl": 0.3468449302017689,
"learning_rate": 6.972022555836764e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 361
},
{
"completion_length": 540.453125,
"epoch": 0.7520124642949884,
"grad_norm": 0.08254203200340271,
"kl": 0.36483363062143326,
"learning_rate": 6.956928751856623e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 362
},
{
"completion_length": 553.078125,
"epoch": 0.7540898467930408,
"grad_norm": 0.08209247887134552,
"kl": 0.37783167138695717,
"learning_rate": 6.94181385914321e-05,
"loss": 0.0002,
"reward": 0.667187537997961,
"reward_std": 0.17898640409111977,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.1984375026077032,
"step": 363
},
{
"completion_length": 610.234375,
"epoch": 0.7561672292910933,
"grad_norm": 0.062447499483823776,
"kl": 0.34449223801493645,
"learning_rate": 6.926678040581323e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 364
},
{
"completion_length": 527.78125,
"epoch": 0.7582446117891457,
"grad_norm": 0.07009898126125336,
"kl": 0.36786164715886116,
"learning_rate": 6.911521459281265e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 365
},
{
"completion_length": 587.296875,
"epoch": 0.7603219942871982,
"grad_norm": 0.06519950181245804,
"kl": 0.35170425847172737,
"learning_rate": 6.896344278577083e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 366
},
{
"completion_length": 583.96875,
"epoch": 0.7623993767852506,
"grad_norm": 0.051437534391880035,
"kl": 0.34650370851159096,
"learning_rate": 6.881146662024822e-05,
"loss": 0.0002,
"reward": 1.0593750476837158,
"reward_std": 0.11048543080687523,
"rewards/argmax_reward_func": 0.859375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 367
},
{
"completion_length": 538.484375,
"epoch": 0.764476759283303,
"grad_norm": 0.06923159956932068,
"kl": 0.38169170916080475,
"learning_rate": 6.865928773400743e-05,
"loss": 0.0002,
"reward": 0.8250000439584255,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 368
},
{
"completion_length": 580.03125,
"epoch": 0.7665541417813555,
"grad_norm": 0.0665920302271843,
"kl": 0.3636031821370125,
"learning_rate": 6.850690776699573e-05,
"loss": 0.0002,
"reward": 0.7289062924683094,
"reward_std": 0.13589708344079554,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19765625149011612,
"step": 369
},
{
"completion_length": 562.015625,
"epoch": 0.768631524279408,
"grad_norm": 0.06946459412574768,
"kl": 0.4947234131395817,
"learning_rate": 6.835432836132731e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 370
},
{
"completion_length": 575.0625,
"epoch": 0.7707089067774604,
"grad_norm": 0.0689174011349678,
"kl": 0.3747940734028816,
"learning_rate": 6.820155116126561e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 371
},
{
"completion_length": 571.109375,
"epoch": 0.7727862892755128,
"grad_norm": 0.08710569888353348,
"kl": 0.39623570069670677,
"learning_rate": 6.804857781320558e-05,
"loss": 0.0002,
"reward": 0.7464844174683094,
"reward_std": 0.28670969791710377,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.19960937835276127,
"step": 372
},
{
"completion_length": 607.59375,
"epoch": 0.7748636717735653,
"grad_norm": 0.0731528028845787,
"kl": 0.3582250289618969,
"learning_rate": 6.789540996565593e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 373
},
{
"completion_length": 579.28125,
"epoch": 0.7769410542716177,
"grad_norm": 0.0625411793589592,
"kl": 0.3635551296174526,
"learning_rate": 6.774204926922145e-05,
"loss": 0.0002,
"reward": 0.8875000439584255,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 374
},
{
"completion_length": 599.3125,
"epoch": 0.7790184367696702,
"grad_norm": 0.08092815428972244,
"kl": 0.4265919253230095,
"learning_rate": 6.758849737658509e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 375
},
{
"completion_length": 547.796875,
"epoch": 0.7810958192677226,
"grad_norm": 0.07175435870885849,
"kl": 0.3616880625486374,
"learning_rate": 6.743475594249021e-05,
"loss": 0.0002,
"reward": 0.8843750432133675,
"reward_std": 0.18119611218571663,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.19687500223517418,
"step": 376
},
{
"completion_length": 591.609375,
"epoch": 0.7831732017657751,
"grad_norm": 0.07784335315227509,
"kl": 0.42067378014326096,
"learning_rate": 6.728082662372282e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 377
},
{
"completion_length": 531.859375,
"epoch": 0.7852505842638275,
"grad_norm": 0.07652134448289871,
"kl": 0.3934118077158928,
"learning_rate": 6.712671107909359e-05,
"loss": 0.0002,
"reward": 0.8875000439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 378
},
{
"completion_length": 542.640625,
"epoch": 0.78732796676188,
"grad_norm": 0.054136764258146286,
"kl": 0.4239979311823845,
"learning_rate": 6.697241096942006e-05,
"loss": 0.0002,
"reward": 0.8562500439584255,
"reward_std": 0.0883883461356163,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 379
},
{
"completion_length": 548.53125,
"epoch": 0.7894053492599324,
"grad_norm": 0.07836976647377014,
"kl": 0.4376937076449394,
"learning_rate": 6.681792795750875e-05,
"loss": 0.0002,
"reward": 0.7308594211935997,
"reward_std": 0.17732911929488182,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19960937649011612,
"step": 380
},
{
"completion_length": 545.140625,
"epoch": 0.7914827317579849,
"grad_norm": 0.06161171570420265,
"kl": 0.526831716299057,
"learning_rate": 6.666326370813723e-05,
"loss": 0.0003,
"reward": 0.8562500476837158,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 381
},
{
"completion_length": 539.4375,
"epoch": 0.7935601142560373,
"grad_norm": 0.050724372267723083,
"kl": 0.4309442602097988,
"learning_rate": 6.650841988803606e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.0883883461356163,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 382
},
{
"completion_length": 515.109375,
"epoch": 0.7956374967540899,
"grad_norm": 0.08242635428905487,
"kl": 0.4333142638206482,
"learning_rate": 6.635339816587109e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 383
},
{
"completion_length": 582.0,
"epoch": 0.7977148792521424,
"grad_norm": 0.07576624304056168,
"kl": 0.40080199763178825,
"learning_rate": 6.619820021222518e-05,
"loss": 0.0002,
"reward": 0.8406250439584255,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 384
},
{
"completion_length": 563.59375,
"epoch": 0.7997922617501948,
"grad_norm": 0.08377435803413391,
"kl": 0.43326959386467934,
"learning_rate": 6.604282769958044e-05,
"loss": 0.0002,
"reward": 0.8089844211935997,
"reward_std": 0.2436203770339489,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.19960937649011612,
"step": 385
},
{
"completion_length": 548.859375,
"epoch": 0.8018696442482472,
"grad_norm": 0.09505198895931244,
"kl": 0.625109825283289,
"learning_rate": 6.588728230230004e-05,
"loss": 0.0003,
"reward": 0.7933594211935997,
"reward_std": 0.3088067825883627,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19960937649011612,
"step": 386
},
{
"completion_length": 534.046875,
"epoch": 0.8039470267462997,
"grad_norm": 0.09738834947347641,
"kl": 0.5551509782671928,
"learning_rate": 6.573156569661025e-05,
"loss": 0.0003,
"reward": 0.8703125491738319,
"reward_std": 0.289471834897995,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.1984375026077032,
"step": 387
},
{
"completion_length": 557.40625,
"epoch": 0.8060244092443521,
"grad_norm": 0.0654783695936203,
"kl": 0.42286501079797745,
"learning_rate": 6.557567956058239e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 388
},
{
"completion_length": 509.921875,
"epoch": 0.8081017917424046,
"grad_norm": 0.07516364008188248,
"kl": 0.5226034559309483,
"learning_rate": 6.541962557411469e-05,
"loss": 0.0003,
"reward": 0.8250000476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 389
},
{
"completion_length": 540.953125,
"epoch": 0.810179174240457,
"grad_norm": 0.08237718045711517,
"kl": 0.49187011271715164,
"learning_rate": 6.526340541891418e-05,
"loss": 0.0002,
"reward": 0.7937500476837158,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 390
},
{
"completion_length": 538.03125,
"epoch": 0.8122565567385095,
"grad_norm": 0.08174508810043335,
"kl": 0.45481956005096436,
"learning_rate": 6.510702077847863e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 391
},
{
"completion_length": 552.703125,
"epoch": 0.8143339392365619,
"grad_norm": 0.09640171378850937,
"kl": 0.45114999637007713,
"learning_rate": 6.495047333807842e-05,
"loss": 0.0002,
"reward": 0.7621094211935997,
"reward_std": 0.309911634773016,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.19960937835276127,
"step": 392
},
{
"completion_length": 555.75,
"epoch": 0.8164113217346144,
"grad_norm": 0.07019418478012085,
"kl": 0.44265756756067276,
"learning_rate": 6.479376478473823e-05,
"loss": 0.0002,
"reward": 0.9500000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 393
},
{
"completion_length": 588.09375,
"epoch": 0.8184887042326668,
"grad_norm": 0.05258520692586899,
"kl": 0.4588502533733845,
"learning_rate": 6.463689680721904e-05,
"loss": 0.0002,
"reward": 0.8718750439584255,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 394
},
{
"completion_length": 527.578125,
"epoch": 0.8205660867307193,
"grad_norm": 0.08728921413421631,
"kl": 0.44911035895347595,
"learning_rate": 6.447987109599986e-05,
"loss": 0.0002,
"reward": 0.7937500476837158,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 395
},
{
"completion_length": 635.40625,
"epoch": 0.8226434692287717,
"grad_norm": 0.06634779274463654,
"kl": 0.38350560516119003,
"learning_rate": 6.432268934325946e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 396
},
{
"completion_length": 532.09375,
"epoch": 0.8247208517268242,
"grad_norm": 0.09231170266866684,
"kl": 0.46880777925252914,
"learning_rate": 6.416535324285824e-05,
"loss": 0.0002,
"reward": 0.6843750402331352,
"reward_std": 0.2872621212154627,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 397
},
{
"completion_length": 588.875,
"epoch": 0.8267982342248766,
"grad_norm": 0.07496833801269531,
"kl": 0.850627463310957,
"learning_rate": 6.400786449031986e-05,
"loss": 0.0004,
"reward": 0.8875000476837158,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 398
},
{
"completion_length": 511.953125,
"epoch": 0.8288756167229291,
"grad_norm": 0.06271515041589737,
"kl": 0.4049219489097595,
"learning_rate": 6.385022478281306e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 399
},
{
"completion_length": 555.046875,
"epoch": 0.8309529992209815,
"grad_norm": 0.07291208207607269,
"kl": 0.4224717430770397,
"learning_rate": 6.369243581913336e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 400
},
{
"completion_length": 573.515625,
"epoch": 0.833030381719034,
"grad_norm": 0.06133547052741051,
"kl": 0.42930199950933456,
"learning_rate": 6.353449929968465e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 401
},
{
"completion_length": 522.8125,
"epoch": 0.8351077642170864,
"grad_norm": 0.06863158941268921,
"kl": 0.4221891984343529,
"learning_rate": 6.337641692646106e-05,
"loss": 0.0002,
"reward": 0.9019531756639481,
"reward_std": 0.15633688867092133,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.19882812723517418,
"step": 402
},
{
"completion_length": 544.0,
"epoch": 0.8371851467151389,
"grad_norm": 0.07222079485654831,
"kl": 0.45428359508514404,
"learning_rate": 6.321819040302839e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 403
},
{
"completion_length": 527.671875,
"epoch": 0.8392625292131913,
"grad_norm": 0.08342251926660538,
"kl": 0.4082505330443382,
"learning_rate": 6.305982143450597e-05,
"loss": 0.0002,
"reward": 0.8402344286441803,
"reward_std": 0.24362037930404767,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19960937649011612,
"step": 404
},
{
"completion_length": 544.171875,
"epoch": 0.8413399117112438,
"grad_norm": 0.064121775329113,
"kl": 0.43093303963541985,
"learning_rate": 6.290131172754811e-05,
"loss": 0.0002,
"reward": 0.9949219226837158,
"reward_std": 0.15744174271821976,
"rewards/argmax_reward_func": 0.796875,
"rewards/format_reward_func": 0.19804687798023224,
"step": 405
},
{
"completion_length": 517.578125,
"epoch": 0.8434172942092963,
"grad_norm": 0.08640465885400772,
"kl": 0.44819287210702896,
"learning_rate": 6.274266299032582e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 406
},
{
"completion_length": 536.734375,
"epoch": 0.8454946767073488,
"grad_norm": 0.09182075411081314,
"kl": 0.39375099167227745,
"learning_rate": 6.25838769325083e-05,
"loss": 0.0002,
"reward": 0.7625000476837158,
"reward_std": 0.3093592096120119,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 407
},
{
"completion_length": 537.265625,
"epoch": 0.8475720592054012,
"grad_norm": 0.05770527943968773,
"kl": 0.48194558918476105,
"learning_rate": 6.24249552652447e-05,
"loss": 0.0002,
"reward": 0.7292969226837158,
"reward_std": 0.13534465618431568,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.19804687798023224,
"step": 408
},
{
"completion_length": 536.84375,
"epoch": 0.8496494417034537,
"grad_norm": 0.07029449939727783,
"kl": 0.4273468554019928,
"learning_rate": 6.226589970114543e-05,
"loss": 0.0002,
"reward": 0.8406250439584255,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 409
},
{
"completion_length": 539.140625,
"epoch": 0.8517268242015061,
"grad_norm": 0.0789664089679718,
"kl": 0.4228878915309906,
"learning_rate": 6.210671195426387e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 410
},
{
"completion_length": 582.265625,
"epoch": 0.8538042066995586,
"grad_norm": 0.05472075939178467,
"kl": 0.39347052946686745,
"learning_rate": 6.194739374007792e-05,
"loss": 0.0002,
"reward": 0.8562500439584255,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 411
},
{
"completion_length": 511.125,
"epoch": 0.855881589197611,
"grad_norm": 0.08022020757198334,
"kl": 0.44926824048161507,
"learning_rate": 6.178794677547137e-05,
"loss": 0.0002,
"reward": 0.7312500439584255,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 412
},
{
"completion_length": 502.40625,
"epoch": 0.8579589716956635,
"grad_norm": 0.08022835850715637,
"kl": 0.4819503165781498,
"learning_rate": 6.162837277871553e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.24306794814765453,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 413
},
{
"completion_length": 537.1875,
"epoch": 0.8600363541937159,
"grad_norm": 0.06297382712364197,
"kl": 0.49380555003881454,
"learning_rate": 6.146867346945066e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 414
},
{
"completion_length": 521.859375,
"epoch": 0.8621137366917684,
"grad_norm": 0.07238580286502838,
"kl": 0.4831845983862877,
"learning_rate": 6.130885056866742e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 415
},
{
"completion_length": 538.84375,
"epoch": 0.8641911191898208,
"grad_norm": 0.070571668446064,
"kl": 0.5007887817919254,
"learning_rate": 6.114890579868837e-05,
"loss": 0.0003,
"reward": 0.8250000439584255,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 416
},
{
"completion_length": 573.25,
"epoch": 0.8662685016878733,
"grad_norm": 0.0768335610628128,
"kl": 0.4633421525359154,
"learning_rate": 6.098884088314938e-05,
"loss": 0.0002,
"reward": 0.8875000439584255,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 417
},
{
"completion_length": 627.6875,
"epoch": 0.8683458841859257,
"grad_norm": 0.07244177162647247,
"kl": 0.45967796072363853,
"learning_rate": 6.082865754698109e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 418
},
{
"completion_length": 558.171875,
"epoch": 0.8704232666839782,
"grad_norm": 0.07271222770214081,
"kl": 0.46447786316275597,
"learning_rate": 6.066835751639022e-05,
"loss": 0.0002,
"reward": 0.7925781682133675,
"reward_std": 0.22152329608798027,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.19882812909781933,
"step": 419
},
{
"completion_length": 539.90625,
"epoch": 0.8725006491820306,
"grad_norm": 0.05372535437345505,
"kl": 0.47306570410728455,
"learning_rate": 6.050794251884112e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 420
},
{
"completion_length": 579.59375,
"epoch": 0.8745780316800831,
"grad_norm": 0.06956978142261505,
"kl": 0.4903941936790943,
"learning_rate": 6.0347414283037004e-05,
"loss": 0.0002,
"reward": 0.7937500476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 421
},
{
"completion_length": 543.5625,
"epoch": 0.8766554141781355,
"grad_norm": 0.05839576572179794,
"kl": 0.46378039941191673,
"learning_rate": 6.018677453890149e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 422
},
{
"completion_length": 571.265625,
"epoch": 0.878732796676188,
"grad_norm": 0.08274129778146744,
"kl": 0.4939221628010273,
"learning_rate": 6.002602501755974e-05,
"loss": 0.0002,
"reward": 0.9656250476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.765625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 423
},
{
"completion_length": 562.875,
"epoch": 0.8808101791742404,
"grad_norm": 0.052250444889068604,
"kl": 0.4885864891111851,
"learning_rate": 5.9865167451320005e-05,
"loss": 0.0002,
"reward": 0.9031250439584255,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 424
},
{
"completion_length": 564.171875,
"epoch": 0.8828875616722929,
"grad_norm": 0.07596340775489807,
"kl": 0.4957350380718708,
"learning_rate": 5.970420357365486e-05,
"loss": 0.0002,
"reward": 0.6843750439584255,
"reward_std": 0.24306794814765453,
"rewards/argmax_reward_func": 0.484375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 425
},
{
"completion_length": 563.375,
"epoch": 0.8849649441703453,
"grad_norm": 0.0591680072247982,
"kl": 0.47624582052230835,
"learning_rate": 5.9543135119182514e-05,
"loss": 0.0002,
"reward": 0.7757812812924385,
"reward_std": 0.11269513890147209,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19765625335276127,
"step": 426
},
{
"completion_length": 607.78125,
"epoch": 0.8870423266683978,
"grad_norm": 0.0683642029762268,
"kl": 0.5224468521773815,
"learning_rate": 5.938196382364818e-05,
"loss": 0.0003,
"reward": 0.8718750439584255,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 427
},
{
"completion_length": 584.8125,
"epoch": 0.8891197091664502,
"grad_norm": 0.05711337924003601,
"kl": 0.4908281937241554,
"learning_rate": 5.9220691423905305e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.1325825173407793,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 428
},
{
"completion_length": 626.34375,
"epoch": 0.8911970916645027,
"grad_norm": 0.060382284224033356,
"kl": 0.586872935295105,
"learning_rate": 5.9059319657896884e-05,
"loss": 0.0003,
"reward": 0.9343750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 429
},
{
"completion_length": 597.703125,
"epoch": 0.8932744741625552,
"grad_norm": 0.07129113376140594,
"kl": 0.6612692400813103,
"learning_rate": 5.889785026463672e-05,
"loss": 0.0003,
"reward": 0.8554687947034836,
"reward_std": 0.17788155190646648,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19921875186264515,
"step": 430
},
{
"completion_length": 568.046875,
"epoch": 0.8953518566606077,
"grad_norm": 0.07546839118003845,
"kl": 0.5523902028799057,
"learning_rate": 5.873628498419073e-05,
"loss": 0.0003,
"reward": 0.8250000476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 431
},
{
"completion_length": 573.609375,
"epoch": 0.8974292391586601,
"grad_norm": 0.06730344146490097,
"kl": 0.5382697433233261,
"learning_rate": 5.8574625557658095e-05,
"loss": 0.0003,
"reward": 0.7312500402331352,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.53125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 432
},
{
"completion_length": 576.84375,
"epoch": 0.8995066216567126,
"grad_norm": 0.04371188208460808,
"kl": 0.49328725039958954,
"learning_rate": 5.8412873727152595e-05,
"loss": 0.0002,
"reward": 0.8875000439584255,
"reward_std": 0.0883883461356163,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 433
},
{
"completion_length": 634.609375,
"epoch": 0.901584004154765,
"grad_norm": 0.05455589294433594,
"kl": 0.4651510939002037,
"learning_rate": 5.825103123578379e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 434
},
{
"completion_length": 607.640625,
"epoch": 0.9036613866528175,
"grad_norm": 0.06638182699680328,
"kl": 0.4994208887219429,
"learning_rate": 5.808909982763825e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 435
},
{
"completion_length": 577.171875,
"epoch": 0.9057387691508699,
"grad_norm": 0.05038674548268318,
"kl": 0.4841819517314434,
"learning_rate": 5.792708124776072e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 436
},
{
"completion_length": 618.765625,
"epoch": 0.9078161516489224,
"grad_norm": 0.06103122606873512,
"kl": 0.45625371113419533,
"learning_rate": 5.776497724213536e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 437
},
{
"completion_length": 577.34375,
"epoch": 0.9098935341469748,
"grad_norm": 0.06659764796495438,
"kl": 0.5014519467949867,
"learning_rate": 5.760278955766695e-05,
"loss": 0.0003,
"reward": 0.7937500439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.59375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 438
},
{
"completion_length": 588.828125,
"epoch": 0.9119709166450273,
"grad_norm": 0.06549356877803802,
"kl": 0.4932373948395252,
"learning_rate": 5.744051994216201e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 439
},
{
"completion_length": 587.828125,
"epoch": 0.9140482991430797,
"grad_norm": 0.08965161442756653,
"kl": 0.4944054037332535,
"learning_rate": 5.727817014430992e-05,
"loss": 0.0002,
"reward": 0.7777344174683094,
"reward_std": 0.28781455382704735,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19960937649011612,
"step": 440
},
{
"completion_length": 580.125,
"epoch": 0.9161256816411322,
"grad_norm": 0.07723158597946167,
"kl": 0.4910140074789524,
"learning_rate": 5.7115741913664264e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.24306795187294483,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 441
},
{
"completion_length": 598.890625,
"epoch": 0.9182030641391846,
"grad_norm": 0.068883016705513,
"kl": 0.481427326798439,
"learning_rate": 5.695323700062375e-05,
"loss": 0.0002,
"reward": 0.7000000439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.5,
"rewards/format_reward_func": 0.20000000298023224,
"step": 442
},
{
"completion_length": 600.140625,
"epoch": 0.9202804466372371,
"grad_norm": 0.07069353759288788,
"kl": 0.5068237520754337,
"learning_rate": 5.6790657156413504e-05,
"loss": 0.0003,
"reward": 0.714843787252903,
"reward_std": 0.1999786365777254,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.19921875186264515,
"step": 443
},
{
"completion_length": 587.625,
"epoch": 0.9223578291352895,
"grad_norm": 1.0064716339111328,
"kl": 9.217760100960732,
"learning_rate": 5.66280041330661e-05,
"loss": 0.0046,
"reward": 0.7781250439584255,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 444
},
{
"completion_length": 590.78125,
"epoch": 0.924435211633342,
"grad_norm": 0.2128666639328003,
"kl": 3.3808604292571545,
"learning_rate": 5.646527968340278e-05,
"loss": 0.0017,
"reward": 0.7625000476837158,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 445
},
{
"completion_length": 633.46875,
"epoch": 0.9265125941313944,
"grad_norm": 0.05456709861755371,
"kl": 0.49203020706772804,
"learning_rate": 5.6302485561014475e-05,
"loss": 0.0002,
"reward": 0.9500000439584255,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 446
},
{
"completion_length": 584.015625,
"epoch": 0.9285899766294469,
"grad_norm": 0.06649811565876007,
"kl": 0.47326431795954704,
"learning_rate": 5.613962352024292e-05,
"loss": 0.0002,
"reward": 0.7625000439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 447
},
{
"completion_length": 657.1875,
"epoch": 0.9306673591274993,
"grad_norm": 0.08579502999782562,
"kl": 0.4674902521073818,
"learning_rate": 5.597669531616181e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.331456296145916,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 448
},
{
"completion_length": 664.71875,
"epoch": 0.9327447416255518,
"grad_norm": 0.056638821959495544,
"kl": 0.470287274569273,
"learning_rate": 5.5813702704557814e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 449
},
{
"completion_length": 618.296875,
"epoch": 0.9348221241236042,
"grad_norm": 0.040137626230716705,
"kl": 0.46139009296894073,
"learning_rate": 5.5650647441911706e-05,
"loss": 0.0002,
"reward": 0.9656250476837158,
"reward_std": 0.06629125960171223,
"rewards/argmax_reward_func": 0.765625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 450
},
{
"completion_length": 632.96875,
"epoch": 0.9368995066216567,
"grad_norm": 0.06284568458795547,
"kl": 0.517881490290165,
"learning_rate": 5.548753128537939e-05,
"loss": 0.0003,
"reward": 0.8718750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 451
},
{
"completion_length": 677.46875,
"epoch": 0.9389768891197091,
"grad_norm": 0.07211080193519592,
"kl": 0.46745334565639496,
"learning_rate": 5.532435599277303e-05,
"loss": 0.0002,
"reward": 0.7781250439584255,
"reward_std": 0.24306794814765453,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 452
},
{
"completion_length": 697.96875,
"epoch": 0.9410542716177617,
"grad_norm": 0.06623782962560654,
"kl": 0.4283139891922474,
"learning_rate": 5.516112332254203e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.2209708634763956,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 453
},
{
"completion_length": 619.375,
"epoch": 0.9431316541158141,
"grad_norm": 0.08626007288694382,
"kl": 0.5277771130204201,
"learning_rate": 5.499783503375412e-05,
"loss": 0.0003,
"reward": 0.8250000476837158,
"reward_std": 0.30935920774936676,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 454
},
{
"completion_length": 674.0,
"epoch": 0.9452090366138666,
"grad_norm": 0.06324354559183121,
"kl": 0.4568277336657047,
"learning_rate": 5.4834492886076446e-05,
"loss": 0.0002,
"reward": 0.8714844174683094,
"reward_std": 0.19942620425717905,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.19960937835276127,
"step": 455
},
{
"completion_length": 701.0625,
"epoch": 0.947286419111919,
"grad_norm": 0.07262270897626877,
"kl": 0.4528024010360241,
"learning_rate": 5.4671098639756504e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 456
},
{
"completion_length": 693.90625,
"epoch": 0.9493638016099715,
"grad_norm": 0.06315562129020691,
"kl": 0.43744752556085587,
"learning_rate": 5.4507654055603275e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 457
},
{
"completion_length": 664.984375,
"epoch": 0.9514411841080239,
"grad_norm": 0.06270638853311539,
"kl": 0.4934372082352638,
"learning_rate": 5.4344160894968145e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 458
},
{
"completion_length": 679.78125,
"epoch": 0.9535185666060764,
"grad_norm": 0.06320095807313919,
"kl": 0.4636671505868435,
"learning_rate": 5.418062091972604e-05,
"loss": 0.0002,
"reward": 0.9019531756639481,
"reward_std": 0.15633688890375197,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.19882812537252903,
"step": 459
},
{
"completion_length": 768.765625,
"epoch": 0.9555959491041288,
"grad_norm": 0.08169972896575928,
"kl": 0.4431908018887043,
"learning_rate": 5.4017035892256365e-05,
"loss": 0.0002,
"reward": 0.7773437909781933,
"reward_std": 0.33256115578114986,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.19921875186264515,
"step": 460
},
{
"completion_length": 649.203125,
"epoch": 0.9576733316021813,
"grad_norm": 0.06060326099395752,
"kl": 0.4794473238289356,
"learning_rate": 5.385340757542402e-05,
"loss": 0.0002,
"reward": 0.6375000365078449,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.4375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 461
},
{
"completion_length": 752.859375,
"epoch": 0.9597507141002337,
"grad_norm": 0.07235154509544373,
"kl": 0.43572117015719414,
"learning_rate": 5.36897377325604e-05,
"loss": 0.0002,
"reward": 0.9328125417232513,
"reward_std": 0.2452776599675417,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.1984375026077032,
"step": 462
},
{
"completion_length": 680.78125,
"epoch": 0.9618280965982862,
"grad_norm": 0.07361527532339096,
"kl": 0.45767712593078613,
"learning_rate": 5.352602812744441e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.26516503654420376,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 463
},
{
"completion_length": 702.609375,
"epoch": 0.9639054790963386,
"grad_norm": 0.0740986093878746,
"kl": 0.493575606495142,
"learning_rate": 5.336228052428348e-05,
"loss": 0.0002,
"reward": 0.9031250476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.703125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 464
},
{
"completion_length": 733.359375,
"epoch": 0.9659828615943911,
"grad_norm": 0.07798778265714645,
"kl": 0.45794639363884926,
"learning_rate": 5.319849668769449e-05,
"loss": 0.0002,
"reward": 0.6675781644880772,
"reward_std": 0.2668223176151514,
"rewards/argmax_reward_func": 0.46875,
"rewards/format_reward_func": 0.19882812723517418,
"step": 465
},
{
"completion_length": 673.03125,
"epoch": 0.9680602440924435,
"grad_norm": 0.06236180663108826,
"kl": 0.4699827618896961,
"learning_rate": 5.303467838268478e-05,
"loss": 0.0002,
"reward": 0.8718750439584255,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 466
},
{
"completion_length": 695.03125,
"epoch": 0.970137626590496,
"grad_norm": 0.06047491356730461,
"kl": 0.42734822258353233,
"learning_rate": 5.287082737463317e-05,
"loss": 0.0002,
"reward": 0.8875000439584255,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 467
},
{
"completion_length": 700.515625,
"epoch": 0.9722150090885484,
"grad_norm": 0.05264519900083542,
"kl": 0.514576718211174,
"learning_rate": 5.270694542927088e-05,
"loss": 0.0003,
"reward": 0.9500000476837158,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 468
},
{
"completion_length": 685.015625,
"epoch": 0.9742923915866009,
"grad_norm": 0.07000822573900223,
"kl": 0.47352610528469086,
"learning_rate": 5.254303431266254e-05,
"loss": 0.0002,
"reward": 0.8382812961935997,
"reward_std": 0.24638251960277557,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.19765625149011612,
"step": 469
},
{
"completion_length": 778.0,
"epoch": 0.9763697740846533,
"grad_norm": 0.0691061019897461,
"kl": 0.44779016450047493,
"learning_rate": 5.2379095791187124e-05,
"loss": 0.0002,
"reward": 0.8238281644880772,
"reward_std": 0.2226281464099884,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.19882812723517418,
"step": 470
},
{
"completion_length": 721.953125,
"epoch": 0.9784471565827058,
"grad_norm": 0.056446801871061325,
"kl": 0.4972013346850872,
"learning_rate": 5.2215131631518945e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 471
},
{
"completion_length": 734.296875,
"epoch": 0.9805245390807582,
"grad_norm": 0.04742085933685303,
"kl": 0.4290156289935112,
"learning_rate": 5.20511436006086e-05,
"loss": 0.0002,
"reward": 0.9187500439584255,
"reward_std": 0.13258251920342445,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 472
},
{
"completion_length": 699.078125,
"epoch": 0.9826019215788107,
"grad_norm": 0.06520857661962509,
"kl": 0.44061052426695824,
"learning_rate": 5.188713346566393e-05,
"loss": 0.0002,
"reward": 0.9187500476837158,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 473
},
{
"completion_length": 816.515625,
"epoch": 0.9846793040768631,
"grad_norm": 0.06763774901628494,
"kl": 0.462362315505743,
"learning_rate": 5.172310299413099e-05,
"loss": 0.0002,
"reward": 0.8875000476837158,
"reward_std": 0.2651650384068489,
"rewards/argmax_reward_func": 0.6875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 474
},
{
"completion_length": 703.703125,
"epoch": 0.9867566865749156,
"grad_norm": 0.06497927010059357,
"kl": 0.4288316182792187,
"learning_rate": 5.1559053953674975e-05,
"loss": 0.0002,
"reward": 0.8562500476837158,
"reward_std": 0.1767766922712326,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 475
},
{
"completion_length": 734.265625,
"epoch": 0.9888340690729681,
"grad_norm": 0.061615679413080215,
"kl": 0.4229474924504757,
"learning_rate": 5.139498811216122e-05,
"loss": 0.0002,
"reward": 0.7156250383704901,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.515625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 476
},
{
"completion_length": 745.546875,
"epoch": 0.9909114515710206,
"grad_norm": 0.061035335063934326,
"kl": 0.44998469576239586,
"learning_rate": 5.123090723763606e-05,
"loss": 0.0002,
"reward": 0.7781250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.578125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 477
},
{
"completion_length": 726.9375,
"epoch": 0.992988834069073,
"grad_norm": 0.05501917377114296,
"kl": 0.4711364693939686,
"learning_rate": 5.106681309830791e-05,
"loss": 0.0002,
"reward": 0.9312500469386578,
"reward_std": 0.1590990237891674,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.19687500223517418,
"step": 478
},
{
"completion_length": 731.265625,
"epoch": 0.9950662165671255,
"grad_norm": 0.06437938660383224,
"kl": 0.488413542509079,
"learning_rate": 5.090270746252802e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 479
},
{
"completion_length": 877.296875,
"epoch": 0.9971435990651779,
"grad_norm": 0.05059191957116127,
"kl": 0.3898215554654598,
"learning_rate": 5.073859209877168e-05,
"loss": 0.0002,
"reward": 0.9179687947034836,
"reward_std": 0.22207572311162949,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.19921875186264515,
"step": 480
},
{
"completion_length": 737.34375,
"epoch": 0.9992209815632304,
"grad_norm": 0.05600970238447189,
"kl": 0.4228545166552067,
"learning_rate": 5.057446877561884e-05,
"loss": 0.0002,
"reward": 0.9179687909781933,
"reward_std": 0.17788155004382133,
"rewards/argmax_reward_func": 0.71875,
"rewards/format_reward_func": 0.1992187537252903,
"step": 481
},
{
"completion_length": 867.2916666666666,
"epoch": 1.0,
"grad_norm": 0.0311344675719738,
"kl": 0.4165251553058624,
"learning_rate": 5.0410339261735384e-05,
"loss": 0.0001,
"reward": 0.9500000476837158,
"reward_std": 0.23570225636164346,
"rewards/argmax_reward_func": 0.75,
"rewards/format_reward_func": 0.20000000298023224,
"step": 482
},
{
"completion_length": 721.59375,
"epoch": 1.0020773824980524,
"grad_norm": 0.08058605343103409,
"kl": 0.4754480682313442,
"learning_rate": 5.0246205325853826e-05,
"loss": 0.0002,
"reward": 0.8250000476837158,
"reward_std": 0.30935921147465706,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 483
},
{
"completion_length": 749.40625,
"epoch": 1.004154764996105,
"grad_norm": 0.06808894872665405,
"kl": 0.4143032245337963,
"learning_rate": 5.008206873675433e-05,
"loss": 0.0002,
"reward": 0.9343750476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 484
},
{
"completion_length": 739.546875,
"epoch": 1.0062321474941573,
"grad_norm": 0.04413120448589325,
"kl": 0.4007079564034939,
"learning_rate": 4.991793126324568e-05,
"loss": 0.0002,
"reward": 0.9656250476837158,
"reward_std": 0.11048543266952038,
"rewards/argmax_reward_func": 0.765625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 485
},
{
"completion_length": 764.390625,
"epoch": 1.0083095299922098,
"grad_norm": 0.05544662848114967,
"kl": 0.40518468618392944,
"learning_rate": 4.9753794674146206e-05,
"loss": 0.0002,
"reward": 1.0125000476837158,
"reward_std": 0.17677669040858746,
"rewards/argmax_reward_func": 0.8125,
"rewards/format_reward_func": 0.20000000298023224,
"step": 486
},
{
"completion_length": 770.71875,
"epoch": 1.0103869124902622,
"grad_norm": 0.0641309842467308,
"kl": 0.41656066104769707,
"learning_rate": 4.9589660738264614e-05,
"loss": 0.0002,
"reward": 0.8718750476837158,
"reward_std": 0.19887377880513668,
"rewards/argmax_reward_func": 0.671875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 487
},
{
"completion_length": 774.984375,
"epoch": 1.0124642949883147,
"grad_norm": 0.06294507533311844,
"kl": 0.41369784995913506,
"learning_rate": 4.9425531224381163e-05,
"loss": 0.0002,
"reward": 0.7625000402331352,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 488
},
{
"completion_length": 725.171875,
"epoch": 1.0145416774863671,
"grad_norm": 0.058232299983501434,
"kl": 0.4683380052447319,
"learning_rate": 4.926140790122835e-05,
"loss": 0.0002,
"reward": 0.8406250476837158,
"reward_std": 0.19887377694249153,
"rewards/argmax_reward_func": 0.640625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 489
},
{
"completion_length": 782.046875,
"epoch": 1.0166190599844196,
"grad_norm": 0.053580548614263535,
"kl": 0.42908982560038567,
"learning_rate": 4.909729253747197e-05,
"loss": 0.0002,
"reward": 0.8093750476837158,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 490
},
{
"completion_length": 812.921875,
"epoch": 1.018696442482472,
"grad_norm": 0.07418368011713028,
"kl": 0.4282660707831383,
"learning_rate": 4.893318690169211e-05,
"loss": 0.0002,
"reward": 0.7625000476837158,
"reward_std": 0.30935920774936676,
"rewards/argmax_reward_func": 0.5625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 491
},
{
"completion_length": 772.4375,
"epoch": 1.0207738249805245,
"grad_norm": 0.053191013634204865,
"kl": 0.42502470314502716,
"learning_rate": 4.876909276236395e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 492
},
{
"completion_length": 752.390625,
"epoch": 1.022851207478577,
"grad_norm": 0.05855753272771835,
"kl": 0.42473678290843964,
"learning_rate": 4.8605011887838797e-05,
"loss": 0.0002,
"reward": 0.8554687947034836,
"reward_std": 0.22207572497427464,
"rewards/argmax_reward_func": 0.65625,
"rewards/format_reward_func": 0.19921875186264515,
"step": 493
},
{
"completion_length": 757.84375,
"epoch": 1.0249285899766294,
"grad_norm": 0.06828629225492477,
"kl": 0.409926887601614,
"learning_rate": 4.844094604632502e-05,
"loss": 0.0002,
"reward": 0.9968750476837158,
"reward_std": 0.24306795001029968,
"rewards/argmax_reward_func": 0.796875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 494
},
{
"completion_length": 781.3125,
"epoch": 1.0270059724746818,
"grad_norm": 0.07438631355762482,
"kl": 0.41192958503961563,
"learning_rate": 4.827689700586902e-05,
"loss": 0.0002,
"reward": 0.8093750439584255,
"reward_std": 0.331456296145916,
"rewards/argmax_reward_func": 0.609375,
"rewards/format_reward_func": 0.20000000298023224,
"step": 495
},
{
"completion_length": 793.234375,
"epoch": 1.0290833549727343,
"grad_norm": 0.05079561844468117,
"kl": 0.4043182320892811,
"learning_rate": 4.811286653433609e-05,
"loss": 0.0002,
"reward": 0.7468750476837158,
"reward_std": 0.15467960387468338,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 496
},
{
"completion_length": 781.796875,
"epoch": 1.0311607374707867,
"grad_norm": 0.05202874913811684,
"kl": 0.413474939763546,
"learning_rate": 4.794885639939142e-05,
"loss": 0.0002,
"reward": 0.7468750439584255,
"reward_std": 0.15467960573732853,
"rewards/argmax_reward_func": 0.546875,
"rewards/format_reward_func": 0.20000000298023224,
"step": 497
},
{
"completion_length": 810.25,
"epoch": 1.0332381199688392,
"grad_norm": 0.05652881786227226,
"kl": 0.40745414793491364,
"learning_rate": 4.7784868368481067e-05,
"loss": 0.0002,
"reward": 0.9335937947034836,
"reward_std": 0.1999786328524351,
"rewards/argmax_reward_func": 0.734375,
"rewards/format_reward_func": 0.1992187537252903,
"step": 498
},
{
"completion_length": 799.765625,
"epoch": 1.0353155024668916,
"grad_norm": 0.0548785924911499,
"kl": 0.39410270750522614,
"learning_rate": 4.762090420881289e-05,
"loss": 0.0002,
"reward": 0.9953125491738319,
"reward_std": 0.1568893175572157,
"rewards/argmax_reward_func": 0.796875,
"rewards/format_reward_func": 0.1984375026077032,
"step": 499
},
{
"completion_length": 777.75,
"epoch": 1.037392884964944,
"grad_norm": 0.06177806481719017,
"kl": 0.6858577094972134,
"learning_rate": 4.745696568733748e-05,
"loss": 0.0003,
"reward": 0.8250000439584255,
"reward_std": 0.22097086533904076,
"rewards/argmax_reward_func": 0.625,
"rewards/format_reward_func": 0.20000000298023224,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 962,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}