| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.037392884964944, | |
| "eval_steps": 1000, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 775.203125, | |
| "epoch": 0.002077382498052454, | |
| "grad_norm": 0.16490910947322845, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "reward": 0.2167968787252903, | |
| "reward_std": 0.11324757407419384, | |
| "rewards/argmax_reward_func": 0.0625, | |
| "rewards/format_reward_func": 0.154296875, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 820.609375, | |
| "epoch": 0.004154764996104908, | |
| "grad_norm": 0.15733271837234497, | |
| "kl": 0.0, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0, | |
| "reward": 0.1472656298428774, | |
| "reward_std": 0.020439805870410055, | |
| "rewards/argmax_reward_func": 0.0, | |
| "rewards/format_reward_func": 0.14726562798023224, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 901.25, | |
| "epoch": 0.006232147494157362, | |
| "grad_norm": 0.14691142737865448, | |
| "kl": 0.0010660013067536056, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0, | |
| "reward": 0.20703125, | |
| "reward_std": 0.11269514623563737, | |
| "rewards/argmax_reward_func": 0.0625, | |
| "rewards/format_reward_func": 0.14453125, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 873.015625, | |
| "epoch": 0.008309529992209816, | |
| "grad_norm": 0.14998185634613037, | |
| "kl": 0.0019050340633839369, | |
| "learning_rate": 6e-05, | |
| "loss": 0.0, | |
| "reward": 0.2011718824505806, | |
| "reward_std": 0.09004563023336232, | |
| "rewards/argmax_reward_func": 0.046875, | |
| "rewards/format_reward_func": 0.1542968787252903, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 870.546875, | |
| "epoch": 0.01038691249026227, | |
| "grad_norm": 0.1567591279745102, | |
| "kl": 0.005349995743017644, | |
| "learning_rate": 8e-05, | |
| "loss": 0.0, | |
| "reward": 0.2285156361758709, | |
| "reward_std": 0.1110378596931696, | |
| "rewards/argmax_reward_func": 0.0625, | |
| "rewards/format_reward_func": 0.1660156287252903, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 849.125, | |
| "epoch": 0.012464294988314724, | |
| "grad_norm": 0.10938515514135361, | |
| "kl": 0.01296996301971376, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "reward": 0.24414063058793545, | |
| "reward_std": 0.0999893163680099, | |
| "rewards/argmax_reward_func": 0.0625, | |
| "rewards/format_reward_func": 0.1816406324505806, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 901.015625, | |
| "epoch": 0.014541677486367177, | |
| "grad_norm": 0.12581659853458405, | |
| "kl": 0.02171943092253059, | |
| "learning_rate": 9.999973058889791e-05, | |
| "loss": 0.0, | |
| "reward": 0.2585937548428774, | |
| "reward_std": 0.12816310487687588, | |
| "rewards/argmax_reward_func": 0.078125, | |
| "rewards/format_reward_func": 0.18046875298023224, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 916.671875, | |
| "epoch": 0.01661905998441963, | |
| "grad_norm": 0.12178487330675125, | |
| "kl": 0.04081101668998599, | |
| "learning_rate": 9.999892235849491e-05, | |
| "loss": 0.0, | |
| "reward": 0.3437500111758709, | |
| "reward_std": 0.1900349531788379, | |
| "rewards/argmax_reward_func": 0.15625, | |
| "rewards/format_reward_func": 0.1875, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 803.90625, | |
| "epoch": 0.018696442482472084, | |
| "grad_norm": 0.12499672174453735, | |
| "kl": 0.06826442573219538, | |
| "learning_rate": 9.999757531750085e-05, | |
| "loss": 0.0, | |
| "reward": 0.45625001564621925, | |
| "reward_std": 0.25411650398746133, | |
| "rewards/argmax_reward_func": 0.265625, | |
| "rewards/format_reward_func": 0.1906250026077032, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 953.875, | |
| "epoch": 0.02077382498052454, | |
| "grad_norm": 0.11061865091323853, | |
| "kl": 0.06516677932813764, | |
| "learning_rate": 9.999568948043205e-05, | |
| "loss": 0.0, | |
| "reward": 0.3804687615483999, | |
| "reward_std": 0.23091456340625882, | |
| "rewards/argmax_reward_func": 0.1875, | |
| "rewards/format_reward_func": 0.19296875037252903, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 824.546875, | |
| "epoch": 0.022851207478576992, | |
| "grad_norm": 0.10025237500667572, | |
| "kl": 0.10202133795246482, | |
| "learning_rate": 9.999326486761114e-05, | |
| "loss": 0.0001, | |
| "reward": 0.4562500212341547, | |
| "reward_std": 0.203293202444911, | |
| "rewards/argmax_reward_func": 0.265625, | |
| "rewards/format_reward_func": 0.1906250026077032, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 925.234375, | |
| "epoch": 0.02492858997662945, | |
| "grad_norm": 0.12423845380544662, | |
| "kl": 0.14641187246888876, | |
| "learning_rate": 9.99903015051668e-05, | |
| "loss": 0.0001, | |
| "reward": 0.6261719018220901, | |
| "reward_std": 0.22925727342953905, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.1886718738824129, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 810.546875, | |
| "epoch": 0.0270059724746819, | |
| "grad_norm": 0.1263190507888794, | |
| "kl": 0.23557536769658327, | |
| "learning_rate": 9.998679942503358e-05, | |
| "loss": 0.0001, | |
| "reward": 0.5953125320374966, | |
| "reward_std": 0.2717941626906395, | |
| "rewards/argmax_reward_func": 0.40625, | |
| "rewards/format_reward_func": 0.18906250409781933, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 738.953125, | |
| "epoch": 0.029083354972734354, | |
| "grad_norm": 0.09476204961538315, | |
| "kl": 0.29587008990347385, | |
| "learning_rate": 9.998275866495138e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7289062887430191, | |
| "reward_std": 0.18009126000106335, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19765625335276127, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 699.109375, | |
| "epoch": 0.03116073747078681, | |
| "grad_norm": 0.15413929522037506, | |
| "kl": 0.2644388508051634, | |
| "learning_rate": 9.997817926846529e-05, | |
| "loss": 0.0001, | |
| "reward": 0.6968750357627869, | |
| "reward_std": 0.4021669775247574, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 716.4375, | |
| "epoch": 0.03323811996883926, | |
| "grad_norm": 0.13675570487976074, | |
| "kl": 0.41714945435523987, | |
| "learning_rate": 9.99730612849249e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6320312805473804, | |
| "reward_std": 0.27289901627227664, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.19453125074505806, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 688.609375, | |
| "epoch": 0.03531550246689172, | |
| "grad_norm": 0.14246560633182526, | |
| "kl": 0.35029047913849354, | |
| "learning_rate": 9.996740476948385e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6304687857627869, | |
| "reward_std": 0.31930290907621384, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.19296875223517418, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 630.828125, | |
| "epoch": 0.03739288496494417, | |
| "grad_norm": 2.1836376190185547, | |
| "kl": 10.275608837604523, | |
| "learning_rate": 9.996120978309931e-05, | |
| "loss": 0.0051, | |
| "reward": 0.5742187947034836, | |
| "reward_std": 0.39885240606963634, | |
| "rewards/argmax_reward_func": 0.375, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 647.890625, | |
| "epoch": 0.039470267462996624, | |
| "grad_norm": 0.1236676499247551, | |
| "kl": 0.366399560123682, | |
| "learning_rate": 9.995447639253115e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7765625417232513, | |
| "reward_std": 0.2894718423485756, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19843750074505806, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 567.578125, | |
| "epoch": 0.04154764996104908, | |
| "grad_norm": 0.12248539924621582, | |
| "kl": 0.28800770081579685, | |
| "learning_rate": 9.994720467034142e-05, | |
| "loss": 0.0001, | |
| "reward": 0.807812537997961, | |
| "reward_std": 0.24527766928076744, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 538.796875, | |
| "epoch": 0.04362503245910153, | |
| "grad_norm": 0.14135704934597015, | |
| "kl": 0.5838185884058475, | |
| "learning_rate": 9.993939469489342e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6835937947034836, | |
| "reward_std": 0.24417280592024326, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 534.40625, | |
| "epoch": 0.045702414957153985, | |
| "grad_norm": 0.16315752267837524, | |
| "kl": 0.37346063926815987, | |
| "learning_rate": 9.993104655035088e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6835937909781933, | |
| "reward_std": 0.37675532698631287, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 583.4375, | |
| "epoch": 0.04777979745520644, | |
| "grad_norm": 0.13479600846767426, | |
| "kl": 0.5061899088323116, | |
| "learning_rate": 9.992216032667716e-05, | |
| "loss": 0.0003, | |
| "reward": 0.5878906548023224, | |
| "reward_std": 0.2911291141062975, | |
| "rewards/argmax_reward_func": 0.390625, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 532.203125, | |
| "epoch": 0.0498571799532589, | |
| "grad_norm": 0.12497097253799438, | |
| "kl": 0.5098075568675995, | |
| "learning_rate": 9.991273611963412e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 503.4375, | |
| "epoch": 0.051934562451311346, | |
| "grad_norm": 0.153394415974617, | |
| "kl": 0.35232703387737274, | |
| "learning_rate": 9.990277403078122e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250439584255, | |
| "reward_std": 0.3314562924206257, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 491.25, | |
| "epoch": 0.0540119449493638, | |
| "grad_norm": 0.15910868346691132, | |
| "kl": 0.41421468555927277, | |
| "learning_rate": 9.989227416747434e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000476837158, | |
| "reward_std": 0.35355337895452976, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 485.390625, | |
| "epoch": 0.05608932744741626, | |
| "grad_norm": 34.968101501464844, | |
| "kl": 504.1205723620951, | |
| "learning_rate": 9.988123664286469e-05, | |
| "loss": 0.2521, | |
| "reward": 0.6375000439584255, | |
| "reward_std": 0.39774755388498306, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 543.859375, | |
| "epoch": 0.05816670994546871, | |
| "grad_norm": 0.1266699880361557, | |
| "kl": 0.3936588950455189, | |
| "learning_rate": 9.98696615758975e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7296875417232513, | |
| "reward_std": 0.2673747483640909, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 520.84375, | |
| "epoch": 0.06024409244352116, | |
| "grad_norm": 0.1384185552597046, | |
| "kl": 0.41820336878299713, | |
| "learning_rate": 9.985754909131085e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6523437947034836, | |
| "reward_std": 0.2883669789880514, | |
| "rewards/argmax_reward_func": 0.453125, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 566.515625, | |
| "epoch": 0.06232147494157362, | |
| "grad_norm": 0.12411382049322128, | |
| "kl": 0.37708618491888046, | |
| "learning_rate": 9.984489931963428e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7304687909781933, | |
| "reward_std": 0.26626989617943764, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 557.71875, | |
| "epoch": 0.06439885743962608, | |
| "grad_norm": 0.1417299211025238, | |
| "kl": 0.5239567384123802, | |
| "learning_rate": 9.98317123971873e-05, | |
| "loss": 0.0003, | |
| "reward": 0.5867187865078449, | |
| "reward_std": 0.33698057383298874, | |
| "rewards/argmax_reward_func": 0.390625, | |
| "rewards/format_reward_func": 0.19609375298023224, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 571.46875, | |
| "epoch": 0.06647623993767852, | |
| "grad_norm": 0.12581190466880798, | |
| "kl": 0.3905966766178608, | |
| "learning_rate": 9.981798846607808e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8238281756639481, | |
| "reward_std": 0.31101649068295956, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19882812909781933, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 617.171875, | |
| "epoch": 0.06855362243573097, | |
| "grad_norm": 0.119756318628788, | |
| "kl": 0.37264879420399666, | |
| "learning_rate": 9.980372767420177e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6210937835276127, | |
| "reward_std": 0.2883669789880514, | |
| "rewards/argmax_reward_func": 0.421875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 604.265625, | |
| "epoch": 0.07063100493378344, | |
| "grad_norm": 0.12877410650253296, | |
| "kl": 0.4410099685192108, | |
| "learning_rate": 9.978893017523903e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6687500476837158, | |
| "reward_std": 0.3535533808171749, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 664.875, | |
| "epoch": 0.07270838743183589, | |
| "grad_norm": 0.10366171598434448, | |
| "kl": 0.5219907499849796, | |
| "learning_rate": 9.977359612865423e-05, | |
| "loss": 0.0003, | |
| "reward": 0.726562537252903, | |
| "reward_std": 0.26958445459604263, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.1953125037252903, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 658.515625, | |
| "epoch": 0.07478576992988833, | |
| "grad_norm": 0.1170380637049675, | |
| "kl": 0.46299856156110764, | |
| "learning_rate": 9.97577256996939e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8226562887430191, | |
| "reward_std": 0.3126737759448588, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19765625335276127, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 621.890625, | |
| "epoch": 0.0768631524279408, | |
| "grad_norm": 0.09539435803890228, | |
| "kl": 0.39572376012802124, | |
| "learning_rate": 9.974131905938483e-05, | |
| "loss": 0.0002, | |
| "reward": 0.851562537252903, | |
| "reward_std": 0.18119611439760774, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.1953125037252903, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 616.671875, | |
| "epoch": 0.07894053492599325, | |
| "grad_norm": 0.1066877692937851, | |
| "kl": 0.3970871977508068, | |
| "learning_rate": 9.972437638453227e-05, | |
| "loss": 0.0002, | |
| "reward": 0.5734375342726707, | |
| "reward_std": 0.2673747483640909, | |
| "rewards/argmax_reward_func": 0.375, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 620.515625, | |
| "epoch": 0.0810179174240457, | |
| "grad_norm": 0.09872303903102875, | |
| "kl": 0.4494887478649616, | |
| "learning_rate": 9.970689785771798e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8539062887430191, | |
| "reward_std": 0.22428543493151665, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19765625521540642, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 646.25, | |
| "epoch": 0.08309529992209816, | |
| "grad_norm": 0.10916193574666977, | |
| "kl": 0.3997967578470707, | |
| "learning_rate": 9.968888366729835e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7867187820374966, | |
| "reward_std": 0.31156892515718937, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19296875223517418, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 636.578125, | |
| "epoch": 0.08517268242015061, | |
| "grad_norm": 0.109690822660923, | |
| "kl": 0.4680747017264366, | |
| "learning_rate": 9.967033400740227e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7125000357627869, | |
| "reward_std": 0.2916815411299467, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19687500409781933, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 633.8125, | |
| "epoch": 0.08725006491820306, | |
| "grad_norm": 0.11978733539581299, | |
| "kl": 0.42677244916558266, | |
| "learning_rate": 9.965124907792915e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6804687902331352, | |
| "reward_std": 0.3369805682450533, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.19609375298023224, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 641.328125, | |
| "epoch": 0.08932744741625552, | |
| "grad_norm": 0.10323718935251236, | |
| "kl": 0.4870793893933296, | |
| "learning_rate": 9.963162908454664e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6820312812924385, | |
| "reward_std": 0.2905766926705837, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.19765625335276127, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 672.90625, | |
| "epoch": 0.09140482991430797, | |
| "grad_norm": 0.10177203267812729, | |
| "kl": 0.4337821826338768, | |
| "learning_rate": 9.96114742386885e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7445312850177288, | |
| "reward_std": 0.3347708657383919, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.19765625335276127, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 639.203125, | |
| "epoch": 0.09348221241236043, | |
| "grad_norm": 0.07864588499069214, | |
| "kl": 0.6409419141709805, | |
| "learning_rate": 9.95907847575523e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6367187835276127, | |
| "reward_std": 0.17788154818117619, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 662.859375, | |
| "epoch": 0.09555959491041288, | |
| "grad_norm": 0.1133044883608818, | |
| "kl": 0.49651604518294334, | |
| "learning_rate": 9.95695608640971e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6664062887430191, | |
| "reward_std": 0.31267377361655235, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.19765624962747097, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 615.1875, | |
| "epoch": 0.09763697740846533, | |
| "grad_norm": 0.09465198963880539, | |
| "kl": 0.42672090977430344, | |
| "learning_rate": 9.954780278704097e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 624.234375, | |
| "epoch": 0.0997143599065178, | |
| "grad_norm": 0.12003368884325027, | |
| "kl": 0.45439790561795235, | |
| "learning_rate": 9.952551076085864e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6531250402331352, | |
| "reward_std": 0.375650467351079, | |
| "rewards/argmax_reward_func": 0.453125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 657.6875, | |
| "epoch": 0.10179174240457024, | |
| "grad_norm": 0.10603732615709305, | |
| "kl": 0.4523283280432224, | |
| "learning_rate": 9.950268502577884e-05, | |
| "loss": 0.0002, | |
| "reward": 0.823437537997961, | |
| "reward_std": 0.3115689232945442, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19843750074505806, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 611.640625, | |
| "epoch": 0.10386912490262269, | |
| "grad_norm": 0.11261381953954697, | |
| "kl": 0.5920008532702923, | |
| "learning_rate": 9.947932582778188e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7765625417232513, | |
| "reward_std": 0.33366600796580315, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 662.359375, | |
| "epoch": 0.10594650740067516, | |
| "grad_norm": 0.11194069683551788, | |
| "kl": 0.40367136895656586, | |
| "learning_rate": 9.94554334185968e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6375000365078449, | |
| "reward_std": 0.35355338267982006, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 686.265625, | |
| "epoch": 0.1080238898987276, | |
| "grad_norm": 0.10016939043998718, | |
| "kl": 0.429857462644577, | |
| "learning_rate": 9.943100805569887e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750476837158, | |
| "reward_std": 0.33145629055798054, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 613.890625, | |
| "epoch": 0.11010127239678005, | |
| "grad_norm": 0.11434896290302277, | |
| "kl": 0.41631242260336876, | |
| "learning_rate": 9.94060500023066e-05, | |
| "loss": 0.0002, | |
| "reward": 0.620312537997961, | |
| "reward_std": 0.333666006103158, | |
| "rewards/argmax_reward_func": 0.421875, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 671.75, | |
| "epoch": 0.11217865489483252, | |
| "grad_norm": 0.0824907198548317, | |
| "kl": 0.4247642531991005, | |
| "learning_rate": 9.938055952737907e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 711.078125, | |
| "epoch": 0.11425603739288497, | |
| "grad_norm": 0.09910566359758377, | |
| "kl": 0.46975456923246384, | |
| "learning_rate": 9.935453690561297e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7906250469386578, | |
| "reward_std": 0.3137786239385605, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 659.5625, | |
| "epoch": 0.11633341989093741, | |
| "grad_norm": 0.0940733402967453, | |
| "kl": 0.42034388333559036, | |
| "learning_rate": 9.932798241743961e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.2872621212154627, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 694.796875, | |
| "epoch": 0.11841080238898988, | |
| "grad_norm": 0.20298048853874207, | |
| "kl": 1.01119814068079, | |
| "learning_rate": 9.930089634902197e-05, | |
| "loss": 0.0005, | |
| "reward": 0.714062537997961, | |
| "reward_std": 0.28947182931005955, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19843750447034836, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 679.421875, | |
| "epoch": 0.12048818488704233, | |
| "grad_norm": 0.10724397003650665, | |
| "kl": 0.45981432124972343, | |
| "learning_rate": 9.927327899225151e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250476837158, | |
| "reward_std": 0.375650467351079, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 666.390625, | |
| "epoch": 0.12256556738509478, | |
| "grad_norm": 0.09369952231645584, | |
| "kl": 0.5710588954389095, | |
| "learning_rate": 9.924513064474519e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8085937909781933, | |
| "reward_std": 0.24417280405759811, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 667.3125, | |
| "epoch": 0.12464294988314724, | |
| "grad_norm": 0.10410826653242111, | |
| "kl": 0.6697803623974323, | |
| "learning_rate": 9.921645160984206e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7625000476837158, | |
| "reward_std": 0.35355337895452976, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 694.453125, | |
| "epoch": 0.1267203323811997, | |
| "grad_norm": 0.10938889533281326, | |
| "kl": 0.42841707170009613, | |
| "learning_rate": 9.918724219660013e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.3756504636257887, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 767.796875, | |
| "epoch": 0.12879771487925215, | |
| "grad_norm": 0.08572812378406525, | |
| "kl": 0.42277197539806366, | |
| "learning_rate": 9.915750271979305e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6843750402331352, | |
| "reward_std": 0.28726212307810783, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 746.703125, | |
| "epoch": 0.1308750973773046, | |
| "grad_norm": 0.08672405034303665, | |
| "kl": 0.4743144288659096, | |
| "learning_rate": 9.91272334999066e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.2872621212154627, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 785.140625, | |
| "epoch": 0.13295247987535705, | |
| "grad_norm": 0.07892299443483353, | |
| "kl": 0.5303685143589973, | |
| "learning_rate": 9.909643486313533e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7312500402331352, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 816.015625, | |
| "epoch": 0.1350298623734095, | |
| "grad_norm": 0.071454256772995, | |
| "kl": 0.39743437245488167, | |
| "learning_rate": 9.906510714137905e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6218750402331352, | |
| "reward_std": 0.24306794628500938, | |
| "rewards/argmax_reward_func": 0.421875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 836.4375, | |
| "epoch": 0.13710724487146195, | |
| "grad_norm": 0.08313830941915512, | |
| "kl": 0.3903077654540539, | |
| "learning_rate": 9.903325067223919e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6367187909781933, | |
| "reward_std": 0.31046406738460064, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 787.484375, | |
| "epoch": 0.13918462736951442, | |
| "grad_norm": 0.08504212647676468, | |
| "kl": 0.5619952343404293, | |
| "learning_rate": 9.90008657990152e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7464844211935997, | |
| "reward_std": 0.28781455010175705, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.19960937649011612, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 807.921875, | |
| "epoch": 0.14126200986756687, | |
| "grad_norm": 0.08398205786943436, | |
| "kl": 0.47908810153603554, | |
| "learning_rate": 9.896795287070086e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750476837158, | |
| "reward_std": 0.331456296145916, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 837.234375, | |
| "epoch": 0.14333939236561932, | |
| "grad_norm": 0.054244451224803925, | |
| "kl": 0.39820099994540215, | |
| "learning_rate": 9.893451224198052e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 910.0625, | |
| "epoch": 0.14541677486367177, | |
| "grad_norm": 0.08078251034021378, | |
| "kl": 0.4756108485162258, | |
| "learning_rate": 9.890054427322521e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.331456296145916, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 867.84375, | |
| "epoch": 0.14749415736172422, | |
| "grad_norm": 0.08043571561574936, | |
| "kl": 0.3970469869673252, | |
| "learning_rate": 9.886604933048888e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6679687947034836, | |
| "reward_std": 0.3104640601668507, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 882.265625, | |
| "epoch": 0.14957153985977667, | |
| "grad_norm": 0.09208390861749649, | |
| "kl": 0.40190327540040016, | |
| "learning_rate": 9.883102778550434e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.3977475520223379, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 889.78125, | |
| "epoch": 0.15164892235782915, | |
| "grad_norm": 0.09202940762042999, | |
| "kl": 0.38338571041822433, | |
| "learning_rate": 9.879548001567931e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7000000476837158, | |
| "reward_std": 0.4419417232275009, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 942.609375, | |
| "epoch": 0.1537263048558816, | |
| "grad_norm": 0.06312800943851471, | |
| "kl": 0.4080694951117039, | |
| "learning_rate": 9.875940640409234e-05, | |
| "loss": 0.0002, | |
| "reward": 0.5750000402331352, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 948.859375, | |
| "epoch": 0.15580368735393405, | |
| "grad_norm": 0.0712570995092392, | |
| "kl": 0.4405221752822399, | |
| "learning_rate": 9.872280733948867e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8085937947034836, | |
| "reward_std": 0.2883669827133417, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 1053.21875, | |
| "epoch": 0.1578810698519865, | |
| "grad_norm": 0.05858299508690834, | |
| "kl": 0.4397047348320484, | |
| "learning_rate": 9.868568321627611e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7000000383704901, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 1019.96875, | |
| "epoch": 0.15995845235003894, | |
| "grad_norm": 0.07670939713716507, | |
| "kl": 0.40835118666291237, | |
| "learning_rate": 9.86480344345207e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.33145629800856113, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 1075.15625, | |
| "epoch": 0.1620358348480914, | |
| "grad_norm": 0.06651510298252106, | |
| "kl": 0.42486657947301865, | |
| "learning_rate": 9.860986139994239e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.28726211935281754, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 1096.828125, | |
| "epoch": 0.16411321734614387, | |
| "grad_norm": 0.06264790147542953, | |
| "kl": 0.3813174143433571, | |
| "learning_rate": 9.857116452391079e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 1159.390625, | |
| "epoch": 0.16619059984419632, | |
| "grad_norm": 0.06721258908510208, | |
| "kl": 0.41810835897922516, | |
| "learning_rate": 9.85319442234406e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7617187947034836, | |
| "reward_std": 0.3104640692472458, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 1207.40625, | |
| "epoch": 0.16826798234224877, | |
| "grad_norm": 0.07961631566286087, | |
| "kl": 0.353565227240324, | |
| "learning_rate": 9.84922009211872e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.4419417269527912, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 1267.5625, | |
| "epoch": 0.17034536484030122, | |
| "grad_norm": 0.06159353628754616, | |
| "kl": 0.3608316369354725, | |
| "learning_rate": 9.845193504544209e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6218750365078449, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.421875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 1271.1875, | |
| "epoch": 0.17242274733835367, | |
| "grad_norm": 0.0616268515586853, | |
| "kl": 0.3721548244357109, | |
| "learning_rate": 9.841114703012817e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7613281682133675, | |
| "reward_std": 0.26682231575250626, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.19882812723517418, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 1197.84375, | |
| "epoch": 0.17450012983640611, | |
| "grad_norm": 0.06743966042995453, | |
| "kl": 0.46105678752064705, | |
| "learning_rate": 9.836983731479525e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000476837158, | |
| "reward_std": 0.30935920774936676, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 1239.859375, | |
| "epoch": 0.1765775123344586, | |
| "grad_norm": 0.07362944632768631, | |
| "kl": 0.35114892572164536, | |
| "learning_rate": 9.832800634461518e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6828125417232513, | |
| "reward_std": 0.3336660098284483, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.19843750074505806, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 1253.21875, | |
| "epoch": 0.17865489483251104, | |
| "grad_norm": 0.060973405838012695, | |
| "kl": 0.3400215059518814, | |
| "learning_rate": 9.828565457037703e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7613281719386578, | |
| "reward_std": 0.2668223213404417, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.19882812723517418, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 1251.828125, | |
| "epoch": 0.1807322773305635, | |
| "grad_norm": 0.06071100011467934, | |
| "kl": 0.3388819098472595, | |
| "learning_rate": 9.824278244848235e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6843750402331352, | |
| "reward_std": 0.28726212307810783, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 1177.9375, | |
| "epoch": 0.18280965982861594, | |
| "grad_norm": 0.07785635441541672, | |
| "kl": 0.39376673474907875, | |
| "learning_rate": 9.819939044094016e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6687500476837158, | |
| "reward_std": 0.3977475520223379, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 1312.515625, | |
| "epoch": 0.1848870423266684, | |
| "grad_norm": 0.06982032209634781, | |
| "kl": 0.3353493846952915, | |
| "learning_rate": 9.815547901536201e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.33145629428327084, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 1360.75, | |
| "epoch": 0.18696442482472087, | |
| "grad_norm": 0.06107737869024277, | |
| "kl": 0.45528167858719826, | |
| "learning_rate": 9.811104864495691e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.24306794814765453, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 1353.296875, | |
| "epoch": 0.18904180732277331, | |
| "grad_norm": 0.06465540081262589, | |
| "kl": 0.353522464632988, | |
| "learning_rate": 9.806609980852628e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8046875409781933, | |
| "reward_std": 0.2938912510871887, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19531250186264515, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 1441.984375, | |
| "epoch": 0.19111918982082576, | |
| "grad_norm": 0.0610247403383255, | |
| "kl": 0.36326174437999725, | |
| "learning_rate": 9.802063299045873e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750402331352, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 1441.265625, | |
| "epoch": 0.1931965723188782, | |
| "grad_norm": 0.05115514621138573, | |
| "kl": 0.411540150642395, | |
| "learning_rate": 9.797464868072488e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6812500357627869, | |
| "reward_std": 0.2032931987196207, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 1397.03125, | |
| "epoch": 0.19527395481693066, | |
| "grad_norm": 0.053147751837968826, | |
| "kl": 0.4489905573427677, | |
| "learning_rate": 9.792814737487207e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 1407.515625, | |
| "epoch": 0.1973513373149831, | |
| "grad_norm": 0.0552426278591156, | |
| "kl": 0.3701773174107075, | |
| "learning_rate": 9.788112957401903e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 1505.546875, | |
| "epoch": 0.1994287198130356, | |
| "grad_norm": 0.05075477808713913, | |
| "kl": 0.39650479704141617, | |
| "learning_rate": 9.783359578485047e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8855469226837158, | |
| "reward_std": 0.17953883111476898, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 1542.90625, | |
| "epoch": 0.20150610231108804, | |
| "grad_norm": 0.053789589554071426, | |
| "kl": 0.35163769498467445, | |
| "learning_rate": 9.778554651961159e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 1533.46875, | |
| "epoch": 0.20358348480914049, | |
| "grad_norm": 0.05969106778502464, | |
| "kl": 0.40055200457572937, | |
| "learning_rate": 9.773698229610263e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8664062917232513, | |
| "reward_std": 0.29499610885977745, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.1945312526077032, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 1658.3125, | |
| "epoch": 0.20566086730719293, | |
| "grad_norm": 0.05904076248407364, | |
| "kl": 0.3737713471055031, | |
| "learning_rate": 9.768790363767322e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7132812924683094, | |
| "reward_std": 0.2905766908079386, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19765625149011612, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 1522.9375, | |
| "epoch": 0.20773824980524538, | |
| "grad_norm": 0.04626452177762985, | |
| "kl": 0.3718419596552849, | |
| "learning_rate": 9.763831107321678e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6843750439584255, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 1437.34375, | |
| "epoch": 0.20981563230329783, | |
| "grad_norm": 0.0583551786839962, | |
| "kl": 0.3823527656495571, | |
| "learning_rate": 9.75882051371648e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7929687947034836, | |
| "reward_std": 0.26626989617943764, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 1665.859375, | |
| "epoch": 0.2118930148013503, | |
| "grad_norm": 0.071258544921875, | |
| "kl": 0.3524062894284725, | |
| "learning_rate": 9.753758636948111e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7121094167232513, | |
| "reward_std": 0.3806223217397928, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19648437574505806, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 1570.484375, | |
| "epoch": 0.21397039729940276, | |
| "grad_norm": 0.06221286952495575, | |
| "kl": 0.4012618362903595, | |
| "learning_rate": 9.748645531565604e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8691406697034836, | |
| "reward_std": 0.2911291141062975, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 1470.140625, | |
| "epoch": 0.2160477797974552, | |
| "grad_norm": 0.05706779286265373, | |
| "kl": 0.37375468015670776, | |
| "learning_rate": 9.743481252670049e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7136719189584255, | |
| "reward_std": 0.24583008396439254, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 1487.796875, | |
| "epoch": 0.21812516229550766, | |
| "grad_norm": 0.04427757114171982, | |
| "kl": 0.40517764165997505, | |
| "learning_rate": 9.738265855914013e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 1542.703125, | |
| "epoch": 0.2202025447935601, | |
| "grad_norm": 0.060884129256010056, | |
| "kl": 0.41977495700120926, | |
| "learning_rate": 9.732999397500926e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6503906659781933, | |
| "reward_std": 0.24693494127131999, | |
| "rewards/argmax_reward_func": 0.453125, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 1549.03125, | |
| "epoch": 0.22227992729161256, | |
| "grad_norm": 0.04595618322491646, | |
| "kl": 0.47253532335162163, | |
| "learning_rate": 9.727681934184481e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9000000506639481, | |
| "reward_std": 0.1590990237891674, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 1636.546875, | |
| "epoch": 0.22435730978966503, | |
| "grad_norm": 0.03207004442811012, | |
| "kl": 0.37253231182694435, | |
| "learning_rate": 9.722313523268028e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000439584255, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 1696.84375, | |
| "epoch": 0.22643469228771748, | |
| "grad_norm": 0.08920740336179733, | |
| "kl": 0.7168225161731243, | |
| "learning_rate": 9.716894222603942e-05, | |
| "loss": 0.0004, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 1443.65625, | |
| "epoch": 0.22851207478576993, | |
| "grad_norm": 0.06247260421514511, | |
| "kl": 0.3850158527493477, | |
| "learning_rate": 9.711424090593019e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7617187947034836, | |
| "reward_std": 0.2662698905915022, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 1509.8125, | |
| "epoch": 0.23058945728382238, | |
| "grad_norm": 0.06556280702352524, | |
| "kl": 0.3532305136322975, | |
| "learning_rate": 9.705903186183828e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7281250506639481, | |
| "reward_std": 0.3137786276638508, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 1467.875, | |
| "epoch": 0.23266683978187483, | |
| "grad_norm": 0.06337332725524902, | |
| "kl": 0.3515300862491131, | |
| "learning_rate": 9.700331568872086e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8054687976837158, | |
| "reward_std": 0.2905766889452934, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19609375298023224, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 1409.40625, | |
| "epoch": 0.23474422227992728, | |
| "grad_norm": 0.06349179893732071, | |
| "kl": 0.35185598209500313, | |
| "learning_rate": 9.694709298700016e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7750000469386578, | |
| "reward_std": 0.24748736945912242, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 1412.53125, | |
| "epoch": 0.23682160477797976, | |
| "grad_norm": 0.064293272793293, | |
| "kl": 0.506983544677496, | |
| "learning_rate": 9.689036436255699e-05, | |
| "loss": 0.0003, | |
| "reward": 0.724609412252903, | |
| "reward_std": 0.22483785497024655, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19335937686264515, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 1372.859375, | |
| "epoch": 0.2388989872760322, | |
| "grad_norm": 0.06638536602258682, | |
| "kl": 0.35727328434586525, | |
| "learning_rate": 9.683313042672418e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.287262124940753, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 1373.09375, | |
| "epoch": 0.24097636977408465, | |
| "grad_norm": 0.06149492412805557, | |
| "kl": 0.3754408285021782, | |
| "learning_rate": 9.677539179628005e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 1313.734375, | |
| "epoch": 0.2430537522721371, | |
| "grad_norm": 0.062166426330804825, | |
| "kl": 0.40280015021562576, | |
| "learning_rate": 9.671714909344174e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8531250506639481, | |
| "reward_std": 0.2695844564586878, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 1132.78125, | |
| "epoch": 0.24513113477018955, | |
| "grad_norm": 0.06206024810671806, | |
| "kl": 0.4296950623393059, | |
| "learning_rate": 9.665840294585845e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 1270.625, | |
| "epoch": 0.24720851726824203, | |
| "grad_norm": 0.05055106431245804, | |
| "kl": 0.3436691351234913, | |
| "learning_rate": 9.659915398660477e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7742187902331352, | |
| "reward_std": 0.16020388156175613, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19609375298023224, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 1221.015625, | |
| "epoch": 0.24928589976629448, | |
| "grad_norm": 0.06833141297101974, | |
| "kl": 0.3341045156121254, | |
| "learning_rate": 9.65394028541738e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1273.125, | |
| "epoch": 0.2513632822643469, | |
| "grad_norm": 0.06194274127483368, | |
| "kl": 0.3503304682672024, | |
| "learning_rate": 9.647915019247029e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6687500439584255, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 1169.4375, | |
| "epoch": 0.2534406647623994, | |
| "grad_norm": 0.05682160705327988, | |
| "kl": 0.4410577192902565, | |
| "learning_rate": 9.641839665080363e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 1201.921875, | |
| "epoch": 0.25551804726045185, | |
| "grad_norm": 0.061100929975509644, | |
| "kl": 0.3569498844444752, | |
| "learning_rate": 9.635714288388102e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 1296.234375, | |
| "epoch": 0.2575954297585043, | |
| "grad_norm": 0.0515943244099617, | |
| "kl": 0.34240079671144485, | |
| "learning_rate": 9.629538955180021e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6835937909781933, | |
| "reward_std": 0.15578446350991726, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 1237.078125, | |
| "epoch": 0.25967281225655675, | |
| "grad_norm": 0.07088616490364075, | |
| "kl": 0.34578079730272293, | |
| "learning_rate": 9.623313732004258e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6687500402331352, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 1213.71875, | |
| "epoch": 0.2617501947546092, | |
| "grad_norm": 0.05374123901128769, | |
| "kl": 0.3377624601125717, | |
| "learning_rate": 9.617038685946578e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750402331352, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 1325.421875, | |
| "epoch": 0.26382757725266165, | |
| "grad_norm": 0.05408313870429993, | |
| "kl": 0.3581954091787338, | |
| "learning_rate": 9.610713884629666e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 1305.421875, | |
| "epoch": 0.2659049597507141, | |
| "grad_norm": 0.05651029199361801, | |
| "kl": 0.3299425356090069, | |
| "learning_rate": 9.60433939621239e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6500000394880772, | |
| "reward_std": 0.15909902285784483, | |
| "rewards/argmax_reward_func": 0.453125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 1394.34375, | |
| "epoch": 0.26798234224876655, | |
| "grad_norm": 0.05847406014800072, | |
| "kl": 0.3248457871377468, | |
| "learning_rate": 9.597915289389066e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8847656697034836, | |
| "reward_std": 0.22483785450458527, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 1361.125, | |
| "epoch": 0.270059724746819, | |
| "grad_norm": 0.03918185085058212, | |
| "kl": 0.29694442078471184, | |
| "learning_rate": 9.591441633388724e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8687500506639481, | |
| "reward_std": 0.11490485025569797, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 1294.34375, | |
| "epoch": 0.27213710724487145, | |
| "grad_norm": 0.06627894192934036, | |
| "kl": 0.317622110247612, | |
| "learning_rate": 9.584918497974354e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8031250387430191, | |
| "reward_std": 0.2519067842513323, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19375000149011612, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 1264.578125, | |
| "epoch": 0.2742144897429239, | |
| "grad_norm": 0.05716657266020775, | |
| "kl": 0.33274614438414574, | |
| "learning_rate": 9.578345953442162e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7093750424683094, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19375000335276127, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 1101.5625, | |
| "epoch": 0.27629187224097634, | |
| "grad_norm": 0.06597350537776947, | |
| "kl": 0.3318898268043995, | |
| "learning_rate": 9.571724070620806e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 1340.328125, | |
| "epoch": 0.27836925473902885, | |
| "grad_norm": 0.06743122637271881, | |
| "kl": 0.2939135618507862, | |
| "learning_rate": 9.565052920870636e-05, | |
| "loss": 0.0001, | |
| "reward": 0.6312500461935997, | |
| "reward_std": 0.27400387404486537, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.19375000149011612, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1385.546875, | |
| "epoch": 0.2804466372370813, | |
| "grad_norm": 0.05118987336754799, | |
| "kl": 0.27961407601833344, | |
| "learning_rate": 9.558332576082925e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8664062991738319, | |
| "reward_std": 0.20660776272416115, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19453125074505806, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 1284.90625, | |
| "epoch": 0.28252401973513375, | |
| "grad_norm": 0.060963716357946396, | |
| "kl": 0.310220867395401, | |
| "learning_rate": 9.551563108679091e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 1260.078125, | |
| "epoch": 0.2846014022331862, | |
| "grad_norm": 0.0460037924349308, | |
| "kl": 0.39314381033182144, | |
| "learning_rate": 9.544744591609922e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250402331352, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 1031.03125, | |
| "epoch": 0.28667878473123865, | |
| "grad_norm": 0.06592284142971039, | |
| "kl": 0.4330439232289791, | |
| "learning_rate": 9.537877098354786e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 1101.796875, | |
| "epoch": 0.2887561672292911, | |
| "grad_norm": 0.0644359141588211, | |
| "kl": 0.2887462917715311, | |
| "learning_rate": 9.53096070292084e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8218750432133675, | |
| "reward_std": 0.22539028152823448, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 1065.90625, | |
| "epoch": 0.29083354972734354, | |
| "grad_norm": 0.065298892557621, | |
| "kl": 0.30470659770071507, | |
| "learning_rate": 9.523995479842232e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6218750365078449, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.421875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 978.953125, | |
| "epoch": 0.292910932225396, | |
| "grad_norm": 0.05792571231722832, | |
| "kl": 0.4863986298441887, | |
| "learning_rate": 9.516981504179299e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 1087.9375, | |
| "epoch": 0.29498831472344844, | |
| "grad_norm": 0.06688184291124344, | |
| "kl": 0.29886077158153057, | |
| "learning_rate": 9.509918851517758e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 1019.140625, | |
| "epoch": 0.2970656972215009, | |
| "grad_norm": 0.06881757080554962, | |
| "kl": 0.3445068225264549, | |
| "learning_rate": 9.502807597967893e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 1186.5625, | |
| "epoch": 0.29914307971955334, | |
| "grad_norm": 0.05837235972285271, | |
| "kl": 0.32853276655077934, | |
| "learning_rate": 9.495647820163725e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8855469226837158, | |
| "reward_std": 0.17953882738947868, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 931.265625, | |
| "epoch": 0.30122046221760584, | |
| "grad_norm": 0.046699460595846176, | |
| "kl": 0.33741075173020363, | |
| "learning_rate": 9.488439595262204e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 944.25, | |
| "epoch": 0.3032978447156583, | |
| "grad_norm": 0.06217503920197487, | |
| "kl": 0.3317374251782894, | |
| "learning_rate": 9.48118300094236e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7312500476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 1001.078125, | |
| "epoch": 0.30537522721371074, | |
| "grad_norm": 0.06545262783765793, | |
| "kl": 0.3109145648777485, | |
| "learning_rate": 9.473878115404477e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 910.796875, | |
| "epoch": 0.3074526097117632, | |
| "grad_norm": 0.05757139250636101, | |
| "kl": 0.3004848547279835, | |
| "learning_rate": 9.466525017369243e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 1078.265625, | |
| "epoch": 0.30952999220981564, | |
| "grad_norm": 0.07616781443357468, | |
| "kl": 0.28462448343634605, | |
| "learning_rate": 9.459123786076912e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.33145629428327084, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 963.71875, | |
| "epoch": 0.3116073747078681, | |
| "grad_norm": 0.06607849150896072, | |
| "kl": 0.2945715934038162, | |
| "learning_rate": 9.451674501286436e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7468750402331352, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 925.484375, | |
| "epoch": 0.31368475720592054, | |
| "grad_norm": 0.08415860682725906, | |
| "kl": 0.326167568564415, | |
| "learning_rate": 9.444177243274618e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7000000439584255, | |
| "reward_std": 0.35355337895452976, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 814.296875, | |
| "epoch": 0.315762139703973, | |
| "grad_norm": 0.049171049147844315, | |
| "kl": 0.312137458473444, | |
| "learning_rate": 9.436632092835239e-05, | |
| "loss": 0.0002, | |
| "reward": 1.0281250476837158, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.828125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 783.484375, | |
| "epoch": 0.31783952220202544, | |
| "grad_norm": 0.06367822736501694, | |
| "kl": 0.33039499446749687, | |
| "learning_rate": 9.42903913127819e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7281250394880772, | |
| "reward_std": 0.18119611032307148, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 861.609375, | |
| "epoch": 0.3199169047000779, | |
| "grad_norm": 0.06421905755996704, | |
| "kl": 0.3065376691520214, | |
| "learning_rate": 9.421398440428597e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 902.390625, | |
| "epoch": 0.32199428719813034, | |
| "grad_norm": 0.07363509386777878, | |
| "kl": 0.33688198402523994, | |
| "learning_rate": 9.413710102625938e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.287262124940753, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 922.0625, | |
| "epoch": 0.3240716696961828, | |
| "grad_norm": 0.06810685992240906, | |
| "kl": 0.34481339529156685, | |
| "learning_rate": 9.405974200723155e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500476837158, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 884.640625, | |
| "epoch": 0.3261490521942353, | |
| "grad_norm": 0.06919455528259277, | |
| "kl": 0.3362896367907524, | |
| "learning_rate": 9.398190818085763e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8398437947034836, | |
| "reward_std": 0.2441728077828884, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 831.109375, | |
| "epoch": 0.32822643469228774, | |
| "grad_norm": 0.08263985067605972, | |
| "kl": 0.882828488945961, | |
| "learning_rate": 9.390360038590951e-05, | |
| "loss": 0.0004, | |
| "reward": 0.8531250506639481, | |
| "reward_std": 0.22539028339087963, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 879.40625, | |
| "epoch": 0.3303038171903402, | |
| "grad_norm": 0.0637197494506836, | |
| "kl": 0.31912703067064285, | |
| "learning_rate": 9.382481946626674e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 696.3125, | |
| "epoch": 0.33238119968839264, | |
| "grad_norm": 0.08041277527809143, | |
| "kl": 0.3748646304011345, | |
| "learning_rate": 9.374556627090749e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 891.265625, | |
| "epoch": 0.3344585821864451, | |
| "grad_norm": 0.06974095106124878, | |
| "kl": 0.3626530338078737, | |
| "learning_rate": 9.366584165389941e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250439584255, | |
| "reward_std": 0.24306794814765453, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 813.03125, | |
| "epoch": 0.33653596468449753, | |
| "grad_norm": 0.092588409781456, | |
| "kl": 0.3759094402194023, | |
| "learning_rate": 9.358564647439037e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7593750506639481, | |
| "reward_std": 0.35797279700636864, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 838.296875, | |
| "epoch": 0.33861334718255, | |
| "grad_norm": 0.06730964034795761, | |
| "kl": 0.3410007916390896, | |
| "learning_rate": 9.350498159659924e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250439584255, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 775.375, | |
| "epoch": 0.34069072968060243, | |
| "grad_norm": 0.06589485704898834, | |
| "kl": 0.3439077027142048, | |
| "learning_rate": 9.342384788980656e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7312500420957804, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 781.0, | |
| "epoch": 0.3427681121786549, | |
| "grad_norm": 0.07955412566661835, | |
| "kl": 0.359022606164217, | |
| "learning_rate": 9.33422462283452e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750402331352, | |
| "reward_std": 0.287262124940753, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 895.96875, | |
| "epoch": 0.34484549467670733, | |
| "grad_norm": 0.06293340772390366, | |
| "kl": 0.4317344203591347, | |
| "learning_rate": 9.326017749159087e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 884.25, | |
| "epoch": 0.3469228771747598, | |
| "grad_norm": 0.07700355350971222, | |
| "kl": 0.5744567923247814, | |
| "learning_rate": 9.317764256395275e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6031250357627869, | |
| "reward_std": 0.26958445459604263, | |
| "rewards/argmax_reward_func": 0.40625, | |
| "rewards/format_reward_func": 0.19687500409781933, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 889.921875, | |
| "epoch": 0.34900025967281223, | |
| "grad_norm": 0.062210842967033386, | |
| "kl": 0.32679086178541183, | |
| "learning_rate": 9.309464233486387e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750402331352, | |
| "reward_std": 0.19887377507984638, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 788.84375, | |
| "epoch": 0.35107764217086473, | |
| "grad_norm": 0.0710466280579567, | |
| "kl": 0.35585347935557365, | |
| "learning_rate": 9.301117769877153e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8187500350177288, | |
| "reward_std": 0.22207572124898434, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19375000149011612, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 849.484375, | |
| "epoch": 0.3531550246689172, | |
| "grad_norm": 0.0648435726761818, | |
| "kl": 0.32205165177583694, | |
| "learning_rate": 9.292724955512774e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 856.984375, | |
| "epoch": 0.35523240716696963, | |
| "grad_norm": 0.06077580899000168, | |
| "kl": 0.34612051025032997, | |
| "learning_rate": 9.284285880837946e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250365078449, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 776.5, | |
| "epoch": 0.3573097896650221, | |
| "grad_norm": 0.07481009513139725, | |
| "kl": 0.3612271770834923, | |
| "learning_rate": 9.275800636795884e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7773437909781933, | |
| "reward_std": 0.28836698085069656, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 789.046875, | |
| "epoch": 0.35938717216307453, | |
| "grad_norm": 0.07435107976198196, | |
| "kl": 0.3340052030980587, | |
| "learning_rate": 9.267269314827345e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8398437947034836, | |
| "reward_std": 0.28836698085069656, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 768.390625, | |
| "epoch": 0.361464554661127, | |
| "grad_norm": 0.08201409131288528, | |
| "kl": 0.3197612836956978, | |
| "learning_rate": 9.258692006869643e-05, | |
| "loss": 0.0002, | |
| "reward": 0.621093787252903, | |
| "reward_std": 0.3325611485633999, | |
| "rewards/argmax_reward_func": 0.421875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 740.515625, | |
| "epoch": 0.36354193715917943, | |
| "grad_norm": 0.08215157687664032, | |
| "kl": 0.3205004744231701, | |
| "learning_rate": 9.250068805355658e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.2872621212154627, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 833.796875, | |
| "epoch": 0.3656193196572319, | |
| "grad_norm": 0.06702969968318939, | |
| "kl": 0.31005076318979263, | |
| "learning_rate": 9.24139980321284e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.28726212307810783, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 790.546875, | |
| "epoch": 0.3676967021552843, | |
| "grad_norm": 0.08229520171880722, | |
| "kl": 0.32232359051704407, | |
| "learning_rate": 9.232685093862204e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.33145629428327084, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 814.953125, | |
| "epoch": 0.3697740846533368, | |
| "grad_norm": 0.0782497227191925, | |
| "kl": 0.3153250627219677, | |
| "learning_rate": 9.22392477121733e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7750000506639481, | |
| "reward_std": 0.3358757123351097, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 972.390625, | |
| "epoch": 0.3718514671513892, | |
| "grad_norm": 0.07078168541193008, | |
| "kl": 0.3324251137673855, | |
| "learning_rate": 9.215118929683344e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7750000469386578, | |
| "reward_std": 0.29168154671788216, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 757.90625, | |
| "epoch": 0.37392884964944173, | |
| "grad_norm": 0.09468799084424973, | |
| "kl": 0.33874499425292015, | |
| "learning_rate": 9.206267664155907e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.4640388172119856, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 844.265625, | |
| "epoch": 0.3760062321474942, | |
| "grad_norm": 0.08337994664907455, | |
| "kl": 0.31716278567910194, | |
| "learning_rate": 9.197371070020184e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7906250506639481, | |
| "reward_std": 0.3579728025943041, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 864.234375, | |
| "epoch": 0.37808361464554663, | |
| "grad_norm": 0.0694384053349495, | |
| "kl": 0.32097451388835907, | |
| "learning_rate": 9.188429243149824e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6472656652331352, | |
| "reward_std": 0.24362037517130375, | |
| "rewards/argmax_reward_func": 0.453125, | |
| "rewards/format_reward_func": 0.19414062798023224, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 749.046875, | |
| "epoch": 0.3801609971435991, | |
| "grad_norm": 0.062498513609170914, | |
| "kl": 0.3336629420518875, | |
| "learning_rate": 9.179442279905928e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 923.640625, | |
| "epoch": 0.3822383796416515, | |
| "grad_norm": 0.07083828747272491, | |
| "kl": 0.32143479958176613, | |
| "learning_rate": 9.170410277135999e-05, | |
| "loss": 0.0002, | |
| "reward": 0.75625004991889, | |
| "reward_std": 0.31819804944097996, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.19375000149011612, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 884.015625, | |
| "epoch": 0.384315762139704, | |
| "grad_norm": 0.0693785548210144, | |
| "kl": 0.4987417571246624, | |
| "learning_rate": 9.161333332172912e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7062500454485416, | |
| "reward_std": 0.24748736806213856, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.1906250026077032, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 795.234375, | |
| "epoch": 0.3863931446377564, | |
| "grad_norm": 0.07442086935043335, | |
| "kl": 0.33293722197413445, | |
| "learning_rate": 9.152211542833857e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 929.25, | |
| "epoch": 0.3884705271358089, | |
| "grad_norm": 0.05656367912888527, | |
| "kl": 0.31210994347929955, | |
| "learning_rate": 9.143045007419284e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 902.828125, | |
| "epoch": 0.3905479096338613, | |
| "grad_norm": 0.06982313841581345, | |
| "kl": 0.3056885749101639, | |
| "learning_rate": 9.133833824711853e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250439584255, | |
| "reward_std": 0.28726212307810783, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 995.71875, | |
| "epoch": 0.3926252921319138, | |
| "grad_norm": 0.07361900061368942, | |
| "kl": 0.3039589188992977, | |
| "learning_rate": 9.124578093975358e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7300781719386578, | |
| "reward_std": 0.35521066188812256, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19882812909781933, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 964.359375, | |
| "epoch": 0.3947026746299662, | |
| "grad_norm": 0.07159875333309174, | |
| "kl": 0.3399963229894638, | |
| "learning_rate": 9.115277914953662e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.33145629428327084, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1034.03125, | |
| "epoch": 0.39678005712801867, | |
| "grad_norm": 0.07782501727342606, | |
| "kl": 0.3184865601360798, | |
| "learning_rate": 9.105933387869628e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6910156607627869, | |
| "reward_std": 0.4104533866047859, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.19101562723517418, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 1139.8125, | |
| "epoch": 0.3988574396260712, | |
| "grad_norm": 0.05474551394581795, | |
| "kl": 0.29825419560074806, | |
| "learning_rate": 9.096544613424025e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8804688006639481, | |
| "reward_std": 0.27510872669517994, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19296875223517418, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 1003.0, | |
| "epoch": 0.4009348221241236, | |
| "grad_norm": 0.0725637748837471, | |
| "kl": 0.3301442116498947, | |
| "learning_rate": 9.087111692794459e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7304687947034836, | |
| "reward_std": 0.31046406365931034, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 1139.734375, | |
| "epoch": 0.4030122046221761, | |
| "grad_norm": 0.057006120681762695, | |
| "kl": 0.31723184883594513, | |
| "learning_rate": 9.077634727634272e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8449219167232513, | |
| "reward_std": 0.23146697832271457, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.1886718776077032, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 1048.71875, | |
| "epoch": 0.4050895871202285, | |
| "grad_norm": 0.07205278426408768, | |
| "kl": 0.33233997970819473, | |
| "learning_rate": 9.068113820071447e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7875000387430191, | |
| "reward_std": 0.3181980513036251, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19375000335276127, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 955.6875, | |
| "epoch": 0.40716696961828097, | |
| "grad_norm": 0.057774197310209274, | |
| "kl": 0.3174768090248108, | |
| "learning_rate": 9.058549072707513e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8347656726837158, | |
| "reward_std": 0.1994262058287859, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19414062798023224, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 1300.125, | |
| "epoch": 0.4092443521163334, | |
| "grad_norm": 0.05007508769631386, | |
| "kl": 0.30298993550240993, | |
| "learning_rate": 9.048940588616435e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7843750491738319, | |
| "reward_std": 0.22539028525352478, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.1906250026077032, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 1237.953125, | |
| "epoch": 0.41132173461438587, | |
| "grad_norm": 0.060646846890449524, | |
| "kl": 0.3001830168068409, | |
| "learning_rate": 9.039288471343504e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8812500461935997, | |
| "reward_std": 0.27400387451052666, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19375000335276127, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 1186.453125, | |
| "epoch": 0.4133991171124383, | |
| "grad_norm": 0.05342816561460495, | |
| "kl": 0.30144498124718666, | |
| "learning_rate": 9.029592824904225e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8074219226837158, | |
| "reward_std": 0.24583008885383606, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1274.375, | |
| "epoch": 0.41547649961049077, | |
| "grad_norm": 0.05866052210330963, | |
| "kl": 0.3122952822595835, | |
| "learning_rate": 9.019853753783185e-05, | |
| "loss": 0.0002, | |
| "reward": 0.652343787252903, | |
| "reward_std": 0.2264951393008232, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.1835937537252903, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1241.5, | |
| "epoch": 0.4175538821085432, | |
| "grad_norm": 0.05399727076292038, | |
| "kl": 0.33987458795309067, | |
| "learning_rate": 9.010071362932944e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8687500432133675, | |
| "reward_std": 0.2032931987196207, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1336.140625, | |
| "epoch": 0.41963126460659567, | |
| "grad_norm": 0.06403433531522751, | |
| "kl": 0.28705168329179287, | |
| "learning_rate": 9.000245757772885e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8281250521540642, | |
| "reward_std": 0.29610096476972103, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.18750000558793545, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 1035.203125, | |
| "epoch": 0.42170864710464817, | |
| "grad_norm": 0.0628470629453659, | |
| "kl": 0.30243775993585587, | |
| "learning_rate": 8.990377044188098e-05, | |
| "loss": 0.0002, | |
| "reward": 0.85000004991889, | |
| "reward_std": 0.22980970283970237, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19375000149011612, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1175.90625, | |
| "epoch": 0.4237860296027006, | |
| "grad_norm": 0.05064735934138298, | |
| "kl": 0.3158372975885868, | |
| "learning_rate": 8.980465328528219e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7906250394880772, | |
| "reward_std": 0.18119611404836178, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 1109.28125, | |
| "epoch": 0.42586341210075307, | |
| "grad_norm": 0.060826126486063004, | |
| "kl": 0.2915416620671749, | |
| "learning_rate": 8.9705107176063e-05, | |
| "loss": 0.0001, | |
| "reward": 0.9437500461935997, | |
| "reward_std": 0.22980970703065395, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.19375000149011612, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 979.453125, | |
| "epoch": 0.4279407945988055, | |
| "grad_norm": 0.062372464686632156, | |
| "kl": 0.3590022251009941, | |
| "learning_rate": 8.960513318697647e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 1014.40625, | |
| "epoch": 0.43001817709685797, | |
| "grad_norm": 0.0675223246216774, | |
| "kl": 0.3203696608543396, | |
| "learning_rate": 8.950473239538673e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8835937976837158, | |
| "reward_std": 0.27068931609392166, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.1960937511175871, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1077.65625, | |
| "epoch": 0.4320955595949104, | |
| "grad_norm": 0.07710019499063492, | |
| "kl": 0.28732946887612343, | |
| "learning_rate": 8.940390588325727e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8781250491738319, | |
| "reward_std": 0.4110058154910803, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19062500447034836, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 985.4375, | |
| "epoch": 0.43417294209296287, | |
| "grad_norm": 0.04350803792476654, | |
| "kl": 0.3246513232588768, | |
| "learning_rate": 8.930265473713938e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8531250469386578, | |
| "reward_std": 0.13700193725526333, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19687500409781933, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 984.140625, | |
| "epoch": 0.4362503245910153, | |
| "grad_norm": 0.06984654814004898, | |
| "kl": 0.32982902973890305, | |
| "learning_rate": 8.920098004816036e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8710937947034836, | |
| "reward_std": 0.24417280592024326, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 870.03125, | |
| "epoch": 0.43832770708906776, | |
| "grad_norm": 0.06809406727552414, | |
| "kl": 0.29028210788965225, | |
| "learning_rate": 8.909888291201182e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8718750439584255, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 906.40625, | |
| "epoch": 0.4404050895871202, | |
| "grad_norm": 0.08602919429540634, | |
| "kl": 0.2871505431830883, | |
| "learning_rate": 8.899636442893783e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8062500469386578, | |
| "reward_std": 0.3800698835402727, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 869.421875, | |
| "epoch": 0.44248247208517266, | |
| "grad_norm": 0.05844723433256149, | |
| "kl": 0.26848769187927246, | |
| "learning_rate": 8.88934257037231e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8710937947034836, | |
| "reward_std": 0.19997863844037056, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 864.75, | |
| "epoch": 0.4445598545832251, | |
| "grad_norm": 0.07575644552707672, | |
| "kl": 0.4048551693558693, | |
| "learning_rate": 8.879006784568104e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7613281682133675, | |
| "reward_std": 0.26682231947779655, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.19882812723517418, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 877.96875, | |
| "epoch": 0.4466372370812776, | |
| "grad_norm": 0.07090688496828079, | |
| "kl": 0.2992668803781271, | |
| "learning_rate": 8.868629196864182e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8035156689584255, | |
| "reward_std": 0.24362037889659405, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19414062798023224, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 779.8125, | |
| "epoch": 0.44871461957933007, | |
| "grad_norm": 0.069987952709198, | |
| "kl": 0.2883603498339653, | |
| "learning_rate": 8.858209919094039e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7156250402331352, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 748.59375, | |
| "epoch": 0.4507920020773825, | |
| "grad_norm": 0.07478881627321243, | |
| "kl": 0.2929369006305933, | |
| "learning_rate": 8.847749063540439e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8066406697034836, | |
| "reward_std": 0.24693494103848934, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 749.8125, | |
| "epoch": 0.45286938457543496, | |
| "grad_norm": 0.08688110113143921, | |
| "kl": 0.3713537007570267, | |
| "learning_rate": 8.837246742934207e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7765625454485416, | |
| "reward_std": 0.33366601169109344, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 697.546875, | |
| "epoch": 0.4549467670734874, | |
| "grad_norm": 0.08973264694213867, | |
| "kl": 0.36194442212581635, | |
| "learning_rate": 8.826703070453015e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8511719219386578, | |
| "reward_std": 0.316540764644742, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19492187909781933, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 736.953125, | |
| "epoch": 0.45702414957153986, | |
| "grad_norm": 0.06507878005504608, | |
| "kl": 0.32681479677557945, | |
| "learning_rate": 8.816118159720156e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 686.046875, | |
| "epoch": 0.4591015320695923, | |
| "grad_norm": 0.08443711698055267, | |
| "kl": 0.27842542715370655, | |
| "learning_rate": 8.805492124803331e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7750000506639481, | |
| "reward_std": 0.2474873699247837, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 636.28125, | |
| "epoch": 0.46117891456764476, | |
| "grad_norm": 0.08033400774002075, | |
| "kl": 0.2751711644232273, | |
| "learning_rate": 8.794825080213414e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 652.40625, | |
| "epoch": 0.4632562970656972, | |
| "grad_norm": 0.07973612844944, | |
| "kl": 0.28703486546874046, | |
| "learning_rate": 8.78411714090321e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 656.5, | |
| "epoch": 0.46533367956374966, | |
| "grad_norm": 0.0935521349310875, | |
| "kl": 0.28887104988098145, | |
| "learning_rate": 8.77336842226623e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.3535533845424652, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 601.90625, | |
| "epoch": 0.4674110620618021, | |
| "grad_norm": 0.08775703608989716, | |
| "kl": 0.2865128982812166, | |
| "learning_rate": 8.76257904013544e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 649.09375, | |
| "epoch": 0.46948844455985456, | |
| "grad_norm": 0.07471180707216263, | |
| "kl": 0.3112582378089428, | |
| "learning_rate": 8.751749110782012e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 651.375, | |
| "epoch": 0.47156582705790706, | |
| "grad_norm": 0.08434654772281647, | |
| "kl": 0.33592014387249947, | |
| "learning_rate": 8.740878750914076e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8390625491738319, | |
| "reward_std": 0.24527766555547714, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 592.15625, | |
| "epoch": 0.4736432095559595, | |
| "grad_norm": 0.10138159990310669, | |
| "kl": 0.3475854229182005, | |
| "learning_rate": 8.729968077675454e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.33145629428327084, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 618.953125, | |
| "epoch": 0.47572059205401196, | |
| "grad_norm": 0.08923006802797318, | |
| "kl": 0.32317574694752693, | |
| "learning_rate": 8.71901720864441e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8085937947034836, | |
| "reward_std": 0.28836698085069656, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 606.46875, | |
| "epoch": 0.4777979745520644, | |
| "grad_norm": 0.07547228038311005, | |
| "kl": 0.4202072508633137, | |
| "learning_rate": 8.70802626183237e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7757812924683094, | |
| "reward_std": 0.20218834839761257, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19765625149011612, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 567.375, | |
| "epoch": 0.47987535705011686, | |
| "grad_norm": 0.07534275949001312, | |
| "kl": 0.5509752966463566, | |
| "learning_rate": 8.696995355682656e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8250000439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 615.265625, | |
| "epoch": 0.4819527395481693, | |
| "grad_norm": 0.08038201183080673, | |
| "kl": 0.3771616071462631, | |
| "learning_rate": 8.685924609069214e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8695312887430191, | |
| "reward_std": 0.20218834280967712, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19765625149011612, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 602.453125, | |
| "epoch": 0.48403012204622176, | |
| "grad_norm": 0.07698789983987808, | |
| "kl": 0.6121297106146812, | |
| "learning_rate": 8.674814141295324e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8718750439584255, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 592.75, | |
| "epoch": 0.4861075045442742, | |
| "grad_norm": 0.09831973165273666, | |
| "kl": 0.31690799072384834, | |
| "learning_rate": 8.663664072092323e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8246094211935997, | |
| "reward_std": 0.3099116366356611, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19960937835276127, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 558.90625, | |
| "epoch": 0.48818488704232665, | |
| "grad_norm": 0.09684620797634125, | |
| "kl": 0.3237866424024105, | |
| "learning_rate": 8.652474521618306e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.3093592096120119, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 626.078125, | |
| "epoch": 0.4902622695403791, | |
| "grad_norm": 0.06933271139860153, | |
| "kl": 0.3663709722459316, | |
| "learning_rate": 8.641245610456838e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9812500476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.78125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 579.40625, | |
| "epoch": 0.49233965203843155, | |
| "grad_norm": 0.08088324964046478, | |
| "kl": 0.3435916490852833, | |
| "learning_rate": 8.629977459615655e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 607.125, | |
| "epoch": 0.49441703453648406, | |
| "grad_norm": 0.07071245461702347, | |
| "kl": 0.2806865181773901, | |
| "learning_rate": 8.618670190525352e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8250000439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 558.390625, | |
| "epoch": 0.4964944170345365, | |
| "grad_norm": 0.08282584697008133, | |
| "kl": 0.40584639832377434, | |
| "learning_rate": 8.607323925038082e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250439584255, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 667.71875, | |
| "epoch": 0.49857179953258896, | |
| "grad_norm": 0.08190900087356567, | |
| "kl": 0.35680179484188557, | |
| "learning_rate": 8.595938785426241e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.24306794814765453, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 670.8125, | |
| "epoch": 0.5006491820306413, | |
| "grad_norm": 0.08591850101947784, | |
| "kl": 0.3619570918381214, | |
| "learning_rate": 8.584514894381151e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 676.25, | |
| "epoch": 0.5027265645286938, | |
| "grad_norm": 0.08506251126527786, | |
| "kl": 0.3224334083497524, | |
| "learning_rate": 8.573052375011733e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8867187947034836, | |
| "reward_std": 0.2662698905915022, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 673.1875, | |
| "epoch": 0.5048039470267462, | |
| "grad_norm": 0.06150234118103981, | |
| "kl": 0.3478453829884529, | |
| "learning_rate": 8.561551350843186e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9656250476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.765625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 692.546875, | |
| "epoch": 0.5068813295247988, | |
| "grad_norm": 0.06618204712867737, | |
| "kl": 0.29330621659755707, | |
| "learning_rate": 8.550011945815655e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 687.03125, | |
| "epoch": 0.5089587120228513, | |
| "grad_norm": 0.07623764872550964, | |
| "kl": 0.3498356007039547, | |
| "learning_rate": 8.538434284282892e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 660.703125, | |
| "epoch": 0.5110360945209037, | |
| "grad_norm": 0.04135030135512352, | |
| "kl": 0.32844917103648186, | |
| "learning_rate": 8.526818491010922e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 716.609375, | |
| "epoch": 0.5131134770189562, | |
| "grad_norm": 0.0865129679441452, | |
| "kl": 0.3059841375797987, | |
| "learning_rate": 8.515164691176687e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7312500439584255, | |
| "reward_std": 0.3093592096120119, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 757.4375, | |
| "epoch": 0.5151908595170086, | |
| "grad_norm": 0.07157998532056808, | |
| "kl": 0.2929275669157505, | |
| "learning_rate": 8.503473010366713e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8867187909781933, | |
| "reward_std": 0.22207572311162949, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 662.859375, | |
| "epoch": 0.517268242015061, | |
| "grad_norm": 0.06820650398731232, | |
| "kl": 0.31902188807725906, | |
| "learning_rate": 8.491743574575743e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 721.625, | |
| "epoch": 0.5193456245131135, | |
| "grad_norm": 0.0756940096616745, | |
| "kl": 0.31403973512351513, | |
| "learning_rate": 8.479976510205387e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.22097086161375046, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 748.046875, | |
| "epoch": 0.521423007011166, | |
| "grad_norm": 0.07090619206428528, | |
| "kl": 0.2569838650524616, | |
| "learning_rate": 8.468171944062755e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7929687947034836, | |
| "reward_std": 0.22207572311162949, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 693.609375, | |
| "epoch": 0.5235003895092184, | |
| "grad_norm": 0.06538081914186478, | |
| "kl": 0.2928556613624096, | |
| "learning_rate": 8.456330003359093e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 728.953125, | |
| "epoch": 0.5255777720072708, | |
| "grad_norm": 0.09474781900644302, | |
| "kl": 0.27900537475943565, | |
| "learning_rate": 8.444450815708415e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.3977475557476282, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 737.625, | |
| "epoch": 0.5276551545053233, | |
| "grad_norm": 0.06914320588111877, | |
| "kl": 0.26191011257469654, | |
| "learning_rate": 8.432534509126122e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 743.609375, | |
| "epoch": 0.5297325370033757, | |
| "grad_norm": 0.05855982005596161, | |
| "kl": 0.26190576888620853, | |
| "learning_rate": 8.420581212027624e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 768.265625, | |
| "epoch": 0.5318099195014282, | |
| "grad_norm": 0.058118585497140884, | |
| "kl": 0.2930552177131176, | |
| "learning_rate": 8.408591053226964e-05, | |
| "loss": 0.0001, | |
| "reward": 0.9492187947034836, | |
| "reward_std": 0.13368737325072289, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 763.0, | |
| "epoch": 0.5338873019994806, | |
| "grad_norm": 0.07115372270345688, | |
| "kl": 0.35762836039066315, | |
| "learning_rate": 8.396564161935411e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8710937947034836, | |
| "reward_std": 0.1999786328524351, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 865.8125, | |
| "epoch": 0.5359646844975331, | |
| "grad_norm": 0.06384899467229843, | |
| "kl": 0.2847513500601053, | |
| "learning_rate": 8.38450066776009e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8375000506639481, | |
| "reward_std": 0.2032931987196207, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 691.734375, | |
| "epoch": 0.5380420669955855, | |
| "grad_norm": 0.08122014999389648, | |
| "kl": 0.2869179602712393, | |
| "learning_rate": 8.37240070070257e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.30935921147465706, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 759.203125, | |
| "epoch": 0.540119449493638, | |
| "grad_norm": 0.06432370841503143, | |
| "kl": 0.3190025221556425, | |
| "learning_rate": 8.360264391157471e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9500000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 872.890625, | |
| "epoch": 0.5421968319916904, | |
| "grad_norm": 0.08087541162967682, | |
| "kl": 0.2903926521539688, | |
| "learning_rate": 8.348091869911054e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7554687969386578, | |
| "reward_std": 0.27510873042047024, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.19296875409781933, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 868.875, | |
| "epoch": 0.5442742144897429, | |
| "grad_norm": 0.06983164697885513, | |
| "kl": 0.25976957008242607, | |
| "learning_rate": 8.335883268139813e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8062500506639481, | |
| "reward_std": 0.247487373650074, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 781.53125, | |
| "epoch": 0.5463515969877953, | |
| "grad_norm": 0.07666690647602081, | |
| "kl": 0.286643173545599, | |
| "learning_rate": 8.323638717409061e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 768.796875, | |
| "epoch": 0.5484289794858478, | |
| "grad_norm": 0.06480922549962997, | |
| "kl": 0.30170151591300964, | |
| "learning_rate": 8.311358349671517e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 874.328125, | |
| "epoch": 0.5505063619839002, | |
| "grad_norm": 0.06416033208370209, | |
| "kl": 0.28673115372657776, | |
| "learning_rate": 8.299042297265876e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8843750506639481, | |
| "reward_std": 0.22539028525352478, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 790.890625, | |
| "epoch": 0.5525837444819527, | |
| "grad_norm": 0.06319725513458252, | |
| "kl": 0.3224434554576874, | |
| "learning_rate": 8.286690692915386e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 837.4375, | |
| "epoch": 0.5546611269800052, | |
| "grad_norm": 0.07317644357681274, | |
| "kl": 0.3563056066632271, | |
| "learning_rate": 8.274303669726426e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7906250432133675, | |
| "reward_std": 0.22539028525352478, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 718.125, | |
| "epoch": 0.5567385094780577, | |
| "grad_norm": 0.06230226531624794, | |
| "kl": 0.30831460282206535, | |
| "learning_rate": 8.261881361187054e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9500000476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 879.25, | |
| "epoch": 0.5588158919761101, | |
| "grad_norm": 0.07465776056051254, | |
| "kl": 0.47362302988767624, | |
| "learning_rate": 8.249423901165584e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8535156697034836, | |
| "reward_std": 0.22483785264194012, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 669.40625, | |
| "epoch": 0.5608932744741626, | |
| "grad_norm": 0.07374807447195053, | |
| "kl": 0.329727228730917, | |
| "learning_rate": 8.236931423909138e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7773437947034836, | |
| "reward_std": 0.19997863098978996, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 732.3125, | |
| "epoch": 0.562970656972215, | |
| "grad_norm": 0.0676698312163353, | |
| "kl": 0.3591331150382757, | |
| "learning_rate": 8.2244040640422e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 840.921875, | |
| "epoch": 0.5650480394702675, | |
| "grad_norm": 0.0682259052991867, | |
| "kl": 0.35003719478845596, | |
| "learning_rate": 8.21184195656516e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 722.375, | |
| "epoch": 0.5671254219683199, | |
| "grad_norm": 0.05751950666308403, | |
| "kl": 0.37356993556022644, | |
| "learning_rate": 8.199245236852871e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6843750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 679.171875, | |
| "epoch": 0.5692028044663724, | |
| "grad_norm": 0.09351193159818649, | |
| "kl": 0.3799058124423027, | |
| "learning_rate": 8.186614040653176e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.33145629428327084, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 767.84375, | |
| "epoch": 0.5712801869644248, | |
| "grad_norm": 0.06785906106233597, | |
| "kl": 0.3164171427488327, | |
| "learning_rate": 8.173948504085454e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8242187947034836, | |
| "reward_std": 0.17788154631853104, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 700.25, | |
| "epoch": 0.5733575694624773, | |
| "grad_norm": 0.06914710998535156, | |
| "kl": 0.3422697074711323, | |
| "learning_rate": 8.161248763639153e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 662.9375, | |
| "epoch": 0.5754349519605297, | |
| "grad_norm": 0.05800582095980644, | |
| "kl": 0.37523847445845604, | |
| "learning_rate": 8.148514956172315e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.1325825173407793, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 834.5625, | |
| "epoch": 0.5775123344585822, | |
| "grad_norm": 0.06169675290584564, | |
| "kl": 0.31875982135534286, | |
| "learning_rate": 8.135747218910104e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8367187976837158, | |
| "reward_std": 0.20439805276691914, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.1960937511175871, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 766.546875, | |
| "epoch": 0.5795897169566346, | |
| "grad_norm": 0.08040869235992432, | |
| "kl": 1.0613461509346962, | |
| "learning_rate": 8.122945689443328e-05, | |
| "loss": 0.0005, | |
| "reward": 0.8703125417232513, | |
| "reward_std": 0.1568893175572157, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19843750074505806, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 737.5625, | |
| "epoch": 0.5816670994546871, | |
| "grad_norm": 0.0702415257692337, | |
| "kl": 0.34963829442858696, | |
| "learning_rate": 8.11011050572695e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8222656659781933, | |
| "reward_std": 0.22483785450458527, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 753.703125, | |
| "epoch": 0.5837444819527395, | |
| "grad_norm": 0.07661338895559311, | |
| "kl": 0.38233664259314537, | |
| "learning_rate": 8.097241806078615e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7929687909781933, | |
| "reward_std": 0.26626989245414734, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 880.609375, | |
| "epoch": 0.585821864450792, | |
| "grad_norm": 0.07432933151721954, | |
| "kl": 0.42199838161468506, | |
| "learning_rate": 8.084339729177142e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8500000461935997, | |
| "reward_std": 0.27400387451052666, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19375000335276127, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 778.453125, | |
| "epoch": 0.5878992469488444, | |
| "grad_norm": 0.07835783809423447, | |
| "kl": 0.36370869539678097, | |
| "learning_rate": 8.071404414061041e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8207031637430191, | |
| "reward_std": 0.2712417396251112, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19570313021540642, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 806.515625, | |
| "epoch": 0.5899766294468969, | |
| "grad_norm": 0.048540204763412476, | |
| "kl": 0.3912508450448513, | |
| "learning_rate": 8.058436000127014e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8679687865078449, | |
| "reward_std": 0.1602038759738207, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.1960937511175871, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 859.765625, | |
| "epoch": 0.5920540119449493, | |
| "grad_norm": 0.06207654997706413, | |
| "kl": 0.31765272468328476, | |
| "learning_rate": 8.045434627128446e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9312500506639481, | |
| "reward_std": 0.2032931987196207, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 710.453125, | |
| "epoch": 0.5941313944430018, | |
| "grad_norm": 0.08810100704431534, | |
| "kl": 0.40968091040849686, | |
| "learning_rate": 8.032400435173907e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8542969226837158, | |
| "reward_std": 0.31212134286761284, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 700.296875, | |
| "epoch": 0.5962087769410542, | |
| "grad_norm": 0.07407598942518234, | |
| "kl": 0.3017115257680416, | |
| "learning_rate": 8.019333564725639e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9476562887430191, | |
| "reward_std": 0.18009125301614404, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.19765625521540642, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 628.984375, | |
| "epoch": 0.5982861594391067, | |
| "grad_norm": 0.05131203308701515, | |
| "kl": 0.3888060562312603, | |
| "learning_rate": 8.006234156598042e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 648.28125, | |
| "epoch": 0.6003635419371591, | |
| "grad_norm": 0.07319964468479156, | |
| "kl": 0.3936074487864971, | |
| "learning_rate": 7.99310235195615e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 788.953125, | |
| "epoch": 0.6024409244352117, | |
| "grad_norm": 0.07722538709640503, | |
| "kl": 0.35653146356344223, | |
| "learning_rate": 7.979938292314129e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8386719189584255, | |
| "reward_std": 0.24583008512854576, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 679.46875, | |
| "epoch": 0.6045183069332641, | |
| "grad_norm": 0.03349410742521286, | |
| "kl": 0.35145866870880127, | |
| "learning_rate": 7.966742119533723e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.04419417306780815, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 761.25, | |
| "epoch": 0.6065956894313166, | |
| "grad_norm": 0.06922980397939682, | |
| "kl": 0.33772632107138634, | |
| "learning_rate": 7.953513975822755e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8242187947034836, | |
| "reward_std": 0.2220757193863392, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 618.25, | |
| "epoch": 0.608673071929369, | |
| "grad_norm": 0.07786116003990173, | |
| "kl": 0.5136113204061985, | |
| "learning_rate": 7.940254003733578e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 704.921875, | |
| "epoch": 0.6107504544274215, | |
| "grad_norm": 0.0848776176571846, | |
| "kl": 0.4174853079020977, | |
| "learning_rate": 7.926962346161535e-05, | |
| "loss": 0.0002, | |
| "reward": 0.699218787252903, | |
| "reward_std": 0.22207571775652468, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 657.734375, | |
| "epoch": 0.6128278369254739, | |
| "grad_norm": 0.0675949826836586, | |
| "kl": 0.4570797383785248, | |
| "learning_rate": 7.913639146343435e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 689.328125, | |
| "epoch": 0.6149052194235264, | |
| "grad_norm": 0.07435144484043121, | |
| "kl": 0.3593181371688843, | |
| "learning_rate": 7.900284547855991e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8691406697034836, | |
| "reward_std": 0.2469349391758442, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 783.453125, | |
| "epoch": 0.6169826019215788, | |
| "grad_norm": 0.07517191022634506, | |
| "kl": 0.7363171242177486, | |
| "learning_rate": 7.886898694614291e-05, | |
| "loss": 0.0004, | |
| "reward": 0.8375000469386578, | |
| "reward_std": 0.20329319685697556, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 665.171875, | |
| "epoch": 0.6190599844196313, | |
| "grad_norm": 0.07602944225072861, | |
| "kl": 0.4283002242445946, | |
| "learning_rate": 7.873481730870232e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 741.75, | |
| "epoch": 0.6211373669176837, | |
| "grad_norm": 0.07438351958990097, | |
| "kl": 0.2955322675406933, | |
| "learning_rate": 7.860033801210976e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8250000439584255, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 719.9375, | |
| "epoch": 0.6232147494157362, | |
| "grad_norm": 0.08875050395727158, | |
| "kl": 0.34281647577881813, | |
| "learning_rate": 7.84655505055738e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7125000432133675, | |
| "reward_std": 0.38006988912820816, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19687500409781933, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 755.453125, | |
| "epoch": 0.6252921319137886, | |
| "grad_norm": 0.07758081704378128, | |
| "kl": 0.29608317092061043, | |
| "learning_rate": 7.833045624162452e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 721.40625, | |
| "epoch": 0.6273695144118411, | |
| "grad_norm": 0.07114533334970474, | |
| "kl": 0.5064779743552208, | |
| "learning_rate": 7.819505667609767e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 804.59375, | |
| "epoch": 0.6294468969098935, | |
| "grad_norm": 0.06897041946649551, | |
| "kl": 0.3391858469694853, | |
| "learning_rate": 7.805935326811912e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9500000476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 723.96875, | |
| "epoch": 0.631524279407946, | |
| "grad_norm": 0.07760775089263916, | |
| "kl": 0.3714125622063875, | |
| "learning_rate": 7.792334748008905e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 694.328125, | |
| "epoch": 0.6336016619059984, | |
| "grad_norm": 0.08604968339204788, | |
| "kl": 0.3233291208744049, | |
| "learning_rate": 7.77870407776662e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7000000476837158, | |
| "reward_std": 0.3093592058867216, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 875.234375, | |
| "epoch": 0.6356790444040509, | |
| "grad_norm": 0.07271739840507507, | |
| "kl": 0.2942599691450596, | |
| "learning_rate": 7.765043462975217e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7464844100177288, | |
| "reward_std": 0.1817485373467207, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.18398437649011612, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 674.65625, | |
| "epoch": 0.6377564269021033, | |
| "grad_norm": 0.06499814242124557, | |
| "kl": 0.6551753357052803, | |
| "learning_rate": 7.751353050847545e-05, | |
| "loss": 0.0003, | |
| "reward": 0.6683594062924385, | |
| "reward_std": 0.13313494622707367, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.19960937649011612, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 794.671875, | |
| "epoch": 0.6398338094001558, | |
| "grad_norm": 0.07812398672103882, | |
| "kl": 0.29106237180531025, | |
| "learning_rate": 7.737632988917564e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8218750506639481, | |
| "reward_std": 0.3137786276638508, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 726.0, | |
| "epoch": 0.6419111918982082, | |
| "grad_norm": 0.08285919576883316, | |
| "kl": 0.3495354764163494, | |
| "learning_rate": 7.723883425038758e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 862.09375, | |
| "epoch": 0.6439885743962607, | |
| "grad_norm": 0.06892167776823044, | |
| "kl": 0.318182036280632, | |
| "learning_rate": 7.710104507382531e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7753906697034836, | |
| "reward_std": 0.24693494103848934, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19726562686264515, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 705.40625, | |
| "epoch": 0.6460659568943131, | |
| "grad_norm": 0.053015708923339844, | |
| "kl": 0.30275189504027367, | |
| "learning_rate": 7.696296384436619e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250402331352, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 689.0, | |
| "epoch": 0.6481433393923656, | |
| "grad_norm": 0.08785798400640488, | |
| "kl": 0.3134246002882719, | |
| "learning_rate": 7.682459205003483e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.2872621212154627, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 875.421875, | |
| "epoch": 0.650220721890418, | |
| "grad_norm": 0.07502438127994537, | |
| "kl": 0.33200008049607277, | |
| "learning_rate": 7.668593118198719e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8218750506639481, | |
| "reward_std": 0.26958445087075233, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19687500409781933, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 708.828125, | |
| "epoch": 0.6522981043884706, | |
| "grad_norm": 0.08758591115474701, | |
| "kl": 0.3114005923271179, | |
| "learning_rate": 7.654698273449435e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9179687947034836, | |
| "reward_std": 0.31046406738460064, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 843.875, | |
| "epoch": 0.654375486886523, | |
| "grad_norm": 0.06280484795570374, | |
| "kl": 0.2624143324792385, | |
| "learning_rate": 7.640774820492647e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 676.296875, | |
| "epoch": 0.6564528693845755, | |
| "grad_norm": 0.06573140621185303, | |
| "kl": 0.29568540304899216, | |
| "learning_rate": 7.626822909373667e-05, | |
| "loss": 0.0001, | |
| "reward": 0.7781250402331352, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 708.625, | |
| "epoch": 0.6585302518826279, | |
| "grad_norm": 0.07694177329540253, | |
| "kl": 0.2915249727666378, | |
| "learning_rate": 7.612842690444486e-05, | |
| "loss": 0.0001, | |
| "reward": 0.9648437947034836, | |
| "reward_std": 0.2441728077828884, | |
| "rewards/argmax_reward_func": 0.765625, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 656.375, | |
| "epoch": 0.6606076343806804, | |
| "grad_norm": 0.09348881989717484, | |
| "kl": 0.3169392794370651, | |
| "learning_rate": 7.598834314362151e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.3314562924206257, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 640.75, | |
| "epoch": 0.6626850168787328, | |
| "grad_norm": 0.07280497252941132, | |
| "kl": 0.2968177553266287, | |
| "learning_rate": 7.584797932087145e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8710937947034836, | |
| "reward_std": 0.19997863844037056, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 694.359375, | |
| "epoch": 0.6647623993767853, | |
| "grad_norm": 0.09892084449529648, | |
| "kl": 0.5367627218365669, | |
| "learning_rate": 7.570733694881755e-05, | |
| "loss": 0.0003, | |
| "reward": 0.9031250439584255, | |
| "reward_std": 0.28726212307810783, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 653.953125, | |
| "epoch": 0.6668397818748377, | |
| "grad_norm": 0.07763518393039703, | |
| "kl": 0.31165359169244766, | |
| "learning_rate": 7.556641754308447e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250439584255, | |
| "reward_std": 0.24306794814765453, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 738.703125, | |
| "epoch": 0.6689171643728902, | |
| "grad_norm": 0.0881708562374115, | |
| "kl": 0.3136756382882595, | |
| "learning_rate": 7.542522262228231e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8085937947034836, | |
| "reward_std": 0.33256115205585957, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 757.421875, | |
| "epoch": 0.6709945468709426, | |
| "grad_norm": 0.0727957934141159, | |
| "kl": 0.28189600445330143, | |
| "learning_rate": 7.528375370799024e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 660.125, | |
| "epoch": 0.6730719293689951, | |
| "grad_norm": 0.068515844643116, | |
| "kl": 0.29710386879742146, | |
| "learning_rate": 7.514201232474011e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8562500439584255, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 704.8125, | |
| "epoch": 0.6751493118670475, | |
| "grad_norm": 0.07097381353378296, | |
| "kl": 0.31055452302098274, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250439584255, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 680.921875, | |
| "epoch": 0.6772266943651, | |
| "grad_norm": 0.06986773759126663, | |
| "kl": 0.3125472627580166, | |
| "learning_rate": 7.48577182641578e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 744.046875, | |
| "epoch": 0.6793040768631524, | |
| "grad_norm": 0.06576069444417953, | |
| "kl": 0.3361051678657532, | |
| "learning_rate": 7.471516865050467e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 654.0, | |
| "epoch": 0.6813814593612049, | |
| "grad_norm": 0.07205154001712799, | |
| "kl": 0.30227479338645935, | |
| "learning_rate": 7.457235269521856e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7617187909781933, | |
| "reward_std": 0.17788154468871653, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 647.859375, | |
| "epoch": 0.6834588418592573, | |
| "grad_norm": 0.0794130265712738, | |
| "kl": 0.4595659039914608, | |
| "learning_rate": 7.44292719373476e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.24306794628500938, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 735.078125, | |
| "epoch": 0.6855362243573098, | |
| "grad_norm": 0.0897228941321373, | |
| "kl": 0.3426021710038185, | |
| "learning_rate": 7.428592791879361e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.33145629800856113, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 853.703125, | |
| "epoch": 0.6876136068553622, | |
| "grad_norm": 0.07609646022319794, | |
| "kl": 0.26740806736052036, | |
| "learning_rate": 7.414232218429537e-05, | |
| "loss": 0.0001, | |
| "reward": 0.9156250506639481, | |
| "reward_std": 0.26958445832133293, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 616.890625, | |
| "epoch": 0.6896909893534147, | |
| "grad_norm": 0.09115231037139893, | |
| "kl": 0.334526427090168, | |
| "learning_rate": 7.399845628141206e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750439584255, | |
| "reward_std": 0.2872621212154627, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 622.765625, | |
| "epoch": 0.6917683718514671, | |
| "grad_norm": 0.08646494895219803, | |
| "kl": 0.3055717647075653, | |
| "learning_rate": 7.385433176050653e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8710937909781933, | |
| "reward_std": 0.2883669827133417, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 686.921875, | |
| "epoch": 0.6938457543495196, | |
| "grad_norm": 0.0787225142121315, | |
| "kl": 0.3159499131143093, | |
| "learning_rate": 7.370995017472863e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8531250506639481, | |
| "reward_std": 0.26958445459604263, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 634.703125, | |
| "epoch": 0.695923136847572, | |
| "grad_norm": 0.09521856158971786, | |
| "kl": 0.3115619271993637, | |
| "learning_rate": 7.356531307999843e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750476837158, | |
| "reward_std": 0.375650467351079, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 753.703125, | |
| "epoch": 0.6980005193456245, | |
| "grad_norm": 0.09729248285293579, | |
| "kl": 0.3037104904651642, | |
| "learning_rate": 7.342042203498951e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.3977475520223379, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 674.0, | |
| "epoch": 0.700077901843677, | |
| "grad_norm": 0.09549879282712936, | |
| "kl": 0.3098057843744755, | |
| "learning_rate": 7.32752786011121e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.37565046921372414, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 673.1875, | |
| "epoch": 0.7021552843417295, | |
| "grad_norm": 0.08708694577217102, | |
| "kl": 0.29914069548249245, | |
| "learning_rate": 7.312988434249632e-05, | |
| "loss": 0.0001, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.33145629800856113, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 699.65625, | |
| "epoch": 0.7042326668397819, | |
| "grad_norm": 0.09121581166982651, | |
| "kl": 0.31991639360785484, | |
| "learning_rate": 7.298424082597526e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.3093592096120119, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 667.546875, | |
| "epoch": 0.7063100493378344, | |
| "grad_norm": 0.08295177668333054, | |
| "kl": 0.3119734339416027, | |
| "learning_rate": 7.283834962106811e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6656250394880772, | |
| "reward_std": 0.31377863325178623, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.19687500409781933, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 734.140625, | |
| "epoch": 0.7083874318358868, | |
| "grad_norm": 0.07721901684999466, | |
| "kl": 0.2920740433037281, | |
| "learning_rate": 7.269221229996331e-05, | |
| "loss": 0.0001, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.30935920774936676, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 723.78125, | |
| "epoch": 0.7104648143339393, | |
| "grad_norm": 0.07446262985467911, | |
| "kl": 0.31096627190709114, | |
| "learning_rate": 7.254583043750151e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 598.140625, | |
| "epoch": 0.7125421968319917, | |
| "grad_norm": 0.07494507730007172, | |
| "kl": 0.3281702548265457, | |
| "learning_rate": 7.239920561115867e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7000000476837158, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 669.484375, | |
| "epoch": 0.7146195793300442, | |
| "grad_norm": 0.06954500079154968, | |
| "kl": 0.29709911718964577, | |
| "learning_rate": 7.225233940102906e-05, | |
| "loss": 0.0001, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 738.015625, | |
| "epoch": 0.7166969618280966, | |
| "grad_norm": 0.08409620076417923, | |
| "kl": 0.3333327900618315, | |
| "learning_rate": 7.210523338980813e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8398437947034836, | |
| "reward_std": 0.2883669827133417, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 596.5, | |
| "epoch": 0.7187743443261491, | |
| "grad_norm": 0.07967247068881989, | |
| "kl": 0.3089658170938492, | |
| "learning_rate": 7.195788916277565e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7929687947034836, | |
| "reward_std": 0.22207571775652468, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 717.390625, | |
| "epoch": 0.7208517268242015, | |
| "grad_norm": 0.0864432230591774, | |
| "kl": 0.3028757870197296, | |
| "learning_rate": 7.181030830777837e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8843750506639481, | |
| "reward_std": 0.2695844564586878, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 716.515625, | |
| "epoch": 0.722929109322254, | |
| "grad_norm": 0.07595375925302505, | |
| "kl": 0.30636318400502205, | |
| "learning_rate": 7.166249241521318e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7898437976837158, | |
| "reward_std": 0.22649514116346836, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19609375298023224, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 584.3125, | |
| "epoch": 0.7250064918203064, | |
| "grad_norm": 0.10229937732219696, | |
| "kl": 0.32858528569340706, | |
| "learning_rate": 7.151444307800975e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.3756504710763693, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 590.1875, | |
| "epoch": 0.7270838743183589, | |
| "grad_norm": 0.07948501408100128, | |
| "kl": 0.3115417957305908, | |
| "learning_rate": 7.13661618916135e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.2651650346815586, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 618.46875, | |
| "epoch": 0.7291612568164113, | |
| "grad_norm": 0.06686828285455704, | |
| "kl": 0.32769910246133804, | |
| "learning_rate": 7.121765045396834e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8867187947034836, | |
| "reward_std": 0.17788154655136168, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 600.5625, | |
| "epoch": 0.7312386393144638, | |
| "grad_norm": 0.07947742938995361, | |
| "kl": 0.3473210446536541, | |
| "learning_rate": 7.106891036549945e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 551.78125, | |
| "epoch": 0.7333160218125162, | |
| "grad_norm": 0.04208023473620415, | |
| "kl": 0.35688477009534836, | |
| "learning_rate": 7.091994322909611e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9968750476837158, | |
| "reward_std": 0.06629125960171223, | |
| "rewards/argmax_reward_func": 0.796875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 576.609375, | |
| "epoch": 0.7353934043105687, | |
| "grad_norm": 0.06657633185386658, | |
| "kl": 0.32345687225461006, | |
| "learning_rate": 7.077075065009433e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 552.921875, | |
| "epoch": 0.7374707868086211, | |
| "grad_norm": 0.0544576533138752, | |
| "kl": 0.3251136727631092, | |
| "learning_rate": 7.062133423625959e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250439584255, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 606.703125, | |
| "epoch": 0.7395481693066736, | |
| "grad_norm": 0.07147221267223358, | |
| "kl": 0.3404123783111572, | |
| "learning_rate": 7.04716955977695e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 605.28125, | |
| "epoch": 0.741625551804726, | |
| "grad_norm": 0.05363324284553528, | |
| "kl": 0.3204925172030926, | |
| "learning_rate": 7.03218363471965e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9500000439584255, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 563.65625, | |
| "epoch": 0.7437029343027785, | |
| "grad_norm": 0.041013430804014206, | |
| "kl": 0.3419278897345066, | |
| "learning_rate": 7.017175809949044e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500439584255, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 557.15625, | |
| "epoch": 0.7457803168008309, | |
| "grad_norm": 0.06874032318592072, | |
| "kl": 0.3553139455616474, | |
| "learning_rate": 7.002146247196113e-05, | |
| "loss": 0.0002, | |
| "reward": 0.776562537997961, | |
| "reward_std": 0.1568893138319254, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 567.15625, | |
| "epoch": 0.7478576992988835, | |
| "grad_norm": 0.0703793615102768, | |
| "kl": 0.33949872851371765, | |
| "learning_rate": 6.987095108426101e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7312500402331352, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 552.984375, | |
| "epoch": 0.7499350817969359, | |
| "grad_norm": 0.06514879316091537, | |
| "kl": 0.3468449302017689, | |
| "learning_rate": 6.972022555836764e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 540.453125, | |
| "epoch": 0.7520124642949884, | |
| "grad_norm": 0.08254203200340271, | |
| "kl": 0.36483363062143326, | |
| "learning_rate": 6.956928751856623e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 553.078125, | |
| "epoch": 0.7540898467930408, | |
| "grad_norm": 0.08209247887134552, | |
| "kl": 0.37783167138695717, | |
| "learning_rate": 6.94181385914321e-05, | |
| "loss": 0.0002, | |
| "reward": 0.667187537997961, | |
| "reward_std": 0.17898640409111977, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 610.234375, | |
| "epoch": 0.7561672292910933, | |
| "grad_norm": 0.062447499483823776, | |
| "kl": 0.34449223801493645, | |
| "learning_rate": 6.926678040581323e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 527.78125, | |
| "epoch": 0.7582446117891457, | |
| "grad_norm": 0.07009898126125336, | |
| "kl": 0.36786164715886116, | |
| "learning_rate": 6.911521459281265e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 587.296875, | |
| "epoch": 0.7603219942871982, | |
| "grad_norm": 0.06519950181245804, | |
| "kl": 0.35170425847172737, | |
| "learning_rate": 6.896344278577083e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 583.96875, | |
| "epoch": 0.7623993767852506, | |
| "grad_norm": 0.051437534391880035, | |
| "kl": 0.34650370851159096, | |
| "learning_rate": 6.881146662024822e-05, | |
| "loss": 0.0002, | |
| "reward": 1.0593750476837158, | |
| "reward_std": 0.11048543080687523, | |
| "rewards/argmax_reward_func": 0.859375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 538.484375, | |
| "epoch": 0.764476759283303, | |
| "grad_norm": 0.06923159956932068, | |
| "kl": 0.38169170916080475, | |
| "learning_rate": 6.865928773400743e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000439584255, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 580.03125, | |
| "epoch": 0.7665541417813555, | |
| "grad_norm": 0.0665920302271843, | |
| "kl": 0.3636031821370125, | |
| "learning_rate": 6.850690776699573e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7289062924683094, | |
| "reward_std": 0.13589708344079554, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19765625149011612, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 562.015625, | |
| "epoch": 0.768631524279408, | |
| "grad_norm": 0.06946459412574768, | |
| "kl": 0.4947234131395817, | |
| "learning_rate": 6.835432836132731e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 575.0625, | |
| "epoch": 0.7707089067774604, | |
| "grad_norm": 0.0689174011349678, | |
| "kl": 0.3747940734028816, | |
| "learning_rate": 6.820155116126561e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 571.109375, | |
| "epoch": 0.7727862892755128, | |
| "grad_norm": 0.08710569888353348, | |
| "kl": 0.39623570069670677, | |
| "learning_rate": 6.804857781320558e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7464844174683094, | |
| "reward_std": 0.28670969791710377, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.19960937835276127, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 607.59375, | |
| "epoch": 0.7748636717735653, | |
| "grad_norm": 0.0731528028845787, | |
| "kl": 0.3582250289618969, | |
| "learning_rate": 6.789540996565593e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 579.28125, | |
| "epoch": 0.7769410542716177, | |
| "grad_norm": 0.0625411793589592, | |
| "kl": 0.3635551296174526, | |
| "learning_rate": 6.774204926922145e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000439584255, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 599.3125, | |
| "epoch": 0.7790184367696702, | |
| "grad_norm": 0.08092815428972244, | |
| "kl": 0.4265919253230095, | |
| "learning_rate": 6.758849737658509e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 547.796875, | |
| "epoch": 0.7810958192677226, | |
| "grad_norm": 0.07175435870885849, | |
| "kl": 0.3616880625486374, | |
| "learning_rate": 6.743475594249021e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8843750432133675, | |
| "reward_std": 0.18119611218571663, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 591.609375, | |
| "epoch": 0.7831732017657751, | |
| "grad_norm": 0.07784335315227509, | |
| "kl": 0.42067378014326096, | |
| "learning_rate": 6.728082662372282e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 531.859375, | |
| "epoch": 0.7852505842638275, | |
| "grad_norm": 0.07652134448289871, | |
| "kl": 0.3934118077158928, | |
| "learning_rate": 6.712671107909359e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 542.640625, | |
| "epoch": 0.78732796676188, | |
| "grad_norm": 0.054136764258146286, | |
| "kl": 0.4239979311823845, | |
| "learning_rate": 6.697241096942006e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500439584255, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 548.53125, | |
| "epoch": 0.7894053492599324, | |
| "grad_norm": 0.07836976647377014, | |
| "kl": 0.4376937076449394, | |
| "learning_rate": 6.681792795750875e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7308594211935997, | |
| "reward_std": 0.17732911929488182, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19960937649011612, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 545.140625, | |
| "epoch": 0.7914827317579849, | |
| "grad_norm": 0.06161171570420265, | |
| "kl": 0.526831716299057, | |
| "learning_rate": 6.666326370813723e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 539.4375, | |
| "epoch": 0.7935601142560373, | |
| "grad_norm": 0.050724372267723083, | |
| "kl": 0.4309442602097988, | |
| "learning_rate": 6.650841988803606e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 515.109375, | |
| "epoch": 0.7956374967540899, | |
| "grad_norm": 0.08242635428905487, | |
| "kl": 0.4333142638206482, | |
| "learning_rate": 6.635339816587109e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 582.0, | |
| "epoch": 0.7977148792521424, | |
| "grad_norm": 0.07576624304056168, | |
| "kl": 0.40080199763178825, | |
| "learning_rate": 6.619820021222518e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250439584255, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 563.59375, | |
| "epoch": 0.7997922617501948, | |
| "grad_norm": 0.08377435803413391, | |
| "kl": 0.43326959386467934, | |
| "learning_rate": 6.604282769958044e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8089844211935997, | |
| "reward_std": 0.2436203770339489, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.19960937649011612, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 548.859375, | |
| "epoch": 0.8018696442482472, | |
| "grad_norm": 0.09505198895931244, | |
| "kl": 0.625109825283289, | |
| "learning_rate": 6.588728230230004e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7933594211935997, | |
| "reward_std": 0.3088067825883627, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19960937649011612, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 534.046875, | |
| "epoch": 0.8039470267462997, | |
| "grad_norm": 0.09738834947347641, | |
| "kl": 0.5551509782671928, | |
| "learning_rate": 6.573156569661025e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8703125491738319, | |
| "reward_std": 0.289471834897995, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 557.40625, | |
| "epoch": 0.8060244092443521, | |
| "grad_norm": 0.0654783695936203, | |
| "kl": 0.42286501079797745, | |
| "learning_rate": 6.557567956058239e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 509.921875, | |
| "epoch": 0.8081017917424046, | |
| "grad_norm": 0.07516364008188248, | |
| "kl": 0.5226034559309483, | |
| "learning_rate": 6.541962557411469e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 540.953125, | |
| "epoch": 0.810179174240457, | |
| "grad_norm": 0.08237718045711517, | |
| "kl": 0.49187011271715164, | |
| "learning_rate": 6.526340541891418e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500476837158, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 538.03125, | |
| "epoch": 0.8122565567385095, | |
| "grad_norm": 0.08174508810043335, | |
| "kl": 0.45481956005096436, | |
| "learning_rate": 6.510702077847863e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 552.703125, | |
| "epoch": 0.8143339392365619, | |
| "grad_norm": 0.09640171378850937, | |
| "kl": 0.45114999637007713, | |
| "learning_rate": 6.495047333807842e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7621094211935997, | |
| "reward_std": 0.309911634773016, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.19960937835276127, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 555.75, | |
| "epoch": 0.8164113217346144, | |
| "grad_norm": 0.07019418478012085, | |
| "kl": 0.44265756756067276, | |
| "learning_rate": 6.479376478473823e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9500000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 588.09375, | |
| "epoch": 0.8184887042326668, | |
| "grad_norm": 0.05258520692586899, | |
| "kl": 0.4588502533733845, | |
| "learning_rate": 6.463689680721904e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750439584255, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 527.578125, | |
| "epoch": 0.8205660867307193, | |
| "grad_norm": 0.08728921413421631, | |
| "kl": 0.44911035895347595, | |
| "learning_rate": 6.447987109599986e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500476837158, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 635.40625, | |
| "epoch": 0.8226434692287717, | |
| "grad_norm": 0.06634779274463654, | |
| "kl": 0.38350560516119003, | |
| "learning_rate": 6.432268934325946e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 532.09375, | |
| "epoch": 0.8247208517268242, | |
| "grad_norm": 0.09231170266866684, | |
| "kl": 0.46880777925252914, | |
| "learning_rate": 6.416535324285824e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6843750402331352, | |
| "reward_std": 0.2872621212154627, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 588.875, | |
| "epoch": 0.8267982342248766, | |
| "grad_norm": 0.07496833801269531, | |
| "kl": 0.850627463310957, | |
| "learning_rate": 6.400786449031986e-05, | |
| "loss": 0.0004, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 511.953125, | |
| "epoch": 0.8288756167229291, | |
| "grad_norm": 0.06271515041589737, | |
| "kl": 0.4049219489097595, | |
| "learning_rate": 6.385022478281306e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 555.046875, | |
| "epoch": 0.8309529992209815, | |
| "grad_norm": 0.07291208207607269, | |
| "kl": 0.4224717430770397, | |
| "learning_rate": 6.369243581913336e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 573.515625, | |
| "epoch": 0.833030381719034, | |
| "grad_norm": 0.06133547052741051, | |
| "kl": 0.42930199950933456, | |
| "learning_rate": 6.353449929968465e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 522.8125, | |
| "epoch": 0.8351077642170864, | |
| "grad_norm": 0.06863158941268921, | |
| "kl": 0.4221891984343529, | |
| "learning_rate": 6.337641692646106e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9019531756639481, | |
| "reward_std": 0.15633688867092133, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.19882812723517418, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 544.0, | |
| "epoch": 0.8371851467151389, | |
| "grad_norm": 0.07222079485654831, | |
| "kl": 0.45428359508514404, | |
| "learning_rate": 6.321819040302839e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 527.671875, | |
| "epoch": 0.8392625292131913, | |
| "grad_norm": 0.08342251926660538, | |
| "kl": 0.4082505330443382, | |
| "learning_rate": 6.305982143450597e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8402344286441803, | |
| "reward_std": 0.24362037930404767, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19960937649011612, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 544.171875, | |
| "epoch": 0.8413399117112438, | |
| "grad_norm": 0.064121775329113, | |
| "kl": 0.43093303963541985, | |
| "learning_rate": 6.290131172754811e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9949219226837158, | |
| "reward_std": 0.15744174271821976, | |
| "rewards/argmax_reward_func": 0.796875, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 517.578125, | |
| "epoch": 0.8434172942092963, | |
| "grad_norm": 0.08640465885400772, | |
| "kl": 0.44819287210702896, | |
| "learning_rate": 6.274266299032582e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 536.734375, | |
| "epoch": 0.8454946767073488, | |
| "grad_norm": 0.09182075411081314, | |
| "kl": 0.39375099167227745, | |
| "learning_rate": 6.25838769325083e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000476837158, | |
| "reward_std": 0.3093592096120119, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 537.265625, | |
| "epoch": 0.8475720592054012, | |
| "grad_norm": 0.05770527943968773, | |
| "kl": 0.48194558918476105, | |
| "learning_rate": 6.24249552652447e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7292969226837158, | |
| "reward_std": 0.13534465618431568, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.19804687798023224, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 536.84375, | |
| "epoch": 0.8496494417034537, | |
| "grad_norm": 0.07029449939727783, | |
| "kl": 0.4273468554019928, | |
| "learning_rate": 6.226589970114543e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250439584255, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 539.140625, | |
| "epoch": 0.8517268242015061, | |
| "grad_norm": 0.0789664089679718, | |
| "kl": 0.4228878915309906, | |
| "learning_rate": 6.210671195426387e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 582.265625, | |
| "epoch": 0.8538042066995586, | |
| "grad_norm": 0.05472075939178467, | |
| "kl": 0.39347052946686745, | |
| "learning_rate": 6.194739374007792e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500439584255, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 511.125, | |
| "epoch": 0.855881589197611, | |
| "grad_norm": 0.08022020757198334, | |
| "kl": 0.44926824048161507, | |
| "learning_rate": 6.178794677547137e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7312500439584255, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 502.40625, | |
| "epoch": 0.8579589716956635, | |
| "grad_norm": 0.08022835850715637, | |
| "kl": 0.4819503165781498, | |
| "learning_rate": 6.162837277871553e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.24306794814765453, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 537.1875, | |
| "epoch": 0.8600363541937159, | |
| "grad_norm": 0.06297382712364197, | |
| "kl": 0.49380555003881454, | |
| "learning_rate": 6.146867346945066e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 521.859375, | |
| "epoch": 0.8621137366917684, | |
| "grad_norm": 0.07238580286502838, | |
| "kl": 0.4831845983862877, | |
| "learning_rate": 6.130885056866742e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 538.84375, | |
| "epoch": 0.8641911191898208, | |
| "grad_norm": 0.070571668446064, | |
| "kl": 0.5007887817919254, | |
| "learning_rate": 6.114890579868837e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8250000439584255, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 573.25, | |
| "epoch": 0.8662685016878733, | |
| "grad_norm": 0.0768335610628128, | |
| "kl": 0.4633421525359154, | |
| "learning_rate": 6.098884088314938e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000439584255, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 627.6875, | |
| "epoch": 0.8683458841859257, | |
| "grad_norm": 0.07244177162647247, | |
| "kl": 0.45967796072363853, | |
| "learning_rate": 6.082865754698109e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 558.171875, | |
| "epoch": 0.8704232666839782, | |
| "grad_norm": 0.07271222770214081, | |
| "kl": 0.46447786316275597, | |
| "learning_rate": 6.066835751639022e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7925781682133675, | |
| "reward_std": 0.22152329608798027, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.19882812909781933, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 539.90625, | |
| "epoch": 0.8725006491820306, | |
| "grad_norm": 0.05372535437345505, | |
| "kl": 0.47306570410728455, | |
| "learning_rate": 6.050794251884112e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 579.59375, | |
| "epoch": 0.8745780316800831, | |
| "grad_norm": 0.06956978142261505, | |
| "kl": 0.4903941936790943, | |
| "learning_rate": 6.0347414283037004e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7937500476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 543.5625, | |
| "epoch": 0.8766554141781355, | |
| "grad_norm": 0.05839576572179794, | |
| "kl": 0.46378039941191673, | |
| "learning_rate": 6.018677453890149e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 571.265625, | |
| "epoch": 0.878732796676188, | |
| "grad_norm": 0.08274129778146744, | |
| "kl": 0.4939221628010273, | |
| "learning_rate": 6.002602501755974e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9656250476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.765625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 562.875, | |
| "epoch": 0.8808101791742404, | |
| "grad_norm": 0.052250444889068604, | |
| "kl": 0.4885864891111851, | |
| "learning_rate": 5.9865167451320005e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250439584255, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 564.171875, | |
| "epoch": 0.8828875616722929, | |
| "grad_norm": 0.07596340775489807, | |
| "kl": 0.4957350380718708, | |
| "learning_rate": 5.970420357365486e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6843750439584255, | |
| "reward_std": 0.24306794814765453, | |
| "rewards/argmax_reward_func": 0.484375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 563.375, | |
| "epoch": 0.8849649441703453, | |
| "grad_norm": 0.0591680072247982, | |
| "kl": 0.47624582052230835, | |
| "learning_rate": 5.9543135119182514e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7757812812924385, | |
| "reward_std": 0.11269513890147209, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19765625335276127, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 607.78125, | |
| "epoch": 0.8870423266683978, | |
| "grad_norm": 0.0683642029762268, | |
| "kl": 0.5224468521773815, | |
| "learning_rate": 5.938196382364818e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8718750439584255, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 584.8125, | |
| "epoch": 0.8891197091664502, | |
| "grad_norm": 0.05711337924003601, | |
| "kl": 0.4908281937241554, | |
| "learning_rate": 5.9220691423905305e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.1325825173407793, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 626.34375, | |
| "epoch": 0.8911970916645027, | |
| "grad_norm": 0.060382284224033356, | |
| "kl": 0.586872935295105, | |
| "learning_rate": 5.9059319657896884e-05, | |
| "loss": 0.0003, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 597.703125, | |
| "epoch": 0.8932744741625552, | |
| "grad_norm": 0.07129113376140594, | |
| "kl": 0.6612692400813103, | |
| "learning_rate": 5.889785026463672e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8554687947034836, | |
| "reward_std": 0.17788155190646648, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 568.046875, | |
| "epoch": 0.8953518566606077, | |
| "grad_norm": 0.07546839118003845, | |
| "kl": 0.5523902028799057, | |
| "learning_rate": 5.873628498419073e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 573.609375, | |
| "epoch": 0.8974292391586601, | |
| "grad_norm": 0.06730344146490097, | |
| "kl": 0.5382697433233261, | |
| "learning_rate": 5.8574625557658095e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7312500402331352, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.53125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 576.84375, | |
| "epoch": 0.8995066216567126, | |
| "grad_norm": 0.04371188208460808, | |
| "kl": 0.49328725039958954, | |
| "learning_rate": 5.8412873727152595e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000439584255, | |
| "reward_std": 0.0883883461356163, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 634.609375, | |
| "epoch": 0.901584004154765, | |
| "grad_norm": 0.05455589294433594, | |
| "kl": 0.4651510939002037, | |
| "learning_rate": 5.825103123578379e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 607.640625, | |
| "epoch": 0.9036613866528175, | |
| "grad_norm": 0.06638182699680328, | |
| "kl": 0.4994208887219429, | |
| "learning_rate": 5.808909982763825e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 577.171875, | |
| "epoch": 0.9057387691508699, | |
| "grad_norm": 0.05038674548268318, | |
| "kl": 0.4841819517314434, | |
| "learning_rate": 5.792708124776072e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 618.765625, | |
| "epoch": 0.9078161516489224, | |
| "grad_norm": 0.06103122606873512, | |
| "kl": 0.45625371113419533, | |
| "learning_rate": 5.776497724213536e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 577.34375, | |
| "epoch": 0.9098935341469748, | |
| "grad_norm": 0.06659764796495438, | |
| "kl": 0.5014519467949867, | |
| "learning_rate": 5.760278955766695e-05, | |
| "loss": 0.0003, | |
| "reward": 0.7937500439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.59375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 588.828125, | |
| "epoch": 0.9119709166450273, | |
| "grad_norm": 0.06549356877803802, | |
| "kl": 0.4932373948395252, | |
| "learning_rate": 5.744051994216201e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 587.828125, | |
| "epoch": 0.9140482991430797, | |
| "grad_norm": 0.08965161442756653, | |
| "kl": 0.4944054037332535, | |
| "learning_rate": 5.727817014430992e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7777344174683094, | |
| "reward_std": 0.28781455382704735, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19960937649011612, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 580.125, | |
| "epoch": 0.9161256816411322, | |
| "grad_norm": 0.07723158597946167, | |
| "kl": 0.4910140074789524, | |
| "learning_rate": 5.7115741913664264e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.24306795187294483, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 598.890625, | |
| "epoch": 0.9182030641391846, | |
| "grad_norm": 0.068883016705513, | |
| "kl": 0.481427326798439, | |
| "learning_rate": 5.695323700062375e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7000000439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.5, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 600.140625, | |
| "epoch": 0.9202804466372371, | |
| "grad_norm": 0.07069353759288788, | |
| "kl": 0.5068237520754337, | |
| "learning_rate": 5.6790657156413504e-05, | |
| "loss": 0.0003, | |
| "reward": 0.714843787252903, | |
| "reward_std": 0.1999786365777254, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 587.625, | |
| "epoch": 0.9223578291352895, | |
| "grad_norm": 1.0064716339111328, | |
| "kl": 9.217760100960732, | |
| "learning_rate": 5.66280041330661e-05, | |
| "loss": 0.0046, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 590.78125, | |
| "epoch": 0.924435211633342, | |
| "grad_norm": 0.2128666639328003, | |
| "kl": 3.3808604292571545, | |
| "learning_rate": 5.646527968340278e-05, | |
| "loss": 0.0017, | |
| "reward": 0.7625000476837158, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 633.46875, | |
| "epoch": 0.9265125941313944, | |
| "grad_norm": 0.05456709861755371, | |
| "kl": 0.49203020706772804, | |
| "learning_rate": 5.6302485561014475e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9500000439584255, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 584.015625, | |
| "epoch": 0.9285899766294469, | |
| "grad_norm": 0.06649811565876007, | |
| "kl": 0.47326431795954704, | |
| "learning_rate": 5.613962352024292e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 657.1875, | |
| "epoch": 0.9306673591274993, | |
| "grad_norm": 0.08579502999782562, | |
| "kl": 0.4674902521073818, | |
| "learning_rate": 5.597669531616181e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.331456296145916, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 664.71875, | |
| "epoch": 0.9327447416255518, | |
| "grad_norm": 0.056638821959495544, | |
| "kl": 0.470287274569273, | |
| "learning_rate": 5.5813702704557814e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 618.296875, | |
| "epoch": 0.9348221241236042, | |
| "grad_norm": 0.040137626230716705, | |
| "kl": 0.46139009296894073, | |
| "learning_rate": 5.5650647441911706e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9656250476837158, | |
| "reward_std": 0.06629125960171223, | |
| "rewards/argmax_reward_func": 0.765625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 632.96875, | |
| "epoch": 0.9368995066216567, | |
| "grad_norm": 0.06284568458795547, | |
| "kl": 0.517881490290165, | |
| "learning_rate": 5.548753128537939e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 677.46875, | |
| "epoch": 0.9389768891197091, | |
| "grad_norm": 0.07211080193519592, | |
| "kl": 0.46745334565639496, | |
| "learning_rate": 5.532435599277303e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250439584255, | |
| "reward_std": 0.24306794814765453, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 697.96875, | |
| "epoch": 0.9410542716177617, | |
| "grad_norm": 0.06623782962560654, | |
| "kl": 0.4283139891922474, | |
| "learning_rate": 5.516112332254203e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.2209708634763956, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 619.375, | |
| "epoch": 0.9431316541158141, | |
| "grad_norm": 0.08626007288694382, | |
| "kl": 0.5277771130204201, | |
| "learning_rate": 5.499783503375412e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.30935920774936676, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 674.0, | |
| "epoch": 0.9452090366138666, | |
| "grad_norm": 0.06324354559183121, | |
| "kl": 0.4568277336657047, | |
| "learning_rate": 5.4834492886076446e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8714844174683094, | |
| "reward_std": 0.19942620425717905, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.19960937835276127, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 701.0625, | |
| "epoch": 0.947286419111919, | |
| "grad_norm": 0.07262270897626877, | |
| "kl": 0.4528024010360241, | |
| "learning_rate": 5.4671098639756504e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 693.90625, | |
| "epoch": 0.9493638016099715, | |
| "grad_norm": 0.06315562129020691, | |
| "kl": 0.43744752556085587, | |
| "learning_rate": 5.4507654055603275e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 664.984375, | |
| "epoch": 0.9514411841080239, | |
| "grad_norm": 0.06270638853311539, | |
| "kl": 0.4934372082352638, | |
| "learning_rate": 5.4344160894968145e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 679.78125, | |
| "epoch": 0.9535185666060764, | |
| "grad_norm": 0.06320095807313919, | |
| "kl": 0.4636671505868435, | |
| "learning_rate": 5.418062091972604e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9019531756639481, | |
| "reward_std": 0.15633688890375197, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.19882812537252903, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 768.765625, | |
| "epoch": 0.9555959491041288, | |
| "grad_norm": 0.08169972896575928, | |
| "kl": 0.4431908018887043, | |
| "learning_rate": 5.4017035892256365e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7773437909781933, | |
| "reward_std": 0.33256115578114986, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 649.203125, | |
| "epoch": 0.9576733316021813, | |
| "grad_norm": 0.06060326099395752, | |
| "kl": 0.4794473238289356, | |
| "learning_rate": 5.385340757542402e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6375000365078449, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.4375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 752.859375, | |
| "epoch": 0.9597507141002337, | |
| "grad_norm": 0.07235154509544373, | |
| "kl": 0.43572117015719414, | |
| "learning_rate": 5.36897377325604e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9328125417232513, | |
| "reward_std": 0.2452776599675417, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 680.78125, | |
| "epoch": 0.9618280965982862, | |
| "grad_norm": 0.07361527532339096, | |
| "kl": 0.45767712593078613, | |
| "learning_rate": 5.352602812744441e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.26516503654420376, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 702.609375, | |
| "epoch": 0.9639054790963386, | |
| "grad_norm": 0.0740986093878746, | |
| "kl": 0.493575606495142, | |
| "learning_rate": 5.336228052428348e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9031250476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.703125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 733.359375, | |
| "epoch": 0.9659828615943911, | |
| "grad_norm": 0.07798778265714645, | |
| "kl": 0.45794639363884926, | |
| "learning_rate": 5.319849668769449e-05, | |
| "loss": 0.0002, | |
| "reward": 0.6675781644880772, | |
| "reward_std": 0.2668223176151514, | |
| "rewards/argmax_reward_func": 0.46875, | |
| "rewards/format_reward_func": 0.19882812723517418, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 673.03125, | |
| "epoch": 0.9680602440924435, | |
| "grad_norm": 0.06236180663108826, | |
| "kl": 0.4699827618896961, | |
| "learning_rate": 5.303467838268478e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750439584255, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 695.03125, | |
| "epoch": 0.970137626590496, | |
| "grad_norm": 0.06047491356730461, | |
| "kl": 0.42734822258353233, | |
| "learning_rate": 5.287082737463317e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000439584255, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 700.515625, | |
| "epoch": 0.9722150090885484, | |
| "grad_norm": 0.05264519900083542, | |
| "kl": 0.514576718211174, | |
| "learning_rate": 5.270694542927088e-05, | |
| "loss": 0.0003, | |
| "reward": 0.9500000476837158, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 685.015625, | |
| "epoch": 0.9742923915866009, | |
| "grad_norm": 0.07000822573900223, | |
| "kl": 0.47352610528469086, | |
| "learning_rate": 5.254303431266254e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8382812961935997, | |
| "reward_std": 0.24638251960277557, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.19765625149011612, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 778.0, | |
| "epoch": 0.9763697740846533, | |
| "grad_norm": 0.0691061019897461, | |
| "kl": 0.44779016450047493, | |
| "learning_rate": 5.2379095791187124e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8238281644880772, | |
| "reward_std": 0.2226281464099884, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.19882812723517418, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 721.953125, | |
| "epoch": 0.9784471565827058, | |
| "grad_norm": 0.056446801871061325, | |
| "kl": 0.4972013346850872, | |
| "learning_rate": 5.2215131631518945e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 734.296875, | |
| "epoch": 0.9805245390807582, | |
| "grad_norm": 0.04742085933685303, | |
| "kl": 0.4290156289935112, | |
| "learning_rate": 5.20511436006086e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500439584255, | |
| "reward_std": 0.13258251920342445, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 699.078125, | |
| "epoch": 0.9826019215788107, | |
| "grad_norm": 0.06520857661962509, | |
| "kl": 0.44061052426695824, | |
| "learning_rate": 5.188713346566393e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9187500476837158, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 816.515625, | |
| "epoch": 0.9846793040768631, | |
| "grad_norm": 0.06763774901628494, | |
| "kl": 0.462362315505743, | |
| "learning_rate": 5.172310299413099e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8875000476837158, | |
| "reward_std": 0.2651650384068489, | |
| "rewards/argmax_reward_func": 0.6875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 703.703125, | |
| "epoch": 0.9867566865749156, | |
| "grad_norm": 0.06497927010059357, | |
| "kl": 0.4288316182792187, | |
| "learning_rate": 5.1559053953674975e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8562500476837158, | |
| "reward_std": 0.1767766922712326, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 734.265625, | |
| "epoch": 0.9888340690729681, | |
| "grad_norm": 0.061615679413080215, | |
| "kl": 0.4229474924504757, | |
| "learning_rate": 5.139498811216122e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7156250383704901, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.515625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 745.546875, | |
| "epoch": 0.9909114515710206, | |
| "grad_norm": 0.061035335063934326, | |
| "kl": 0.44998469576239586, | |
| "learning_rate": 5.123090723763606e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7781250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.578125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 726.9375, | |
| "epoch": 0.992988834069073, | |
| "grad_norm": 0.05501917377114296, | |
| "kl": 0.4711364693939686, | |
| "learning_rate": 5.106681309830791e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9312500469386578, | |
| "reward_std": 0.1590990237891674, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.19687500223517418, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 731.265625, | |
| "epoch": 0.9950662165671255, | |
| "grad_norm": 0.06437938660383224, | |
| "kl": 0.488413542509079, | |
| "learning_rate": 5.090270746252802e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 877.296875, | |
| "epoch": 0.9971435990651779, | |
| "grad_norm": 0.05059191957116127, | |
| "kl": 0.3898215554654598, | |
| "learning_rate": 5.073859209877168e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9179687947034836, | |
| "reward_std": 0.22207572311162949, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 737.34375, | |
| "epoch": 0.9992209815632304, | |
| "grad_norm": 0.05600970238447189, | |
| "kl": 0.4228545166552067, | |
| "learning_rate": 5.057446877561884e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9179687909781933, | |
| "reward_std": 0.17788155004382133, | |
| "rewards/argmax_reward_func": 0.71875, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 867.2916666666666, | |
| "epoch": 1.0, | |
| "grad_norm": 0.0311344675719738, | |
| "kl": 0.4165251553058624, | |
| "learning_rate": 5.0410339261735384e-05, | |
| "loss": 0.0001, | |
| "reward": 0.9500000476837158, | |
| "reward_std": 0.23570225636164346, | |
| "rewards/argmax_reward_func": 0.75, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 721.59375, | |
| "epoch": 1.0020773824980524, | |
| "grad_norm": 0.08058605343103409, | |
| "kl": 0.4754480682313442, | |
| "learning_rate": 5.0246205325853826e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8250000476837158, | |
| "reward_std": 0.30935921147465706, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 749.40625, | |
| "epoch": 1.004154764996105, | |
| "grad_norm": 0.06808894872665405, | |
| "kl": 0.4143032245337963, | |
| "learning_rate": 5.008206873675433e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9343750476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 739.546875, | |
| "epoch": 1.0062321474941573, | |
| "grad_norm": 0.04413120448589325, | |
| "kl": 0.4007079564034939, | |
| "learning_rate": 4.991793126324568e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9656250476837158, | |
| "reward_std": 0.11048543266952038, | |
| "rewards/argmax_reward_func": 0.765625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 764.390625, | |
| "epoch": 1.0083095299922098, | |
| "grad_norm": 0.05544662848114967, | |
| "kl": 0.40518468618392944, | |
| "learning_rate": 4.9753794674146206e-05, | |
| "loss": 0.0002, | |
| "reward": 1.0125000476837158, | |
| "reward_std": 0.17677669040858746, | |
| "rewards/argmax_reward_func": 0.8125, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 770.71875, | |
| "epoch": 1.0103869124902622, | |
| "grad_norm": 0.0641309842467308, | |
| "kl": 0.41656066104769707, | |
| "learning_rate": 4.9589660738264614e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8718750476837158, | |
| "reward_std": 0.19887377880513668, | |
| "rewards/argmax_reward_func": 0.671875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 774.984375, | |
| "epoch": 1.0124642949883147, | |
| "grad_norm": 0.06294507533311844, | |
| "kl": 0.41369784995913506, | |
| "learning_rate": 4.9425531224381163e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000402331352, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 725.171875, | |
| "epoch": 1.0145416774863671, | |
| "grad_norm": 0.058232299983501434, | |
| "kl": 0.4683380052447319, | |
| "learning_rate": 4.926140790122835e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8406250476837158, | |
| "reward_std": 0.19887377694249153, | |
| "rewards/argmax_reward_func": 0.640625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 782.046875, | |
| "epoch": 1.0166190599844196, | |
| "grad_norm": 0.053580548614263535, | |
| "kl": 0.42908982560038567, | |
| "learning_rate": 4.909729253747197e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750476837158, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 812.921875, | |
| "epoch": 1.018696442482472, | |
| "grad_norm": 0.07418368011713028, | |
| "kl": 0.4282660707831383, | |
| "learning_rate": 4.893318690169211e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7625000476837158, | |
| "reward_std": 0.30935920774936676, | |
| "rewards/argmax_reward_func": 0.5625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 772.4375, | |
| "epoch": 1.0207738249805245, | |
| "grad_norm": 0.053191013634204865, | |
| "kl": 0.42502470314502716, | |
| "learning_rate": 4.876909276236395e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 752.390625, | |
| "epoch": 1.022851207478577, | |
| "grad_norm": 0.05855753272771835, | |
| "kl": 0.42473678290843964, | |
| "learning_rate": 4.8605011887838797e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8554687947034836, | |
| "reward_std": 0.22207572497427464, | |
| "rewards/argmax_reward_func": 0.65625, | |
| "rewards/format_reward_func": 0.19921875186264515, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 757.84375, | |
| "epoch": 1.0249285899766294, | |
| "grad_norm": 0.06828629225492477, | |
| "kl": 0.409926887601614, | |
| "learning_rate": 4.844094604632502e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9968750476837158, | |
| "reward_std": 0.24306795001029968, | |
| "rewards/argmax_reward_func": 0.796875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 781.3125, | |
| "epoch": 1.0270059724746818, | |
| "grad_norm": 0.07438631355762482, | |
| "kl": 0.41192958503961563, | |
| "learning_rate": 4.827689700586902e-05, | |
| "loss": 0.0002, | |
| "reward": 0.8093750439584255, | |
| "reward_std": 0.331456296145916, | |
| "rewards/argmax_reward_func": 0.609375, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 793.234375, | |
| "epoch": 1.0290833549727343, | |
| "grad_norm": 0.05079561844468117, | |
| "kl": 0.4043182320892811, | |
| "learning_rate": 4.811286653433609e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750476837158, | |
| "reward_std": 0.15467960387468338, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 781.796875, | |
| "epoch": 1.0311607374707867, | |
| "grad_norm": 0.05202874913811684, | |
| "kl": 0.413474939763546, | |
| "learning_rate": 4.794885639939142e-05, | |
| "loss": 0.0002, | |
| "reward": 0.7468750439584255, | |
| "reward_std": 0.15467960573732853, | |
| "rewards/argmax_reward_func": 0.546875, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 810.25, | |
| "epoch": 1.0332381199688392, | |
| "grad_norm": 0.05652881786227226, | |
| "kl": 0.40745414793491364, | |
| "learning_rate": 4.7784868368481067e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9335937947034836, | |
| "reward_std": 0.1999786328524351, | |
| "rewards/argmax_reward_func": 0.734375, | |
| "rewards/format_reward_func": 0.1992187537252903, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 799.765625, | |
| "epoch": 1.0353155024668916, | |
| "grad_norm": 0.0548785924911499, | |
| "kl": 0.39410270750522614, | |
| "learning_rate": 4.762090420881289e-05, | |
| "loss": 0.0002, | |
| "reward": 0.9953125491738319, | |
| "reward_std": 0.1568893175572157, | |
| "rewards/argmax_reward_func": 0.796875, | |
| "rewards/format_reward_func": 0.1984375026077032, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 777.75, | |
| "epoch": 1.037392884964944, | |
| "grad_norm": 0.06177806481719017, | |
| "kl": 0.6858577094972134, | |
| "learning_rate": 4.745696568733748e-05, | |
| "loss": 0.0003, | |
| "reward": 0.8250000439584255, | |
| "reward_std": 0.22097086533904076, | |
| "rewards/argmax_reward_func": 0.625, | |
| "rewards/format_reward_func": 0.20000000298023224, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 962, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |