{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5197505197505198, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 167.609375, "epoch": 0.0010395010395010396, "grad_norm": 3.384965902533218, "kl": 0.423828125, "learning_rate": 9.989604989604989e-07, "loss": 0.017, "reward": 0.328125, "reward_std": 0.4426340162754059, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.1875, "step": 1 }, { "completion_length": 99.421875, "epoch": 0.002079002079002079, "grad_norm": 4.010451982192975, "kl": 0.232421875, "learning_rate": 9.97920997920998e-07, "loss": 0.0093, "reward": 0.828125, "reward_std": 0.5546915531158447, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.5, "step": 2 }, { "completion_length": 152.234375, "epoch": 0.0031185031185031187, "grad_norm": 2.50510324672354, "kl": 0.09716796875, "learning_rate": 9.968814968814967e-07, "loss": 0.0039, "reward": 0.765625, "reward_std": 0.5627470016479492, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.4375, "step": 3 }, { "completion_length": 159.671875, "epoch": 0.004158004158004158, "grad_norm": 2.685973626492788, "kl": 0.1826171875, "learning_rate": 9.958419958419959e-07, "loss": 0.0073, "reward": 0.765625, "reward_std": 0.5857534408569336, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.46875, "step": 4 }, { "completion_length": 145.78125, "epoch": 0.005197505197505198, "grad_norm": 4.083886116082862, "kl": 0.4140625, "learning_rate": 9.948024948024948e-07, "loss": 0.0166, "reward": 0.71875, "reward_std": 0.5261132717132568, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.640625, "step": 5 }, { "completion_length": 105.171875, "epoch": 0.006237006237006237, "grad_norm": 3.7990116330116583, "kl": 0.353515625, "learning_rate": 9.937629937629938e-07, "loss": 0.0141, "reward": 1.15625, "reward_std": 0.4218915104866028, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.734375, "step": 6 }, { "completion_length": 96.65625, "epoch": 0.007276507276507277, "grad_norm": 4.201704051454585, "kl": 0.30859375, "learning_rate": 9.927234927234927e-07, "loss": 0.0123, "reward": 1.359375, "reward_std": 0.508479118347168, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.84375, "step": 7 }, { "completion_length": 127.9375, "epoch": 0.008316008316008316, "grad_norm": 5.088551827269882, "kl": 0.341796875, "learning_rate": 9.916839916839916e-07, "loss": 0.0137, "reward": 1.015625, "reward_std": 0.532598614692688, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.703125, "step": 8 }, { "completion_length": 121.359375, "epoch": 0.009355509355509356, "grad_norm": 3.013078644977297, "kl": 0.263671875, "learning_rate": 9.906444906444906e-07, "loss": 0.0105, "reward": 1.171875, "reward_std": 0.5579804182052612, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.859375, "step": 9 }, { "completion_length": 107.984375, "epoch": 0.010395010395010396, "grad_norm": 13.14098090276698, "kl": 0.298828125, "learning_rate": 9.896049896049897e-07, "loss": 0.012, "reward": 1.078125, "reward_std": 0.39907702803611755, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.875, "step": 10 }, { "completion_length": 98.875, "epoch": 0.011434511434511435, "grad_norm": 3.959516469755692, "kl": 0.419921875, "learning_rate": 9.885654885654884e-07, "loss": 0.0168, "reward": 1.234375, "reward_std": 0.3804909586906433, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.953125, "step": 11 }, { "completion_length": 120.53125, "epoch": 0.012474012474012475, "grad_norm": 3.938285492512014, "kl": 0.2060546875, "learning_rate": 9.875259875259876e-07, "loss": 0.0083, "reward": 0.953125, "reward_std": 0.37298911809921265, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.890625, "step": 12 }, { "completion_length": 80.234375, "epoch": 0.013513513513513514, "grad_norm": 2.7510195000927107, "kl": 0.4296875, "learning_rate": 9.864864864864865e-07, "loss": 0.0172, "reward": 1.578125, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.96875, "step": 13 }, { "completion_length": 91.453125, "epoch": 0.014553014553014554, "grad_norm": 2.8921535294998364, "kl": 0.18359375, "learning_rate": 9.854469854469854e-07, "loss": 0.0074, "reward": 1.21875, "reward_std": 0.3208816647529602, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.984375, "step": 14 }, { "completion_length": 80.5, "epoch": 0.015592515592515593, "grad_norm": 3.7454869808999534, "kl": 0.234375, "learning_rate": 9.844074844074844e-07, "loss": 0.0094, "reward": 1.1875, "reward_std": 0.4659920632839203, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9375, "step": 15 }, { "completion_length": 85.046875, "epoch": 0.016632016632016633, "grad_norm": 19.522379255276192, "kl": 0.34765625, "learning_rate": 9.833679833679833e-07, "loss": 0.014, "reward": 1.109375, "reward_std": 0.49365806579589844, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.890625, "step": 16 }, { "completion_length": 93.390625, "epoch": 0.017671517671517672, "grad_norm": 4.298078954922317, "kl": 0.33984375, "learning_rate": 9.823284823284822e-07, "loss": 0.0136, "reward": 1.078125, "reward_std": 0.4204515814781189, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.90625, "step": 17 }, { "completion_length": 106.4375, "epoch": 0.018711018711018712, "grad_norm": 2.412950307153221, "kl": 0.212890625, "learning_rate": 9.812889812889814e-07, "loss": 0.0085, "reward": 1.125, "reward_std": 0.29176726937294006, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "step": 18 }, { "completion_length": 96.796875, "epoch": 0.01975051975051975, "grad_norm": 2.2611660961515616, "kl": 0.390625, "learning_rate": 9.8024948024948e-07, "loss": 0.0156, "reward": 1.140625, "reward_std": 0.28412121534347534, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "step": 19 }, { "completion_length": 77.390625, "epoch": 0.02079002079002079, "grad_norm": 3.90061596001628, "kl": 0.314453125, "learning_rate": 9.792099792099792e-07, "loss": 0.0126, "reward": 1.4375, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 20 }, { "completion_length": 87.265625, "epoch": 0.02182952182952183, "grad_norm": 5.857765767712962, "kl": 0.390625, "learning_rate": 9.781704781704782e-07, "loss": 0.0156, "reward": 1.328125, "reward_std": 0.38664889335632324, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "step": 21 }, { "completion_length": 100.4375, "epoch": 0.02286902286902287, "grad_norm": 2.676679829022191, "kl": 0.28515625, "learning_rate": 9.77130977130977e-07, "loss": 0.0114, "reward": 1.078125, "reward_std": 0.37909185886383057, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.890625, "step": 22 }, { "completion_length": 89.5625, "epoch": 0.02390852390852391, "grad_norm": 2.712931732211751, "kl": 0.287109375, "learning_rate": 9.76091476091476e-07, "loss": 0.0115, "reward": 1.21875, "reward_std": 0.4278908669948578, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9375, "step": 23 }, { "completion_length": 112.53125, "epoch": 0.02494802494802495, "grad_norm": 2.788638415909055, "kl": 0.30078125, "learning_rate": 9.75051975051975e-07, "loss": 0.012, "reward": 1.203125, "reward_std": 0.3813319802284241, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "step": 24 }, { "completion_length": 92.328125, "epoch": 0.02598752598752599, "grad_norm": 2.294914593938693, "kl": 0.267578125, "learning_rate": 9.74012474012474e-07, "loss": 0.0107, "reward": 1.265625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 1.0, "step": 25 }, { "completion_length": 82.515625, "epoch": 0.02702702702702703, "grad_norm": 6.578560845833114, "kl": 0.546875, "learning_rate": 9.72972972972973e-07, "loss": 0.0219, "reward": 1.265625, "reward_std": 0.3762476146221161, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 26 }, { "completion_length": 103.84375, "epoch": 0.028066528066528068, "grad_norm": 2.963633005149672, "kl": 0.453125, "learning_rate": 9.71933471933472e-07, "loss": 0.0181, "reward": 1.375, "reward_std": 0.41610971093177795, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.953125, "step": 27 }, { "completion_length": 102.34375, "epoch": 0.029106029106029108, "grad_norm": 2.3610893431114945, "kl": 0.349609375, "learning_rate": 9.708939708939709e-07, "loss": 0.014, "reward": 1.359375, "reward_std": 0.30617380142211914, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.96875, "step": 28 }, { "completion_length": 101.859375, "epoch": 0.030145530145530147, "grad_norm": 3.3184214937295375, "kl": 0.248046875, "learning_rate": 9.698544698544698e-07, "loss": 0.0099, "reward": 1.203125, "reward_std": 0.4559571444988251, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.921875, "step": 29 }, { "completion_length": 90.703125, "epoch": 0.031185031185031187, "grad_norm": 5.762768283894928, "kl": 0.248046875, "learning_rate": 9.688149688149688e-07, "loss": 0.0099, "reward": 1.1875, "reward_std": 0.4003184735774994, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.96875, "step": 30 }, { "completion_length": 81.375, "epoch": 0.032224532224532226, "grad_norm": 3.3322341115231118, "kl": 0.302734375, "learning_rate": 9.677754677754677e-07, "loss": 0.0121, "reward": 1.21875, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "step": 31 }, { "completion_length": 92.53125, "epoch": 0.033264033264033266, "grad_norm": 4.7403464104013855, "kl": 0.41015625, "learning_rate": 9.667359667359666e-07, "loss": 0.0164, "reward": 1.3125, "reward_std": 0.42873120307922363, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.953125, "step": 32 }, { "completion_length": 84.796875, "epoch": 0.034303534303534305, "grad_norm": 3.363277684222119, "kl": 0.287109375, "learning_rate": 9.656964656964658e-07, "loss": 0.0115, "reward": 1.34375, "reward_std": 0.28247910737991333, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 33 }, { "completion_length": 87.484375, "epoch": 0.035343035343035345, "grad_norm": 2.9891514003852397, "kl": 0.322265625, "learning_rate": 9.646569646569647e-07, "loss": 0.0129, "reward": 1.328125, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 34 }, { "completion_length": 86.71875, "epoch": 0.036382536382536385, "grad_norm": 2.8231753083001343, "kl": 0.298828125, "learning_rate": 9.636174636174636e-07, "loss": 0.0119, "reward": 1.15625, "reward_std": 0.3377464711666107, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.953125, "step": 35 }, { "completion_length": 85.96875, "epoch": 0.037422037422037424, "grad_norm": 3.8473202673864697, "kl": 0.447265625, "learning_rate": 9.625779625779626e-07, "loss": 0.0179, "reward": 1.328125, "reward_std": 0.46307316422462463, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.9375, "step": 36 }, { "completion_length": 111.28125, "epoch": 0.038461538461538464, "grad_norm": 2.2648971842321735, "kl": 0.310546875, "learning_rate": 9.615384615384615e-07, "loss": 0.0124, "reward": 1.15625, "reward_std": 0.3420785665512085, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "step": 37 }, { "completion_length": 76.046875, "epoch": 0.0395010395010395, "grad_norm": 3.084036494604344, "kl": 0.189453125, "learning_rate": 9.604989604989604e-07, "loss": 0.0076, "reward": 1.265625, "reward_std": 0.40890365839004517, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 38 }, { "completion_length": 78.59375, "epoch": 0.04054054054054054, "grad_norm": 3.47905631150264, "kl": 0.314453125, "learning_rate": 9.594594594594594e-07, "loss": 0.0126, "reward": 1.375, "reward_std": 0.32474884390830994, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.953125, "step": 39 }, { "completion_length": 73.3125, "epoch": 0.04158004158004158, "grad_norm": 3.8602279426046575, "kl": 0.29296875, "learning_rate": 9.584199584199583e-07, "loss": 0.0117, "reward": 1.640625, "reward_std": 0.4024401307106018, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 40 }, { "completion_length": 98.296875, "epoch": 0.04261954261954262, "grad_norm": 12.185855347304553, "kl": 0.337890625, "learning_rate": 9.573804573804574e-07, "loss": 0.0135, "reward": 1.328125, "reward_std": 0.45242589712142944, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 41 }, { "completion_length": 86.5625, "epoch": 0.04365904365904366, "grad_norm": 4.45323380584257, "kl": 0.58203125, "learning_rate": 9.563409563409564e-07, "loss": 0.0233, "reward": 1.421875, "reward_std": 0.41749441623687744, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 42 }, { "completion_length": 91.234375, "epoch": 0.0446985446985447, "grad_norm": 2.646987610793277, "kl": 0.2734375, "learning_rate": 9.553014553014553e-07, "loss": 0.0109, "reward": 1.375, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.984375, "step": 43 }, { "completion_length": 89.0, "epoch": 0.04573804573804574, "grad_norm": 2.9433826771733136, "kl": 0.24609375, "learning_rate": 9.542619542619542e-07, "loss": 0.0098, "reward": 1.390625, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 44 }, { "completion_length": 102.984375, "epoch": 0.04677754677754678, "grad_norm": 3.670730844456411, "kl": 0.330078125, "learning_rate": 9.532224532224532e-07, "loss": 0.0132, "reward": 1.328125, "reward_std": 0.4624157249927521, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "step": 45 }, { "completion_length": 95.40625, "epoch": 0.04781704781704782, "grad_norm": 3.3683953085657583, "kl": 0.29296875, "learning_rate": 9.521829521829522e-07, "loss": 0.0118, "reward": 1.421875, "reward_std": 0.338117778301239, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 46 }, { "completion_length": 79.71875, "epoch": 0.04885654885654886, "grad_norm": 3.1789712912565293, "kl": 0.310546875, "learning_rate": 9.511434511434511e-07, "loss": 0.0125, "reward": 1.234375, "reward_std": 0.3931124210357666, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "step": 47 }, { "completion_length": 95.578125, "epoch": 0.0498960498960499, "grad_norm": 2.4758505450209007, "kl": 0.4921875, "learning_rate": 9.501039501039501e-07, "loss": 0.0196, "reward": 1.078125, "reward_std": 0.24831004440784454, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.984375, "step": 48 }, { "completion_length": 74.390625, "epoch": 0.05093555093555094, "grad_norm": 3.029362594665336, "kl": 0.251953125, "learning_rate": 9.490644490644491e-07, "loss": 0.0101, "reward": 1.46875, "reward_std": 0.31300368905067444, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 49 }, { "completion_length": 97.390625, "epoch": 0.05197505197505198, "grad_norm": 2.9018083463764883, "kl": 0.28125, "learning_rate": 9.48024948024948e-07, "loss": 0.0112, "reward": 1.234375, "reward_std": 0.45346418023109436, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9375, "step": 50 }, { "completion_length": 102.859375, "epoch": 0.05301455301455302, "grad_norm": 2.9814050035921293, "kl": 0.3125, "learning_rate": 9.46985446985447e-07, "loss": 0.0125, "reward": 1.15625, "reward_std": 0.37944284081459045, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.921875, "step": 51 }, { "completion_length": 115.96875, "epoch": 0.05405405405405406, "grad_norm": 2.545086760197515, "kl": 0.263671875, "learning_rate": 9.459459459459459e-07, "loss": 0.0105, "reward": 1.265625, "reward_std": 0.32878512144088745, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 52 }, { "completion_length": 115.8125, "epoch": 0.0550935550935551, "grad_norm": 2.6627986532935357, "kl": 0.35546875, "learning_rate": 9.449064449064449e-07, "loss": 0.0143, "reward": 1.359375, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.953125, "step": 53 }, { "completion_length": 99.546875, "epoch": 0.056133056133056136, "grad_norm": 5.527757679704933, "kl": 0.55078125, "learning_rate": 9.438669438669439e-07, "loss": 0.022, "reward": 1.453125, "reward_std": 0.41398805379867554, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 54 }, { "completion_length": 137.171875, "epoch": 0.057172557172557176, "grad_norm": 1.8818465440799708, "kl": 0.408203125, "learning_rate": 9.428274428274428e-07, "loss": 0.0164, "reward": 1.078125, "reward_std": 0.35066378116607666, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "step": 55 }, { "completion_length": 113.34375, "epoch": 0.058212058212058215, "grad_norm": 3.2070622502548543, "kl": 0.380859375, "learning_rate": 9.417879417879417e-07, "loss": 0.0152, "reward": 1.28125, "reward_std": 0.2845909595489502, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.984375, "step": 56 }, { "completion_length": 110.890625, "epoch": 0.059251559251559255, "grad_norm": 3.959257619835642, "kl": 0.384765625, "learning_rate": 9.407484407484408e-07, "loss": 0.0154, "reward": 1.328125, "reward_std": 0.3751009702682495, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 57 }, { "completion_length": 115.453125, "epoch": 0.060291060291060294, "grad_norm": 2.763319824032788, "kl": 0.369140625, "learning_rate": 9.397089397089397e-07, "loss": 0.0147, "reward": 1.265625, "reward_std": 0.4024401307106018, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 58 }, { "completion_length": 115.296875, "epoch": 0.061330561330561334, "grad_norm": 2.8966205370851386, "kl": 0.296875, "learning_rate": 9.386694386694386e-07, "loss": 0.0119, "reward": 1.21875, "reward_std": 0.34891825914382935, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.890625, "step": 59 }, { "completion_length": 82.84375, "epoch": 0.062370062370062374, "grad_norm": 3.878835501099912, "kl": 0.302734375, "learning_rate": 9.376299376299376e-07, "loss": 0.0121, "reward": 1.28125, "reward_std": 0.2346404641866684, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 1.0, "step": 60 }, { "completion_length": 111.5625, "epoch": 0.06340956340956341, "grad_norm": 3.0582156540380345, "kl": 0.2578125, "learning_rate": 9.365904365904366e-07, "loss": 0.0103, "reward": 1.046875, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9375, "step": 61 }, { "completion_length": 121.9375, "epoch": 0.06444906444906445, "grad_norm": 2.5261151410620424, "kl": 0.228515625, "learning_rate": 9.355509355509355e-07, "loss": 0.0091, "reward": 1.09375, "reward_std": 0.40678203105926514, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.875, "step": 62 }, { "completion_length": 105.40625, "epoch": 0.06548856548856549, "grad_norm": 2.7497728824845264, "kl": 0.267578125, "learning_rate": 9.345114345114345e-07, "loss": 0.0107, "reward": 1.265625, "reward_std": 0.33417510986328125, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 63 }, { "completion_length": 117.234375, "epoch": 0.06652806652806653, "grad_norm": 2.3142965179660977, "kl": 0.2001953125, "learning_rate": 9.334719334719334e-07, "loss": 0.008, "reward": 1.15625, "reward_std": 0.39378440380096436, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.953125, "step": 64 }, { "completion_length": 135.65625, "epoch": 0.06756756756756757, "grad_norm": 2.409253542878613, "kl": 0.287109375, "learning_rate": 9.324324324324324e-07, "loss": 0.0115, "reward": 1.140625, "reward_std": 0.3931124210357666, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "step": 65 }, { "completion_length": 104.234375, "epoch": 0.06860706860706861, "grad_norm": 2.7748693099355592, "kl": 0.1943359375, "learning_rate": 9.313929313929314e-07, "loss": 0.0078, "reward": 1.328125, "reward_std": 0.3657732605934143, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 66 }, { "completion_length": 112.578125, "epoch": 0.06964656964656965, "grad_norm": 2.111070798611204, "kl": 0.3046875, "learning_rate": 9.303534303534303e-07, "loss": 0.0122, "reward": 1.15625, "reward_std": 0.2346404492855072, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.953125, "step": 67 }, { "completion_length": 106.65625, "epoch": 0.07068607068607069, "grad_norm": 3.3113935279262456, "kl": 0.380859375, "learning_rate": 9.293139293139292e-07, "loss": 0.0152, "reward": 1.375, "reward_std": 0.4530978500843048, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.984375, "step": 68 }, { "completion_length": 127.625, "epoch": 0.07172557172557173, "grad_norm": 2.92525637448603, "kl": 0.248046875, "learning_rate": 9.282744282744283e-07, "loss": 0.0099, "reward": 1.171875, "reward_std": 0.5029704570770264, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.90625, "step": 69 }, { "completion_length": 114.828125, "epoch": 0.07276507276507277, "grad_norm": 2.000528265763137, "kl": 0.1533203125, "learning_rate": 9.272349272349273e-07, "loss": 0.0061, "reward": 1.34375, "reward_std": 0.22201897203922272, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.96875, "step": 70 }, { "completion_length": 103.84375, "epoch": 0.07380457380457381, "grad_norm": 3.163825220349421, "kl": 0.4375, "learning_rate": 9.261954261954261e-07, "loss": 0.0175, "reward": 1.21875, "reward_std": 0.3873208165168762, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "step": 71 }, { "completion_length": 113.5, "epoch": 0.07484407484407485, "grad_norm": 4.376461774285187, "kl": 0.455078125, "learning_rate": 9.251559251559252e-07, "loss": 0.0182, "reward": 1.28125, "reward_std": 0.3834536373615265, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.953125, "step": 72 }, { "completion_length": 112.046875, "epoch": 0.07588357588357589, "grad_norm": 2.303626338128328, "kl": 0.3046875, "learning_rate": 9.241164241164241e-07, "loss": 0.0122, "reward": 1.140625, "reward_std": 0.4128749668598175, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.921875, "step": 73 }, { "completion_length": 109.46875, "epoch": 0.07692307692307693, "grad_norm": 3.7745885873169462, "kl": 0.228515625, "learning_rate": 9.230769230769231e-07, "loss": 0.0091, "reward": 1.375, "reward_std": 0.34632188081741333, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.9375, "step": 74 }, { "completion_length": 100.109375, "epoch": 0.07796257796257797, "grad_norm": 2.2745584201622995, "kl": 0.3671875, "learning_rate": 9.22037422037422e-07, "loss": 0.0147, "reward": 1.46875, "reward_std": 0.3319803476333618, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.953125, "step": 75 }, { "completion_length": 111.828125, "epoch": 0.079002079002079, "grad_norm": 4.897879127935987, "kl": 0.453125, "learning_rate": 9.20997920997921e-07, "loss": 0.0181, "reward": 1.53125, "reward_std": 0.5561797022819519, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.96875, "step": 76 }, { "completion_length": 116.359375, "epoch": 0.08004158004158005, "grad_norm": 2.908458010027759, "kl": 0.2392578125, "learning_rate": 9.199584199584199e-07, "loss": 0.0096, "reward": 1.109375, "reward_std": 0.32878512144088745, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.90625, "step": 77 }, { "completion_length": 101.328125, "epoch": 0.08108108108108109, "grad_norm": 9.104110254047791, "kl": 0.7421875, "learning_rate": 9.18918918918919e-07, "loss": 0.0298, "reward": 1.3125, "reward_std": 0.342454731464386, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.984375, "step": 78 }, { "completion_length": 107.71875, "epoch": 0.08212058212058213, "grad_norm": 2.039274999402774, "kl": 0.1748046875, "learning_rate": 9.178794178794178e-07, "loss": 0.007, "reward": 1.484375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 79 }, { "completion_length": 110.3125, "epoch": 0.08316008316008316, "grad_norm": 2.5522421332959033, "kl": 0.3359375, "learning_rate": 9.168399168399168e-07, "loss": 0.0134, "reward": 1.296875, "reward_std": 0.379610151052475, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.96875, "step": 80 }, { "completion_length": 111.65625, "epoch": 0.0841995841995842, "grad_norm": 2.7510124545518746, "kl": 0.291015625, "learning_rate": 9.158004158004158e-07, "loss": 0.0117, "reward": 1.34375, "reward_std": 0.39347875118255615, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 81 }, { "completion_length": 120.5625, "epoch": 0.08523908523908524, "grad_norm": 2.600192149102474, "kl": 0.15625, "learning_rate": 9.147609147609148e-07, "loss": 0.0062, "reward": 1.296875, "reward_std": 0.3718760311603546, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.9375, "step": 82 }, { "completion_length": 105.90625, "epoch": 0.08627858627858628, "grad_norm": 2.854219950777657, "kl": 0.1982421875, "learning_rate": 9.137214137214136e-07, "loss": 0.0079, "reward": 1.421875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 83 }, { "completion_length": 103.234375, "epoch": 0.08731808731808732, "grad_norm": 7.069266742884403, "kl": 0.306640625, "learning_rate": 9.126819126819127e-07, "loss": 0.0123, "reward": 1.265625, "reward_std": 0.3766237497329712, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 84 }, { "completion_length": 122.265625, "epoch": 0.08835758835758836, "grad_norm": 5.232802902595259, "kl": 0.373046875, "learning_rate": 9.116424116424116e-07, "loss": 0.0149, "reward": 1.203125, "reward_std": 0.38627272844314575, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "step": 85 }, { "completion_length": 128.09375, "epoch": 0.0893970893970894, "grad_norm": 2.3365914823291547, "kl": 0.474609375, "learning_rate": 9.106029106029106e-07, "loss": 0.0189, "reward": 1.296875, "reward_std": 0.31761831045150757, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.984375, "step": 86 }, { "completion_length": 109.078125, "epoch": 0.09043659043659044, "grad_norm": 2.677767787509197, "kl": 0.3125, "learning_rate": 9.095634095634094e-07, "loss": 0.0125, "reward": 1.28125, "reward_std": 0.42081791162490845, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.890625, "step": 87 }, { "completion_length": 120.59375, "epoch": 0.09147609147609148, "grad_norm": 3.178614192574966, "kl": 0.291015625, "learning_rate": 9.085239085239085e-07, "loss": 0.0116, "reward": 1.25, "reward_std": 0.46416622400283813, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9375, "step": 88 }, { "completion_length": 111.765625, "epoch": 0.09251559251559252, "grad_norm": 2.422132707646152, "kl": 0.1923828125, "learning_rate": 9.074844074844074e-07, "loss": 0.0077, "reward": 1.09375, "reward_std": 0.35247981548309326, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "step": 89 }, { "completion_length": 137.71875, "epoch": 0.09355509355509356, "grad_norm": 4.000979691308957, "kl": 0.12060546875, "learning_rate": 9.064449064449065e-07, "loss": 0.0048, "reward": 1.1875, "reward_std": 0.5443553328514099, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.78125, "step": 90 }, { "completion_length": 121.375, "epoch": 0.0945945945945946, "grad_norm": 2.165351772309914, "kl": 0.2255859375, "learning_rate": 9.054054054054053e-07, "loss": 0.009, "reward": 1.40625, "reward_std": 0.2756393849849701, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 91 }, { "completion_length": 107.484375, "epoch": 0.09563409563409564, "grad_norm": 5.3668012064331485, "kl": 0.298828125, "learning_rate": 9.043659043659043e-07, "loss": 0.012, "reward": 1.25, "reward_std": 0.38452720642089844, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "step": 92 }, { "completion_length": 129.875, "epoch": 0.09667359667359668, "grad_norm": 2.8176458437862024, "kl": 0.291015625, "learning_rate": 9.033264033264034e-07, "loss": 0.0116, "reward": 1.1875, "reward_std": 0.3777071237564087, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.96875, "step": 93 }, { "completion_length": 112.5625, "epoch": 0.09771309771309772, "grad_norm": 3.3935322694258407, "kl": 0.28515625, "learning_rate": 9.022869022869023e-07, "loss": 0.0114, "reward": 1.359375, "reward_std": 0.4987064599990845, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.953125, "step": 94 }, { "completion_length": 114.15625, "epoch": 0.09875259875259876, "grad_norm": 2.598586845093194, "kl": 0.29296875, "learning_rate": 9.012474012474012e-07, "loss": 0.0117, "reward": 1.234375, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "step": 95 }, { "completion_length": 109.578125, "epoch": 0.0997920997920998, "grad_norm": 4.80611439340846, "kl": 0.28515625, "learning_rate": 9.002079002079002e-07, "loss": 0.0115, "reward": 1.15625, "reward_std": 0.3119301199913025, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.953125, "step": 96 }, { "completion_length": 115.0625, "epoch": 0.10083160083160084, "grad_norm": 2.9251632543351986, "kl": 0.400390625, "learning_rate": 8.991683991683992e-07, "loss": 0.0161, "reward": 1.28125, "reward_std": 0.37086743116378784, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9375, "step": 97 }, { "completion_length": 126.140625, "epoch": 0.10187110187110188, "grad_norm": 3.196609595258664, "kl": 0.1435546875, "learning_rate": 8.981288981288981e-07, "loss": 0.0058, "reward": 1.28125, "reward_std": 0.42044180631637573, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9375, "step": 98 }, { "completion_length": 126.953125, "epoch": 0.10291060291060292, "grad_norm": 2.2030053954052558, "kl": 0.162109375, "learning_rate": 8.97089397089397e-07, "loss": 0.0065, "reward": 1.1875, "reward_std": 0.4196762442588806, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9375, "step": 99 }, { "completion_length": 108.46875, "epoch": 0.10395010395010396, "grad_norm": 2.5419167973263126, "kl": 0.259765625, "learning_rate": 8.96049896049896e-07, "loss": 0.0104, "reward": 1.328125, "reward_std": 0.3403330445289612, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 1.0, "step": 100 }, { "completion_length": 109.484375, "epoch": 0.104989604989605, "grad_norm": 3.007811716939622, "kl": 0.271484375, "learning_rate": 8.95010395010395e-07, "loss": 0.0109, "reward": 1.40625, "reward_std": 0.5542179942131042, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.953125, "step": 101 }, { "completion_length": 110.0625, "epoch": 0.10602910602910603, "grad_norm": 3.224264825300265, "kl": 0.283203125, "learning_rate": 8.93970893970894e-07, "loss": 0.0113, "reward": 1.265625, "reward_std": 0.4024401307106018, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 102 }, { "completion_length": 109.578125, "epoch": 0.10706860706860707, "grad_norm": 3.065490684046226, "kl": 0.240234375, "learning_rate": 8.929313929313929e-07, "loss": 0.0096, "reward": 1.234375, "reward_std": 0.4301159381866455, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9375, "step": 103 }, { "completion_length": 107.890625, "epoch": 0.10810810810810811, "grad_norm": 1.8617549882485043, "kl": 0.3359375, "learning_rate": 8.918918918918918e-07, "loss": 0.0134, "reward": 1.09375, "reward_std": 0.2709311544895172, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "step": 104 }, { "completion_length": 93.21875, "epoch": 0.10914760914760915, "grad_norm": 2.9237840712859406, "kl": 0.388671875, "learning_rate": 8.908523908523909e-07, "loss": 0.0156, "reward": 1.421875, "reward_std": 0.3571978509426117, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 105 }, { "completion_length": 117.46875, "epoch": 0.1101871101871102, "grad_norm": 1.8423829545194714, "kl": 0.1455078125, "learning_rate": 8.898128898128898e-07, "loss": 0.0058, "reward": 1.3125, "reward_std": 0.29143065214157104, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.96875, "step": 106 }, { "completion_length": 97.765625, "epoch": 0.11122661122661123, "grad_norm": 2.5106113978668647, "kl": 0.328125, "learning_rate": 8.887733887733887e-07, "loss": 0.0131, "reward": 1.203125, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.984375, "step": 107 }, { "completion_length": 101.671875, "epoch": 0.11226611226611227, "grad_norm": 3.0331559387148013, "kl": 0.3203125, "learning_rate": 8.877338877338876e-07, "loss": 0.0128, "reward": 1.265625, "reward_std": 0.41185659170150757, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 108 }, { "completion_length": 93.578125, "epoch": 0.11330561330561331, "grad_norm": 3.1364652248856544, "kl": 0.259765625, "learning_rate": 8.866943866943867e-07, "loss": 0.0103, "reward": 1.421875, "reward_std": 0.40961647033691406, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 109 }, { "completion_length": 118.640625, "epoch": 0.11434511434511435, "grad_norm": 2.9289438325934665, "kl": 0.31640625, "learning_rate": 8.856548856548856e-07, "loss": 0.0127, "reward": 1.21875, "reward_std": 0.4003184735774994, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.953125, "step": 110 }, { "completion_length": 92.4375, "epoch": 0.11538461538461539, "grad_norm": 2.913424400224021, "kl": 0.2392578125, "learning_rate": 8.846153846153846e-07, "loss": 0.0096, "reward": 1.3125, "reward_std": 0.42081791162490845, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 111 }, { "completion_length": 109.484375, "epoch": 0.11642411642411643, "grad_norm": 2.6453558070367436, "kl": 0.1962890625, "learning_rate": 8.835758835758835e-07, "loss": 0.0078, "reward": 1.203125, "reward_std": 0.35612428188323975, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9375, "step": 112 }, { "completion_length": 133.328125, "epoch": 0.11746361746361747, "grad_norm": 2.0359227108406595, "kl": 0.328125, "learning_rate": 8.825363825363825e-07, "loss": 0.0131, "reward": 1.28125, "reward_std": 0.4308430552482605, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.90625, "step": 113 }, { "completion_length": 117.25, "epoch": 0.11850311850311851, "grad_norm": 2.2801014402944833, "kl": 0.1298828125, "learning_rate": 8.814968814968816e-07, "loss": 0.0052, "reward": 1.1875, "reward_std": 0.45040658116340637, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9375, "step": 114 }, { "completion_length": 118.46875, "epoch": 0.11954261954261955, "grad_norm": 2.68873136773117, "kl": 0.375, "learning_rate": 8.804573804573804e-07, "loss": 0.0149, "reward": 1.28125, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.984375, "step": 115 }, { "completion_length": 112.703125, "epoch": 0.12058212058212059, "grad_norm": 3.2430281591556174, "kl": 0.30859375, "learning_rate": 8.794178794178794e-07, "loss": 0.0124, "reward": 1.296875, "reward_std": 0.32407689094543457, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.984375, "step": 116 }, { "completion_length": 110.6875, "epoch": 0.12162162162162163, "grad_norm": 2.806790180793269, "kl": 0.51171875, "learning_rate": 8.783783783783784e-07, "loss": 0.0204, "reward": 1.296875, "reward_std": 0.42553597688674927, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.9375, "step": 117 }, { "completion_length": 118.765625, "epoch": 0.12266112266112267, "grad_norm": 2.96437926272725, "kl": 0.267578125, "learning_rate": 8.773388773388774e-07, "loss": 0.0107, "reward": 1.265625, "reward_std": 0.4520144462585449, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.90625, "step": 118 }, { "completion_length": 88.234375, "epoch": 0.12370062370062371, "grad_norm": 2.7984516884691177, "kl": 0.1748046875, "learning_rate": 8.762993762993762e-07, "loss": 0.007, "reward": 1.34375, "reward_std": 0.3866586983203888, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "step": 119 }, { "completion_length": 144.875, "epoch": 0.12474012474012475, "grad_norm": 2.5468625594920775, "kl": 0.390625, "learning_rate": 8.752598752598753e-07, "loss": 0.0156, "reward": 1.21875, "reward_std": 0.45673251152038574, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9375, "step": 120 }, { "completion_length": 118.9375, "epoch": 0.1257796257796258, "grad_norm": 2.53545934638772, "kl": 0.23828125, "learning_rate": 8.742203742203742e-07, "loss": 0.0095, "reward": 1.25, "reward_std": 0.43872103095054626, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "step": 121 }, { "completion_length": 107.046875, "epoch": 0.12681912681912683, "grad_norm": 2.7183530219848517, "kl": 0.26171875, "learning_rate": 8.731808731808732e-07, "loss": 0.0105, "reward": 1.28125, "reward_std": 0.3754722774028778, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.984375, "step": 122 }, { "completion_length": 105.09375, "epoch": 0.12785862785862787, "grad_norm": 2.659104494218674, "kl": 0.251953125, "learning_rate": 8.72141372141372e-07, "loss": 0.0101, "reward": 1.171875, "reward_std": 0.28778618574142456, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.953125, "step": 123 }, { "completion_length": 107.875, "epoch": 0.1288981288981289, "grad_norm": 2.6320368990647047, "kl": 0.171875, "learning_rate": 8.711018711018711e-07, "loss": 0.0069, "reward": 1.4375, "reward_std": 0.364027738571167, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.9375, "step": 124 }, { "completion_length": 112.046875, "epoch": 0.12993762993762994, "grad_norm": 2.885160493320566, "kl": 0.306640625, "learning_rate": 8.7006237006237e-07, "loss": 0.0122, "reward": 1.359375, "reward_std": 0.3403330445289612, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.984375, "step": 125 }, { "completion_length": 148.203125, "epoch": 0.13097713097713098, "grad_norm": 2.0619166812231287, "kl": 0.333984375, "learning_rate": 8.690228690228691e-07, "loss": 0.0134, "reward": 1.09375, "reward_std": 0.39886873960494995, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.890625, "step": 126 }, { "completion_length": 131.0, "epoch": 0.13201663201663202, "grad_norm": 2.460534979254211, "kl": 0.33984375, "learning_rate": 8.679833679833679e-07, "loss": 0.0135, "reward": 1.265625, "reward_std": 0.46608567237854004, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.9375, "step": 127 }, { "completion_length": 105.8125, "epoch": 0.13305613305613306, "grad_norm": 2.6874393518215984, "kl": 0.474609375, "learning_rate": 8.669438669438669e-07, "loss": 0.019, "reward": 1.40625, "reward_std": 0.4092700183391571, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 128 }, { "completion_length": 94.09375, "epoch": 0.1340956340956341, "grad_norm": 2.571250750757306, "kl": 0.359375, "learning_rate": 8.659043659043659e-07, "loss": 0.0144, "reward": 1.421875, "reward_std": 0.2472364604473114, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 129 }, { "completion_length": 101.40625, "epoch": 0.13513513513513514, "grad_norm": 2.8469987361121496, "kl": 0.2255859375, "learning_rate": 8.648648648648649e-07, "loss": 0.009, "reward": 1.359375, "reward_std": 0.44451263546943665, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.96875, "step": 130 }, { "completion_length": 94.765625, "epoch": 0.13617463617463618, "grad_norm": 2.034649025246861, "kl": 0.193359375, "learning_rate": 8.638253638253637e-07, "loss": 0.0077, "reward": 1.109375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "step": 131 }, { "completion_length": 119.859375, "epoch": 0.13721413721413722, "grad_norm": 2.2929245219262366, "kl": 0.369140625, "learning_rate": 8.627858627858628e-07, "loss": 0.0147, "reward": 1.203125, "reward_std": 0.2109457403421402, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.984375, "step": 132 }, { "completion_length": 104.171875, "epoch": 0.13825363825363826, "grad_norm": 3.802832244151579, "kl": 0.1357421875, "learning_rate": 8.617463617463617e-07, "loss": 0.0054, "reward": 1.203125, "reward_std": 0.3352486491203308, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9375, "step": 133 }, { "completion_length": 87.109375, "epoch": 0.1392931392931393, "grad_norm": 3.2186010406252565, "kl": 0.26171875, "learning_rate": 8.607068607068607e-07, "loss": 0.0105, "reward": 1.3125, "reward_std": 0.3743738532066345, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.984375, "step": 134 }, { "completion_length": 94.265625, "epoch": 0.14033264033264034, "grad_norm": 3.1252137378170723, "kl": 0.310546875, "learning_rate": 8.596673596673595e-07, "loss": 0.0124, "reward": 1.578125, "reward_std": 0.35612428188323975, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 135 }, { "completion_length": 92.578125, "epoch": 0.14137214137214138, "grad_norm": 3.1680567591207196, "kl": 0.271484375, "learning_rate": 8.586278586278586e-07, "loss": 0.0109, "reward": 1.453125, "reward_std": 0.4240131676197052, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 136 }, { "completion_length": 99.75, "epoch": 0.14241164241164242, "grad_norm": 2.415440709916101, "kl": 0.205078125, "learning_rate": 8.575883575883575e-07, "loss": 0.0082, "reward": 1.5, "reward_std": 0.3697938919067383, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 137 }, { "completion_length": 83.421875, "epoch": 0.14345114345114346, "grad_norm": 2.4888389212854194, "kl": 0.291015625, "learning_rate": 8.565488565488566e-07, "loss": 0.0116, "reward": 1.265625, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 138 }, { "completion_length": 88.96875, "epoch": 0.1444906444906445, "grad_norm": 3.419749430648012, "kl": 0.546875, "learning_rate": 8.555093555093555e-07, "loss": 0.0218, "reward": 1.25, "reward_std": 0.28883427381515503, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "step": 139 }, { "completion_length": 100.578125, "epoch": 0.14553014553014554, "grad_norm": 2.438206499063026, "kl": 0.2373046875, "learning_rate": 8.544698544698544e-07, "loss": 0.0095, "reward": 1.171875, "reward_std": 0.32878512144088745, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "step": 140 }, { "completion_length": 101.125, "epoch": 0.14656964656964658, "grad_norm": 2.7873808528874418, "kl": 0.291015625, "learning_rate": 8.534303534303535e-07, "loss": 0.0116, "reward": 1.375, "reward_std": 0.42081791162490845, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.984375, "step": 141 }, { "completion_length": 95.375, "epoch": 0.14760914760914762, "grad_norm": 2.4423008379047584, "kl": 0.1884765625, "learning_rate": 8.523908523908524e-07, "loss": 0.0075, "reward": 1.25, "reward_std": 0.25513994693756104, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "step": 142 }, { "completion_length": 87.328125, "epoch": 0.14864864864864866, "grad_norm": 2.1938349211084702, "kl": 0.25390625, "learning_rate": 8.513513513513513e-07, "loss": 0.0102, "reward": 1.453125, "reward_std": 0.3098084330558777, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 143 }, { "completion_length": 112.09375, "epoch": 0.1496881496881497, "grad_norm": 2.3003757459008436, "kl": 0.36328125, "learning_rate": 8.503118503118503e-07, "loss": 0.0146, "reward": 1.421875, "reward_std": 0.37298911809921265, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 144 }, { "completion_length": 88.4375, "epoch": 0.15072765072765074, "grad_norm": 1.7447748646232581, "kl": 0.294921875, "learning_rate": 8.492723492723493e-07, "loss": 0.0118, "reward": 1.421875, "reward_std": 0.2468603253364563, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 145 }, { "completion_length": 101.828125, "epoch": 0.15176715176715178, "grad_norm": 3.0543860139029158, "kl": 0.51171875, "learning_rate": 8.482328482328482e-07, "loss": 0.0204, "reward": 1.359375, "reward_std": 0.38664889335632324, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.984375, "step": 146 }, { "completion_length": 93.984375, "epoch": 0.15280665280665282, "grad_norm": 3.6199305167907596, "kl": 0.546875, "learning_rate": 8.471933471933472e-07, "loss": 0.0219, "reward": 1.234375, "reward_std": 0.35141605138778687, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.96875, "step": 147 }, { "completion_length": 94.765625, "epoch": 0.15384615384615385, "grad_norm": 2.557052235383183, "kl": 0.185546875, "learning_rate": 8.461538461538461e-07, "loss": 0.0074, "reward": 1.5625, "reward_std": 0.35824596881866455, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 148 }, { "completion_length": 92.71875, "epoch": 0.1548856548856549, "grad_norm": 2.945697348311385, "kl": 0.3984375, "learning_rate": 8.451143451143451e-07, "loss": 0.0159, "reward": 1.046875, "reward_std": 0.24831004440784454, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "step": 149 }, { "completion_length": 100.390625, "epoch": 0.15592515592515593, "grad_norm": 3.2023590511316407, "kl": 0.384765625, "learning_rate": 8.44074844074844e-07, "loss": 0.0154, "reward": 1.375, "reward_std": 0.3808925747871399, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.953125, "step": 150 }, { "completion_length": 117.4375, "epoch": 0.15696465696465697, "grad_norm": 2.1888605217557875, "kl": 0.279296875, "learning_rate": 8.43035343035343e-07, "loss": 0.0112, "reward": 1.375, "reward_std": 0.41573357582092285, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.96875, "step": 151 }, { "completion_length": 106.421875, "epoch": 0.158004158004158, "grad_norm": 2.8067233505725784, "kl": 0.2216796875, "learning_rate": 8.419958419958419e-07, "loss": 0.0089, "reward": 1.3125, "reward_std": 0.30038219690322876, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.96875, "step": 152 }, { "completion_length": 132.1875, "epoch": 0.15904365904365905, "grad_norm": 2.4725963476032122, "kl": 0.2421875, "learning_rate": 8.40956340956341e-07, "loss": 0.0097, "reward": 1.359375, "reward_std": 0.43657946586608887, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.9375, "step": 153 }, { "completion_length": 112.484375, "epoch": 0.1600831600831601, "grad_norm": 2.30649511642921, "kl": 0.267578125, "learning_rate": 8.399168399168399e-07, "loss": 0.0107, "reward": 1.046875, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.953125, "step": 154 }, { "completion_length": 84.375, "epoch": 0.16112266112266113, "grad_norm": 3.461718706555044, "kl": 0.1943359375, "learning_rate": 8.388773388773388e-07, "loss": 0.0078, "reward": 1.40625, "reward_std": 0.35247981548309326, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 155 }, { "completion_length": 92.1875, "epoch": 0.16216216216216217, "grad_norm": 5.241226536569336, "kl": 0.17578125, "learning_rate": 8.378378378378377e-07, "loss": 0.007, "reward": 1.5, "reward_std": 0.3039487302303314, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 156 }, { "completion_length": 109.859375, "epoch": 0.1632016632016632, "grad_norm": 1.7036834744062095, "kl": 0.33984375, "learning_rate": 8.367983367983368e-07, "loss": 0.0136, "reward": 1.34375, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "step": 157 }, { "completion_length": 100.5, "epoch": 0.16424116424116425, "grad_norm": 3.075935365211901, "kl": 0.298828125, "learning_rate": 8.357588357588357e-07, "loss": 0.012, "reward": 1.25, "reward_std": 0.3668213486671448, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "step": 158 }, { "completion_length": 93.421875, "epoch": 0.1652806652806653, "grad_norm": 2.4426663598286154, "kl": 0.1640625, "learning_rate": 8.347193347193346e-07, "loss": 0.0066, "reward": 1.265625, "reward_std": 0.3708576261997223, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 159 }, { "completion_length": 94.65625, "epoch": 0.16632016632016633, "grad_norm": 3.311526941922045, "kl": 0.388671875, "learning_rate": 8.336798336798336e-07, "loss": 0.0155, "reward": 1.328125, "reward_std": 0.4325885772705078, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 160 }, { "completion_length": 103.359375, "epoch": 0.16735966735966737, "grad_norm": 2.6908773955352663, "kl": 0.43359375, "learning_rate": 8.326403326403326e-07, "loss": 0.0174, "reward": 1.296875, "reward_std": 0.308285653591156, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.953125, "step": 161 }, { "completion_length": 106.828125, "epoch": 0.1683991683991684, "grad_norm": 3.415635957440033, "kl": 0.265625, "learning_rate": 8.316008316008317e-07, "loss": 0.0106, "reward": 1.265625, "reward_std": 0.4508727788925171, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.953125, "step": 162 }, { "completion_length": 106.640625, "epoch": 0.16943866943866945, "grad_norm": 1.9735358477846445, "kl": 0.353515625, "learning_rate": 8.305613305613305e-07, "loss": 0.0142, "reward": 1.296875, "reward_std": 0.36185091733932495, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.96875, "step": 163 }, { "completion_length": 93.953125, "epoch": 0.1704781704781705, "grad_norm": 3.1820923879405183, "kl": 0.21875, "learning_rate": 8.295218295218295e-07, "loss": 0.0088, "reward": 1.46875, "reward_std": 0.41610968112945557, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.984375, "step": 164 }, { "completion_length": 93.171875, "epoch": 0.17151767151767153, "grad_norm": 8.650291957858007, "kl": 0.361328125, "learning_rate": 8.284823284823285e-07, "loss": 0.0145, "reward": 1.25, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "step": 165 }, { "completion_length": 103.25, "epoch": 0.17255717255717257, "grad_norm": 2.5426066792660995, "kl": 0.39453125, "learning_rate": 8.274428274428275e-07, "loss": 0.0158, "reward": 1.21875, "reward_std": 0.3593195080757141, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "step": 166 }, { "completion_length": 111.859375, "epoch": 0.1735966735966736, "grad_norm": 2.343882723244004, "kl": 0.275390625, "learning_rate": 8.264033264033263e-07, "loss": 0.011, "reward": 1.421875, "reward_std": 0.3403330445289612, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 167 }, { "completion_length": 89.875, "epoch": 0.17463617463617465, "grad_norm": 1.9703908284642608, "kl": 0.3046875, "learning_rate": 8.253638253638254e-07, "loss": 0.0122, "reward": 1.328125, "reward_std": 0.32878512144088745, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 168 }, { "completion_length": 101.484375, "epoch": 0.17567567567567569, "grad_norm": 2.4911339554644765, "kl": 0.30078125, "learning_rate": 8.243243243243243e-07, "loss": 0.0121, "reward": 1.421875, "reward_std": 0.23568853735923767, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 169 }, { "completion_length": 115.453125, "epoch": 0.17671517671517672, "grad_norm": 4.0006827326090155, "kl": 0.314453125, "learning_rate": 8.232848232848233e-07, "loss": 0.0126, "reward": 1.125, "reward_std": 0.34352827072143555, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.921875, "step": 170 }, { "completion_length": 110.515625, "epoch": 0.17775467775467776, "grad_norm": 2.2131801659247556, "kl": 0.2265625, "learning_rate": 8.222453222453221e-07, "loss": 0.0091, "reward": 1.25, "reward_std": 0.3678949177265167, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9375, "step": 171 }, { "completion_length": 74.75, "epoch": 0.1787941787941788, "grad_norm": 3.381995670114101, "kl": 0.28515625, "learning_rate": 8.212058212058212e-07, "loss": 0.0114, "reward": 1.5, "reward_std": 0.42506125569343567, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 172 }, { "completion_length": 118.8125, "epoch": 0.17983367983367984, "grad_norm": 2.439404317033616, "kl": 0.314453125, "learning_rate": 8.201663201663201e-07, "loss": 0.0126, "reward": 1.21875, "reward_std": 0.40456175804138184, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.953125, "step": 173 }, { "completion_length": 106.5625, "epoch": 0.18087318087318088, "grad_norm": 2.463923446684379, "kl": 0.349609375, "learning_rate": 8.191268191268192e-07, "loss": 0.014, "reward": 1.25, "reward_std": 0.34929442405700684, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "step": 174 }, { "completion_length": 108.234375, "epoch": 0.18191268191268192, "grad_norm": 2.6025268522312355, "kl": 0.255859375, "learning_rate": 8.18087318087318e-07, "loss": 0.0103, "reward": 1.25, "reward_std": 0.3913668990135193, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "step": 175 }, { "completion_length": 109.53125, "epoch": 0.18295218295218296, "grad_norm": 1.9370133222813268, "kl": 0.23828125, "learning_rate": 8.17047817047817e-07, "loss": 0.0095, "reward": 1.34375, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 176 }, { "completion_length": 109.015625, "epoch": 0.183991683991684, "grad_norm": 1.7186396811072, "kl": 0.3671875, "learning_rate": 8.16008316008316e-07, "loss": 0.0147, "reward": 1.34375, "reward_std": 0.29143065214157104, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "step": 177 }, { "completion_length": 99.21875, "epoch": 0.18503118503118504, "grad_norm": 2.860353086745535, "kl": 0.26171875, "learning_rate": 8.14968814968815e-07, "loss": 0.0105, "reward": 1.453125, "reward_std": 0.463489294052124, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 178 }, { "completion_length": 122.046875, "epoch": 0.18607068607068608, "grad_norm": 1.8937401710666846, "kl": 0.263671875, "learning_rate": 8.139293139293138e-07, "loss": 0.0106, "reward": 1.078125, "reward_std": 0.2597545385360718, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "step": 179 }, { "completion_length": 103.34375, "epoch": 0.18711018711018712, "grad_norm": 3.3739057606811023, "kl": 0.349609375, "learning_rate": 8.128898128898129e-07, "loss": 0.014, "reward": 1.390625, "reward_std": 0.4908284544944763, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.9375, "step": 180 }, { "completion_length": 88.3125, "epoch": 0.18814968814968816, "grad_norm": 2.5083496285458007, "kl": 0.2470703125, "learning_rate": 8.118503118503118e-07, "loss": 0.0099, "reward": 1.328125, "reward_std": 0.2777610719203949, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "step": 181 }, { "completion_length": 114.140625, "epoch": 0.1891891891891892, "grad_norm": 2.2289224380222654, "kl": 0.263671875, "learning_rate": 8.108108108108108e-07, "loss": 0.0105, "reward": 1.421875, "reward_std": 0.363300621509552, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 182 }, { "completion_length": 116.765625, "epoch": 0.19022869022869024, "grad_norm": 1.7727286482565525, "kl": 0.1640625, "learning_rate": 8.097713097713096e-07, "loss": 0.0065, "reward": 1.28125, "reward_std": 0.32805800437927246, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.96875, "step": 183 }, { "completion_length": 91.78125, "epoch": 0.19126819126819128, "grad_norm": 3.481159364974563, "kl": 0.259765625, "learning_rate": 8.087318087318087e-07, "loss": 0.0104, "reward": 1.390625, "reward_std": 0.44484928250312805, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.953125, "step": 184 }, { "completion_length": 102.375, "epoch": 0.19230769230769232, "grad_norm": 2.506758348081539, "kl": 0.2158203125, "learning_rate": 8.076923076923077e-07, "loss": 0.0086, "reward": 1.390625, "reward_std": 0.4341113567352295, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 185 }, { "completion_length": 85.84375, "epoch": 0.19334719334719336, "grad_norm": 4.63275407448868, "kl": 0.2451171875, "learning_rate": 8.066528066528067e-07, "loss": 0.0098, "reward": 1.203125, "reward_std": 0.28460076451301575, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.96875, "step": 186 }, { "completion_length": 102.234375, "epoch": 0.1943866943866944, "grad_norm": 2.9791046563409385, "kl": 0.2177734375, "learning_rate": 8.056133056133056e-07, "loss": 0.0087, "reward": 1.203125, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.984375, "step": 187 }, { "completion_length": 105.578125, "epoch": 0.19542619542619544, "grad_norm": 2.451834076733131, "kl": 0.13671875, "learning_rate": 8.045738045738045e-07, "loss": 0.0055, "reward": 1.28125, "reward_std": 0.4552096724510193, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.953125, "step": 188 }, { "completion_length": 101.34375, "epoch": 0.19646569646569648, "grad_norm": 2.127343534437706, "kl": 0.30078125, "learning_rate": 8.035343035343036e-07, "loss": 0.012, "reward": 1.484375, "reward_std": 0.34706932306289673, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.96875, "step": 189 }, { "completion_length": 106.421875, "epoch": 0.19750519750519752, "grad_norm": 4.164479003705187, "kl": 0.193359375, "learning_rate": 8.024948024948025e-07, "loss": 0.0077, "reward": 1.25, "reward_std": 0.37981897592544556, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "step": 190 }, { "completion_length": 100.46875, "epoch": 0.19854469854469856, "grad_norm": 2.6178864303174323, "kl": 0.2109375, "learning_rate": 8.014553014553014e-07, "loss": 0.0084, "reward": 1.484375, "reward_std": 0.3571978509426117, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.96875, "step": 191 }, { "completion_length": 93.453125, "epoch": 0.1995841995841996, "grad_norm": 2.594758254777971, "kl": 0.3515625, "learning_rate": 8.004158004158003e-07, "loss": 0.0141, "reward": 1.375, "reward_std": 0.3291315734386444, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.953125, "step": 192 }, { "completion_length": 78.921875, "epoch": 0.20062370062370063, "grad_norm": 4.413288481561682, "kl": 0.23046875, "learning_rate": 7.993762993762994e-07, "loss": 0.0092, "reward": 1.53125, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 193 }, { "completion_length": 105.765625, "epoch": 0.20166320166320167, "grad_norm": 1.9718222001034817, "kl": 0.205078125, "learning_rate": 7.983367983367983e-07, "loss": 0.0082, "reward": 1.234375, "reward_std": 0.32407689094543457, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.96875, "step": 194 }, { "completion_length": 85.921875, "epoch": 0.20270270270270271, "grad_norm": 3.3388619606365357, "kl": 0.353515625, "learning_rate": 7.972972972972972e-07, "loss": 0.0141, "reward": 1.390625, "reward_std": 0.24831002950668335, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 195 }, { "completion_length": 91.28125, "epoch": 0.20374220374220375, "grad_norm": 2.038320663756555, "kl": 0.3984375, "learning_rate": 7.962577962577962e-07, "loss": 0.0159, "reward": 1.390625, "reward_std": 0.23925508558750153, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 196 }, { "completion_length": 87.28125, "epoch": 0.2047817047817048, "grad_norm": 2.6028987063506395, "kl": 0.1806640625, "learning_rate": 7.952182952182952e-07, "loss": 0.0072, "reward": 1.46875, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 197 }, { "completion_length": 96.734375, "epoch": 0.20582120582120583, "grad_norm": 2.389369599386934, "kl": 0.271484375, "learning_rate": 7.941787941787942e-07, "loss": 0.0108, "reward": 1.21875, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.953125, "step": 198 }, { "completion_length": 99.9375, "epoch": 0.20686070686070687, "grad_norm": 2.062412524943575, "kl": 0.201171875, "learning_rate": 7.931392931392931e-07, "loss": 0.0081, "reward": 1.171875, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9375, "step": 199 }, { "completion_length": 100.203125, "epoch": 0.2079002079002079, "grad_norm": 2.4940427139773393, "kl": 0.26171875, "learning_rate": 7.92099792099792e-07, "loss": 0.0105, "reward": 1.34375, "reward_std": 0.35400262475013733, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "step": 200 }, { "completion_length": 85.953125, "epoch": 0.20893970893970895, "grad_norm": 2.015357865283533, "kl": 0.22265625, "learning_rate": 7.91060291060291e-07, "loss": 0.0089, "reward": 1.21875, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.984375, "step": 201 }, { "completion_length": 118.453125, "epoch": 0.20997920997921, "grad_norm": 2.1290558113911837, "kl": 0.26953125, "learning_rate": 7.9002079002079e-07, "loss": 0.0107, "reward": 1.25, "reward_std": 0.37405285239219666, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "step": 202 }, { "completion_length": 109.96875, "epoch": 0.21101871101871103, "grad_norm": 2.688679228008769, "kl": 0.201171875, "learning_rate": 7.889812889812889e-07, "loss": 0.0081, "reward": 1.28125, "reward_std": 0.44663429260253906, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.96875, "step": 203 }, { "completion_length": 108.53125, "epoch": 0.21205821205821207, "grad_norm": 2.2084088317237964, "kl": 0.33984375, "learning_rate": 7.879417879417878e-07, "loss": 0.0135, "reward": 1.296875, "reward_std": 0.36469969153404236, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.921875, "step": 204 }, { "completion_length": 109.234375, "epoch": 0.2130977130977131, "grad_norm": 2.490585523868161, "kl": 0.2431640625, "learning_rate": 7.869022869022869e-07, "loss": 0.0097, "reward": 1.5, "reward_std": 0.32666346430778503, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 205 }, { "completion_length": 98.765625, "epoch": 0.21413721413721415, "grad_norm": 13.141457080612374, "kl": 0.875, "learning_rate": 7.858627858627859e-07, "loss": 0.035, "reward": 1.484375, "reward_std": 0.4650121033191681, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 206 }, { "completion_length": 103.53125, "epoch": 0.2151767151767152, "grad_norm": 2.6834346241779707, "kl": 0.212890625, "learning_rate": 7.848232848232847e-07, "loss": 0.0085, "reward": 1.5, "reward_std": 0.47359731793403625, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 207 }, { "completion_length": 124.203125, "epoch": 0.21621621621621623, "grad_norm": 2.479989052431702, "kl": 0.1904296875, "learning_rate": 7.837837837837838e-07, "loss": 0.0076, "reward": 1.359375, "reward_std": 0.3845370411872864, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.984375, "step": 208 }, { "completion_length": 104.5625, "epoch": 0.21725571725571727, "grad_norm": 2.6463156472279636, "kl": 0.095703125, "learning_rate": 7.827442827442827e-07, "loss": 0.0038, "reward": 1.375, "reward_std": 0.37981897592544556, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 209 }, { "completion_length": 122.6875, "epoch": 0.2182952182952183, "grad_norm": 2.3928355609338743, "kl": 0.1455078125, "learning_rate": 7.817047817047818e-07, "loss": 0.0058, "reward": 1.328125, "reward_std": 0.30617380142211914, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.953125, "step": 210 }, { "completion_length": 123.03125, "epoch": 0.21933471933471935, "grad_norm": 3.319099111148213, "kl": 0.365234375, "learning_rate": 7.806652806652806e-07, "loss": 0.0146, "reward": 1.171875, "reward_std": 0.46850311756134033, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.90625, "step": 211 }, { "completion_length": 138.140625, "epoch": 0.2203742203742204, "grad_norm": 2.1203357894697414, "kl": 0.3203125, "learning_rate": 7.796257796257796e-07, "loss": 0.0128, "reward": 1.15625, "reward_std": 0.5000613927841187, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.921875, "step": 212 }, { "completion_length": 149.25, "epoch": 0.22141372141372143, "grad_norm": 1.764594728225052, "kl": 0.154296875, "learning_rate": 7.785862785862785e-07, "loss": 0.0062, "reward": 0.96875, "reward_std": 0.3424547016620636, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.859375, "step": 213 }, { "completion_length": 115.8125, "epoch": 0.22245322245322247, "grad_norm": 2.492957712507643, "kl": 0.255859375, "learning_rate": 7.775467775467776e-07, "loss": 0.0103, "reward": 1.46875, "reward_std": 0.4003184735774994, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 214 }, { "completion_length": 139.8125, "epoch": 0.2234927234927235, "grad_norm": 1.9416412568936112, "kl": 0.1689453125, "learning_rate": 7.765072765072764e-07, "loss": 0.0068, "reward": 1.28125, "reward_std": 0.5075247287750244, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.84375, "step": 215 }, { "completion_length": 132.015625, "epoch": 0.22453222453222454, "grad_norm": 2.5560980324956435, "kl": 0.2197265625, "learning_rate": 7.754677754677755e-07, "loss": 0.0088, "reward": 1.1875, "reward_std": 0.30371320247650146, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.921875, "step": 216 }, { "completion_length": 109.078125, "epoch": 0.22557172557172558, "grad_norm": 2.151673966108665, "kl": 0.1982421875, "learning_rate": 7.744282744282744e-07, "loss": 0.0079, "reward": 1.34375, "reward_std": 0.29143062233924866, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "step": 217 }, { "completion_length": 136.359375, "epoch": 0.22661122661122662, "grad_norm": 2.29435772468969, "kl": 0.15625, "learning_rate": 7.733887733887734e-07, "loss": 0.0063, "reward": 1.265625, "reward_std": 0.530933678150177, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.890625, "step": 218 }, { "completion_length": 134.296875, "epoch": 0.22765072765072766, "grad_norm": 2.717345439053142, "kl": 0.146484375, "learning_rate": 7.723492723492722e-07, "loss": 0.0058, "reward": 1.515625, "reward_std": 0.5091127157211304, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.96875, "step": 219 }, { "completion_length": 131.6875, "epoch": 0.2286902286902287, "grad_norm": 1.8519220333623425, "kl": 0.2236328125, "learning_rate": 7.713097713097713e-07, "loss": 0.0089, "reward": 1.203125, "reward_std": 0.37055522203445435, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9375, "step": 220 }, { "completion_length": 133.53125, "epoch": 0.22972972972972974, "grad_norm": 1.9504470687641822, "kl": 0.19921875, "learning_rate": 7.702702702702702e-07, "loss": 0.008, "reward": 1.234375, "reward_std": 0.28460076451301575, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.890625, "step": 221 }, { "completion_length": 130.09375, "epoch": 0.23076923076923078, "grad_norm": 2.3563437022672025, "kl": 0.1318359375, "learning_rate": 7.692307692307693e-07, "loss": 0.0053, "reward": 1.296875, "reward_std": 0.39995211362838745, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.9375, "step": 222 }, { "completion_length": 112.0625, "epoch": 0.23180873180873182, "grad_norm": 2.366963325999118, "kl": 0.19140625, "learning_rate": 7.681912681912681e-07, "loss": 0.0077, "reward": 1.390625, "reward_std": 0.4420618414878845, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.9375, "step": 223 }, { "completion_length": 128.265625, "epoch": 0.23284823284823286, "grad_norm": 2.89936642724053, "kl": 0.2197265625, "learning_rate": 7.671517671517671e-07, "loss": 0.0088, "reward": 1.390625, "reward_std": 0.5027130246162415, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.90625, "step": 224 }, { "completion_length": 127.3125, "epoch": 0.2338877338877339, "grad_norm": 2.403345736619868, "kl": 0.1591796875, "learning_rate": 7.66112266112266e-07, "loss": 0.0064, "reward": 1.40625, "reward_std": 0.49022960662841797, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.875, "step": 225 }, { "completion_length": 124.296875, "epoch": 0.23492723492723494, "grad_norm": 2.9889182131513126, "kl": 0.22265625, "learning_rate": 7.650727650727651e-07, "loss": 0.0089, "reward": 1.390625, "reward_std": 0.36507582664489746, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.9375, "step": 226 }, { "completion_length": 111.203125, "epoch": 0.23596673596673598, "grad_norm": 2.1803630110880157, "kl": 0.1767578125, "learning_rate": 7.640332640332639e-07, "loss": 0.0071, "reward": 1.359375, "reward_std": 0.3481428921222687, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.96875, "step": 227 }, { "completion_length": 124.03125, "epoch": 0.23700623700623702, "grad_norm": 2.1178274135281154, "kl": 0.2197265625, "learning_rate": 7.62993762993763e-07, "loss": 0.0088, "reward": 1.53125, "reward_std": 0.4846656918525696, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.90625, "step": 228 }, { "completion_length": 127.875, "epoch": 0.23804573804573806, "grad_norm": 2.346136680880863, "kl": 0.2294921875, "learning_rate": 7.61954261954262e-07, "loss": 0.0092, "reward": 1.125, "reward_std": 0.3650461435317993, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.890625, "step": 229 }, { "completion_length": 111.734375, "epoch": 0.2390852390852391, "grad_norm": 3.1254556385587717, "kl": 0.19140625, "learning_rate": 7.609147609147609e-07, "loss": 0.0077, "reward": 1.5625, "reward_std": 0.4366091787815094, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "step": 230 }, { "completion_length": 96.890625, "epoch": 0.24012474012474014, "grad_norm": 2.724553391463907, "kl": 0.26171875, "learning_rate": 7.598752598752599e-07, "loss": 0.0105, "reward": 1.296875, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.953125, "step": 231 }, { "completion_length": 101.5, "epoch": 0.24116424116424118, "grad_norm": 3.1063175484346854, "kl": 0.255859375, "learning_rate": 7.588357588357588e-07, "loss": 0.0102, "reward": 1.484375, "reward_std": 0.4071483612060547, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.953125, "step": 232 }, { "completion_length": 107.578125, "epoch": 0.24220374220374222, "grad_norm": 1.3010919951427018, "kl": 0.453125, "learning_rate": 7.577962577962578e-07, "loss": 0.0181, "reward": 1.09375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9375, "step": 233 }, { "completion_length": 129.015625, "epoch": 0.24324324324324326, "grad_norm": 2.2650566790016535, "kl": 0.2041015625, "learning_rate": 7.567567567567568e-07, "loss": 0.0082, "reward": 1.203125, "reward_std": 0.2867125868797302, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 1.0, "step": 234 }, { "completion_length": 93.796875, "epoch": 0.2442827442827443, "grad_norm": 3.439341173936557, "kl": 0.265625, "learning_rate": 7.557172557172557e-07, "loss": 0.0106, "reward": 1.5625, "reward_std": 0.37981900572776794, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.96875, "step": 235 }, { "completion_length": 92.625, "epoch": 0.24532224532224534, "grad_norm": 2.526643849221635, "kl": 0.28125, "learning_rate": 7.546777546777546e-07, "loss": 0.0112, "reward": 1.1875, "reward_std": 0.3319803476333618, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9375, "step": 236 }, { "completion_length": 114.53125, "epoch": 0.24636174636174638, "grad_norm": 2.415414400936711, "kl": 0.2099609375, "learning_rate": 7.536382536382537e-07, "loss": 0.0084, "reward": 1.515625, "reward_std": 0.4071483612060547, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.953125, "step": 237 }, { "completion_length": 113.1875, "epoch": 0.24740124740124741, "grad_norm": 2.72039835648138, "kl": 0.177734375, "learning_rate": 7.525987525987526e-07, "loss": 0.0071, "reward": 1.296875, "reward_std": 0.4329647123813629, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.953125, "step": 238 }, { "completion_length": 94.296875, "epoch": 0.24844074844074845, "grad_norm": 2.8033871389666896, "kl": 0.359375, "learning_rate": 7.515592515592515e-07, "loss": 0.0144, "reward": 1.328125, "reward_std": 0.5321329832077026, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.890625, "step": 239 }, { "completion_length": 94.359375, "epoch": 0.2494802494802495, "grad_norm": 2.842754154750858, "kl": 0.177734375, "learning_rate": 7.505197505197504e-07, "loss": 0.0071, "reward": 1.34375, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 240 }, { "completion_length": 109.25, "epoch": 0.2505197505197505, "grad_norm": 1.879708141108578, "kl": 0.265625, "learning_rate": 7.494802494802495e-07, "loss": 0.0106, "reward": 0.984375, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "step": 241 }, { "completion_length": 99.9375, "epoch": 0.2515592515592516, "grad_norm": 2.204281131329277, "kl": 0.2109375, "learning_rate": 7.484407484407484e-07, "loss": 0.0084, "reward": 1.21875, "reward_std": 0.2709311842918396, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.984375, "step": 242 }, { "completion_length": 101.296875, "epoch": 0.2525987525987526, "grad_norm": 2.541043388402867, "kl": 0.123046875, "learning_rate": 7.474012474012473e-07, "loss": 0.0049, "reward": 1.46875, "reward_std": 0.400318443775177, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.9375, "step": 243 }, { "completion_length": 103.984375, "epoch": 0.25363825363825365, "grad_norm": 2.5100542315827603, "kl": 0.337890625, "learning_rate": 7.463617463617463e-07, "loss": 0.0135, "reward": 1.46875, "reward_std": 0.31300368905067444, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.9375, "step": 244 }, { "completion_length": 102.796875, "epoch": 0.25467775467775466, "grad_norm": 2.6750847404186526, "kl": 0.2119140625, "learning_rate": 7.453222453222453e-07, "loss": 0.0085, "reward": 1.28125, "reward_std": 0.38925278186798096, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.90625, "step": 245 }, { "completion_length": 81.1875, "epoch": 0.25571725571725573, "grad_norm": 2.628256692264164, "kl": 0.1689453125, "learning_rate": 7.442827442827442e-07, "loss": 0.0067, "reward": 1.359375, "reward_std": 0.2867125868797302, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.984375, "step": 246 }, { "completion_length": 95.609375, "epoch": 0.25675675675675674, "grad_norm": 3.050770535954829, "kl": 0.259765625, "learning_rate": 7.432432432432432e-07, "loss": 0.0104, "reward": 1.25, "reward_std": 0.5536434650421143, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.890625, "step": 247 }, { "completion_length": 109.921875, "epoch": 0.2577962577962578, "grad_norm": 2.904431233133768, "kl": 0.2451171875, "learning_rate": 7.422037422037421e-07, "loss": 0.0098, "reward": 1.375, "reward_std": 0.42674916982650757, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.890625, "step": 248 }, { "completion_length": 85.53125, "epoch": 0.2588357588357588, "grad_norm": 2.5577061579653484, "kl": 0.30078125, "learning_rate": 7.411642411642412e-07, "loss": 0.0121, "reward": 1.25, "reward_std": 0.3161734342575073, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "step": 249 }, { "completion_length": 120.40625, "epoch": 0.2598752598752599, "grad_norm": 2.214123961069078, "kl": 0.19921875, "learning_rate": 7.401247401247401e-07, "loss": 0.008, "reward": 1.328125, "reward_std": 0.4764317572116852, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.9375, "step": 250 }, { "completion_length": 101.703125, "epoch": 0.2609147609147609, "grad_norm": 2.5257813914058187, "kl": 0.28125, "learning_rate": 7.39085239085239e-07, "loss": 0.0112, "reward": 1.328125, "reward_std": 0.3352486789226532, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.953125, "step": 251 }, { "completion_length": 108.0, "epoch": 0.26195426195426197, "grad_norm": 3.6853026845098245, "kl": 0.234375, "learning_rate": 7.380457380457379e-07, "loss": 0.0094, "reward": 1.296875, "reward_std": 0.3661494255065918, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.953125, "step": 252 }, { "completion_length": 95.625, "epoch": 0.262993762993763, "grad_norm": 2.6597583633631374, "kl": 0.25390625, "learning_rate": 7.37006237006237e-07, "loss": 0.0101, "reward": 1.421875, "reward_std": 0.3981967866420746, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 253 }, { "completion_length": 107.34375, "epoch": 0.26403326403326405, "grad_norm": 2.2787436287223217, "kl": 0.494140625, "learning_rate": 7.35966735966736e-07, "loss": 0.0197, "reward": 1.296875, "reward_std": 0.33669838309288025, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.953125, "step": 254 }, { "completion_length": 119.828125, "epoch": 0.26507276507276506, "grad_norm": 14.085434648060476, "kl": 0.1767578125, "learning_rate": 7.349272349272348e-07, "loss": 0.0071, "reward": 1.203125, "reward_std": 0.34280115365982056, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9375, "step": 255 }, { "completion_length": 107.921875, "epoch": 0.2661122661122661, "grad_norm": 2.2496434467284963, "kl": 0.349609375, "learning_rate": 7.338877338877339e-07, "loss": 0.014, "reward": 1.328125, "reward_std": 0.30617380142211914, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 1.0, "step": 256 }, { "completion_length": 112.5625, "epoch": 0.26715176715176714, "grad_norm": 2.9459491689372808, "kl": 0.220703125, "learning_rate": 7.328482328482328e-07, "loss": 0.0089, "reward": 1.21875, "reward_std": 0.37944284081459045, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "step": 257 }, { "completion_length": 104.890625, "epoch": 0.2681912681912682, "grad_norm": 3.8247022050660804, "kl": 0.33203125, "learning_rate": 7.318087318087319e-07, "loss": 0.0133, "reward": 1.28125, "reward_std": 0.4581822156906128, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.90625, "step": 258 }, { "completion_length": 82.796875, "epoch": 0.2692307692307692, "grad_norm": 2.207243758759684, "kl": 0.11474609375, "learning_rate": 7.307692307692307e-07, "loss": 0.0046, "reward": 1.5, "reward_std": 0.3671974837779999, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 259 }, { "completion_length": 101.796875, "epoch": 0.2702702702702703, "grad_norm": 2.199587468578523, "kl": 0.2041015625, "learning_rate": 7.297297297297297e-07, "loss": 0.0082, "reward": 1.515625, "reward_std": 0.29826050996780396, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 260 }, { "completion_length": 94.09375, "epoch": 0.2713097713097713, "grad_norm": 2.9001298306387424, "kl": 0.380859375, "learning_rate": 7.286902286902286e-07, "loss": 0.0152, "reward": 1.46875, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 261 }, { "completion_length": 104.25, "epoch": 0.27234927234927236, "grad_norm": 2.31490114378449, "kl": 0.2333984375, "learning_rate": 7.276507276507277e-07, "loss": 0.0094, "reward": 1.390625, "reward_std": 0.35035818815231323, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 262 }, { "completion_length": 90.15625, "epoch": 0.2733887733887734, "grad_norm": 2.5048320759407994, "kl": 0.328125, "learning_rate": 7.266112266112265e-07, "loss": 0.0131, "reward": 1.421875, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 263 }, { "completion_length": 125.265625, "epoch": 0.27442827442827444, "grad_norm": 2.6850432562076034, "kl": 0.3984375, "learning_rate": 7.255717255717255e-07, "loss": 0.016, "reward": 1.28125, "reward_std": 0.408893883228302, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.953125, "step": 264 }, { "completion_length": 105.859375, "epoch": 0.27546777546777546, "grad_norm": 2.179335718695676, "kl": 0.130859375, "learning_rate": 7.245322245322245e-07, "loss": 0.0052, "reward": 1.328125, "reward_std": 0.3548792004585266, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 265 }, { "completion_length": 106.75, "epoch": 0.2765072765072765, "grad_norm": 2.2156372090069354, "kl": 0.234375, "learning_rate": 7.234927234927235e-07, "loss": 0.0094, "reward": 1.515625, "reward_std": 0.3442002236843109, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 266 }, { "completion_length": 117.46875, "epoch": 0.27754677754677753, "grad_norm": 1.855196986690691, "kl": 0.2451171875, "learning_rate": 7.224532224532223e-07, "loss": 0.0098, "reward": 1.25, "reward_std": 0.2986069917678833, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9375, "step": 267 }, { "completion_length": 94.984375, "epoch": 0.2785862785862786, "grad_norm": 2.160834159824737, "kl": 0.22265625, "learning_rate": 7.214137214137214e-07, "loss": 0.0089, "reward": 1.5, "reward_std": 0.2961388826370239, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 268 }, { "completion_length": 98.359375, "epoch": 0.2796257796257796, "grad_norm": 3.2050747726495215, "kl": 0.2216796875, "learning_rate": 7.203742203742203e-07, "loss": 0.0089, "reward": 1.28125, "reward_std": 0.37722259759902954, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9375, "step": 269 }, { "completion_length": 102.5, "epoch": 0.2806652806652807, "grad_norm": 2.7137349832775355, "kl": 0.140625, "learning_rate": 7.193347193347194e-07, "loss": 0.0056, "reward": 1.5, "reward_std": 0.5224853754043579, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.9375, "step": 270 }, { "completion_length": 103.578125, "epoch": 0.2817047817047817, "grad_norm": 2.5504212913965425, "kl": 0.193359375, "learning_rate": 7.182952182952182e-07, "loss": 0.0077, "reward": 1.609375, "reward_std": 0.43189114332199097, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.96875, "step": 271 }, { "completion_length": 109.140625, "epoch": 0.28274428274428276, "grad_norm": 1.9934498180056277, "kl": 0.1953125, "learning_rate": 7.172557172557172e-07, "loss": 0.0078, "reward": 1.453125, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.953125, "step": 272 }, { "completion_length": 106.4375, "epoch": 0.28378378378378377, "grad_norm": 2.4352344151350187, "kl": 0.2412109375, "learning_rate": 7.162162162162161e-07, "loss": 0.0097, "reward": 1.4375, "reward_std": 0.4081283211708069, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.953125, "step": 273 }, { "completion_length": 107.1875, "epoch": 0.28482328482328484, "grad_norm": 3.0268433172409734, "kl": 0.2470703125, "learning_rate": 7.151767151767152e-07, "loss": 0.0099, "reward": 1.171875, "reward_std": 0.3877224624156952, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "step": 274 }, { "completion_length": 112.453125, "epoch": 0.28586278586278585, "grad_norm": 2.4589777380163556, "kl": 0.26953125, "learning_rate": 7.14137214137214e-07, "loss": 0.0107, "reward": 1.234375, "reward_std": 0.38627272844314575, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.953125, "step": 275 }, { "completion_length": 105.734375, "epoch": 0.2869022869022869, "grad_norm": 2.480927014617341, "kl": 0.16015625, "learning_rate": 7.13097713097713e-07, "loss": 0.0064, "reward": 1.59375, "reward_std": 0.4139782190322876, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.96875, "step": 276 }, { "completion_length": 136.609375, "epoch": 0.28794178794178793, "grad_norm": 2.0065515332519213, "kl": 0.154296875, "learning_rate": 7.120582120582121e-07, "loss": 0.0062, "reward": 1.359375, "reward_std": 0.3298586905002594, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.96875, "step": 277 }, { "completion_length": 121.75, "epoch": 0.288981288981289, "grad_norm": 1.8518688275887052, "kl": 0.189453125, "learning_rate": 7.11018711018711e-07, "loss": 0.0076, "reward": 1.140625, "reward_std": 0.3352486789226532, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9375, "step": 278 }, { "completion_length": 129.4375, "epoch": 0.29002079002079, "grad_norm": 1.8230160105928799, "kl": 0.12890625, "learning_rate": 7.0997920997921e-07, "loss": 0.0052, "reward": 1.265625, "reward_std": 0.3949218690395355, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.921875, "step": 279 }, { "completion_length": 97.640625, "epoch": 0.2910602910602911, "grad_norm": 2.9173788612933853, "kl": 0.2197265625, "learning_rate": 7.089397089397089e-07, "loss": 0.0088, "reward": 1.390625, "reward_std": 0.3908922076225281, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.96875, "step": 280 }, { "completion_length": 149.25, "epoch": 0.2920997920997921, "grad_norm": 2.5851136090482636, "kl": 0.208984375, "learning_rate": 7.079002079002079e-07, "loss": 0.0083, "reward": 1.109375, "reward_std": 0.6387466192245483, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.796875, "step": 281 }, { "completion_length": 137.171875, "epoch": 0.29313929313929316, "grad_norm": 2.2395492234661414, "kl": 0.12353515625, "learning_rate": 7.068607068607068e-07, "loss": 0.0049, "reward": 1.234375, "reward_std": 0.5919406414031982, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.828125, "step": 282 }, { "completion_length": 115.5, "epoch": 0.29417879417879417, "grad_norm": 2.793225196680643, "kl": 0.10595703125, "learning_rate": 7.058212058212058e-07, "loss": 0.0042, "reward": 1.515625, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 283 }, { "completion_length": 125.421875, "epoch": 0.29521829521829523, "grad_norm": 2.127426489661907, "kl": 0.25390625, "learning_rate": 7.047817047817047e-07, "loss": 0.0102, "reward": 1.3125, "reward_std": 0.3197399973869324, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.921875, "step": 284 }, { "completion_length": 130.890625, "epoch": 0.29625779625779625, "grad_norm": 2.151123057037707, "kl": 0.17578125, "learning_rate": 7.037422037422038e-07, "loss": 0.007, "reward": 1.40625, "reward_std": 0.42044180631637573, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.984375, "step": 285 }, { "completion_length": 126.0625, "epoch": 0.2972972972972973, "grad_norm": 1.7920670956311737, "kl": 0.3046875, "learning_rate": 7.027027027027027e-07, "loss": 0.0122, "reward": 1.171875, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.90625, "step": 286 }, { "completion_length": 111.875, "epoch": 0.2983367983367983, "grad_norm": 2.3055748284808986, "kl": 0.1025390625, "learning_rate": 7.016632016632016e-07, "loss": 0.0041, "reward": 1.375, "reward_std": 0.3654222786426544, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.9375, "step": 287 }, { "completion_length": 128.171875, "epoch": 0.2993762993762994, "grad_norm": 2.5775130360637473, "kl": 0.234375, "learning_rate": 7.006237006237005e-07, "loss": 0.0094, "reward": 1.078125, "reward_std": 0.41246524453163147, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.890625, "step": 288 }, { "completion_length": 117.859375, "epoch": 0.3004158004158004, "grad_norm": 2.666070840917175, "kl": 0.1376953125, "learning_rate": 6.995841995841996e-07, "loss": 0.0055, "reward": 1.453125, "reward_std": 0.38664889335632324, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 289 }, { "completion_length": 108.015625, "epoch": 0.30145530145530147, "grad_norm": 2.451996984174324, "kl": 0.1884765625, "learning_rate": 6.985446985446985e-07, "loss": 0.0075, "reward": 1.25, "reward_std": 0.3697938919067383, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "step": 290 }, { "completion_length": 137.421875, "epoch": 0.3024948024948025, "grad_norm": 2.0608355017631617, "kl": 0.2138671875, "learning_rate": 6.975051975051974e-07, "loss": 0.0086, "reward": 1.3125, "reward_std": 0.35262539982795715, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.90625, "step": 291 }, { "completion_length": 120.34375, "epoch": 0.30353430353430355, "grad_norm": 2.586337972351937, "kl": 0.34765625, "learning_rate": 6.964656964656964e-07, "loss": 0.0139, "reward": 1.25, "reward_std": 0.4322375953197479, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.890625, "step": 292 }, { "completion_length": 120.953125, "epoch": 0.30457380457380456, "grad_norm": 4.81457252198039, "kl": 0.4765625, "learning_rate": 6.954261954261954e-07, "loss": 0.019, "reward": 1.21875, "reward_std": 0.34632188081741333, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9375, "step": 293 }, { "completion_length": 109.46875, "epoch": 0.30561330561330563, "grad_norm": 1.8211643339700867, "kl": 0.2177734375, "learning_rate": 6.943866943866943e-07, "loss": 0.0087, "reward": 1.296875, "reward_std": 0.2956691384315491, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.953125, "step": 294 }, { "completion_length": 126.046875, "epoch": 0.30665280665280664, "grad_norm": 1.943909455338696, "kl": 0.11083984375, "learning_rate": 6.933471933471933e-07, "loss": 0.0044, "reward": 1.0, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.921875, "step": 295 }, { "completion_length": 110.46875, "epoch": 0.3076923076923077, "grad_norm": 3.5211483867519933, "kl": 0.55859375, "learning_rate": 6.923076923076922e-07, "loss": 0.0224, "reward": 1.21875, "reward_std": 0.40996742248535156, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.90625, "step": 296 }, { "completion_length": 105.59375, "epoch": 0.3087318087318087, "grad_norm": 2.476135435805383, "kl": 0.220703125, "learning_rate": 6.912681912681912e-07, "loss": 0.0088, "reward": 1.515625, "reward_std": 0.38664889335632324, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.96875, "step": 297 }, { "completion_length": 125.625, "epoch": 0.3097713097713098, "grad_norm": 2.1772871695428497, "kl": 0.388671875, "learning_rate": 6.902286902286903e-07, "loss": 0.0155, "reward": 1.109375, "reward_std": 0.45831799507141113, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.875, "step": 298 }, { "completion_length": 105.96875, "epoch": 0.3108108108108108, "grad_norm": 2.4132129647630878, "kl": 0.205078125, "learning_rate": 6.891891891891891e-07, "loss": 0.0082, "reward": 1.34375, "reward_std": 0.3898440897464752, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.96875, "step": 299 }, { "completion_length": 110.359375, "epoch": 0.31185031185031187, "grad_norm": 2.682268995895765, "kl": 0.1083984375, "learning_rate": 6.881496881496881e-07, "loss": 0.0043, "reward": 1.34375, "reward_std": 0.43768274784088135, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.953125, "step": 300 }, { "completion_length": 110.0625, "epoch": 0.3128898128898129, "grad_norm": 2.9458466604035034, "kl": 0.3046875, "learning_rate": 6.871101871101871e-07, "loss": 0.0122, "reward": 1.46875, "reward_std": 0.5682458877563477, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.921875, "step": 301 }, { "completion_length": 117.75, "epoch": 0.31392931392931395, "grad_norm": 2.248917968335701, "kl": 0.1396484375, "learning_rate": 6.860706860706861e-07, "loss": 0.0056, "reward": 1.25, "reward_std": 0.4088938534259796, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.90625, "step": 302 }, { "completion_length": 115.625, "epoch": 0.31496881496881496, "grad_norm": 4.609128063710832, "kl": 0.3046875, "learning_rate": 6.850311850311849e-07, "loss": 0.0122, "reward": 1.390625, "reward_std": 0.49294528365135193, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.875, "step": 303 }, { "completion_length": 125.8125, "epoch": 0.316008316008316, "grad_norm": 2.084629369217826, "kl": 0.2314453125, "learning_rate": 6.83991683991684e-07, "loss": 0.0093, "reward": 1.296875, "reward_std": 0.40887951850891113, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.921875, "step": 304 }, { "completion_length": 109.875, "epoch": 0.31704781704781704, "grad_norm": 2.6334408798567295, "kl": 0.1748046875, "learning_rate": 6.829521829521829e-07, "loss": 0.007, "reward": 1.40625, "reward_std": 0.4648738503456116, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.875, "step": 305 }, { "completion_length": 109.015625, "epoch": 0.3180873180873181, "grad_norm": 1.927711771720593, "kl": 0.26171875, "learning_rate": 6.81912681912682e-07, "loss": 0.0105, "reward": 1.0625, "reward_std": 0.240030437707901, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "step": 306 }, { "completion_length": 99.703125, "epoch": 0.3191268191268191, "grad_norm": 2.485128814848062, "kl": 0.19140625, "learning_rate": 6.808731808731808e-07, "loss": 0.0077, "reward": 1.15625, "reward_std": 0.29143065214157104, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9375, "step": 307 }, { "completion_length": 108.0625, "epoch": 0.3201663201663202, "grad_norm": 2.0810330221837043, "kl": 0.208984375, "learning_rate": 6.798336798336798e-07, "loss": 0.0083, "reward": 1.265625, "reward_std": 0.39809340238571167, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.875, "step": 308 }, { "completion_length": 87.328125, "epoch": 0.3212058212058212, "grad_norm": 3.0346002527876696, "kl": 0.177734375, "learning_rate": 6.787941787941787e-07, "loss": 0.0071, "reward": 1.359375, "reward_std": 0.41717347502708435, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.921875, "step": 309 }, { "completion_length": 105.984375, "epoch": 0.32224532224532226, "grad_norm": 2.5490611242949273, "kl": 0.404296875, "learning_rate": 6.777546777546778e-07, "loss": 0.0162, "reward": 1.171875, "reward_std": 0.3931124210357666, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9375, "step": 310 }, { "completion_length": 102.53125, "epoch": 0.3232848232848233, "grad_norm": 2.568032112050349, "kl": 0.33203125, "learning_rate": 6.767151767151766e-07, "loss": 0.0133, "reward": 1.1875, "reward_std": 0.44430381059646606, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.953125, "step": 311 }, { "completion_length": 112.375, "epoch": 0.32432432432432434, "grad_norm": 2.5804592509805, "kl": 0.296875, "learning_rate": 6.756756756756756e-07, "loss": 0.0118, "reward": 1.328125, "reward_std": 0.42179790139198303, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.953125, "step": 312 }, { "completion_length": 99.078125, "epoch": 0.32536382536382535, "grad_norm": 3.252863355266766, "kl": 0.283203125, "learning_rate": 6.746361746361746e-07, "loss": 0.0114, "reward": 1.34375, "reward_std": 0.5106008052825928, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.921875, "step": 313 }, { "completion_length": 109.734375, "epoch": 0.3264033264033264, "grad_norm": 2.179285852899224, "kl": 0.40625, "learning_rate": 6.735966735966736e-07, "loss": 0.0162, "reward": 1.140625, "reward_std": 0.3356248140335083, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "step": 314 }, { "completion_length": 95.96875, "epoch": 0.32744282744282743, "grad_norm": 2.623641752273634, "kl": 0.1767578125, "learning_rate": 6.725571725571724e-07, "loss": 0.0071, "reward": 1.46875, "reward_std": 0.4488917291164398, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.953125, "step": 315 }, { "completion_length": 104.796875, "epoch": 0.3284823284823285, "grad_norm": 2.9798182566901694, "kl": 0.50390625, "learning_rate": 6.715176715176715e-07, "loss": 0.0202, "reward": 1.28125, "reward_std": 0.48725706338882446, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9375, "step": 316 }, { "completion_length": 109.265625, "epoch": 0.3295218295218295, "grad_norm": 2.37685245941197, "kl": 0.185546875, "learning_rate": 6.704781704781704e-07, "loss": 0.0074, "reward": 1.421875, "reward_std": 0.3625878393650055, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.9375, "step": 317 }, { "completion_length": 94.578125, "epoch": 0.3305613305613306, "grad_norm": 2.824934764816288, "kl": 0.3984375, "learning_rate": 6.694386694386694e-07, "loss": 0.0159, "reward": 1.609375, "reward_std": 0.4150615930557251, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.953125, "step": 318 }, { "completion_length": 104.171875, "epoch": 0.3316008316008316, "grad_norm": 2.9055487898618853, "kl": 0.4765625, "learning_rate": 6.683991683991683e-07, "loss": 0.0191, "reward": 1.359375, "reward_std": 0.547139048576355, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.90625, "step": 319 }, { "completion_length": 109.890625, "epoch": 0.33264033264033266, "grad_norm": 2.340316558515373, "kl": 0.216796875, "learning_rate": 6.673596673596673e-07, "loss": 0.0087, "reward": 1.21875, "reward_std": 0.2619796395301819, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.953125, "step": 320 }, { "completion_length": 110.359375, "epoch": 0.33367983367983367, "grad_norm": 2.784253329990895, "kl": 0.345703125, "learning_rate": 6.663201663201664e-07, "loss": 0.0138, "reward": 1.4375, "reward_std": 0.37086743116378784, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.953125, "step": 321 }, { "completion_length": 109.171875, "epoch": 0.33471933471933474, "grad_norm": 1.9712693706403304, "kl": 0.2138671875, "learning_rate": 6.652806652806653e-07, "loss": 0.0085, "reward": 1.34375, "reward_std": 0.2709311544895172, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "step": 322 }, { "completion_length": 96.828125, "epoch": 0.33575883575883575, "grad_norm": 2.712907330552726, "kl": 0.2109375, "learning_rate": 6.642411642411642e-07, "loss": 0.0084, "reward": 1.21875, "reward_std": 0.3284187912940979, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "step": 323 }, { "completion_length": 105.03125, "epoch": 0.3367983367983368, "grad_norm": 2.8660143178455604, "kl": 0.35546875, "learning_rate": 6.632016632016631e-07, "loss": 0.0142, "reward": 1.484375, "reward_std": 0.46737682819366455, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.9375, "step": 324 }, { "completion_length": 115.375, "epoch": 0.33783783783783783, "grad_norm": 2.473390312865356, "kl": 0.2294921875, "learning_rate": 6.621621621621622e-07, "loss": 0.0092, "reward": 1.28125, "reward_std": 0.44663429260253906, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.96875, "step": 325 }, { "completion_length": 120.359375, "epoch": 0.3388773388773389, "grad_norm": 2.2010501052206743, "kl": 0.279296875, "learning_rate": 6.611226611226611e-07, "loss": 0.0111, "reward": 1.046875, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "step": 326 }, { "completion_length": 119.15625, "epoch": 0.3399168399168399, "grad_norm": 2.2680748832179902, "kl": 0.1630859375, "learning_rate": 6.6008316008316e-07, "loss": 0.0065, "reward": 1.59375, "reward_std": 0.36257803440093994, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.921875, "step": 327 }, { "completion_length": 112.65625, "epoch": 0.340956340956341, "grad_norm": 2.835786579898458, "kl": 0.3515625, "learning_rate": 6.59043659043659e-07, "loss": 0.014, "reward": 1.453125, "reward_std": 0.4908284544944763, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.953125, "step": 328 }, { "completion_length": 90.609375, "epoch": 0.341995841995842, "grad_norm": 3.1758212380134796, "kl": 0.2236328125, "learning_rate": 6.58004158004158e-07, "loss": 0.009, "reward": 1.3125, "reward_std": 0.37190571427345276, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.96875, "step": 329 }, { "completion_length": 112.515625, "epoch": 0.34303534303534305, "grad_norm": 2.0473085648055935, "kl": 0.1357421875, "learning_rate": 6.56964656964657e-07, "loss": 0.0054, "reward": 1.390625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 330 }, { "completion_length": 131.296875, "epoch": 0.34407484407484407, "grad_norm": 3.7298267042246165, "kl": 0.40234375, "learning_rate": 6.559251559251559e-07, "loss": 0.0161, "reward": 1.140625, "reward_std": 0.2688094973564148, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.953125, "step": 331 }, { "completion_length": 112.59375, "epoch": 0.34511434511434513, "grad_norm": 2.5459192608819916, "kl": 0.18359375, "learning_rate": 6.548856548856548e-07, "loss": 0.0073, "reward": 1.203125, "reward_std": 0.4365312457084656, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.921875, "step": 332 }, { "completion_length": 119.015625, "epoch": 0.34615384615384615, "grad_norm": 1.9274415388764092, "kl": 0.126953125, "learning_rate": 6.538461538461538e-07, "loss": 0.0051, "reward": 1.234375, "reward_std": 0.3403330445289612, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.953125, "step": 333 }, { "completion_length": 100.515625, "epoch": 0.3471933471933472, "grad_norm": 2.6121929790977383, "kl": 0.298828125, "learning_rate": 6.528066528066528e-07, "loss": 0.012, "reward": 1.421875, "reward_std": 0.4211643934249878, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.9375, "step": 334 }, { "completion_length": 112.875, "epoch": 0.3482328482328482, "grad_norm": 36.19357591876375, "kl": 0.13671875, "learning_rate": 6.517671517671517e-07, "loss": 0.0055, "reward": 1.390625, "reward_std": 0.39560043811798096, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.953125, "step": 335 }, { "completion_length": 110.625, "epoch": 0.3492723492723493, "grad_norm": 2.361236336612006, "kl": 0.50390625, "learning_rate": 6.507276507276506e-07, "loss": 0.0201, "reward": 1.40625, "reward_std": 0.38231194019317627, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.921875, "step": 336 }, { "completion_length": 112.765625, "epoch": 0.3503118503118503, "grad_norm": 4.937696627588436, "kl": 0.44140625, "learning_rate": 6.496881496881497e-07, "loss": 0.0177, "reward": 1.40625, "reward_std": 0.41505181789398193, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.921875, "step": 337 }, { "completion_length": 113.140625, "epoch": 0.35135135135135137, "grad_norm": 2.752883958690459, "kl": 0.29296875, "learning_rate": 6.486486486486486e-07, "loss": 0.0117, "reward": 1.28125, "reward_std": 0.4609758257865906, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.96875, "step": 338 }, { "completion_length": 93.078125, "epoch": 0.3523908523908524, "grad_norm": 2.5402055033214803, "kl": 0.4140625, "learning_rate": 6.476091476091475e-07, "loss": 0.0165, "reward": 1.46875, "reward_std": 0.37971559166908264, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.96875, "step": 339 }, { "completion_length": 98.359375, "epoch": 0.35343035343035345, "grad_norm": 2.4676102641747897, "kl": 0.25390625, "learning_rate": 6.465696465696465e-07, "loss": 0.0101, "reward": 1.25, "reward_std": 0.29176726937294006, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "step": 340 }, { "completion_length": 110.0625, "epoch": 0.35446985446985446, "grad_norm": 2.681662924711787, "kl": 0.20703125, "learning_rate": 6.455301455301455e-07, "loss": 0.0083, "reward": 1.28125, "reward_std": 0.4253978729248047, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.96875, "step": 341 }, { "completion_length": 103.4375, "epoch": 0.35550935550935553, "grad_norm": 2.5460561977008664, "kl": 0.29296875, "learning_rate": 6.444906444906444e-07, "loss": 0.0117, "reward": 1.359375, "reward_std": 0.363300621509552, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.921875, "step": 342 }, { "completion_length": 101.71875, "epoch": 0.35654885654885654, "grad_norm": 1.863713985839502, "kl": 0.255859375, "learning_rate": 6.434511434511434e-07, "loss": 0.0102, "reward": 1.4375, "reward_std": 0.2798827290534973, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.9375, "step": 343 }, { "completion_length": 111.875, "epoch": 0.3575883575883576, "grad_norm": 2.9131287285501455, "kl": 0.287109375, "learning_rate": 6.424116424116424e-07, "loss": 0.0115, "reward": 1.3125, "reward_std": 0.38877055048942566, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.953125, "step": 344 }, { "completion_length": 112.3125, "epoch": 0.3586278586278586, "grad_norm": 2.507881439565081, "kl": 0.26171875, "learning_rate": 6.413721413721413e-07, "loss": 0.0105, "reward": 1.390625, "reward_std": 0.4123668074607849, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.953125, "step": 345 }, { "completion_length": 114.6875, "epoch": 0.3596673596673597, "grad_norm": 2.2252975178568275, "kl": 0.279296875, "learning_rate": 6.403326403326404e-07, "loss": 0.0112, "reward": 1.328125, "reward_std": 0.38664886355400085, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.9375, "step": 346 }, { "completion_length": 106.875, "epoch": 0.3607068607068607, "grad_norm": 2.382913043350713, "kl": 0.26171875, "learning_rate": 6.392931392931392e-07, "loss": 0.0105, "reward": 1.3125, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.984375, "step": 347 }, { "completion_length": 116.234375, "epoch": 0.36174636174636177, "grad_norm": 2.8479994701272178, "kl": 0.31640625, "learning_rate": 6.382536382536382e-07, "loss": 0.0127, "reward": 1.15625, "reward_std": 0.3808925747871399, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.921875, "step": 348 }, { "completion_length": 96.046875, "epoch": 0.3627858627858628, "grad_norm": 2.892872161226189, "kl": 0.3359375, "learning_rate": 6.372141372141372e-07, "loss": 0.0134, "reward": 1.40625, "reward_std": 0.4615132212638855, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.921875, "step": 349 }, { "completion_length": 94.59375, "epoch": 0.36382536382536385, "grad_norm": 2.478624723665551, "kl": 0.265625, "learning_rate": 6.361746361746362e-07, "loss": 0.0106, "reward": 1.203125, "reward_std": 0.37298911809921265, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "step": 350 }, { "completion_length": 101.296875, "epoch": 0.36486486486486486, "grad_norm": 2.0036022909735047, "kl": 0.251953125, "learning_rate": 6.35135135135135e-07, "loss": 0.0101, "reward": 1.359375, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.9375, "step": 351 }, { "completion_length": 102.75, "epoch": 0.3659043659043659, "grad_norm": 2.8222623988882445, "kl": 0.265625, "learning_rate": 6.340956340956341e-07, "loss": 0.0106, "reward": 1.421875, "reward_std": 0.38664889335632324, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 352 }, { "completion_length": 85.765625, "epoch": 0.36694386694386694, "grad_norm": 2.5025877596913477, "kl": 0.369140625, "learning_rate": 6.33056133056133e-07, "loss": 0.0148, "reward": 1.484375, "reward_std": 0.29826050996780396, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.96875, "step": 353 }, { "completion_length": 94.90625, "epoch": 0.367983367983368, "grad_norm": 2.5303631727788787, "kl": 0.30859375, "learning_rate": 6.32016632016632e-07, "loss": 0.0123, "reward": 1.296875, "reward_std": 0.26196980476379395, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.984375, "step": 354 }, { "completion_length": 88.859375, "epoch": 0.369022869022869, "grad_norm": 2.638236710366981, "kl": 0.197265625, "learning_rate": 6.309771309771309e-07, "loss": 0.0079, "reward": 1.125, "reward_std": 0.3736610412597656, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9375, "step": 355 }, { "completion_length": 110.234375, "epoch": 0.3700623700623701, "grad_norm": 2.129702221317355, "kl": 0.2314453125, "learning_rate": 6.299376299376299e-07, "loss": 0.0093, "reward": 1.109375, "reward_std": 0.3352486491203308, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "step": 356 }, { "completion_length": 92.375, "epoch": 0.3711018711018711, "grad_norm": 2.560578045859425, "kl": 0.365234375, "learning_rate": 6.288981288981288e-07, "loss": 0.0146, "reward": 1.15625, "reward_std": 0.39849260449409485, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9375, "step": 357 }, { "completion_length": 94.625, "epoch": 0.37214137214137216, "grad_norm": 2.1838043908756894, "kl": 0.2041015625, "learning_rate": 6.278586278586279e-07, "loss": 0.0082, "reward": 1.171875, "reward_std": 0.3226271867752075, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9375, "step": 358 }, { "completion_length": 90.140625, "epoch": 0.3731808731808732, "grad_norm": 3.1392214462205223, "kl": 0.212890625, "learning_rate": 6.268191268191267e-07, "loss": 0.0085, "reward": 1.421875, "reward_std": 0.4560605585575104, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 359 }, { "completion_length": 91.03125, "epoch": 0.37422037422037424, "grad_norm": 2.360602355963987, "kl": 0.193359375, "learning_rate": 6.257796257796257e-07, "loss": 0.0077, "reward": 1.328125, "reward_std": 0.2902791500091553, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.96875, "step": 360 }, { "completion_length": 112.015625, "epoch": 0.37525987525987525, "grad_norm": 2.2896938920598062, "kl": 0.318359375, "learning_rate": 6.247401247401247e-07, "loss": 0.0128, "reward": 1.171875, "reward_std": 0.3761745095252991, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.921875, "step": 361 }, { "completion_length": 92.21875, "epoch": 0.3762993762993763, "grad_norm": 2.7696743108716135, "kl": 0.36328125, "learning_rate": 6.237006237006237e-07, "loss": 0.0145, "reward": 1.171875, "reward_std": 0.4483798146247864, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9375, "step": 362 }, { "completion_length": 118.640625, "epoch": 0.37733887733887733, "grad_norm": 2.199903653123164, "kl": 0.2177734375, "learning_rate": 6.226611226611225e-07, "loss": 0.0087, "reward": 1.34375, "reward_std": 0.36295416951179504, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.96875, "step": 363 }, { "completion_length": 100.46875, "epoch": 0.3783783783783784, "grad_norm": 2.337532070877166, "kl": 0.3984375, "learning_rate": 6.216216216216216e-07, "loss": 0.016, "reward": 1.359375, "reward_std": 0.308285653591156, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.953125, "step": 364 }, { "completion_length": 102.9375, "epoch": 0.3794178794178794, "grad_norm": 2.641206488507948, "kl": 0.263671875, "learning_rate": 6.205821205821205e-07, "loss": 0.0106, "reward": 1.28125, "reward_std": 0.45134252309799194, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.921875, "step": 365 }, { "completion_length": 94.09375, "epoch": 0.3804573804573805, "grad_norm": 2.248496957327473, "kl": 0.236328125, "learning_rate": 6.195426195426195e-07, "loss": 0.0095, "reward": 1.203125, "reward_std": 0.3356248140335083, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.96875, "step": 366 }, { "completion_length": 107.9375, "epoch": 0.3814968814968815, "grad_norm": 2.0798948116547926, "kl": 0.15234375, "learning_rate": 6.185031185031184e-07, "loss": 0.0061, "reward": 1.25, "reward_std": 0.40782028436660767, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "step": 367 }, { "completion_length": 100.875, "epoch": 0.38253638253638256, "grad_norm": 2.6494065792911847, "kl": 0.138671875, "learning_rate": 6.174636174636174e-07, "loss": 0.0056, "reward": 1.21875, "reward_std": 0.4041856527328491, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.921875, "step": 368 }, { "completion_length": 95.515625, "epoch": 0.38357588357588357, "grad_norm": 2.5558225520048183, "kl": 0.2197265625, "learning_rate": 6.164241164241164e-07, "loss": 0.0088, "reward": 1.265625, "reward_std": 0.4348241686820984, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.953125, "step": 369 }, { "completion_length": 94.59375, "epoch": 0.38461538461538464, "grad_norm": 2.6661581000773436, "kl": 0.28515625, "learning_rate": 6.153846153846154e-07, "loss": 0.0114, "reward": 1.34375, "reward_std": 0.3618125021457672, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.953125, "step": 370 }, { "completion_length": 97.65625, "epoch": 0.38565488565488565, "grad_norm": 3.3182426229541346, "kl": 0.474609375, "learning_rate": 6.143451143451143e-07, "loss": 0.019, "reward": 1.34375, "reward_std": 0.3956102132797241, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.953125, "step": 371 }, { "completion_length": 95.90625, "epoch": 0.3866943866943867, "grad_norm": 2.8816108690756397, "kl": 0.2734375, "learning_rate": 6.133056133056132e-07, "loss": 0.011, "reward": 1.3125, "reward_std": 0.41186636686325073, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.984375, "step": 372 }, { "completion_length": 91.65625, "epoch": 0.3877338877338877, "grad_norm": 1.7417678310015405, "kl": 0.287109375, "learning_rate": 6.122661122661123e-07, "loss": 0.0115, "reward": 1.40625, "reward_std": 0.1962026059627533, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.984375, "step": 373 }, { "completion_length": 107.40625, "epoch": 0.3887733887733888, "grad_norm": 2.3465229570168744, "kl": 0.205078125, "learning_rate": 6.112266112266112e-07, "loss": 0.0082, "reward": 1.234375, "reward_std": 0.3298586905002594, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.953125, "step": 374 }, { "completion_length": 93.5, "epoch": 0.3898128898128898, "grad_norm": 2.0806519707291273, "kl": 0.37890625, "learning_rate": 6.101871101871101e-07, "loss": 0.0152, "reward": 1.21875, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.984375, "step": 375 }, { "completion_length": 117.390625, "epoch": 0.3908523908523909, "grad_norm": 2.4973757348060106, "kl": 0.33984375, "learning_rate": 6.091476091476091e-07, "loss": 0.0136, "reward": 1.078125, "reward_std": 0.49718862771987915, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.890625, "step": 376 }, { "completion_length": 95.046875, "epoch": 0.3918918918918919, "grad_norm": 3.071811608203381, "kl": 0.369140625, "learning_rate": 6.081081081081081e-07, "loss": 0.0148, "reward": 1.40625, "reward_std": 0.3913668990135193, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 377 }, { "completion_length": 99.484375, "epoch": 0.39293139293139295, "grad_norm": 4.765385181994979, "kl": 0.150390625, "learning_rate": 6.07068607068607e-07, "loss": 0.006, "reward": 1.40625, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.984375, "step": 378 }, { "completion_length": 99.828125, "epoch": 0.39397089397089397, "grad_norm": 2.6564123879185155, "kl": 0.26171875, "learning_rate": 6.06029106029106e-07, "loss": 0.0105, "reward": 1.25, "reward_std": 0.3413130044937134, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "step": 379 }, { "completion_length": 102.015625, "epoch": 0.39501039501039503, "grad_norm": 4.244785196940944, "kl": 0.232421875, "learning_rate": 6.049896049896049e-07, "loss": 0.0093, "reward": 1.234375, "reward_std": 0.4240131676197052, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9375, "step": 380 }, { "completion_length": 97.125, "epoch": 0.39604989604989604, "grad_norm": 2.182167294536953, "kl": 0.3046875, "learning_rate": 6.03950103950104e-07, "loss": 0.0122, "reward": 1.28125, "reward_std": 0.2756394147872925, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.96875, "step": 381 }, { "completion_length": 86.71875, "epoch": 0.3970893970893971, "grad_norm": 2.511767277186771, "kl": 0.1962890625, "learning_rate": 6.029106029106029e-07, "loss": 0.0078, "reward": 1.25, "reward_std": 0.28247910737991333, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9375, "step": 382 }, { "completion_length": 91.640625, "epoch": 0.3981288981288981, "grad_norm": 2.349646104517906, "kl": 0.2451171875, "learning_rate": 6.018711018711018e-07, "loss": 0.0098, "reward": 1.40625, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 383 }, { "completion_length": 81.703125, "epoch": 0.3991683991683992, "grad_norm": 3.2198305088344763, "kl": 0.3203125, "learning_rate": 6.008316008316007e-07, "loss": 0.0128, "reward": 1.625, "reward_std": 0.3377464711666107, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 384 }, { "completion_length": 93.5, "epoch": 0.4002079002079002, "grad_norm": 3.4315345602661913, "kl": 0.1396484375, "learning_rate": 5.997920997920998e-07, "loss": 0.0056, "reward": 1.265625, "reward_std": 0.34717273712158203, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 1.0, "step": 385 }, { "completion_length": 93.625, "epoch": 0.40124740124740127, "grad_norm": 2.122245117738818, "kl": 0.140625, "learning_rate": 5.987525987525987e-07, "loss": 0.0056, "reward": 1.390625, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "step": 386 }, { "completion_length": 74.125, "epoch": 0.4022869022869023, "grad_norm": 3.181679978658109, "kl": 0.216796875, "learning_rate": 5.977130977130976e-07, "loss": 0.0087, "reward": 1.390625, "reward_std": 0.31512534618377686, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 387 }, { "completion_length": 95.625, "epoch": 0.40332640332640335, "grad_norm": 2.4345005370612713, "kl": 0.3359375, "learning_rate": 5.966735966735966e-07, "loss": 0.0134, "reward": 1.546875, "reward_std": 0.380867063999176, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 388 }, { "completion_length": 83.9375, "epoch": 0.40436590436590436, "grad_norm": 2.6934782666891994, "kl": 0.259765625, "learning_rate": 5.956340956340956e-07, "loss": 0.0104, "reward": 1.4375, "reward_std": 0.354972779750824, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.96875, "step": 389 }, { "completion_length": 86.359375, "epoch": 0.40540540540540543, "grad_norm": 2.5930234903300757, "kl": 0.283203125, "learning_rate": 5.945945945945947e-07, "loss": 0.0113, "reward": 1.484375, "reward_std": 0.35141605138778687, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 390 }, { "completion_length": 97.484375, "epoch": 0.40644490644490644, "grad_norm": 2.6103971071309244, "kl": 0.2158203125, "learning_rate": 5.935550935550935e-07, "loss": 0.0086, "reward": 1.375, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 391 }, { "completion_length": 96.90625, "epoch": 0.4074844074844075, "grad_norm": 2.3391475350873803, "kl": 0.22265625, "learning_rate": 5.925155925155925e-07, "loss": 0.0089, "reward": 1.3125, "reward_std": 0.29105448722839355, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.984375, "step": 392 }, { "completion_length": 100.53125, "epoch": 0.4085239085239085, "grad_norm": 2.101989793611535, "kl": 0.2021484375, "learning_rate": 5.914760914760914e-07, "loss": 0.0081, "reward": 1.546875, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 393 }, { "completion_length": 93.53125, "epoch": 0.4095634095634096, "grad_norm": 2.4042084723120896, "kl": 0.2294921875, "learning_rate": 5.904365904365905e-07, "loss": 0.0092, "reward": 1.375, "reward_std": 0.37938767671585083, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.921875, "step": 394 }, { "completion_length": 115.5625, "epoch": 0.4106029106029106, "grad_norm": 1.6786179647108355, "kl": 0.2431640625, "learning_rate": 5.893970893970893e-07, "loss": 0.0097, "reward": 1.453125, "reward_std": 0.23531240224838257, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 395 }, { "completion_length": 119.359375, "epoch": 0.41164241164241167, "grad_norm": 2.2489326661021893, "kl": 0.236328125, "learning_rate": 5.883575883575883e-07, "loss": 0.0094, "reward": 1.296875, "reward_std": 0.39314964413642883, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.921875, "step": 396 }, { "completion_length": 98.359375, "epoch": 0.4126819126819127, "grad_norm": 2.845606897651187, "kl": 0.26171875, "learning_rate": 5.873180873180873e-07, "loss": 0.0105, "reward": 1.40625, "reward_std": 0.3564707636833191, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.953125, "step": 397 }, { "completion_length": 100.390625, "epoch": 0.41372141372141374, "grad_norm": 2.739630836537845, "kl": 0.427734375, "learning_rate": 5.862785862785863e-07, "loss": 0.0171, "reward": 1.265625, "reward_std": 0.31512534618377686, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 398 }, { "completion_length": 104.84375, "epoch": 0.41476091476091476, "grad_norm": 2.091207997470682, "kl": 0.41796875, "learning_rate": 5.852390852390851e-07, "loss": 0.0167, "reward": 1.1875, "reward_std": 0.26409149169921875, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.984375, "step": 399 }, { "completion_length": 111.0625, "epoch": 0.4158004158004158, "grad_norm": 2.0916709118178636, "kl": 0.373046875, "learning_rate": 5.841995841995842e-07, "loss": 0.015, "reward": 1.265625, "reward_std": 0.25441280007362366, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 400 }, { "completion_length": 93.125, "epoch": 0.41683991683991684, "grad_norm": 3.591578553618257, "kl": 0.4296875, "learning_rate": 5.831600831600831e-07, "loss": 0.0172, "reward": 1.171875, "reward_std": 0.3571978509426117, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9375, "step": 401 }, { "completion_length": 109.859375, "epoch": 0.4178794178794179, "grad_norm": 2.2879318936927184, "kl": 0.1494140625, "learning_rate": 5.821205821205821e-07, "loss": 0.006, "reward": 1.390625, "reward_std": 0.23531240224838257, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.96875, "step": 402 }, { "completion_length": 100.375, "epoch": 0.4189189189189189, "grad_norm": 2.634846058637475, "kl": 0.2451171875, "learning_rate": 5.81081081081081e-07, "loss": 0.0098, "reward": 1.40625, "reward_std": 0.3999423384666443, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.984375, "step": 403 }, { "completion_length": 108.796875, "epoch": 0.41995841995842, "grad_norm": 2.666016300456794, "kl": 0.29296875, "learning_rate": 5.8004158004158e-07, "loss": 0.0117, "reward": 1.640625, "reward_std": 0.3661494255065918, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 404 }, { "completion_length": 110.234375, "epoch": 0.420997920997921, "grad_norm": 2.5448579474186688, "kl": 0.2109375, "learning_rate": 5.790020790020789e-07, "loss": 0.0084, "reward": 1.5, "reward_std": 0.34352827072143555, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 405 }, { "completion_length": 115.984375, "epoch": 0.42203742203742206, "grad_norm": 2.005647856601246, "kl": 0.1572265625, "learning_rate": 5.77962577962578e-07, "loss": 0.0063, "reward": 1.375, "reward_std": 0.31300365924835205, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.96875, "step": 406 }, { "completion_length": 140.75, "epoch": 0.4230769230769231, "grad_norm": 2.0943162651124125, "kl": 0.208984375, "learning_rate": 5.769230769230768e-07, "loss": 0.0084, "reward": 1.1875, "reward_std": 0.45957428216934204, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.875, "step": 407 }, { "completion_length": 114.0, "epoch": 0.42411642411642414, "grad_norm": 2.599545812005521, "kl": 0.197265625, "learning_rate": 5.758835758835758e-07, "loss": 0.0079, "reward": 1.296875, "reward_std": 0.35574814677238464, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.96875, "step": 408 }, { "completion_length": 122.625, "epoch": 0.42515592515592515, "grad_norm": 1.8600193074103264, "kl": 0.259765625, "learning_rate": 5.748440748440748e-07, "loss": 0.0104, "reward": 1.328125, "reward_std": 0.2777610421180725, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.921875, "step": 409 }, { "completion_length": 112.09375, "epoch": 0.4261954261954262, "grad_norm": 2.2088945031782443, "kl": 0.265625, "learning_rate": 5.738045738045738e-07, "loss": 0.0107, "reward": 1.34375, "reward_std": 0.4060450792312622, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.953125, "step": 410 }, { "completion_length": 106.859375, "epoch": 0.42723492723492723, "grad_norm": 2.8941837504816634, "kl": 0.2353515625, "learning_rate": 5.727650727650726e-07, "loss": 0.0094, "reward": 1.546875, "reward_std": 0.2777610719203949, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 411 }, { "completion_length": 136.484375, "epoch": 0.4282744282744283, "grad_norm": 2.3682959693257386, "kl": 0.408203125, "learning_rate": 5.717255717255717e-07, "loss": 0.0163, "reward": 1.296875, "reward_std": 0.4493703246116638, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.859375, "step": 412 }, { "completion_length": 128.984375, "epoch": 0.4293139293139293, "grad_norm": 2.8124326739224044, "kl": 0.4453125, "learning_rate": 5.706860706860707e-07, "loss": 0.0178, "reward": 1.21875, "reward_std": 0.4585188627243042, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.921875, "step": 413 }, { "completion_length": 136.875, "epoch": 0.4303534303534304, "grad_norm": 2.2545653166702277, "kl": 0.361328125, "learning_rate": 5.696465696465696e-07, "loss": 0.0145, "reward": 1.375, "reward_std": 0.46502190828323364, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.953125, "step": 414 }, { "completion_length": 111.078125, "epoch": 0.4313929313929314, "grad_norm": 2.929102588945167, "kl": 0.259765625, "learning_rate": 5.686070686070686e-07, "loss": 0.0104, "reward": 1.28125, "reward_std": 0.3564707338809967, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.953125, "step": 415 }, { "completion_length": 142.921875, "epoch": 0.43243243243243246, "grad_norm": 2.3066077157438616, "kl": 0.1953125, "learning_rate": 5.675675675675675e-07, "loss": 0.0078, "reward": 1.234375, "reward_std": 0.4603038728237152, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.890625, "step": 416 }, { "completion_length": 115.0625, "epoch": 0.43347193347193347, "grad_norm": 2.3353758816249877, "kl": 0.33984375, "learning_rate": 5.665280665280665e-07, "loss": 0.0136, "reward": 1.109375, "reward_std": 0.38382193446159363, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.90625, "step": 417 }, { "completion_length": 100.3125, "epoch": 0.43451143451143454, "grad_norm": 2.0317249651419123, "kl": 0.46875, "learning_rate": 5.654885654885655e-07, "loss": 0.0187, "reward": 1.53125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.984375, "step": 418 }, { "completion_length": 137.6875, "epoch": 0.43555093555093555, "grad_norm": 2.122083168918025, "kl": 0.2041015625, "learning_rate": 5.644490644490644e-07, "loss": 0.0082, "reward": 1.40625, "reward_std": 0.38877052068710327, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.921875, "step": 419 }, { "completion_length": 103.828125, "epoch": 0.4365904365904366, "grad_norm": 2.66582828657333, "kl": 0.19921875, "learning_rate": 5.634095634095633e-07, "loss": 0.008, "reward": 1.265625, "reward_std": 0.2414703369140625, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 420 }, { "completion_length": 124.515625, "epoch": 0.4376299376299376, "grad_norm": 3.4153697790028645, "kl": 0.1904296875, "learning_rate": 5.623700623700624e-07, "loss": 0.0076, "reward": 1.140625, "reward_std": 0.24464009702205658, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.953125, "step": 421 }, { "completion_length": 108.625, "epoch": 0.4386694386694387, "grad_norm": 2.2863689887839604, "kl": 0.337890625, "learning_rate": 5.613305613305613e-07, "loss": 0.0135, "reward": 1.296875, "reward_std": 0.2971188426017761, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.96875, "step": 422 }, { "completion_length": 108.734375, "epoch": 0.4397089397089397, "grad_norm": 2.3818509572474, "kl": 0.3125, "learning_rate": 5.602910602910602e-07, "loss": 0.0125, "reward": 1.109375, "reward_std": 0.28460076451301575, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "step": 423 }, { "completion_length": 115.5, "epoch": 0.4407484407484408, "grad_norm": 2.8349463984378334, "kl": 0.27734375, "learning_rate": 5.592515592515592e-07, "loss": 0.0111, "reward": 1.375, "reward_std": 0.4441463053226471, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.953125, "step": 424 }, { "completion_length": 119.90625, "epoch": 0.4417879417879418, "grad_norm": 2.323576716197602, "kl": 0.345703125, "learning_rate": 5.582120582120582e-07, "loss": 0.0138, "reward": 1.25, "reward_std": 0.47720107436180115, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.921875, "step": 425 }, { "completion_length": 100.859375, "epoch": 0.44282744282744285, "grad_norm": 2.906177116895176, "kl": 0.32421875, "learning_rate": 5.571725571725571e-07, "loss": 0.013, "reward": 1.421875, "reward_std": 0.37769731879234314, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 426 }, { "completion_length": 114.125, "epoch": 0.44386694386694386, "grad_norm": 2.437777981237857, "kl": 0.2490234375, "learning_rate": 5.561330561330561e-07, "loss": 0.01, "reward": 1.34375, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.96875, "step": 427 }, { "completion_length": 108.03125, "epoch": 0.44490644490644493, "grad_norm": 2.195903624797964, "kl": 0.255859375, "learning_rate": 5.55093555093555e-07, "loss": 0.0102, "reward": 1.375, "reward_std": 0.24969476461410522, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.96875, "step": 428 }, { "completion_length": 118.921875, "epoch": 0.44594594594594594, "grad_norm": 2.103796751100833, "kl": 0.236328125, "learning_rate": 5.54054054054054e-07, "loss": 0.0094, "reward": 1.4375, "reward_std": 0.33090677857398987, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.96875, "step": 429 }, { "completion_length": 125.0625, "epoch": 0.446985446985447, "grad_norm": 2.4119436846360105, "kl": 0.1875, "learning_rate": 5.53014553014553e-07, "loss": 0.0075, "reward": 1.40625, "reward_std": 0.45744526386260986, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.953125, "step": 430 }, { "completion_length": 106.171875, "epoch": 0.448024948024948, "grad_norm": 2.346056877000993, "kl": 0.279296875, "learning_rate": 5.519750519750519e-07, "loss": 0.0112, "reward": 1.359375, "reward_std": 0.29826050996780396, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.953125, "step": 431 }, { "completion_length": 109.40625, "epoch": 0.4490644490644491, "grad_norm": 3.7965134300874963, "kl": 0.1650390625, "learning_rate": 5.509355509355508e-07, "loss": 0.0066, "reward": 1.234375, "reward_std": 0.4341113567352295, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9375, "step": 432 }, { "completion_length": 113.40625, "epoch": 0.4501039501039501, "grad_norm": 2.869209166637567, "kl": 0.279296875, "learning_rate": 5.498960498960499e-07, "loss": 0.0112, "reward": 1.40625, "reward_std": 0.42959797382354736, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.9375, "step": 433 }, { "completion_length": 94.25, "epoch": 0.45114345114345117, "grad_norm": 4.083213386609918, "kl": 0.294921875, "learning_rate": 5.488565488565488e-07, "loss": 0.0118, "reward": 1.28125, "reward_std": 0.43191659450531006, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.953125, "step": 434 }, { "completion_length": 129.875, "epoch": 0.4521829521829522, "grad_norm": 2.3950732300129522, "kl": 0.263671875, "learning_rate": 5.478170478170477e-07, "loss": 0.0106, "reward": 1.1875, "reward_std": 0.39917677640914917, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9375, "step": 435 }, { "completion_length": 113.0, "epoch": 0.45322245322245325, "grad_norm": 2.2035870884545203, "kl": 0.46875, "learning_rate": 5.467775467775468e-07, "loss": 0.0187, "reward": 1.0625, "reward_std": 0.2618762254714966, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "step": 436 }, { "completion_length": 122.65625, "epoch": 0.45426195426195426, "grad_norm": 2.2922900706173204, "kl": 0.1572265625, "learning_rate": 5.457380457380457e-07, "loss": 0.0063, "reward": 1.25, "reward_std": 0.3284187912940979, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "step": 437 }, { "completion_length": 91.78125, "epoch": 0.4553014553014553, "grad_norm": 3.164029699881162, "kl": 0.28125, "learning_rate": 5.446985446985447e-07, "loss": 0.0112, "reward": 1.46875, "reward_std": 0.33090680837631226, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.984375, "step": 438 }, { "completion_length": 118.75, "epoch": 0.45634095634095634, "grad_norm": 2.822462089548917, "kl": 0.3828125, "learning_rate": 5.436590436590436e-07, "loss": 0.0153, "reward": 1.46875, "reward_std": 0.4728603959083557, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 0.921875, "step": 439 }, { "completion_length": 105.0, "epoch": 0.4573804573804574, "grad_norm": 2.1246453104133, "kl": 0.28125, "learning_rate": 5.426195426195426e-07, "loss": 0.0112, "reward": 1.203125, "reward_std": 0.2993341088294983, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.96875, "step": 440 }, { "completion_length": 104.96875, "epoch": 0.4584199584199584, "grad_norm": 2.1860721759286665, "kl": 0.2490234375, "learning_rate": 5.415800415800415e-07, "loss": 0.01, "reward": 1.171875, "reward_std": 0.23568856716156006, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 1.0, "step": 441 }, { "completion_length": 105.171875, "epoch": 0.4594594594594595, "grad_norm": 2.407885678548303, "kl": 0.470703125, "learning_rate": 5.405405405405406e-07, "loss": 0.0188, "reward": 1.234375, "reward_std": 0.32370075583457947, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9375, "step": 442 }, { "completion_length": 97.546875, "epoch": 0.4604989604989605, "grad_norm": 2.6160635611149554, "kl": 0.515625, "learning_rate": 5.395010395010394e-07, "loss": 0.0206, "reward": 1.34375, "reward_std": 0.31973996758461, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.96875, "step": 443 }, { "completion_length": 113.171875, "epoch": 0.46153846153846156, "grad_norm": 2.087765660756347, "kl": 0.1845703125, "learning_rate": 5.384615384615384e-07, "loss": 0.0074, "reward": 1.078125, "reward_std": 0.28422462940216064, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "step": 444 }, { "completion_length": 103.59375, "epoch": 0.4625779625779626, "grad_norm": 3.184282120650955, "kl": 0.498046875, "learning_rate": 5.374220374220374e-07, "loss": 0.0199, "reward": 1.1875, "reward_std": 0.33447331190109253, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.921875, "step": 445 }, { "completion_length": 106.15625, "epoch": 0.46361746361746364, "grad_norm": 1.9603162569144337, "kl": 0.25390625, "learning_rate": 5.363825363825364e-07, "loss": 0.0102, "reward": 1.140625, "reward_std": 0.2687061131000519, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "step": 446 }, { "completion_length": 86.40625, "epoch": 0.46465696465696466, "grad_norm": 3.0321411232612445, "kl": 0.2890625, "learning_rate": 5.353430353430352e-07, "loss": 0.0116, "reward": 1.390625, "reward_std": 0.4149582087993622, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.96875, "step": 447 }, { "completion_length": 113.859375, "epoch": 0.4656964656964657, "grad_norm": 2.4711666557504803, "kl": 0.57421875, "learning_rate": 5.343035343035343e-07, "loss": 0.0229, "reward": 1.4375, "reward_std": 0.47420668601989746, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.921875, "step": 448 }, { "completion_length": 115.5625, "epoch": 0.46673596673596673, "grad_norm": 2.492332476119015, "kl": 0.251953125, "learning_rate": 5.332640332640332e-07, "loss": 0.0101, "reward": 1.1875, "reward_std": 0.4168071150779724, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.90625, "step": 449 }, { "completion_length": 103.984375, "epoch": 0.4677754677754678, "grad_norm": 2.653714037809222, "kl": 0.2421875, "learning_rate": 5.322245322245322e-07, "loss": 0.0097, "reward": 1.375, "reward_std": 0.4696267247200012, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.96875, "step": 450 }, { "completion_length": 104.359375, "epoch": 0.4688149688149688, "grad_norm": 3.1513574662012074, "kl": 0.4140625, "learning_rate": 5.311850311850311e-07, "loss": 0.0166, "reward": 1.46875, "reward_std": 0.44625815749168396, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.96875, "step": 451 }, { "completion_length": 96.53125, "epoch": 0.4698544698544699, "grad_norm": 2.5270115055909486, "kl": 0.33984375, "learning_rate": 5.301455301455301e-07, "loss": 0.0136, "reward": 1.328125, "reward_std": 0.49362704157829285, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.875, "step": 452 }, { "completion_length": 94.3125, "epoch": 0.4708939708939709, "grad_norm": 3.1374173786243817, "kl": 0.53125, "learning_rate": 5.29106029106029e-07, "loss": 0.0212, "reward": 1.125, "reward_std": 0.36594057083129883, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9375, "step": 453 }, { "completion_length": 87.359375, "epoch": 0.47193347193347196, "grad_norm": 3.0488631323367392, "kl": 0.234375, "learning_rate": 5.280665280665281e-07, "loss": 0.0094, "reward": 1.265625, "reward_std": 0.3549826145172119, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.96875, "step": 454 }, { "completion_length": 97.875, "epoch": 0.47297297297297297, "grad_norm": 3.6220969471653124, "kl": 0.296875, "learning_rate": 5.270270270270269e-07, "loss": 0.0119, "reward": 1.109375, "reward_std": 0.3359614610671997, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.921875, "step": 455 }, { "completion_length": 95.6875, "epoch": 0.47401247401247404, "grad_norm": 2.8563568978773093, "kl": 0.443359375, "learning_rate": 5.259875259875259e-07, "loss": 0.0177, "reward": 1.25, "reward_std": 0.34352827072143555, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9375, "step": 456 }, { "completion_length": 92.15625, "epoch": 0.47505197505197505, "grad_norm": 2.6384556309936165, "kl": 0.349609375, "learning_rate": 5.249480249480249e-07, "loss": 0.014, "reward": 1.296875, "reward_std": 0.3403330445289612, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.921875, "step": 457 }, { "completion_length": 88.390625, "epoch": 0.4760914760914761, "grad_norm": 2.623030904942531, "kl": 0.275390625, "learning_rate": 5.239085239085239e-07, "loss": 0.011, "reward": 1.46875, "reward_std": 0.35400262475013733, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.984375, "step": 458 }, { "completion_length": 79.34375, "epoch": 0.47713097713097713, "grad_norm": 2.310826849041565, "kl": 0.2451171875, "learning_rate": 5.228690228690227e-07, "loss": 0.0098, "reward": 1.078125, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.96875, "step": 459 }, { "completion_length": 90.828125, "epoch": 0.4781704781704782, "grad_norm": 2.3689249732215454, "kl": 0.2001953125, "learning_rate": 5.218295218295218e-07, "loss": 0.008, "reward": 1.40625, "reward_std": 0.2619796097278595, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.984375, "step": 460 }, { "completion_length": 93.84375, "epoch": 0.4792099792099792, "grad_norm": 2.778386056815754, "kl": 0.23828125, "learning_rate": 5.207900207900208e-07, "loss": 0.0095, "reward": 1.40625, "reward_std": 0.3754722774028778, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 461 }, { "completion_length": 87.734375, "epoch": 0.4802494802494803, "grad_norm": 2.621158327511639, "kl": 0.302734375, "learning_rate": 5.197505197505197e-07, "loss": 0.0121, "reward": 1.171875, "reward_std": 0.308285653591156, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.984375, "step": 462 }, { "completion_length": 86.78125, "epoch": 0.4812889812889813, "grad_norm": 2.375079033858394, "kl": 0.28125, "learning_rate": 5.187110187110187e-07, "loss": 0.0112, "reward": 1.25, "reward_std": 0.3093337416648865, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "step": 463 }, { "completion_length": 96.125, "epoch": 0.48232848232848236, "grad_norm": 3.5339746320132837, "kl": 0.25, "learning_rate": 5.176715176715176e-07, "loss": 0.01, "reward": 1.59375, "reward_std": 0.35247981548309326, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 0.984375, "step": 464 }, { "completion_length": 93.203125, "epoch": 0.48336798336798337, "grad_norm": 2.385570672360844, "kl": 0.34375, "learning_rate": 5.166320166320166e-07, "loss": 0.0137, "reward": 1.3125, "reward_std": 0.34436753392219543, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.9375, "step": 465 }, { "completion_length": 83.984375, "epoch": 0.48440748440748443, "grad_norm": 2.648103730611235, "kl": 0.294921875, "learning_rate": 5.155925155925156e-07, "loss": 0.0118, "reward": 1.515625, "reward_std": 0.3625878393650055, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 466 }, { "completion_length": 97.28125, "epoch": 0.48544698544698545, "grad_norm": 2.4851370985704455, "kl": 0.3359375, "learning_rate": 5.145530145530145e-07, "loss": 0.0134, "reward": 1.21875, "reward_std": 0.4253978729248047, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.90625, "step": 467 }, { "completion_length": 96.953125, "epoch": 0.4864864864864865, "grad_norm": 2.8964368585635203, "kl": 0.30078125, "learning_rate": 5.135135135135134e-07, "loss": 0.0121, "reward": 1.234375, "reward_std": 0.42256343364715576, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9375, "step": 468 }, { "completion_length": 80.359375, "epoch": 0.4875259875259875, "grad_norm": 2.261788348494901, "kl": 0.498046875, "learning_rate": 5.124740124740125e-07, "loss": 0.0199, "reward": 1.203125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 1.0, "step": 469 }, { "completion_length": 78.0, "epoch": 0.4885654885654886, "grad_norm": 3.6075827452782496, "kl": 0.376953125, "learning_rate": 5.114345114345114e-07, "loss": 0.0151, "reward": 1.234375, "reward_std": 0.3492845892906189, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.96875, "step": 470 }, { "completion_length": 96.28125, "epoch": 0.4896049896049896, "grad_norm": 1.8495777069870327, "kl": 0.2275390625, "learning_rate": 5.103950103950103e-07, "loss": 0.0091, "reward": 1.3125, "reward_std": 0.19727617502212524, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 471 }, { "completion_length": 94.65625, "epoch": 0.49064449064449067, "grad_norm": 3.126827409885967, "kl": 0.30859375, "learning_rate": 5.093555093555093e-07, "loss": 0.0123, "reward": 1.296875, "reward_std": 0.36469969153404236, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.96875, "step": 472 }, { "completion_length": 91.015625, "epoch": 0.4916839916839917, "grad_norm": 2.7068994418592007, "kl": 0.2578125, "learning_rate": 5.083160083160083e-07, "loss": 0.0103, "reward": 1.25, "reward_std": 0.2435920089483261, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "step": 473 }, { "completion_length": 75.234375, "epoch": 0.49272349272349275, "grad_norm": 3.690807450494717, "kl": 0.392578125, "learning_rate": 5.072765072765072e-07, "loss": 0.0157, "reward": 1.359375, "reward_std": 0.5318273901939392, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.953125, "step": 474 }, { "completion_length": 79.234375, "epoch": 0.49376299376299376, "grad_norm": 2.9852234657786534, "kl": 0.201171875, "learning_rate": 5.062370062370062e-07, "loss": 0.0081, "reward": 1.484375, "reward_std": 0.3776973485946655, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 475 }, { "completion_length": 91.671875, "epoch": 0.49480249480249483, "grad_norm": 3.254822964296011, "kl": 0.33203125, "learning_rate": 5.051975051975051e-07, "loss": 0.0133, "reward": 1.3125, "reward_std": 0.3335031569004059, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.96875, "step": 476 }, { "completion_length": 115.8125, "epoch": 0.49584199584199584, "grad_norm": 2.482414970059199, "kl": 0.310546875, "learning_rate": 5.041580041580041e-07, "loss": 0.0124, "reward": 1.15625, "reward_std": 0.38877052068710327, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9375, "step": 477 }, { "completion_length": 85.09375, "epoch": 0.4968814968814969, "grad_norm": 2.06400763332481, "kl": 0.462890625, "learning_rate": 5.031185031185031e-07, "loss": 0.0185, "reward": 1.296875, "reward_std": 0.18139132857322693, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.953125, "step": 478 }, { "completion_length": 96.796875, "epoch": 0.4979209979209979, "grad_norm": 2.560267335188356, "kl": 0.333984375, "learning_rate": 5.02079002079002e-07, "loss": 0.0134, "reward": 1.3125, "reward_std": 0.2834492325782776, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.953125, "step": 479 }, { "completion_length": 76.34375, "epoch": 0.498960498960499, "grad_norm": 2.6385468443630535, "kl": 0.25, "learning_rate": 5.010395010395009e-07, "loss": 0.01, "reward": 1.1875, "reward_std": 0.29143065214157104, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.984375, "step": 480 }, { "completion_length": 81.75, "epoch": 0.5, "grad_norm": 3.1516343456163156, "kl": 0.373046875, "learning_rate": 5e-07, "loss": 0.0149, "reward": 1.484375, "reward_std": 0.33669841289520264, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 481 }, { "completion_length": 90.6875, "epoch": 0.501039501039501, "grad_norm": 2.128953957337744, "kl": 0.337890625, "learning_rate": 4.98960498960499e-07, "loss": 0.0136, "reward": 1.265625, "reward_std": 0.24039676785469055, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 482 }, { "completion_length": 94.53125, "epoch": 0.502079002079002, "grad_norm": 3.7974425504982268, "kl": 0.287109375, "learning_rate": 4.979209979209979e-07, "loss": 0.0115, "reward": 1.203125, "reward_std": 0.4050365090370178, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.984375, "step": 483 }, { "completion_length": 86.5, "epoch": 0.5031185031185031, "grad_norm": 3.2586201347911317, "kl": 0.44140625, "learning_rate": 4.968814968814969e-07, "loss": 0.0177, "reward": 1.453125, "reward_std": 0.28930896520614624, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.96875, "step": 484 }, { "completion_length": 93.5, "epoch": 0.5041580041580042, "grad_norm": 2.4003917159149206, "kl": 0.1650390625, "learning_rate": 4.958419958419958e-07, "loss": 0.0066, "reward": 1.5, "reward_std": 0.2346404641866684, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.96875, "step": 485 }, { "completion_length": 93.546875, "epoch": 0.5051975051975052, "grad_norm": 2.7610527291032985, "kl": 0.388671875, "learning_rate": 4.948024948024948e-07, "loss": 0.0156, "reward": 1.34375, "reward_std": 0.32805800437927246, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.890625, "step": 486 }, { "completion_length": 94.6875, "epoch": 0.5062370062370062, "grad_norm": 2.8750273006197253, "kl": 0.328125, "learning_rate": 4.937629937629938e-07, "loss": 0.0131, "reward": 1.328125, "reward_std": 0.32407689094543457, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.953125, "step": 487 }, { "completion_length": 80.59375, "epoch": 0.5072765072765073, "grad_norm": 3.2641362910819307, "kl": 0.29296875, "learning_rate": 4.927234927234927e-07, "loss": 0.0118, "reward": 1.40625, "reward_std": 0.3593195378780365, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 488 }, { "completion_length": 91.015625, "epoch": 0.5083160083160083, "grad_norm": 2.738467165143027, "kl": 0.314453125, "learning_rate": 4.916839916839916e-07, "loss": 0.0126, "reward": 1.40625, "reward_std": 0.3370095491409302, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.96875, "step": 489 }, { "completion_length": 89.140625, "epoch": 0.5093555093555093, "grad_norm": 2.4532223007133855, "kl": 0.34765625, "learning_rate": 4.906444906444907e-07, "loss": 0.0139, "reward": 1.203125, "reward_std": 0.2472364604473114, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.921875, "step": 490 }, { "completion_length": 87.90625, "epoch": 0.5103950103950103, "grad_norm": 3.2691402369394837, "kl": 0.3203125, "learning_rate": 4.896049896049896e-07, "loss": 0.0128, "reward": 1.328125, "reward_std": 0.4926879107952118, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.921875, "step": 491 }, { "completion_length": 80.515625, "epoch": 0.5114345114345115, "grad_norm": 2.5629407452852226, "kl": 0.296875, "learning_rate": 4.885654885654885e-07, "loss": 0.0119, "reward": 1.359375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 492 }, { "completion_length": 81.515625, "epoch": 0.5124740124740125, "grad_norm": 3.371024498137097, "kl": 0.255859375, "learning_rate": 4.875259875259875e-07, "loss": 0.0103, "reward": 1.453125, "reward_std": 0.4071483612060547, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 0.984375, "step": 493 }, { "completion_length": 101.34375, "epoch": 0.5135135135135135, "grad_norm": 2.3568217662693627, "kl": 0.26171875, "learning_rate": 4.864864864864865e-07, "loss": 0.0105, "reward": 1.375, "reward_std": 0.3335031569004059, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.953125, "step": 494 }, { "completion_length": 84.984375, "epoch": 0.5145530145530145, "grad_norm": 2.6792763797695187, "kl": 0.287109375, "learning_rate": 4.854469854469854e-07, "loss": 0.0115, "reward": 1.421875, "reward_std": 0.2777610719203949, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.96875, "step": 495 }, { "completion_length": 89.390625, "epoch": 0.5155925155925156, "grad_norm": 2.913629954308741, "kl": 0.5234375, "learning_rate": 4.844074844074844e-07, "loss": 0.021, "reward": 1.328125, "reward_std": 0.37298911809921265, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "step": 496 }, { "completion_length": 81.59375, "epoch": 0.5166320166320166, "grad_norm": 2.526678124230585, "kl": 0.37890625, "learning_rate": 4.833679833679833e-07, "loss": 0.0152, "reward": 1.59375, "reward_std": 0.2540663480758667, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 497 }, { "completion_length": 71.546875, "epoch": 0.5176715176715176, "grad_norm": 3.9520347407545153, "kl": 0.34765625, "learning_rate": 4.823284823284823e-07, "loss": 0.0139, "reward": 1.3125, "reward_std": 0.39820659160614014, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.984375, "step": 498 }, { "completion_length": 102.984375, "epoch": 0.5187110187110187, "grad_norm": 2.3228819463883505, "kl": 0.46484375, "learning_rate": 4.812889812889813e-07, "loss": 0.0187, "reward": 1.25, "reward_std": 0.2540663480758667, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "step": 499 }, { "completion_length": 90.203125, "epoch": 0.5197505197505198, "grad_norm": 2.5314415175145304, "kl": 0.31640625, "learning_rate": 4.802494802494802e-07, "loss": 0.0127, "reward": 1.578125, "reward_std": 0.43556109070777893, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.953125, "step": 500 } ], "logging_steps": 1.0, "max_steps": 962, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }