| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2571.2083587646484, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.15054196119308472, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0802, | |
| "reward": 0.4897647276520729, | |
| "reward_std": 0.8290339335799217, | |
| "rewards/cosine_scaled_reward": -0.015534311532974243, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2804.395881652832, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.06441052258014679, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0258, | |
| "reward": 0.27539755403995514, | |
| "reward_std": 0.42092563211917877, | |
| "rewards/cosine_scaled_reward": -0.04980122856795788, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3410.2083435058594, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.06412151455879211, | |
| "kl": 3.890693187713623e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0187, | |
| "reward": -0.3642676221206784, | |
| "reward_std": 0.3840297982096672, | |
| "rewards/cosine_scaled_reward": -0.23421715013682842, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2242.2916870117188, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.163053959608078, | |
| "kl": 4.735589027404785e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0732, | |
| "reward": 0.5906303785741329, | |
| "reward_std": 0.9497254565358162, | |
| "rewards/cosine_scaled_reward": -0.027601496782153845, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3097.9584045410156, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.18420223891735077, | |
| "kl": 4.886835813522339e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.044, | |
| "reward": 0.26460691541433334, | |
| "reward_std": 0.82295261323452, | |
| "rewards/cosine_scaled_reward": -0.08644656091928482, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3196.479232788086, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.16443675756454468, | |
| "kl": 4.808604717254639e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0233, | |
| "reward": 0.023200208321213722, | |
| "reward_std": 0.8427017889916897, | |
| "rewards/cosine_scaled_reward": -0.16548323072493076, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 2956.916748046875, | |
| "epoch": 0.008, | |
| "grad_norm": 0.12215663492679596, | |
| "kl": 2.5171786546707153e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0395, | |
| "reward": 0.3668882008641958, | |
| "reward_std": 0.6829603910446167, | |
| "rewards/cosine_scaled_reward": -0.09780590422451496, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2590.791702270508, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.1336221843957901, | |
| "kl": 1.7095357179641724e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0327, | |
| "reward": 0.6794745922088623, | |
| "reward_std": 0.7160088084638119, | |
| "rewards/cosine_scaled_reward": 0.10015395213849843, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 2873.104248046875, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.23665077984333038, | |
| "kl": 5.224347114562988e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.1654, | |
| "reward": 0.2612099088728428, | |
| "reward_std": 0.9552066549658775, | |
| "rewards/cosine_scaled_reward": -0.07772838324308395, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2782.5000534057617, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.12416314333677292, | |
| "kl": 3.0666589736938477e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0529, | |
| "reward": 0.20608006697148085, | |
| "reward_std": 0.8456610515713692, | |
| "rewards/cosine_scaled_reward": -0.08445997314993292, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3373.1041870117188, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.09285606443881989, | |
| "kl": 3.6150217056274414e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0318, | |
| "reward": -0.3522867253050208, | |
| "reward_std": 0.49222037196159363, | |
| "rewards/cosine_scaled_reward": -0.22822669660672545, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2749.6459350585938, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.14122599363327026, | |
| "kl": 4.419684410095215e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.04, | |
| "reward": 0.4197032814845443, | |
| "reward_std": 0.8322520144283772, | |
| "rewards/cosine_scaled_reward": -0.06098170578479767, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 3078.3333587646484, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.10787954181432724, | |
| "kl": 3.547407686710358e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0205, | |
| "reward": 0.20099145034328103, | |
| "reward_std": 0.6062487177550793, | |
| "rewards/cosine_scaled_reward": -0.1078376192599535, | |
| "rewards/format_reward": 0.41666668467223644, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2948.229232788086, | |
| "epoch": 0.016, | |
| "grad_norm": 0.15190070867538452, | |
| "kl": 3.463774919509888e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0807, | |
| "reward": 0.04745530150830746, | |
| "reward_std": 0.8162283934652805, | |
| "rewards/cosine_scaled_reward": -0.1429390194825828, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2853.750026702881, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.08899455517530441, | |
| "kl": 2.139061689376831e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0106, | |
| "reward": 0.4579934204230085, | |
| "reward_std": 0.6455020261928439, | |
| "rewards/cosine_scaled_reward": 0.04149670549668372, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3580.6666870117188, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.1100376769900322, | |
| "kl": 3.6090612411499023e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0022, | |
| "reward": -0.2719184570014477, | |
| "reward_std": 0.6123667694628239, | |
| "rewards/cosine_scaled_reward": -0.167209230363369, | |
| "rewards/format_reward": 0.06250000186264515, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2259.93754196167, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.1220458522439003, | |
| "kl": 4.4733285903930664e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": -0.0002, | |
| "reward": 0.6442119255661964, | |
| "reward_std": 0.7075937427580357, | |
| "rewards/cosine_scaled_reward": 0.02002262556925416, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2983.4167098999023, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.1742384135723114, | |
| "kl": 2.3670494556427002e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.049, | |
| "reward": 0.05130944773554802, | |
| "reward_std": 0.7180995307862759, | |
| "rewards/cosine_scaled_reward": -0.14101194869726896, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 2910.166717529297, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.17305254936218262, | |
| "kl": 2.6881694793701172e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0581, | |
| "reward": 0.49342919746413827, | |
| "reward_std": 0.8945518247783184, | |
| "rewards/cosine_scaled_reward": 0.038381271064281464, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2444.6875534057617, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.11907866597175598, | |
| "kl": 1.6576610505580902e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0516, | |
| "reward": 0.5244868360459805, | |
| "reward_std": 0.6065918765962124, | |
| "rewards/cosine_scaled_reward": -0.06067326734773815, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2736.583366394043, | |
| "epoch": 0.024, | |
| "grad_norm": 0.13774950802326202, | |
| "kl": 3.9674341678619385e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0754, | |
| "reward": 0.366255359724164, | |
| "reward_std": 0.786641189828515, | |
| "rewards/cosine_scaled_reward": -0.014788982225582004, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 2147.479202270508, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.1683036983013153, | |
| "kl": 2.2970139980316162e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0518, | |
| "reward": 0.5381300672888756, | |
| "reward_std": 0.6436120271682739, | |
| "rewards/cosine_scaled_reward": -0.04343497380614281, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2571.854232788086, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.11163962632417679, | |
| "kl": 3.069639205932617e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0962, | |
| "reward": 0.03858701325953007, | |
| "reward_std": 0.5964515460655093, | |
| "rewards/cosine_scaled_reward": -0.1890398357063532, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2990.0625915527344, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.15436813235282898, | |
| "kl": 2.1535903215408325e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0679, | |
| "reward": 0.5357824601233006, | |
| "reward_std": 0.9760596714913845, | |
| "rewards/cosine_scaled_reward": 0.02830787282437086, | |
| "rewards/format_reward": 0.47916668467223644, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2695.937545776367, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.09400183707475662, | |
| "kl": 2.316199243068695e-05, | |
| "learning_rate": 5e-07, | |
| "loss": -0.0193, | |
| "reward": 0.24818142130970955, | |
| "reward_std": 0.7111284770071507, | |
| "rewards/cosine_scaled_reward": -0.11549262329936028, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 2949.6041870117188, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.062154464423656464, | |
| "kl": 2.1763145923614502e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0279, | |
| "reward": 0.5066877827048302, | |
| "reward_std": 0.5474561750888824, | |
| "rewards/cosine_scaled_reward": 0.003343891352415085, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2975.3750610351562, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.15270784497261047, | |
| "kl": 1.735985279083252e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0572, | |
| "reward": 0.3573550535365939, | |
| "reward_std": 0.7755850367248058, | |
| "rewards/cosine_scaled_reward": -0.060905810445547104, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2745.7083435058594, | |
| "epoch": 0.032, | |
| "grad_norm": 0.14373046159744263, | |
| "kl": 2.94586643576622e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0143, | |
| "reward": 0.6891938149929047, | |
| "reward_std": 0.9697537384927273, | |
| "rewards/cosine_scaled_reward": 0.06334691727533937, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3157.666732788086, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.21617551147937775, | |
| "kl": 2.562999725341797e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0941, | |
| "reward": -0.19576303288340569, | |
| "reward_std": 0.6964554395526648, | |
| "rewards/cosine_scaled_reward": -0.222881518304348, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 3055.3751220703125, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.21643611788749695, | |
| "kl": 2.6524066925048828e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.1293, | |
| "reward": 0.2991267549805343, | |
| "reward_std": 1.1622500345110893, | |
| "rewards/cosine_scaled_reward": -0.05876995751168579, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 3113.7708587646484, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.10899780690670013, | |
| "kl": 1.920759677886963e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0359, | |
| "reward": -0.09752624668180943, | |
| "reward_std": 0.6536959744989872, | |
| "rewards/cosine_scaled_reward": -0.18417980521917343, | |
| "rewards/format_reward": 0.27083334513008595, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3302.000030517578, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.13032931089401245, | |
| "kl": 2.7488917112350464e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0245, | |
| "reward": 0.240218386054039, | |
| "reward_std": 0.6280976049602032, | |
| "rewards/cosine_scaled_reward": -0.036140820011496544, | |
| "rewards/format_reward": 0.31250001303851604, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3349.3333740234375, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.10642867535352707, | |
| "kl": 4.164688289165497e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0126, | |
| "reward": 0.12095527164638042, | |
| "reward_std": 0.7586401477456093, | |
| "rewards/cosine_scaled_reward": -0.08535570465028286, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2513.687515258789, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.32349422574043274, | |
| "kl": 0.00012987852096557617, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0582, | |
| "reward": 0.5865504257380962, | |
| "reward_std": 0.9055213071405888, | |
| "rewards/cosine_scaled_reward": 0.04327519703656435, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3105.625045776367, | |
| "epoch": 0.04, | |
| "grad_norm": 0.15311317145824432, | |
| "kl": 9.425729513168335e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0384, | |
| "reward": 0.030300017446279526, | |
| "reward_std": 0.9541528224945068, | |
| "rewards/cosine_scaled_reward": -0.13068332930561155, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3456.0208740234375, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.06633453816175461, | |
| "kl": 9.938329458236694e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.032, | |
| "reward": -0.4127392489463091, | |
| "reward_std": 0.373018104583025, | |
| "rewards/cosine_scaled_reward": -0.2480362793430686, | |
| "rewards/format_reward": 0.0833333358168602, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3322.0833740234375, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.06093796342611313, | |
| "kl": 5.175359547138214e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0138, | |
| "reward": -0.3342415885999799, | |
| "reward_std": 0.39543480053544044, | |
| "rewards/cosine_scaled_reward": -0.2712874598801136, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3248.187530517578, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.10319728404283524, | |
| "kl": 9.037554264068604e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0155, | |
| "reward": -0.1099930020282045, | |
| "reward_std": 0.6748690903186798, | |
| "rewards/cosine_scaled_reward": -0.14874650537967682, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2824.7916870117188, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.11912386864423752, | |
| "kl": 9.545683860778809e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0342, | |
| "reward": 0.3973412849009037, | |
| "reward_std": 0.5866867541335523, | |
| "rewards/cosine_scaled_reward": -0.0409126803278923, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2677.250015258789, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.1012558713555336, | |
| "kl": 0.00043101049959659576, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0274, | |
| "reward": 0.27511681243777275, | |
| "reward_std": 0.5241224151104689, | |
| "rewards/cosine_scaled_reward": -0.08119158074259758, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 2805.875045776367, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.15722188353538513, | |
| "kl": 0.00016229506582021713, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0704, | |
| "reward": 0.07507334044203162, | |
| "reward_std": 0.706162091344595, | |
| "rewards/cosine_scaled_reward": -0.19162999838590622, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2796.5000038146973, | |
| "epoch": 0.048, | |
| "grad_norm": 0.07421410083770752, | |
| "kl": 7.737800478935242e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0009, | |
| "reward": -0.17676172463689, | |
| "reward_std": 0.4381077494472265, | |
| "rewards/cosine_scaled_reward": -0.2654642015695572, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3021.3958587646484, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.15121984481811523, | |
| "kl": 0.00015115737915039062, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0418, | |
| "reward": 0.11950396373867989, | |
| "reward_std": 0.7922810819000006, | |
| "rewards/cosine_scaled_reward": -0.07566470839083195, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2888.979179382324, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.1331688016653061, | |
| "kl": 0.00041909515857696533, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0234, | |
| "reward": 0.3374018808826804, | |
| "reward_std": 0.7487543746829033, | |
| "rewards/cosine_scaled_reward": -0.029215732589364052, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3375.562530517578, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.1241455152630806, | |
| "kl": 0.00021854229271411896, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0205, | |
| "reward": 0.1915082884952426, | |
| "reward_std": 0.745365809649229, | |
| "rewards/cosine_scaled_reward": -0.02924584597349167, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3235.4791717529297, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.08959861844778061, | |
| "kl": 0.0004153698682785034, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0007, | |
| "reward": -0.27040275279432535, | |
| "reward_std": 0.5466338861733675, | |
| "rewards/cosine_scaled_reward": -0.20811804989352822, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 3032.6875915527344, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.1654578596353531, | |
| "kl": 0.0002987794578075409, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0742, | |
| "reward": 0.2661336697638035, | |
| "reward_std": 0.9970342293381691, | |
| "rewards/cosine_scaled_reward": -0.06484984699636698, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2870.8541946411133, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.08487487584352493, | |
| "kl": 0.0008619073778390884, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0052, | |
| "reward": 0.021469716913998127, | |
| "reward_std": 0.5047157257795334, | |
| "rewards/cosine_scaled_reward": -0.14551514480262995, | |
| "rewards/format_reward": 0.3125, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2296.166717529297, | |
| "epoch": 0.056, | |
| "grad_norm": 0.16738910973072052, | |
| "kl": 0.0007393211126327515, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0039, | |
| "reward": 0.7408803049474955, | |
| "reward_std": 0.8585928715765476, | |
| "rewards/cosine_scaled_reward": 0.047523480374366045, | |
| "rewards/format_reward": 0.645833345130086, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 3003.729179382324, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.12555623054504395, | |
| "kl": 0.000688605010509491, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0619, | |
| "reward": 0.36810495844110847, | |
| "reward_std": 0.6655636876821518, | |
| "rewards/cosine_scaled_reward": 0.00696912594139576, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2403.270851135254, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.096031554043293, | |
| "kl": 0.0029038935899734497, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0161, | |
| "reward": 0.41045723855495453, | |
| "reward_std": 0.5240885466337204, | |
| "rewards/cosine_scaled_reward": -0.04477138537913561, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 3050.3542289733887, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.13251036405563354, | |
| "kl": 0.00112837553024292, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0229, | |
| "reward": 0.4021341912448406, | |
| "reward_std": 0.8952255919575691, | |
| "rewards/cosine_scaled_reward": 0.02398374956101179, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2802.5000610351562, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.16859768331050873, | |
| "kl": 0.012136131525039673, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0314, | |
| "reward": 0.4030042998492718, | |
| "reward_std": 0.9372089132666588, | |
| "rewards/cosine_scaled_reward": -0.04849785100668669, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2765.104232788086, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.12892229855060577, | |
| "kl": 0.0006947778165340424, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0521, | |
| "reward": 0.7572112157940865, | |
| "reward_std": 0.8469797167927027, | |
| "rewards/cosine_scaled_reward": 0.11818893579766154, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 3193.9584045410156, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.13522343337535858, | |
| "kl": 0.002847835421562195, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0282, | |
| "reward": 0.3383600414963439, | |
| "reward_std": 0.9050974175333977, | |
| "rewards/cosine_scaled_reward": 0.0025133611634373665, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2973.625030517578, | |
| "epoch": 0.064, | |
| "grad_norm": 0.10659506171941757, | |
| "kl": 0.0008548498153686523, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.057, | |
| "reward": 0.15444774832576513, | |
| "reward_std": 0.6975248530507088, | |
| "rewards/cosine_scaled_reward": -0.09985947422683239, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3409.3959045410156, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.109820656478405, | |
| "kl": 0.000378340482711792, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0353, | |
| "reward": -0.1370320685673505, | |
| "reward_std": 0.6953500900417566, | |
| "rewards/cosine_scaled_reward": -0.183099371381104, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2256.270866394043, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.14203794300556183, | |
| "kl": 0.011088848114013672, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0796, | |
| "reward": 0.6106004565954208, | |
| "reward_std": 0.8679536152631044, | |
| "rewards/cosine_scaled_reward": -0.007199776358902454, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 3019.0625610351562, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.17194640636444092, | |
| "kl": 0.0011185333132743835, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0533, | |
| "reward": 0.3943455405533314, | |
| "reward_std": 0.8865922130644321, | |
| "rewards/cosine_scaled_reward": 0.00967277493327856, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 2972.3333892822266, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.14659450948238373, | |
| "kl": 0.001021549105644226, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0618, | |
| "reward": 0.3112136572599411, | |
| "reward_std": 0.991888377815485, | |
| "rewards/cosine_scaled_reward": -0.0527265154523775, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 3116.5208740234375, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.14671690762043, | |
| "kl": 0.001110263168811798, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0815, | |
| "reward": 0.27738364040851593, | |
| "reward_std": 0.800424050539732, | |
| "rewards/cosine_scaled_reward": -0.06964152306318283, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2748.4583740234375, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.12405635416507721, | |
| "kl": 0.004500046372413635, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0233, | |
| "reward": 0.5339981620199978, | |
| "reward_std": 0.8481226451694965, | |
| "rewards/cosine_scaled_reward": 0.016999082639813423, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2458.0625610351562, | |
| "epoch": 0.072, | |
| "grad_norm": 0.2264307290315628, | |
| "kl": 0.0018923580646514893, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.1231, | |
| "reward": 0.7813117974437773, | |
| "reward_std": 1.1678522191941738, | |
| "rewards/cosine_scaled_reward": 0.06773922173306346, | |
| "rewards/format_reward": 0.645833345130086, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2938.3541870117188, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.18781885504722595, | |
| "kl": 0.0028982162475585938, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0713, | |
| "reward": 0.4028865471482277, | |
| "reward_std": 1.0138535592705011, | |
| "rewards/cosine_scaled_reward": -0.006890069227665663, | |
| "rewards/format_reward": 0.41666668094694614, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2938.270854949951, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.11171157658100128, | |
| "kl": 0.0012230873107910156, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0103, | |
| "reward": 0.3659288566559553, | |
| "reward_std": 0.6721893213689327, | |
| "rewards/cosine_scaled_reward": -0.02536890748888254, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2261.8958473205566, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.07533268630504608, | |
| "kl": 0.004939556121826172, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0008, | |
| "reward": 0.44019365310668945, | |
| "reward_std": 0.5285264812409878, | |
| "rewards/cosine_scaled_reward": -0.01948651857674122, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3491.125030517578, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.08121506124734879, | |
| "kl": 0.001986861228942871, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0262, | |
| "reward": -0.5231649484485388, | |
| "reward_std": 0.4413683470338583, | |
| "rewards/cosine_scaled_reward": -0.3032491412013769, | |
| "rewards/format_reward": 0.0833333358168602, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 2320.104202270508, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.13665203750133514, | |
| "kl": 0.009753227233886719, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.103, | |
| "reward": 0.41882045567035675, | |
| "reward_std": 0.8789598569273949, | |
| "rewards/cosine_scaled_reward": -0.0614231089130044, | |
| "rewards/format_reward": 0.5416666697710752, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2683.520866394043, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.12382709234952927, | |
| "kl": 0.005780220031738281, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.029, | |
| "reward": 0.05852051172405481, | |
| "reward_std": 0.7762247212231159, | |
| "rewards/cosine_scaled_reward": -0.17907308926805854, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3093.500030517578, | |
| "epoch": 0.08, | |
| "grad_norm": 0.0833624005317688, | |
| "kl": 0.002095341682434082, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0049, | |
| "reward": 0.1458067416679114, | |
| "reward_std": 0.6252293065190315, | |
| "rewards/cosine_scaled_reward": -0.11459663603454828, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2787.000030517578, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.10387802124023438, | |
| "kl": 0.00287473201751709, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0334, | |
| "reward": 0.3234255127608776, | |
| "reward_std": 0.6192433759570122, | |
| "rewards/cosine_scaled_reward": -0.00495389848947525, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 3198.1458892822266, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.16788320243358612, | |
| "kl": 0.0061588287353515625, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0888, | |
| "reward": -0.16125928983092308, | |
| "reward_std": 0.6868584919720888, | |
| "rewards/cosine_scaled_reward": -0.18479630933143198, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3534.8125, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.1018107533454895, | |
| "kl": 0.0011434555053710938, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0053, | |
| "reward": -0.12906201742589474, | |
| "reward_std": 0.6659317016601562, | |
| "rewards/cosine_scaled_reward": -0.1478643520968035, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 3263.2708435058594, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.08958863466978073, | |
| "kl": 0.0028752684593200684, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0343, | |
| "reward": -0.09894978068768978, | |
| "reward_std": 0.5172175895422697, | |
| "rewards/cosine_scaled_reward": -0.15364155592396855, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 3089.437530517578, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.08952262997627258, | |
| "kl": 0.004845738410949707, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0376, | |
| "reward": 0.3134958976879716, | |
| "reward_std": 0.6024076864123344, | |
| "rewards/cosine_scaled_reward": -0.009918726980686188, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2827.937545776367, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.14939860999584198, | |
| "kl": 0.0015323758125305176, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0686, | |
| "reward": 0.09713686350733042, | |
| "reward_std": 0.6034146882593632, | |
| "rewards/cosine_scaled_reward": -0.18059824593365192, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3079.0208892822266, | |
| "epoch": 0.088, | |
| "grad_norm": 0.06953620910644531, | |
| "kl": 0.00173109769821167, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": -0.0009, | |
| "reward": 0.11226777359843254, | |
| "reward_std": 0.44494798220694065, | |
| "rewards/cosine_scaled_reward": -0.12094944715499878, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 3188.3958892822266, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.1326863169670105, | |
| "kl": 0.0019420385360717773, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0581, | |
| "reward": 0.1064312756061554, | |
| "reward_std": 0.6707769315689802, | |
| "rewards/cosine_scaled_reward": -0.0822010301053524, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2369.7291831970215, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.0872310996055603, | |
| "kl": 0.0035691261291503906, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0099, | |
| "reward": 0.20641303062438965, | |
| "reward_std": 0.6631737053394318, | |
| "rewards/cosine_scaled_reward": -0.15721015818417072, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3369.7708435058594, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.14515474438667297, | |
| "kl": 0.0033063888549804688, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0147, | |
| "reward": -0.007383164018392563, | |
| "reward_std": 0.648245201446116, | |
| "rewards/cosine_scaled_reward": -0.12869158759713173, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3120.166679382324, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.12892520427703857, | |
| "kl": 0.0076847076416015625, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0038, | |
| "reward": -0.025072289630770683, | |
| "reward_std": 0.6095977611839771, | |
| "rewards/cosine_scaled_reward": -0.15836948156356812, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2897.0208740234375, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.22404946386814117, | |
| "kl": 0.0032821297645568848, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0577, | |
| "reward": 0.3680579289793968, | |
| "reward_std": 0.6173720192164183, | |
| "rewards/cosine_scaled_reward": -0.013887699693441391, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2878.812515258789, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.07131865620613098, | |
| "kl": 0.0026471614837646484, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": -0.0005, | |
| "reward": 0.055370867252349854, | |
| "reward_std": 0.513982892036438, | |
| "rewards/cosine_scaled_reward": -0.11814789846539497, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 3116.8958740234375, | |
| "epoch": 0.096, | |
| "grad_norm": 0.11975695192813873, | |
| "kl": 0.0038411617279052734, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0425, | |
| "reward": 0.42375171184539795, | |
| "reward_std": 0.7906983122229576, | |
| "rewards/cosine_scaled_reward": 0.02437583915889263, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3274.5208435058594, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.15465512871742249, | |
| "kl": 0.0028481483459472656, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0083, | |
| "reward": 0.08125680079683661, | |
| "reward_std": 0.9014002867043018, | |
| "rewards/cosine_scaled_reward": -0.13645494263619184, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2986.687530517578, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.10517808794975281, | |
| "kl": 0.004029273986816406, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0243, | |
| "reward": 0.2641464173793793, | |
| "reward_std": 0.558088131248951, | |
| "rewards/cosine_scaled_reward": -0.05542680807411671, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 3038.0833740234375, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.14221909642219543, | |
| "kl": 0.00909280776977539, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0763, | |
| "reward": 0.03238655626773834, | |
| "reward_std": 0.7338373847305775, | |
| "rewards/cosine_scaled_reward": -0.16089007258415222, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 3097.604217529297, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.18620258569717407, | |
| "kl": 0.010651111602783203, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.094, | |
| "reward": 0.319759342353791, | |
| "reward_std": 0.9601002521812916, | |
| "rewards/cosine_scaled_reward": -0.027620327193289995, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3288.8125610351562, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.15396399796009064, | |
| "kl": 0.004711151123046875, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0457, | |
| "reward": 0.19393361918628216, | |
| "reward_std": 0.8315184041857719, | |
| "rewards/cosine_scaled_reward": -0.048866515047848225, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2598.083351135254, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.0603555403649807, | |
| "kl": 0.010183334350585938, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0085, | |
| "reward": -0.08365453220903873, | |
| "reward_std": 0.4098529051989317, | |
| "rewards/cosine_scaled_reward": -0.27099394612014294, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3258.8541870117188, | |
| "epoch": 0.104, | |
| "grad_norm": 0.18056602776050568, | |
| "kl": 0.005778074264526367, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0472, | |
| "reward": 0.1628702199086547, | |
| "reward_std": 0.9603168293833733, | |
| "rewards/cosine_scaled_reward": -0.07481488771736622, | |
| "rewards/format_reward": 0.31250001303851604, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 3047.166748046875, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.12348229438066483, | |
| "kl": 0.008596420288085938, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0333, | |
| "reward": 0.3812000777106732, | |
| "reward_std": 0.7860049642622471, | |
| "rewards/cosine_scaled_reward": -0.028149965219199657, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3517.0416870117188, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.08583661168813705, | |
| "kl": 0.0059413909912109375, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0176, | |
| "reward": -0.5209241807460785, | |
| "reward_std": 0.38511228561401367, | |
| "rewards/cosine_scaled_reward": -0.281295420601964, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3197.7291870117188, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.10561229288578033, | |
| "kl": 0.010333061218261719, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.039, | |
| "reward": 0.015513140708208084, | |
| "reward_std": 0.7152232564985752, | |
| "rewards/cosine_scaled_reward": -0.11724343802779913, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3515.3333435058594, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.10697196424007416, | |
| "kl": 0.002697467803955078, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0037, | |
| "reward": -0.2317026201635599, | |
| "reward_std": 0.646982979029417, | |
| "rewards/cosine_scaled_reward": -0.1783513177651912, | |
| "rewards/format_reward": 0.1250000037252903, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 3197.041732788086, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.11685776710510254, | |
| "kl": 0.008313179016113281, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0622, | |
| "reward": 0.04121094010770321, | |
| "reward_std": 0.7906505540013313, | |
| "rewards/cosine_scaled_reward": -0.10439453413709998, | |
| "rewards/format_reward": 0.25000000931322575, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3248.1666870117188, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.10923011600971222, | |
| "kl": 0.004992485046386719, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0214, | |
| "reward": 0.23345047235488892, | |
| "reward_std": 0.6134117320179939, | |
| "rewards/cosine_scaled_reward": -0.04994143359363079, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 3227.6458740234375, | |
| "epoch": 0.112, | |
| "grad_norm": 0.09419604390859604, | |
| "kl": 0.003627777099609375, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0441, | |
| "reward": 0.07032760046422482, | |
| "reward_std": 0.5467477329075336, | |
| "rewards/cosine_scaled_reward": -0.12108622305095196, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2894.3541717529297, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.09645549207925797, | |
| "kl": 0.00807952880859375, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0198, | |
| "reward": 0.19703226536512375, | |
| "reward_std": 0.5642017982900143, | |
| "rewards/cosine_scaled_reward": -0.06815055944025517, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2980.4583740234375, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.10959184169769287, | |
| "kl": 0.009540557861328125, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0112, | |
| "reward": 0.3093057796359062, | |
| "reward_std": 0.7769787572324276, | |
| "rewards/cosine_scaled_reward": -0.03284713625907898, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2883.354217529297, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.12877003848552704, | |
| "kl": 0.005811214447021484, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0381, | |
| "reward": 0.14440507721155882, | |
| "reward_std": 0.5162308318540454, | |
| "rewards/cosine_scaled_reward": -0.09446411859244108, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 3023.1875610351562, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.1877162903547287, | |
| "kl": 0.011671066284179688, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0889, | |
| "reward": 0.22887461259961128, | |
| "reward_std": 0.8127965480089188, | |
| "rewards/cosine_scaled_reward": -0.11472935602068901, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 3156.812545776367, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.15089279413223267, | |
| "kl": 0.010381698608398438, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0498, | |
| "reward": -0.00840279646217823, | |
| "reward_std": 0.8644813783466816, | |
| "rewards/cosine_scaled_reward": -0.16045140800997615, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2804.000015258789, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.1974973976612091, | |
| "kl": 0.006961822509765625, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0379, | |
| "reward": 0.023546243086457253, | |
| "reward_std": 0.5169991590082645, | |
| "rewards/cosine_scaled_reward": -0.17572688311338425, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 3190.7500610351562, | |
| "epoch": 0.12, | |
| "grad_norm": 0.2654437720775604, | |
| "kl": 0.006871223449707031, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.1131, | |
| "reward": 0.33211813122034073, | |
| "reward_std": 1.1573103182017803, | |
| "rewards/cosine_scaled_reward": 0.009809067007154226, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2616.2708435058594, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.13210490345954895, | |
| "kl": 0.006643772125244141, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0729, | |
| "reward": 1.0535442419350147, | |
| "reward_std": 0.7898675352334976, | |
| "rewards/cosine_scaled_reward": 0.21427209861576557, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2912.875030517578, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.12955093383789062, | |
| "kl": 0.0073909759521484375, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0618, | |
| "reward": 0.21932815946638584, | |
| "reward_std": 0.6689837593585253, | |
| "rewards/cosine_scaled_reward": -0.10908591747283936, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 3047.9583740234375, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 14.177762985229492, | |
| "kl": 0.9636068344116211, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.1109, | |
| "reward": 0.3542330916970968, | |
| "reward_std": 0.7520873434841633, | |
| "rewards/cosine_scaled_reward": -0.010383456945419312, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 3088.916717529297, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.10591775923967361, | |
| "kl": 0.0058727264404296875, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0157, | |
| "reward": 0.28438636660575867, | |
| "reward_std": 0.5916559211909771, | |
| "rewards/cosine_scaled_reward": -0.07655682414770126, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 3114.958366394043, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.13596315681934357, | |
| "kl": 0.008614540100097656, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0252, | |
| "reward": 0.21716876979917288, | |
| "reward_std": 0.8383330777287483, | |
| "rewards/cosine_scaled_reward": -0.08933228440582752, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 3459.812530517578, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.13085001707077026, | |
| "kl": 0.009960174560546875, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0291, | |
| "reward": 0.025424662977457047, | |
| "reward_std": 0.7261426411569118, | |
| "rewards/cosine_scaled_reward": -0.0706210074131377, | |
| "rewards/format_reward": 0.16666667349636555, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3242.541717529297, | |
| "epoch": 0.128, | |
| "grad_norm": 0.13919313251972198, | |
| "kl": 0.00591278076171875, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0053, | |
| "reward": 0.6084912680089474, | |
| "reward_std": 0.7924976646900177, | |
| "rewards/cosine_scaled_reward": 0.08549563866108656, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2981.916748046875, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.24504096806049347, | |
| "kl": 0.01073455810546875, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0422, | |
| "reward": 0.21685536485165358, | |
| "reward_std": 0.8932247292250395, | |
| "rewards/cosine_scaled_reward": -0.13115566316992044, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2645.479202270508, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.07428579032421112, | |
| "kl": 0.007167816162109375, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0088, | |
| "reward": 0.17610520124435425, | |
| "reward_std": 0.4693563599139452, | |
| "rewards/cosine_scaled_reward": -0.20361408591270447, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2916.500030517578, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.11420946568250656, | |
| "kl": 0.0070934295654296875, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0296, | |
| "reward": 0.20668567204847932, | |
| "reward_std": 0.7353272885084152, | |
| "rewards/cosine_scaled_reward": -0.06332382163964212, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3320.6041870117188, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.10533823817968369, | |
| "kl": 0.00713348388671875, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": -0.0252, | |
| "reward": -0.1739243706688285, | |
| "reward_std": 0.6250900998711586, | |
| "rewards/cosine_scaled_reward": -0.18071219464764, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3231.625030517578, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.12218490988016129, | |
| "kl": 0.010065078735351562, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.003, | |
| "reward": 0.0012212982401251793, | |
| "reward_std": 0.693468015640974, | |
| "rewards/cosine_scaled_reward": -0.1452226904220879, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 3066.125015258789, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.1486603021621704, | |
| "kl": 0.0053539276123046875, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0267, | |
| "reward": 0.2887073950842023, | |
| "reward_std": 0.8192379102110863, | |
| "rewards/cosine_scaled_reward": -0.03272963920608163, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2546.9375610351562, | |
| "epoch": 0.136, | |
| "grad_norm": 0.4716709852218628, | |
| "kl": 0.21265125274658203, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0025, | |
| "reward": 0.5194442104548216, | |
| "reward_std": 0.623758127912879, | |
| "rewards/cosine_scaled_reward": -0.03194458410143852, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2618.1458740234375, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.1533748060464859, | |
| "kl": 0.010034561157226562, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0074, | |
| "reward": 0.47723614796996117, | |
| "reward_std": 0.7403046824038029, | |
| "rewards/cosine_scaled_reward": -0.011381933465600014, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 2108.979232788086, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.20188815891742706, | |
| "kl": 0.015041351318359375, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0589, | |
| "reward": 0.5848385840654373, | |
| "reward_std": 0.7289820089936256, | |
| "rewards/cosine_scaled_reward": -0.030497390776872635, | |
| "rewards/format_reward": 0.6458333376795053, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2961.6459197998047, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.1945829838514328, | |
| "kl": 0.008840560913085938, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0574, | |
| "reward": 0.4458494456484914, | |
| "reward_std": 0.867495059967041, | |
| "rewards/cosine_scaled_reward": 0.01459137536585331, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2952.000030517578, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.12263938039541245, | |
| "kl": 0.0066699981689453125, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0451, | |
| "reward": 0.305552801117301, | |
| "reward_std": 0.7307926155626774, | |
| "rewards/cosine_scaled_reward": -0.07639027573168278, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2579.687545776367, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.15065783262252808, | |
| "kl": 0.010354995727539062, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0237, | |
| "reward": 0.6200504712760448, | |
| "reward_std": 0.9035943485796452, | |
| "rewards/cosine_scaled_reward": 0.00794189516454935, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2819.000030517578, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.10753726214170456, | |
| "kl": 0.0061092376708984375, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0258, | |
| "reward": 0.5180913750082254, | |
| "reward_std": 0.7182744760066271, | |
| "rewards/cosine_scaled_reward": 0.081962333410047, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2857.291717529297, | |
| "epoch": 0.144, | |
| "grad_norm": 0.19327324628829956, | |
| "kl": 0.0054225921630859375, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0597, | |
| "reward": 0.4767994333524257, | |
| "reward_std": 1.0140015110373497, | |
| "rewards/cosine_scaled_reward": -0.0011836281046271324, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 3123.0208435058594, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.14542251825332642, | |
| "kl": 0.008556365966796875, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0721, | |
| "reward": -0.07007080456241965, | |
| "reward_std": 0.7998057566583157, | |
| "rewards/cosine_scaled_reward": -0.201702069491148, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2893.812530517578, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.12933015823364258, | |
| "kl": 0.00739288330078125, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0413, | |
| "reward": 0.6880392283201218, | |
| "reward_std": 0.7831969410181046, | |
| "rewards/cosine_scaled_reward": 0.11485293135046959, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3500.562530517578, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.1216391995549202, | |
| "kl": 0.009105682373046875, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": -0.0089, | |
| "reward": -0.261786799877882, | |
| "reward_std": 0.6217585429549217, | |
| "rewards/cosine_scaled_reward": -0.20381007622927427, | |
| "rewards/format_reward": 0.14583333767950535, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 3168.0208740234375, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.09015782177448273, | |
| "kl": 0.008625030517578125, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0264, | |
| "reward": -0.012573342770338058, | |
| "reward_std": 0.45624612644314766, | |
| "rewards/cosine_scaled_reward": -0.10003667138516903, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2846.687530517578, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.14023305475711823, | |
| "kl": 0.00933074951171875, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0644, | |
| "reward": 0.45178989693522453, | |
| "reward_std": 0.7442049328237772, | |
| "rewards/cosine_scaled_reward": 0.027978284284472466, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2890.3333587646484, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.13710667192935944, | |
| "kl": 0.0076751708984375, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.016, | |
| "reward": 0.3612702414393425, | |
| "reward_std": 0.8515727780759335, | |
| "rewards/cosine_scaled_reward": -0.02769822347909212, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3069.729202270508, | |
| "epoch": 0.152, | |
| "grad_norm": 0.10156344622373581, | |
| "kl": 0.009157180786132812, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0382, | |
| "reward": -0.0937919020652771, | |
| "reward_std": 0.5027989856898785, | |
| "rewards/cosine_scaled_reward": -0.20314595522359014, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2549.937530517578, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.077970951795578, | |
| "kl": 0.011308670043945312, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0088, | |
| "reward": 0.2696135453879833, | |
| "reward_std": 0.43714287504553795, | |
| "rewards/cosine_scaled_reward": -0.10477657988667488, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1877.0833625793457, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.10776728391647339, | |
| "kl": 0.007760047912597656, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0044, | |
| "reward": 1.0087775029242039, | |
| "reward_std": 0.766286326572299, | |
| "rewards/cosine_scaled_reward": 0.17105539632029831, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2871.8750610351562, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.22153440117835999, | |
| "kl": 0.011474609375, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0905, | |
| "reward": 0.6536878123879433, | |
| "reward_std": 1.1243817768990993, | |
| "rewards/cosine_scaled_reward": 0.08726056106388569, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 3189.4791870117188, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.11029347777366638, | |
| "kl": 0.010852813720703125, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0475, | |
| "reward": -0.21665774658322334, | |
| "reward_std": 0.5276653412729502, | |
| "rewards/cosine_scaled_reward": -0.22291221655905247, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2698.1875610351562, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.19467265903949738, | |
| "kl": 0.010044097900390625, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0883, | |
| "reward": 0.38313706149347126, | |
| "reward_std": 0.9247510060667992, | |
| "rewards/cosine_scaled_reward": -0.07926480891183019, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 3213.479217529297, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.15390190482139587, | |
| "kl": 0.011737823486328125, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0208, | |
| "reward": 0.11885028099641204, | |
| "reward_std": 0.8315641395747662, | |
| "rewards/cosine_scaled_reward": -0.12807486671954393, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 3079.7083740234375, | |
| "epoch": 0.16, | |
| "grad_norm": 0.2192172110080719, | |
| "kl": 0.015716552734375, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0322, | |
| "reward": 0.0838099829852581, | |
| "reward_std": 0.4756584819406271, | |
| "rewards/cosine_scaled_reward": -0.14559502340853214, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2925.416748046875, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.16468165814876556, | |
| "kl": 0.015735626220703125, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0516, | |
| "reward": 0.3168928772211075, | |
| "reward_std": 0.8896390050649643, | |
| "rewards/cosine_scaled_reward": -0.08113690535537899, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2902.8334045410156, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.11325091868638992, | |
| "kl": 0.014360427856445312, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0589, | |
| "reward": 0.6967496890574694, | |
| "reward_std": 0.6483085379004478, | |
| "rewards/cosine_scaled_reward": 0.025458187563344836, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2774.2292251586914, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.26006487011909485, | |
| "kl": 0.01424407958984375, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0847, | |
| "reward": 0.2764196931384504, | |
| "reward_std": 0.7496595717966557, | |
| "rewards/cosine_scaled_reward": -0.09095682273618877, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 3044.0833892822266, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.1781724989414215, | |
| "kl": 0.011707305908203125, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0491, | |
| "reward": 0.19056477712001652, | |
| "reward_std": 0.9319424778223038, | |
| "rewards/cosine_scaled_reward": -0.06096761766821146, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2377.4583740234375, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.13194841146469116, | |
| "kl": 0.013391494750976562, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0296, | |
| "reward": 0.6011020904406905, | |
| "reward_std": 0.5612340047955513, | |
| "rewards/cosine_scaled_reward": 0.01930104335770011, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2360.9792098999023, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.14858490228652954, | |
| "kl": 0.008525848388671875, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0943, | |
| "reward": 0.22767078503966331, | |
| "reward_std": 0.5497826747596264, | |
| "rewards/cosine_scaled_reward": -0.20908129028975964, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2831.4583740234375, | |
| "epoch": 0.168, | |
| "grad_norm": 0.13822530210018158, | |
| "kl": 0.015819549560546875, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0489, | |
| "reward": 0.17515791207551956, | |
| "reward_std": 0.7267673751339316, | |
| "rewards/cosine_scaled_reward": -0.0999210444279015, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2381.062530517578, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.09740369766950607, | |
| "kl": 0.013698577880859375, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0063, | |
| "reward": 0.39699011482298374, | |
| "reward_std": 0.515554535202682, | |
| "rewards/cosine_scaled_reward": -0.0931716226041317, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2793.291702270508, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.09077835828065872, | |
| "kl": 0.011531829833984375, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0011, | |
| "reward": 0.5288912262767553, | |
| "reward_std": 0.5327219031751156, | |
| "rewards/cosine_scaled_reward": 0.004028939350973815, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2927.9583587646484, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.200521320104599, | |
| "kl": 0.016937255859375, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.068, | |
| "reward": 0.37648333609104156, | |
| "reward_std": 0.8985544182360172, | |
| "rewards/cosine_scaled_reward": 0.0007416550070047379, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2586.041732788086, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.30554497241973877, | |
| "kl": 0.01647186279296875, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.1073, | |
| "reward": 0.6497351740254089, | |
| "reward_std": 0.7574196644127369, | |
| "rewards/cosine_scaled_reward": 0.054034238681197166, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2981.250030517578, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.21656572818756104, | |
| "kl": 0.019748687744140625, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0443, | |
| "reward": -0.05579106882214546, | |
| "reward_std": 0.7100877314805984, | |
| "rewards/cosine_scaled_reward": -0.17372886929661036, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2944.3750228881836, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.11329537630081177, | |
| "kl": 0.02336883544921875, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0242, | |
| "reward": 0.03104301728308201, | |
| "reward_std": 0.6042890883982182, | |
| "rewards/cosine_scaled_reward": -0.16156182251870632, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 3291.2500610351562, | |
| "epoch": 0.176, | |
| "grad_norm": 0.19572459161281586, | |
| "kl": 0.01239013671875, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0356, | |
| "reward": 0.9391661360859871, | |
| "reward_std": 0.8916139230132103, | |
| "rewards/cosine_scaled_reward": 0.20916638150811195, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2646.916702270508, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.11613517999649048, | |
| "kl": 0.0165252685546875, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0234, | |
| "reward": 0.5908447969704866, | |
| "reward_std": 0.7097595669329166, | |
| "rewards/cosine_scaled_reward": 0.024589055217802525, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 3025.1666870117188, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.23642632365226746, | |
| "kl": 0.014795303344726562, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0964, | |
| "reward": 0.13831177074462175, | |
| "reward_std": 0.793521448969841, | |
| "rewards/cosine_scaled_reward": -0.08709411323070526, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 3077.2291870117188, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.08657457679510117, | |
| "kl": 0.017913818359375, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0329, | |
| "reward": 0.01709558442234993, | |
| "reward_std": 0.39399333111941814, | |
| "rewards/cosine_scaled_reward": -0.1685355380177498, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 3292.8958740234375, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.19324812293052673, | |
| "kl": 0.01824951171875, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0586, | |
| "reward": 0.7785975768638309, | |
| "reward_std": 0.9201798811554909, | |
| "rewards/cosine_scaled_reward": 0.18096543662250042, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 3100.854202270508, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.15799157321453094, | |
| "kl": 0.02082061767578125, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.035, | |
| "reward": 0.008442229591310024, | |
| "reward_std": 0.7840565517544746, | |
| "rewards/cosine_scaled_reward": -0.15202889824286103, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2739.5416946411133, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.18316112458705902, | |
| "kl": 0.020570755004882812, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0374, | |
| "reward": 0.2813178598880768, | |
| "reward_std": 0.678614292293787, | |
| "rewards/cosine_scaled_reward": -0.04684108844958246, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2786.0208892822266, | |
| "epoch": 0.184, | |
| "grad_norm": 0.1603342592716217, | |
| "kl": 0.02384185791015625, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0217, | |
| "reward": 0.6455709072761238, | |
| "reward_std": 0.6690337508916855, | |
| "rewards/cosine_scaled_reward": 0.0623687906190753, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 3222.5000610351562, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.25506702065467834, | |
| "kl": 0.0278472900390625, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0432, | |
| "reward": 0.13122543692588806, | |
| "reward_std": 0.88908408023417, | |
| "rewards/cosine_scaled_reward": -0.10105395689606667, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2518.604179382324, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.09915700554847717, | |
| "kl": 0.019123077392578125, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0145, | |
| "reward": 0.8429984450340271, | |
| "reward_std": 0.6572048924863338, | |
| "rewards/cosine_scaled_reward": 0.10899921134114265, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2556.354202270508, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.1429044008255005, | |
| "kl": 0.021711349487304688, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0501, | |
| "reward": 0.4997831657528877, | |
| "reward_std": 0.6965379063040018, | |
| "rewards/cosine_scaled_reward": -0.03135842923074961, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 3219.229248046875, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.18279628455638885, | |
| "kl": 0.033294677734375, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0497, | |
| "reward": 0.08890796452760696, | |
| "reward_std": 0.8442841582000256, | |
| "rewards/cosine_scaled_reward": -0.10137936053797603, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2898.8333587646484, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.1255594938993454, | |
| "kl": 0.02011871337890625, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0265, | |
| "reward": 0.3105367962270975, | |
| "reward_std": 0.62895817309618, | |
| "rewards/cosine_scaled_reward": -0.06348160747438669, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2437.8959197998047, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.25301864743232727, | |
| "kl": 0.01929473876953125, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0756, | |
| "reward": 0.41188543289899826, | |
| "reward_std": 0.8582058474421501, | |
| "rewards/cosine_scaled_reward": -0.1482239617034793, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 3033.0209045410156, | |
| "epoch": 0.192, | |
| "grad_norm": 0.24609790742397308, | |
| "kl": 0.02398681640625, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0643, | |
| "reward": 0.3896508193574846, | |
| "reward_std": 0.9152091555297375, | |
| "rewards/cosine_scaled_reward": -0.05517459171824157, | |
| "rewards/format_reward": 0.5000000186264515, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 2549.3750610351562, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.24488316476345062, | |
| "kl": 0.02884674072265625, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0667, | |
| "reward": 1.2588793262839317, | |
| "reward_std": 1.0218086428940296, | |
| "rewards/cosine_scaled_reward": 0.31693966779857874, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2387.8958740234375, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.15666845440864563, | |
| "kl": 0.021160125732421875, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0736, | |
| "reward": 0.5572759381029755, | |
| "reward_std": 0.7070234641432762, | |
| "rewards/cosine_scaled_reward": -0.013028699904680252, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2764.2083892822266, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.2903493046760559, | |
| "kl": 0.023040771484375, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0756, | |
| "reward": 0.3716934472322464, | |
| "reward_std": 0.7128049619495869, | |
| "rewards/cosine_scaled_reward": -0.03290329407900572, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2922.833366394043, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.32527434825897217, | |
| "kl": 0.03302001953125, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0821, | |
| "reward": 0.5163949467241764, | |
| "reward_std": 0.914849640801549, | |
| "rewards/cosine_scaled_reward": 0.06028079940006137, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 2277.3541870117188, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.28431299328804016, | |
| "kl": 0.0292205810546875, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0899, | |
| "reward": 0.19020776241086423, | |
| "reward_std": 0.8620494175702333, | |
| "rewards/cosine_scaled_reward": -0.18614612240344286, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2744.687545776367, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.2881910800933838, | |
| "kl": 0.03791046142578125, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0575, | |
| "reward": 0.31613041274249554, | |
| "reward_std": 0.8530451729893684, | |
| "rewards/cosine_scaled_reward": -0.08151813084259629, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2789.6875534057617, | |
| "epoch": 0.2, | |
| "grad_norm": 0.08940012753009796, | |
| "kl": 0.037750244140625, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0053, | |
| "reward": 0.3539888858795166, | |
| "reward_std": 0.473992221057415, | |
| "rewards/cosine_scaled_reward": -0.06258890964090824, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2489.5625610351562, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.18962816894054413, | |
| "kl": 0.03296661376953125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0326, | |
| "reward": 0.5119255073368549, | |
| "reward_std": 0.8859294652938843, | |
| "rewards/cosine_scaled_reward": -0.014870589919155464, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 3037.5209045410156, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.2424456924200058, | |
| "kl": 0.043304443359375, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0484, | |
| "reward": 0.2696651890873909, | |
| "reward_std": 0.9238467961549759, | |
| "rewards/cosine_scaled_reward": -0.12558407767210156, | |
| "rewards/format_reward": 0.5208333544433117, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2764.645839691162, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.22290733456611633, | |
| "kl": 0.05438232421875, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0283, | |
| "reward": 0.1138812736608088, | |
| "reward_std": 0.8126712590456009, | |
| "rewards/cosine_scaled_reward": -0.13055937364697456, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2984.416702270508, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.1466417908668518, | |
| "kl": 0.047088623046875, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.015, | |
| "reward": 0.0321359746158123, | |
| "reward_std": 0.5167017672210932, | |
| "rewards/cosine_scaled_reward": -0.17143201641738415, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2136.145881652832, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.09843747317790985, | |
| "kl": 0.0455169677734375, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0107, | |
| "reward": 0.910092594102025, | |
| "reward_std": 0.585525143891573, | |
| "rewards/cosine_scaled_reward": 0.1425462868064642, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 3006.2500228881836, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.13610567152500153, | |
| "kl": 0.055999755859375, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0132, | |
| "reward": -0.03030078485608101, | |
| "reward_std": 0.600213073194027, | |
| "rewards/cosine_scaled_reward": -0.18181706592440605, | |
| "rewards/format_reward": 0.3333333469927311, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2419.437530517578, | |
| "epoch": 0.208, | |
| "grad_norm": 0.18283116817474365, | |
| "kl": 0.0460205078125, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0463, | |
| "reward": 0.5305005796253681, | |
| "reward_std": 0.7933950982987881, | |
| "rewards/cosine_scaled_reward": -0.005583042744547129, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 2477.1875915527344, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.6196571588516235, | |
| "kl": 0.0609283447265625, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.1294, | |
| "reward": 0.5540785677731037, | |
| "reward_std": 0.9478080496191978, | |
| "rewards/cosine_scaled_reward": 0.006205941084772348, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2701.562530517578, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.19450385868549347, | |
| "kl": 0.063079833984375, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0082, | |
| "reward": 0.25770498625934124, | |
| "reward_std": 0.8395693749189377, | |
| "rewards/cosine_scaled_reward": -0.07948085246607661, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2466.125068664551, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.1757989078760147, | |
| "kl": 0.08551025390625, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0215, | |
| "reward": 0.06042790925130248, | |
| "reward_std": 0.5447751199826598, | |
| "rewards/cosine_scaled_reward": -0.18853605829644948, | |
| "rewards/format_reward": 0.4375000186264515, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2972.1458740234375, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.2645973861217499, | |
| "kl": 0.077728271484375, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": -0.0252, | |
| "reward": 0.47168839909136295, | |
| "reward_std": 0.5297163799405098, | |
| "rewards/cosine_scaled_reward": 0.03792751580476761, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 2462.250045776367, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.23852241039276123, | |
| "kl": 0.07574462890625, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0462, | |
| "reward": 0.22391528385924175, | |
| "reward_std": 0.6513072997331619, | |
| "rewards/cosine_scaled_reward": -0.16929237358272076, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 3234.854202270508, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.2536059319972992, | |
| "kl": 0.084686279296875, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.006, | |
| "reward": -0.11121096089482307, | |
| "reward_std": 0.8675953522324562, | |
| "rewards/cosine_scaled_reward": -0.14935547299683094, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 2777.7084045410156, | |
| "epoch": 0.216, | |
| "grad_norm": 0.2072547823190689, | |
| "kl": 0.0953369140625, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0194, | |
| "reward": 0.494580146856606, | |
| "reward_std": 0.7433264572173357, | |
| "rewards/cosine_scaled_reward": -0.07562660798430443, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 2382.875045776367, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.2012052983045578, | |
| "kl": 0.08172607421875, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0391, | |
| "reward": 0.3381795147433877, | |
| "reward_std": 0.5929664559662342, | |
| "rewards/cosine_scaled_reward": -0.07049360126256943, | |
| "rewards/format_reward": 0.47916668094694614, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 2421.979217529297, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.2843816578388214, | |
| "kl": 0.09423828125, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0108, | |
| "reward": 0.6503109214827418, | |
| "reward_std": 1.1196836531162262, | |
| "rewards/cosine_scaled_reward": 0.08557211281731725, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 3207.229217529297, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.3324408233165741, | |
| "kl": 0.10272216796875, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0665, | |
| "reward": -0.08572058263234794, | |
| "reward_std": 0.6475069168955088, | |
| "rewards/cosine_scaled_reward": -0.19911029934883118, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 3053.5834045410156, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.29358112812042236, | |
| "kl": 0.0987548828125, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0517, | |
| "reward": 0.2612769678235054, | |
| "reward_std": 0.8455968722701073, | |
| "rewards/cosine_scaled_reward": -0.07769485469907522, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 2582.645927429199, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.42259493470191956, | |
| "kl": 0.0806884765625, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0436, | |
| "reward": 0.9180384781211615, | |
| "reward_std": 0.7587432824075222, | |
| "rewards/cosine_scaled_reward": 0.2090192511677742, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 2819.6667098999023, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.49130040407180786, | |
| "kl": 0.105621337890625, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0463, | |
| "reward": 0.2741181245073676, | |
| "reward_std": 0.762896966189146, | |
| "rewards/cosine_scaled_reward": -0.040024266578257084, | |
| "rewards/format_reward": 0.35416668094694614, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 3187.7708740234375, | |
| "epoch": 0.224, | |
| "grad_norm": 0.22648778557777405, | |
| "kl": 0.13623046875, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0205, | |
| "reward": 0.16096001025289297, | |
| "reward_std": 0.6957874298095703, | |
| "rewards/cosine_scaled_reward": -0.06535333674401045, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 2287.1875610351562, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.6928053498268127, | |
| "kl": 0.13812255859375, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": -0.0182, | |
| "reward": 0.7687155855819583, | |
| "reward_std": 1.2102737426757812, | |
| "rewards/cosine_scaled_reward": 0.12394111696630716, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2939.187530517578, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.4229676425457001, | |
| "kl": 0.155517578125, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0503, | |
| "reward": 0.317043187096715, | |
| "reward_std": 0.6642983369529247, | |
| "rewards/cosine_scaled_reward": -0.008145075291395187, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 2638.979217529297, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.4884941875934601, | |
| "kl": 0.116912841796875, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0449, | |
| "reward": -0.08919784612953663, | |
| "reward_std": 0.5895257294178009, | |
| "rewards/cosine_scaled_reward": -0.1383489231520798, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 2367.125068664551, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.6410492062568665, | |
| "kl": 0.13128662109375, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0794, | |
| "reward": 0.7019957322627306, | |
| "reward_std": 0.9874038621783257, | |
| "rewards/cosine_scaled_reward": 0.04891452379524708, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 2166.708366394043, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.7525650858879089, | |
| "kl": 0.12518310546875, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0497, | |
| "reward": 0.9610625859349966, | |
| "reward_std": 1.213215809315443, | |
| "rewards/cosine_scaled_reward": 0.24094795435667038, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 2405.8958892822266, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.3488696217536926, | |
| "kl": 0.199951171875, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": -0.0162, | |
| "reward": 0.9840776808559895, | |
| "reward_std": 0.6163855046033859, | |
| "rewards/cosine_scaled_reward": 0.20037216693162918, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 3139.791702270508, | |
| "epoch": 0.232, | |
| "grad_norm": 0.4035970866680145, | |
| "kl": 0.2327880859375, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0518, | |
| "reward": 0.11016486119478941, | |
| "reward_std": 0.6740043368190527, | |
| "rewards/cosine_scaled_reward": -0.08033423824235797, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 2195.8125610351562, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.45050907135009766, | |
| "kl": 0.18145751953125, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0271, | |
| "reward": 0.4769836826599203, | |
| "reward_std": 0.7626992892473936, | |
| "rewards/cosine_scaled_reward": -0.11567484028637409, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 2807.3333892822266, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.40741464495658875, | |
| "kl": 0.202392578125, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.02, | |
| "reward": 0.6580097423866391, | |
| "reward_std": 0.8570194113999605, | |
| "rewards/cosine_scaled_reward": 0.13108818820910528, | |
| "rewards/format_reward": 0.3958333507180214, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 3126.8541870117188, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.3323240876197815, | |
| "kl": 0.2293701171875, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.025, | |
| "reward": -0.058580704033374786, | |
| "reward_std": 0.7412764355540276, | |
| "rewards/cosine_scaled_reward": -0.16470702644437551, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 3054.875045776367, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.4214930534362793, | |
| "kl": 0.2855224609375, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0122, | |
| "reward": -0.014476008713245392, | |
| "reward_std": 0.5796043276786804, | |
| "rewards/cosine_scaled_reward": -0.22598800901323557, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 2381.5417251586914, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.284451961517334, | |
| "kl": 0.1992034912109375, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0347, | |
| "reward": 0.6913385018706322, | |
| "reward_std": 0.6349469162523746, | |
| "rewards/cosine_scaled_reward": 0.05400256626307964, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 2797.312545776367, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.9943209290504456, | |
| "kl": 0.2471923828125, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0679, | |
| "reward": 0.6385658853687346, | |
| "reward_std": 0.8412618339061737, | |
| "rewards/cosine_scaled_reward": 0.038032938959077, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 2810.729202270508, | |
| "epoch": 0.24, | |
| "grad_norm": 0.27606576681137085, | |
| "kl": 0.216064453125, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0424, | |
| "reward": 0.17591036087833345, | |
| "reward_std": 0.5839308425784111, | |
| "rewards/cosine_scaled_reward": -0.10996149946004152, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 2546.8333892822266, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.5562529563903809, | |
| "kl": 0.2337646484375, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0024, | |
| "reward": 0.3355960976332426, | |
| "reward_std": 0.8199643902480602, | |
| "rewards/cosine_scaled_reward": -0.061368612572550774, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 2440.375045776367, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.6957542300224304, | |
| "kl": 0.21954345703125, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0798, | |
| "reward": 0.315935923717916, | |
| "reward_std": 0.7707805298268795, | |
| "rewards/cosine_scaled_reward": -0.09203205385711044, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 2497.416702270508, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.6777252554893494, | |
| "kl": 0.28155517578125, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0231, | |
| "reward": 0.4181370511651039, | |
| "reward_std": 0.701967678964138, | |
| "rewards/cosine_scaled_reward": -0.051348146982491016, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 2721.791702270508, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.44339874386787415, | |
| "kl": 0.2493896484375, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0185, | |
| "reward": 0.5002706460654736, | |
| "reward_std": 0.8209729120135307, | |
| "rewards/cosine_scaled_reward": -0.020698013715445995, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 2568.8958740234375, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.3160378336906433, | |
| "kl": 0.20928955078125, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0097, | |
| "reward": 0.19143693707883358, | |
| "reward_std": 0.5702317543327808, | |
| "rewards/cosine_scaled_reward": -0.14386487286537886, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 2325.666702270508, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.42989280819892883, | |
| "kl": 0.224609375, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0334, | |
| "reward": 0.6585849169641733, | |
| "reward_std": 0.9609077526256442, | |
| "rewards/cosine_scaled_reward": -0.014457540586590767, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 2915.604232788086, | |
| "epoch": 0.248, | |
| "grad_norm": 1.269454836845398, | |
| "kl": 0.2520751953125, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.1032, | |
| "reward": 0.1808436312712729, | |
| "reward_std": 0.8456562999635935, | |
| "rewards/cosine_scaled_reward": -0.08666152018122375, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 2640.4583740234375, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.4037168323993683, | |
| "kl": 0.21539306640625, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0083, | |
| "reward": 0.4553571194410324, | |
| "reward_std": 0.7446375414729118, | |
| "rewards/cosine_scaled_reward": -0.04315477702766657, | |
| "rewards/format_reward": 0.5416666697710752, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 2454.8958740234375, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.6872981190681458, | |
| "kl": 0.2410888671875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0494, | |
| "reward": 0.7339769629761577, | |
| "reward_std": 0.8893967866897583, | |
| "rewards/cosine_scaled_reward": 0.03365514148026705, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 2835.3750534057617, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.4908996820449829, | |
| "kl": 0.2960205078125, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0663, | |
| "reward": -0.27113689854741096, | |
| "reward_std": 0.4867183081805706, | |
| "rewards/cosine_scaled_reward": -0.2918184567242861, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 2121.1250610351562, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.9558163285255432, | |
| "kl": 0.21978759765625, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": -0.0463, | |
| "reward": 0.7585705481469631, | |
| "reward_std": 0.8061345554888248, | |
| "rewards/cosine_scaled_reward": 0.06678527034819126, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 2521.4584197998047, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.2961377203464508, | |
| "kl": 0.2503662109375, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0342, | |
| "reward": 0.4850865611806512, | |
| "reward_std": 0.5646930634975433, | |
| "rewards/cosine_scaled_reward": -0.08037340734153986, | |
| "rewards/format_reward": 0.645833345130086, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 2613.5208892822266, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.3459699749946594, | |
| "kl": 0.2802734375, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0119, | |
| "reward": 0.4122865653480403, | |
| "reward_std": 0.6126667521893978, | |
| "rewards/cosine_scaled_reward": -0.09594006463885307, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 2961.7709350585938, | |
| "epoch": 0.256, | |
| "grad_norm": 0.6146445870399475, | |
| "kl": 0.28375244140625, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0476, | |
| "reward": 0.2019269121810794, | |
| "reward_std": 0.7791545800864697, | |
| "rewards/cosine_scaled_reward": -0.05528653599321842, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 3000.687545776367, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.6927861571311951, | |
| "kl": 0.2901611328125, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0702, | |
| "reward": 0.3877083119004965, | |
| "reward_std": 0.7308296859264374, | |
| "rewards/cosine_scaled_reward": -0.05614587618038058, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 2560.979248046875, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.47879165410995483, | |
| "kl": 0.26910400390625, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0242, | |
| "reward": 0.7968797162175179, | |
| "reward_std": 0.7786244936287403, | |
| "rewards/cosine_scaled_reward": 0.14843985438346863, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 2564.1458740234375, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.469950795173645, | |
| "kl": 0.324462890625, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0308, | |
| "reward": 0.5330985919572413, | |
| "reward_std": 0.9147845208644867, | |
| "rewards/cosine_scaled_reward": -0.014700718224048615, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 2534.604202270508, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.3445420265197754, | |
| "kl": 0.29400634765625, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0282, | |
| "reward": 0.46225254982709885, | |
| "reward_std": 0.6663805656135082, | |
| "rewards/cosine_scaled_reward": 0.012376293540000916, | |
| "rewards/format_reward": 0.4375, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 3104.7500610351562, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.5392424464225769, | |
| "kl": 0.446044921875, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0297, | |
| "reward": 0.41551550291478634, | |
| "reward_std": 0.5293188579380512, | |
| "rewards/cosine_scaled_reward": -0.021408939734101295, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 3205.3959045410156, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.3657205104827881, | |
| "kl": 0.4345703125, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0674, | |
| "reward": -0.06970879365690053, | |
| "reward_std": 0.5970296040177345, | |
| "rewards/cosine_scaled_reward": -0.19110439904034138, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 2905.166717529297, | |
| "epoch": 0.264, | |
| "grad_norm": 0.6562315821647644, | |
| "kl": 0.39349365234375, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0594, | |
| "reward": 0.326082413084805, | |
| "reward_std": 0.7788660638034344, | |
| "rewards/cosine_scaled_reward": -0.003625463228672743, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 3106.604248046875, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.3778403103351593, | |
| "kl": 0.42138671875, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.062, | |
| "reward": -0.2184969331137836, | |
| "reward_std": 0.5246814601123333, | |
| "rewards/cosine_scaled_reward": -0.21341513469815254, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 2835.93758392334, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.723120927810669, | |
| "kl": 0.4295654296875, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0837, | |
| "reward": 0.21882726065814495, | |
| "reward_std": 0.8319694129750133, | |
| "rewards/cosine_scaled_reward": -0.10933638806454837, | |
| "rewards/format_reward": 0.43750001676380634, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 2814.5417137145996, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.5316165685653687, | |
| "kl": 0.404998779296875, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0213, | |
| "reward": -0.02029888378456235, | |
| "reward_std": 0.5957829728722572, | |
| "rewards/cosine_scaled_reward": -0.17681611701846123, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 2171.729217529297, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 1.2359085083007812, | |
| "kl": 0.30389404296875, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0713, | |
| "reward": 0.5996768039185554, | |
| "reward_std": 0.8905858621001244, | |
| "rewards/cosine_scaled_reward": -0.02307826466858387, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 2596.937545776367, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.44107890129089355, | |
| "kl": 0.310302734375, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.005, | |
| "reward": 0.6644928082823753, | |
| "reward_std": 0.8656329847872257, | |
| "rewards/cosine_scaled_reward": -0.00108695263043046, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 2504.2083892822266, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.6235182285308838, | |
| "kl": 0.3521728515625, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0334, | |
| "reward": 0.5841595754027367, | |
| "reward_std": 0.7832061983644962, | |
| "rewards/cosine_scaled_reward": 0.00041312072426080704, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 2374.1458892822266, | |
| "epoch": 0.272, | |
| "grad_norm": 0.5051608085632324, | |
| "kl": 0.3377685546875, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0394, | |
| "reward": 1.067820217460394, | |
| "reward_std": 0.7289628759026527, | |
| "rewards/cosine_scaled_reward": 0.23182673379778862, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 2220.125030517578, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.33481940627098083, | |
| "kl": 0.332763671875, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0246, | |
| "reward": 0.8250223018694669, | |
| "reward_std": 0.5633459165692329, | |
| "rewards/cosine_scaled_reward": 0.15209448896348476, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 2807.2709045410156, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.47511982917785645, | |
| "kl": 0.609130859375, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.063, | |
| "reward": 0.18580850027501583, | |
| "reward_std": 0.5817486252635717, | |
| "rewards/cosine_scaled_reward": -0.16751242137979716, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 2705.0833892822266, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.6473789811134338, | |
| "kl": 0.42578125, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.025, | |
| "reward": 0.17192419804632664, | |
| "reward_std": 0.6516960971057415, | |
| "rewards/cosine_scaled_reward": -0.20570457633584738, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 2445.0834045410156, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.5017490983009338, | |
| "kl": 0.50115966796875, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0511, | |
| "reward": 0.5459022335708141, | |
| "reward_std": 0.6534602418541908, | |
| "rewards/cosine_scaled_reward": -0.10204888670705259, | |
| "rewards/format_reward": 0.7500000167638063, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 2775.3125915527344, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.9957566261291504, | |
| "kl": 0.471435546875, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.009, | |
| "reward": 0.7295572739094496, | |
| "reward_std": 0.8050657417625189, | |
| "rewards/cosine_scaled_reward": 0.09394530206918716, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 2788.8959197998047, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.5656623244285583, | |
| "kl": 0.4091796875, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0419, | |
| "reward": 0.5535422116518021, | |
| "reward_std": 0.7251980789005756, | |
| "rewards/cosine_scaled_reward": 0.05802109342766926, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 3191.8334045410156, | |
| "epoch": 0.28, | |
| "grad_norm": 1.2059617042541504, | |
| "kl": 0.46240234375, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0945, | |
| "reward": 0.15825808281078935, | |
| "reward_std": 0.9811517968773842, | |
| "rewards/cosine_scaled_reward": -0.13962095836177468, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 2959.666748046875, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.74327552318573, | |
| "kl": 0.458740234375, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0224, | |
| "reward": 0.4606718048453331, | |
| "reward_std": 0.9427804127335548, | |
| "rewards/cosine_scaled_reward": -0.009247444570064545, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 3009.2083435058594, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.6833385229110718, | |
| "kl": 0.4307861328125, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0303, | |
| "reward": 0.013721236027777195, | |
| "reward_std": 0.6590881273150444, | |
| "rewards/cosine_scaled_reward": -0.19105605548247695, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 2684.1875762939453, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.5844886302947998, | |
| "kl": 0.35345458984375, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0537, | |
| "reward": 0.45633104629814625, | |
| "reward_std": 0.7913403064012527, | |
| "rewards/cosine_scaled_reward": -0.032251136377453804, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 2176.4792251586914, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.9092757105827332, | |
| "kl": 0.264862060546875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0562, | |
| "reward": 0.8251112774014473, | |
| "reward_std": 0.7513875584118068, | |
| "rewards/cosine_scaled_reward": 0.08963895868510008, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 2420.500030517578, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 1.0886207818984985, | |
| "kl": 0.3681640625, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0552, | |
| "reward": 0.2780859973281622, | |
| "reward_std": 0.6804126389324665, | |
| "rewards/cosine_scaled_reward": -0.183873676112853, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1995.895896911621, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.5505285263061523, | |
| "kl": 0.30914306640625, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.045, | |
| "reward": 0.45957393012940884, | |
| "reward_std": 0.8468098007142544, | |
| "rewards/cosine_scaled_reward": -0.07229639682918787, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 2733.312545776367, | |
| "epoch": 0.288, | |
| "grad_norm": 0.4072404205799103, | |
| "kl": 0.435791015625, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0364, | |
| "reward": 0.22467345744371414, | |
| "reward_std": 0.5785614065825939, | |
| "rewards/cosine_scaled_reward": -0.08557994151487947, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 2344.9792098999023, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.5800250768661499, | |
| "kl": 0.36541748046875, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0011, | |
| "reward": 0.6350781377404928, | |
| "reward_std": 0.6775294542312622, | |
| "rewards/cosine_scaled_reward": -0.03662758320569992, | |
| "rewards/format_reward": 0.708333345130086, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 3104.7709045410156, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.5420771241188049, | |
| "kl": 0.61962890625, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0671, | |
| "reward": 0.0348883168771863, | |
| "reward_std": 0.6898283958435059, | |
| "rewards/cosine_scaled_reward": -0.17005583579884842, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 2963.145896911621, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 1.1052091121673584, | |
| "kl": 0.5419921875, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0088, | |
| "reward": 0.11596883228048682, | |
| "reward_std": 0.6711905375123024, | |
| "rewards/cosine_scaled_reward": -0.20243225805461407, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 2662.437530517578, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.6445716619491577, | |
| "kl": 0.4893798828125, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0404, | |
| "reward": 0.3911870224401355, | |
| "reward_std": 0.6272533088922501, | |
| "rewards/cosine_scaled_reward": -0.012739824131131172, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 3104.31258392334, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.8529700040817261, | |
| "kl": 0.52423095703125, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0561, | |
| "reward": 0.34972720965743065, | |
| "reward_std": 0.8778971843421459, | |
| "rewards/cosine_scaled_reward": -0.033469736110419035, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 3088.354278564453, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 1.0896036624908447, | |
| "kl": 0.4991455078125, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0909, | |
| "reward": 0.26875314209610224, | |
| "reward_std": 0.7737620323896408, | |
| "rewards/cosine_scaled_reward": -0.12604010291397572, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 2515.1459045410156, | |
| "epoch": 0.296, | |
| "grad_norm": 0.5050967335700989, | |
| "kl": 0.449066162109375, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0397, | |
| "reward": 0.7024204786866903, | |
| "reward_std": 0.7655514609068632, | |
| "rewards/cosine_scaled_reward": -0.002956441603600979, | |
| "rewards/format_reward": 0.7083333525806665, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 2217.250015258789, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.4907468855381012, | |
| "kl": 0.36773681640625, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0207, | |
| "reward": 0.9972690381109715, | |
| "reward_std": 0.8888046778738499, | |
| "rewards/cosine_scaled_reward": 0.18613450415432453, | |
| "rewards/format_reward": 0.6250000018626451, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2846.3958740234375, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.5651382207870483, | |
| "kl": 0.461669921875, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0341, | |
| "reward": 0.17303861770778894, | |
| "reward_std": 0.6959999911487103, | |
| "rewards/cosine_scaled_reward": -0.14264736231416464, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 2430.916717529297, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.7266518473625183, | |
| "kl": 0.408447265625, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0076, | |
| "reward": 0.26563636353239417, | |
| "reward_std": 0.5362856052815914, | |
| "rewards/cosine_scaled_reward": -0.2213484961539507, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 2708.4583740234375, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.6144644618034363, | |
| "kl": 0.48321533203125, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0328, | |
| "reward": 0.16829278273507953, | |
| "reward_std": 0.7771525047719479, | |
| "rewards/cosine_scaled_reward": -0.2179369544610381, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 2807.541717529297, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.6735979914665222, | |
| "kl": 0.43017578125, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0176, | |
| "reward": 0.5329833417199552, | |
| "reward_std": 0.7325766794383526, | |
| "rewards/cosine_scaled_reward": -0.0772583307698369, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 2439.3334197998047, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 1.1831626892089844, | |
| "kl": 0.361572265625, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0469, | |
| "reward": 0.6903195958584547, | |
| "reward_std": 0.82445028424263, | |
| "rewards/cosine_scaled_reward": -0.009006870910525322, | |
| "rewards/format_reward": 0.708333345130086, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 2822.1250915527344, | |
| "epoch": 0.304, | |
| "grad_norm": 0.8838703036308289, | |
| "kl": 0.39208984375, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.005, | |
| "reward": 0.3137262724339962, | |
| "reward_std": 0.7213578186929226, | |
| "rewards/cosine_scaled_reward": -0.14522020891308784, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 3202.937530517578, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.977644145488739, | |
| "kl": 0.5537109375, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0299, | |
| "reward": 0.054544554091989994, | |
| "reward_std": 0.6957491394132376, | |
| "rewards/cosine_scaled_reward": -0.128977719694376, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 2406.0208740234375, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.6916462182998657, | |
| "kl": 0.3612060546875, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.031, | |
| "reward": 0.06796193681657314, | |
| "reward_std": 0.5696085840463638, | |
| "rewards/cosine_scaled_reward": -0.20560237113386393, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 2778.479217529297, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.8052944540977478, | |
| "kl": 0.3636474609375, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0335, | |
| "reward": 0.3834271663799882, | |
| "reward_std": 0.865951232612133, | |
| "rewards/cosine_scaled_reward": -0.11036975774914026, | |
| "rewards/format_reward": 0.6041666809469461, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 2532.1458892822266, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 1.3412328958511353, | |
| "kl": 0.26837158203125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0278, | |
| "reward": 0.8955399505794048, | |
| "reward_std": 0.9300484023988247, | |
| "rewards/cosine_scaled_reward": 0.09360331390053034, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 2416.395950317383, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 1.7048732042312622, | |
| "kl": 0.30621337890625, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0786, | |
| "reward": 0.9949164763092995, | |
| "reward_std": 0.953071303665638, | |
| "rewards/cosine_scaled_reward": 0.14329158514738083, | |
| "rewards/format_reward": 0.708333345130086, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 3282.5834350585938, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.832673966884613, | |
| "kl": 0.585693359375, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0292, | |
| "reward": 0.19685243256390095, | |
| "reward_std": 0.7315353937447071, | |
| "rewards/cosine_scaled_reward": -0.0994904637336731, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 2979.0625915527344, | |
| "epoch": 0.312, | |
| "grad_norm": 0.6270153522491455, | |
| "kl": 0.535888671875, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0332, | |
| "reward": 0.3002479658462107, | |
| "reward_std": 0.8212124370038509, | |
| "rewards/cosine_scaled_reward": -0.09987602103501558, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1942.5000457763672, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.7007995247840881, | |
| "kl": 0.320556640625, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": -0.0018, | |
| "reward": 1.0117733776569366, | |
| "reward_std": 0.7953921295702457, | |
| "rewards/cosine_scaled_reward": 0.14130336791276932, | |
| "rewards/format_reward": 0.7291666753590107, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 2129.750045776367, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.5516811609268188, | |
| "kl": 0.302520751953125, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0104, | |
| "reward": 0.5132089601829648, | |
| "reward_std": 0.9499057307839394, | |
| "rewards/cosine_scaled_reward": 0.027437805198132992, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 2182.4166984558105, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.8734866380691528, | |
| "kl": 0.3303680419921875, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0497, | |
| "reward": 0.6296617463231087, | |
| "reward_std": 0.8634731210768223, | |
| "rewards/cosine_scaled_reward": 0.033580862917006016, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 2209.4792098999023, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 1.0455769300460815, | |
| "kl": 0.2725830078125, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0508, | |
| "reward": 0.6334413010627031, | |
| "reward_std": 0.9880629740655422, | |
| "rewards/cosine_scaled_reward": -0.016612697392702103, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 2201.250030517578, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.5012733340263367, | |
| "kl": 0.374755859375, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0199, | |
| "reward": 0.843031425960362, | |
| "reward_std": 0.5963149815797806, | |
| "rewards/cosine_scaled_reward": 0.04651568736881018, | |
| "rewards/format_reward": 0.7500000167638063, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 2702.8333892822266, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 1.3764408826828003, | |
| "kl": 0.383636474609375, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": -0.018, | |
| "reward": 0.4093172252178192, | |
| "reward_std": 0.7577077932655811, | |
| "rewards/cosine_scaled_reward": -0.0765913873910904, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 2229.104232788086, | |
| "epoch": 0.32, | |
| "grad_norm": 1.4205951690673828, | |
| "kl": 0.393707275390625, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": -0.0021, | |
| "reward": 1.024111781269312, | |
| "reward_std": 1.1223380044102669, | |
| "rewards/cosine_scaled_reward": 0.12663922272622585, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 3130.2084045410156, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.6159874200820923, | |
| "kl": 0.49462890625, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0566, | |
| "reward": -0.1348068374209106, | |
| "reward_std": 0.7328717596828938, | |
| "rewards/cosine_scaled_reward": -0.2132367566227913, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 2341.541717529297, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 1.7661807537078857, | |
| "kl": 0.31353759765625, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": -0.0445, | |
| "reward": 0.6573122011031955, | |
| "reward_std": 0.7708069607615471, | |
| "rewards/cosine_scaled_reward": 0.04740610299631953, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2914.729217529297, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 1.0965200662612915, | |
| "kl": 0.47216796875, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0109, | |
| "reward": 0.5034051882103086, | |
| "reward_std": 0.7967246808111668, | |
| "rewards/cosine_scaled_reward": 0.012119237333536148, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 2276.291732788086, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.4287702441215515, | |
| "kl": 0.3963623046875, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0495, | |
| "reward": 0.3963469974696636, | |
| "reward_std": 0.6148848831653595, | |
| "rewards/cosine_scaled_reward": -0.12474317848682404, | |
| "rewards/format_reward": 0.6458333414047956, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 2141.0833740234375, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.2775723934173584, | |
| "kl": 0.219696044921875, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0093, | |
| "reward": 0.5369033999741077, | |
| "reward_std": 0.5743259247392416, | |
| "rewards/cosine_scaled_reward": -0.11696498841047287, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 2264.7709350585938, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 2.459235668182373, | |
| "kl": 0.26708984375, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.1085, | |
| "reward": 0.6158911837264895, | |
| "reward_std": 1.0042807385325432, | |
| "rewards/cosine_scaled_reward": 0.01627892069518566, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1770.3750343322754, | |
| "epoch": 0.328, | |
| "grad_norm": 0.3129092752933502, | |
| "kl": 0.2149658203125, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0042, | |
| "reward": 0.5954279694706202, | |
| "reward_std": 0.46624701749533415, | |
| "rewards/cosine_scaled_reward": -0.05645267991349101, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 2374.270881652832, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.4308589696884155, | |
| "kl": 0.286712646484375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0262, | |
| "reward": 0.4366315193474293, | |
| "reward_std": 0.8237803354859352, | |
| "rewards/cosine_scaled_reward": -0.052517580799758434, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 2094.7292289733887, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.5887665748596191, | |
| "kl": 0.2393341064453125, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0238, | |
| "reward": 0.634770268574357, | |
| "reward_std": 0.559820894151926, | |
| "rewards/cosine_scaled_reward": -0.047198209911584854, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 2255.979263305664, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.6363411545753479, | |
| "kl": 0.2767333984375, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": -0.0051, | |
| "reward": 0.5892711323685944, | |
| "reward_std": 0.595702001824975, | |
| "rewards/cosine_scaled_reward": -0.09078110568225384, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 2604.604263305664, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.6978944540023804, | |
| "kl": 0.255279541015625, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0281, | |
| "reward": 0.49167120084166527, | |
| "reward_std": 0.848762433975935, | |
| "rewards/cosine_scaled_reward": -0.1187477596104145, | |
| "rewards/format_reward": 0.7291666846722364, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 2789.7500610351562, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.8494940400123596, | |
| "kl": 0.31634521484375, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": -0.003, | |
| "reward": 0.5868554009357467, | |
| "reward_std": 0.5472784880548716, | |
| "rewards/cosine_scaled_reward": -0.08157230284996331, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 2231.8333740234375, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.8752176761627197, | |
| "kl": 0.223968505859375, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": -0.0123, | |
| "reward": 0.7991173285990953, | |
| "reward_std": 0.4973907843232155, | |
| "rewards/cosine_scaled_reward": -0.006691355258226395, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 2741.604232788086, | |
| "epoch": 0.336, | |
| "grad_norm": 0.7732493877410889, | |
| "kl": 0.2772216796875, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0615, | |
| "reward": 0.5131175501737744, | |
| "reward_std": 0.6472405269742012, | |
| "rewards/cosine_scaled_reward": 0.01697544753551483, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 2742.9166870117188, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.7361288666725159, | |
| "kl": 0.2744140625, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0561, | |
| "reward": 0.6931683626025915, | |
| "reward_std": 0.7358212843537331, | |
| "rewards/cosine_scaled_reward": 0.044500820338726044, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 3106.4584350585938, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.4049779176712036, | |
| "kl": 0.263427734375, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0199, | |
| "reward": 0.3909061732701957, | |
| "reward_std": 0.6087947525084019, | |
| "rewards/cosine_scaled_reward": -0.13788025826215744, | |
| "rewards/format_reward": 0.6666666902601719, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 3155.0208892822266, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.36298710107803345, | |
| "kl": 0.312744140625, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0074, | |
| "reward": 0.08901350852102041, | |
| "reward_std": 0.6059992350637913, | |
| "rewards/cosine_scaled_reward": -0.13257658202201128, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 2160.562545776367, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.6580337882041931, | |
| "kl": 0.15863037109375, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0368, | |
| "reward": 0.4065987435169518, | |
| "reward_std": 0.6639986764639616, | |
| "rewards/cosine_scaled_reward": -0.06753396429121494, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 2909.5208587646484, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.5796670913696289, | |
| "kl": 0.21197509765625, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0596, | |
| "reward": 0.6528121263254434, | |
| "reward_std": 0.7620734348893166, | |
| "rewards/cosine_scaled_reward": 0.01390604767948389, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 2671.916748046875, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.509225606918335, | |
| "kl": 0.208465576171875, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.006, | |
| "reward": 0.49726424319669604, | |
| "reward_std": 0.5585810020565987, | |
| "rewards/cosine_scaled_reward": -0.053451212123036385, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 2402.916748046875, | |
| "epoch": 0.344, | |
| "grad_norm": 0.4044334292411804, | |
| "kl": 0.20062255859375, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0287, | |
| "reward": 0.5725528532639146, | |
| "reward_std": 0.642670027911663, | |
| "rewards/cosine_scaled_reward": -0.047056916169822216, | |
| "rewards/format_reward": 0.6666666697710752, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 2237.020896911621, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 1.1649314165115356, | |
| "kl": 0.206268310546875, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0401, | |
| "reward": 0.5389626898686402, | |
| "reward_std": 1.0156296789646149, | |
| "rewards/cosine_scaled_reward": 0.019481339491903782, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 2634.166748046875, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.5552562475204468, | |
| "kl": 0.272735595703125, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0404, | |
| "reward": 0.4783714674413204, | |
| "reward_std": 0.7024741154164076, | |
| "rewards/cosine_scaled_reward": -0.14623095095157623, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 2843.104263305664, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.3334513306617737, | |
| "kl": 0.30706787109375, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0326, | |
| "reward": 0.3916517546167597, | |
| "reward_std": 0.644689092412591, | |
| "rewards/cosine_scaled_reward": -0.14792412985116243, | |
| "rewards/format_reward": 0.6875000167638063, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 3143.8125610351562, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.46253135800361633, | |
| "kl": 0.371826171875, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0503, | |
| "reward": 0.2803553529083729, | |
| "reward_std": 0.6069540809839964, | |
| "rewards/cosine_scaled_reward": -0.1619056654162705, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 2349.5833892822266, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.9043461680412292, | |
| "kl": 0.2369384765625, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0383, | |
| "reward": 1.2639683187007904, | |
| "reward_std": 0.8216119185090065, | |
| "rewards/cosine_scaled_reward": 0.21531746815890074, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 2729.500045776367, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.7317904829978943, | |
| "kl": 0.35693359375, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": -0.0004, | |
| "reward": 0.35641809552907944, | |
| "reward_std": 0.8619059510529041, | |
| "rewards/cosine_scaled_reward": -0.07179095968604088, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 3229.0209045410156, | |
| "epoch": 0.352, | |
| "grad_norm": 0.7616815567016602, | |
| "kl": 0.499267578125, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0444, | |
| "reward": 0.3788898056373, | |
| "reward_std": 0.7400763519108295, | |
| "rewards/cosine_scaled_reward": -0.12305509229190648, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 2903.291748046875, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 1.0771965980529785, | |
| "kl": 0.3709716796875, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0667, | |
| "reward": 0.2175411656498909, | |
| "reward_std": 0.7625751979649067, | |
| "rewards/cosine_scaled_reward": -0.17247941810637712, | |
| "rewards/format_reward": 0.5625000167638063, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 2517.5209045410156, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.8801992535591125, | |
| "kl": 0.4547119140625, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0203, | |
| "reward": 0.3958463743329048, | |
| "reward_std": 0.7051512375473976, | |
| "rewards/cosine_scaled_reward": -0.13541015330702066, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 2367.395881652832, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.4860677421092987, | |
| "kl": 0.287261962890625, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0301, | |
| "reward": 0.8140461907387362, | |
| "reward_std": 0.729220163077116, | |
| "rewards/cosine_scaled_reward": 0.032023081090301275, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 2170.0000381469727, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 1.8704103231430054, | |
| "kl": 0.35418701171875, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": -0.0452, | |
| "reward": 0.9425890631973743, | |
| "reward_std": 0.5832888670265675, | |
| "rewards/cosine_scaled_reward": 0.13796119578182697, | |
| "rewards/format_reward": 0.6666666697710752, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 2891.3334045410156, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.5072900652885437, | |
| "kl": 0.40447998046875, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0527, | |
| "reward": 0.9016526590567082, | |
| "reward_std": 0.901871845126152, | |
| "rewards/cosine_scaled_reward": 0.10707633011043072, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 2322.729248046875, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.8013210296630859, | |
| "kl": 0.294403076171875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": -0.0126, | |
| "reward": 1.0439924612874165, | |
| "reward_std": 0.8895706832408905, | |
| "rewards/cosine_scaled_reward": 0.16782954335212708, | |
| "rewards/format_reward": 0.7083333525806665, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 2793.3333892822266, | |
| "epoch": 0.36, | |
| "grad_norm": 1.2426762580871582, | |
| "kl": 0.43634033203125, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0831, | |
| "reward": 0.48491090000607073, | |
| "reward_std": 0.70790103264153, | |
| "rewards/cosine_scaled_reward": -0.049211218021810055, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 3277.3750915527344, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 1.8028209209442139, | |
| "kl": 0.505859375, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": -0.0052, | |
| "reward": 0.30412304773926735, | |
| "reward_std": 0.6635380201041698, | |
| "rewards/cosine_scaled_reward": -0.19168847613036633, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 2602.041717529297, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 1.0472040176391602, | |
| "kl": 0.38592529296875, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0476, | |
| "reward": 0.5747585827484727, | |
| "reward_std": 0.9246655404567719, | |
| "rewards/cosine_scaled_reward": -0.004287379328161478, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 2433.229232788086, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.5204926133155823, | |
| "kl": 0.3592529296875, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0436, | |
| "reward": 0.6435124352574348, | |
| "reward_std": 0.7634010724723339, | |
| "rewards/cosine_scaled_reward": -0.08449378702789545, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 2784.708396911621, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.5020449161529541, | |
| "kl": 0.378021240234375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0635, | |
| "reward": 0.4931874219328165, | |
| "reward_std": 0.7557184733450413, | |
| "rewards/cosine_scaled_reward": -0.149239641148597, | |
| "rewards/format_reward": 0.7916666846722364, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 2341.791717529297, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 1.0193767547607422, | |
| "kl": 0.27911376953125, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0637, | |
| "reward": 0.6816958046983927, | |
| "reward_std": 0.7552312985062599, | |
| "rewards/cosine_scaled_reward": -0.08623544871807098, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 2105.729202270508, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.3755457103252411, | |
| "kl": 0.26214599609375, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0043, | |
| "reward": 1.0699648894369602, | |
| "reward_std": 0.8030014112591743, | |
| "rewards/cosine_scaled_reward": 0.13914906978607178, | |
| "rewards/format_reward": 0.7916666902601719, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 2367.6875610351562, | |
| "epoch": 0.368, | |
| "grad_norm": 0.4942283034324646, | |
| "kl": 0.30426025390625, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0148, | |
| "reward": 0.7899385392665863, | |
| "reward_std": 0.743595227599144, | |
| "rewards/cosine_scaled_reward": -0.03211408853530884, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 2852.3958892822266, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 1.1540416479110718, | |
| "kl": 0.41192626953125, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0112, | |
| "reward": 0.5696731898933649, | |
| "reward_std": 0.6305563114583492, | |
| "rewards/cosine_scaled_reward": -0.09016341622918844, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 3199.541748046875, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.42631351947784424, | |
| "kl": 0.4495849609375, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0534, | |
| "reward": 0.3698822483420372, | |
| "reward_std": 0.7412667665630579, | |
| "rewards/cosine_scaled_reward": -0.09630887286039069, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 3050.104217529297, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.44686606526374817, | |
| "kl": 0.448394775390625, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0349, | |
| "reward": 0.28104627318680286, | |
| "reward_std": 0.8294301778078079, | |
| "rewards/cosine_scaled_reward": -0.09906021226197481, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 2556.916732788086, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.713642954826355, | |
| "kl": 0.252593994140625, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0419, | |
| "reward": 0.625084163621068, | |
| "reward_std": 0.7249293215572834, | |
| "rewards/cosine_scaled_reward": -0.09370792843401432, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 2512.229232788086, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.6753360033035278, | |
| "kl": 0.29766845703125, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0522, | |
| "reward": 0.9986484311521053, | |
| "reward_std": 0.7536502368748188, | |
| "rewards/cosine_scaled_reward": 0.08265753649175167, | |
| "rewards/format_reward": 0.8333333488553762, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 3129.541717529297, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.6392536163330078, | |
| "kl": 0.433837890625, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0176, | |
| "reward": 0.41261669318191707, | |
| "reward_std": 0.6660363562405109, | |
| "rewards/cosine_scaled_reward": -0.08535831584595144, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 2198.8542404174805, | |
| "epoch": 0.376, | |
| "grad_norm": 0.7879998087882996, | |
| "kl": 0.2938232421875, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0376, | |
| "reward": 0.8884001635015011, | |
| "reward_std": 0.8436827287077904, | |
| "rewards/cosine_scaled_reward": 0.06920007988810539, | |
| "rewards/format_reward": 0.7500000335276127, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 2306.812530517578, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.3462686538696289, | |
| "kl": 0.298553466796875, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0274, | |
| "reward": 0.38892703130841255, | |
| "reward_std": 0.7339756563305855, | |
| "rewards/cosine_scaled_reward": -0.18053649738430977, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 2696.229248046875, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.5001699924468994, | |
| "kl": 0.39886474609375, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0252, | |
| "reward": 0.17869803123176098, | |
| "reward_std": 0.6411689594388008, | |
| "rewards/cosine_scaled_reward": -0.19190098624676466, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 2362.645896911621, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.9171366691589355, | |
| "kl": 0.287750244140625, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": -0.0042, | |
| "reward": 0.7273948602378368, | |
| "reward_std": 0.644059307873249, | |
| "rewards/cosine_scaled_reward": -0.06338590569794178, | |
| "rewards/format_reward": 0.8541666828095913, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 2217.7500610351562, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.4733556807041168, | |
| "kl": 0.19439697265625, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0278, | |
| "reward": 0.6465493626892567, | |
| "reward_std": 0.7646395452320576, | |
| "rewards/cosine_scaled_reward": -0.11422532517462969, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 2848.5208740234375, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.6546262502670288, | |
| "kl": 0.42724609375, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0575, | |
| "reward": 0.2325472510419786, | |
| "reward_std": 0.8501592725515366, | |
| "rewards/cosine_scaled_reward": -0.19622638076543808, | |
| "rewards/format_reward": 0.6250000204890966, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 2426.750068664551, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.459773987531662, | |
| "kl": 0.3013916015625, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.002, | |
| "reward": 0.7257073321379721, | |
| "reward_std": 0.8327980078756809, | |
| "rewards/cosine_scaled_reward": 0.01910366490483284, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 2826.104232788086, | |
| "epoch": 0.384, | |
| "grad_norm": 0.9565772414207458, | |
| "kl": 0.369140625, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0762, | |
| "reward": 0.6304492875933647, | |
| "reward_std": 0.9015435054898262, | |
| "rewards/cosine_scaled_reward": -0.0076920222491025925, | |
| "rewards/format_reward": 0.6458333600312471, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 2443.3958587646484, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.9018928408622742, | |
| "kl": 0.2510223388671875, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0516, | |
| "reward": 0.4559049401432276, | |
| "reward_std": 0.6961762681603432, | |
| "rewards/cosine_scaled_reward": -0.18871420342475176, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 2281.166717529297, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.41089001297950745, | |
| "kl": 0.32318115234375, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0283, | |
| "reward": 0.664438179373974, | |
| "reward_std": 0.6754178777337074, | |
| "rewards/cosine_scaled_reward": -0.04278091713786125, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 2934.458427429199, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.45900437235832214, | |
| "kl": 0.371612548828125, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0265, | |
| "reward": 0.5773372187613859, | |
| "reward_std": 0.5042719468474388, | |
| "rewards/cosine_scaled_reward": -0.0759147321805358, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 2755.4375610351562, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.4454108476638794, | |
| "kl": 0.3587646484375, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0325, | |
| "reward": 0.4514606408774853, | |
| "reward_std": 0.6363845467567444, | |
| "rewards/cosine_scaled_reward": -0.1805196925997734, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 2544.6250610351562, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 1.1304974555969238, | |
| "kl": 0.305999755859375, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0396, | |
| "reward": 1.0055190767161548, | |
| "reward_std": 0.7852493021637201, | |
| "rewards/cosine_scaled_reward": 0.14859285950660706, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 2673.812530517578, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 1.1692603826522827, | |
| "kl": 0.439208984375, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0718, | |
| "reward": 0.4766305387020111, | |
| "reward_std": 0.8266436979174614, | |
| "rewards/cosine_scaled_reward": -0.11585140600800514, | |
| "rewards/format_reward": 0.7083333525806665, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 3054.729217529297, | |
| "epoch": 0.392, | |
| "grad_norm": 0.7233371138572693, | |
| "kl": 0.43121337890625, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0223, | |
| "reward": 0.8223184086382389, | |
| "reward_std": 0.8432090878486633, | |
| "rewards/cosine_scaled_reward": 0.09865919034928083, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 2295.4584350585938, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.29200509190559387, | |
| "kl": 0.244537353515625, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0264, | |
| "reward": 1.348364820703864, | |
| "reward_std": 0.6582886949181557, | |
| "rewards/cosine_scaled_reward": 0.20543239824473858, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 2679.2084197998047, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.8350253105163574, | |
| "kl": 0.42327880859375, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0278, | |
| "reward": 1.0699175857007504, | |
| "reward_std": 0.5837405696511269, | |
| "rewards/cosine_scaled_reward": 0.12870877236127853, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 2560.666748046875, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.6276114583015442, | |
| "kl": 0.397705078125, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0694, | |
| "reward": 0.46554950438439846, | |
| "reward_std": 0.5874128863215446, | |
| "rewards/cosine_scaled_reward": -0.17347524780780077, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 2896.354278564453, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 1.1534738540649414, | |
| "kl": 0.4324951171875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0784, | |
| "reward": 0.5726351104676723, | |
| "reward_std": 0.8148692063987255, | |
| "rewards/cosine_scaled_reward": -0.09909912198781967, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 2368.6251068115234, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.5168960094451904, | |
| "kl": 0.35772705078125, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0494, | |
| "reward": 0.7037911647930741, | |
| "reward_std": 0.7166893742978573, | |
| "rewards/cosine_scaled_reward": -0.07518776506185532, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 2686.5834045410156, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.9646963477134705, | |
| "kl": 0.477783203125, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.07, | |
| "reward": 0.7249975223094225, | |
| "reward_std": 0.8503868244588375, | |
| "rewards/cosine_scaled_reward": -0.07500123046338558, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 2476.979202270508, | |
| "epoch": 0.4, | |
| "grad_norm": 1.1782015562057495, | |
| "kl": 0.65362548828125, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0431, | |
| "reward": 0.44952827505767345, | |
| "reward_std": 0.6957900896668434, | |
| "rewards/cosine_scaled_reward": -0.1398192130291136, | |
| "rewards/format_reward": 0.7291666902601719, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 2507.9375762939453, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.49656009674072266, | |
| "kl": 0.3612060546875, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0399, | |
| "reward": 0.9078153409063816, | |
| "reward_std": 0.8314991928637028, | |
| "rewards/cosine_scaled_reward": 0.005991012789309025, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 2370.604217529297, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 1.5748661756515503, | |
| "kl": 0.4361572265625, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0125, | |
| "reward": 0.6327532059513032, | |
| "reward_std": 0.7839572783559561, | |
| "rewards/cosine_scaled_reward": 0.01429326320067048, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 1954.3125686645508, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.35592421889305115, | |
| "kl": 0.27490234375, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0218, | |
| "reward": 0.941289596259594, | |
| "reward_std": 0.8161356300115585, | |
| "rewards/cosine_scaled_reward": 0.04356144741177559, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 2080.3959045410156, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.3516908288002014, | |
| "kl": 0.274932861328125, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0192, | |
| "reward": 0.9610122065059841, | |
| "reward_std": 0.6026036366820335, | |
| "rewards/cosine_scaled_reward": 0.043006100691854954, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 2038.5209197998047, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.6201685667037964, | |
| "kl": 0.34228515625, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0472, | |
| "reward": 0.974739572731778, | |
| "reward_std": 0.9850451275706291, | |
| "rewards/cosine_scaled_reward": 0.11236979370005429, | |
| "rewards/format_reward": 0.7500000093132257, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 2371.541748046875, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.4849975109100342, | |
| "kl": 0.395751953125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0326, | |
| "reward": 0.623257277533412, | |
| "reward_std": 0.7509507350623608, | |
| "rewards/cosine_scaled_reward": -0.0946213798597455, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 2560.7709045410156, | |
| "epoch": 0.408, | |
| "grad_norm": 0.7741249799728394, | |
| "kl": 0.390869140625, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.012, | |
| "reward": 0.5958885028958321, | |
| "reward_std": 0.7125682160258293, | |
| "rewards/cosine_scaled_reward": -0.11872242018580437, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 2100.729202270508, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 2.192291021347046, | |
| "kl": 0.4425048828125, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.1018, | |
| "reward": 0.8158981092274189, | |
| "reward_std": 0.7913046702742577, | |
| "rewards/cosine_scaled_reward": 0.04336569644510746, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 2156.3333892822266, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.9902085065841675, | |
| "kl": 0.37115478515625, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0701, | |
| "reward": 0.6149628674611449, | |
| "reward_std": 0.7031097002327442, | |
| "rewards/cosine_scaled_reward": -0.10918523697182536, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 2550.729232788086, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.8987974524497986, | |
| "kl": 0.4498291015625, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0107, | |
| "reward": 0.7415556833148003, | |
| "reward_std": 0.8184454329311848, | |
| "rewards/cosine_scaled_reward": -0.05630549229681492, | |
| "rewards/format_reward": 0.8541666939854622, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 2538.6875610351562, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 1.1241296529769897, | |
| "kl": 0.4794921875, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0314, | |
| "reward": 0.8051093611866236, | |
| "reward_std": 0.836889635771513, | |
| "rewards/cosine_scaled_reward": -0.014111996628344059, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1696.6042442321777, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.36879751086235046, | |
| "kl": 0.3428955078125, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0423, | |
| "reward": 0.807725053280592, | |
| "reward_std": 0.5215695351362228, | |
| "rewards/cosine_scaled_reward": -0.033637505024671555, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 1807.4792098999023, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.5585716366767883, | |
| "kl": 0.396026611328125, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0461, | |
| "reward": 0.49642418074654415, | |
| "reward_std": 0.7605064287781715, | |
| "rewards/cosine_scaled_reward": -0.09553791396319866, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 2447.354217529297, | |
| "epoch": 0.416, | |
| "grad_norm": 0.512050211429596, | |
| "kl": 0.5145263671875, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0591, | |
| "reward": 0.1905357912182808, | |
| "reward_std": 0.6694304198026657, | |
| "rewards/cosine_scaled_reward": -0.2588987797498703, | |
| "rewards/format_reward": 0.7083333544433117, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 2767.666778564453, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 1.2084037065505981, | |
| "kl": 0.5614013671875, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0845, | |
| "reward": 0.4251478109508753, | |
| "reward_std": 0.8055129833519459, | |
| "rewards/cosine_scaled_reward": -0.07909277267754078, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1953.7500305175781, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 1.8075039386749268, | |
| "kl": 0.295562744140625, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0725, | |
| "reward": 1.0165046447655186, | |
| "reward_std": 0.8075561951845884, | |
| "rewards/cosine_scaled_reward": 0.09158563800156116, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 2626.354263305664, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 1.0496845245361328, | |
| "kl": 0.48699951171875, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0661, | |
| "reward": 0.7226119879633188, | |
| "reward_std": 0.9606124758720398, | |
| "rewards/cosine_scaled_reward": 0.02797265024855733, | |
| "rewards/format_reward": 0.666666692122817, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 2698.1459197998047, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 1.6566721200942993, | |
| "kl": 0.4873046875, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0166, | |
| "reward": 0.5656480398029089, | |
| "reward_std": 0.6528111733496189, | |
| "rewards/cosine_scaled_reward": -0.0713426498696208, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 2418.041748046875, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.9178156852722168, | |
| "kl": 0.41973876953125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0271, | |
| "reward": 0.8303387649357319, | |
| "reward_std": 1.1390304267406464, | |
| "rewards/cosine_scaled_reward": 0.04016935685649514, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 2595.7709350585938, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.9787195324897766, | |
| "kl": 0.5360107421875, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0475, | |
| "reward": 0.6089789252728224, | |
| "reward_std": 0.49529892206192017, | |
| "rewards/cosine_scaled_reward": -0.06009387783706188, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 1669.125057220459, | |
| "epoch": 0.424, | |
| "grad_norm": 0.6832586526870728, | |
| "kl": 0.305511474609375, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0156, | |
| "reward": 1.334755502641201, | |
| "reward_std": 0.7108926326036453, | |
| "rewards/cosine_scaled_reward": 0.2298777117393911, | |
| "rewards/format_reward": 0.8750000037252903, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 2707.750045776367, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.7063215970993042, | |
| "kl": 0.460296630859375, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0567, | |
| "reward": 0.8714766772463918, | |
| "reward_std": 0.7350384518504143, | |
| "rewards/cosine_scaled_reward": 0.0503216665238142, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 1592.6459121704102, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.9033951163291931, | |
| "kl": 0.2605133056640625, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0191, | |
| "reward": 0.7105765882879496, | |
| "reward_std": 0.6581134386360645, | |
| "rewards/cosine_scaled_reward": -0.07179503422230482, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 2403.375030517578, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.9998828768730164, | |
| "kl": 0.48974609375, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0693, | |
| "reward": 0.4763774862512946, | |
| "reward_std": 0.7915669940412045, | |
| "rewards/cosine_scaled_reward": -0.06389461923390627, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 2501.8750762939453, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.610641598701477, | |
| "kl": 0.4688720703125, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0509, | |
| "reward": 0.8910817317664623, | |
| "reward_std": 0.7950854599475861, | |
| "rewards/cosine_scaled_reward": 0.09137419052422047, | |
| "rewards/format_reward": 0.7083333488553762, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 2130.5625610351562, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.6212278008460999, | |
| "kl": 0.380096435546875, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.016, | |
| "reward": 0.4537246283143759, | |
| "reward_std": 0.6907018758356571, | |
| "rewards/cosine_scaled_reward": -0.15855437144637108, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 2739.0209197998047, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.9390625357627869, | |
| "kl": 0.5252685546875, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0252, | |
| "reward": 0.7201773710548878, | |
| "reward_std": 0.9287758991122246, | |
| "rewards/cosine_scaled_reward": 0.005922011099755764, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 2193.7709159851074, | |
| "epoch": 0.432, | |
| "grad_norm": 0.5597609877586365, | |
| "kl": 0.43035888671875, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0432, | |
| "reward": 1.151801437139511, | |
| "reward_std": 0.9676465280354023, | |
| "rewards/cosine_scaled_reward": 0.16965069761499763, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 2664.916732788086, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 1.6749874353408813, | |
| "kl": 0.76043701171875, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.1177, | |
| "reward": 0.40762139530852437, | |
| "reward_std": 0.8776891604065895, | |
| "rewards/cosine_scaled_reward": -0.11910597886890173, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 2202.104232788086, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 1.1534351110458374, | |
| "kl": 0.31915283203125, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0704, | |
| "reward": 0.6592778088524938, | |
| "reward_std": 0.7035505771636963, | |
| "rewards/cosine_scaled_reward": -0.0974444393068552, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 2642.104232788086, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.5377724170684814, | |
| "kl": 0.52935791015625, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0518, | |
| "reward": 0.31143795792013407, | |
| "reward_std": 0.7106641083955765, | |
| "rewards/cosine_scaled_reward": -0.17761437874287367, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 2292.2500610351562, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.722557008266449, | |
| "kl": 0.40679931640625, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0297, | |
| "reward": 0.3869906556792557, | |
| "reward_std": 0.6342262029647827, | |
| "rewards/cosine_scaled_reward": -0.23358802066650242, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 2579.416702270508, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.5246860980987549, | |
| "kl": 0.52972412109375, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0596, | |
| "reward": 0.7596514848992229, | |
| "reward_std": 0.8228101618587971, | |
| "rewards/cosine_scaled_reward": 0.02565906196832657, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 1909.2292098999023, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.30958041548728943, | |
| "kl": 0.29815673828125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0282, | |
| "reward": 1.4319515749812126, | |
| "reward_std": 0.7947760932147503, | |
| "rewards/cosine_scaled_reward": 0.2576424600556493, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 2197.312545776367, | |
| "epoch": 0.44, | |
| "grad_norm": 0.6841936111450195, | |
| "kl": 0.4190673828125, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0334, | |
| "reward": 0.49799082055687904, | |
| "reward_std": 0.5665691867470741, | |
| "rewards/cosine_scaled_reward": -0.13642127346247435, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 2120.0834045410156, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.7352175116539001, | |
| "kl": 0.3243408203125, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0477, | |
| "reward": 0.9959990195930004, | |
| "reward_std": 0.7146658357232809, | |
| "rewards/cosine_scaled_reward": 0.06049949396401644, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 2542.3334197998047, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.9107005596160889, | |
| "kl": 0.56768798828125, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0838, | |
| "reward": 0.5603130697272718, | |
| "reward_std": 0.7675389684736729, | |
| "rewards/cosine_scaled_reward": -0.03234346956014633, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 2395.937545776367, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.914421796798706, | |
| "kl": 0.484466552734375, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0322, | |
| "reward": 0.9246965646743774, | |
| "reward_std": 0.576560951769352, | |
| "rewards/cosine_scaled_reward": 0.10818159952759743, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 2336.645896911621, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.6898525953292847, | |
| "kl": 0.455474853515625, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0534, | |
| "reward": 0.6510482397861779, | |
| "reward_std": 1.0318073891103268, | |
| "rewards/cosine_scaled_reward": -0.049475882202386856, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 2531.291763305664, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.9229927062988281, | |
| "kl": 0.6007080078125, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.052, | |
| "reward": 0.2972820373252034, | |
| "reward_std": 0.7185437642037868, | |
| "rewards/cosine_scaled_reward": -0.1846923204138875, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 2353.4167404174805, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.569179117679596, | |
| "kl": 0.51171875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0774, | |
| "reward": 0.6255490938201547, | |
| "reward_std": 0.8703299462795258, | |
| "rewards/cosine_scaled_reward": -0.04139212518930435, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 2231.208366394043, | |
| "epoch": 0.448, | |
| "grad_norm": 1.0076370239257812, | |
| "kl": 0.39593505859375, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0718, | |
| "reward": 0.6922257398255169, | |
| "reward_std": 0.7815711013972759, | |
| "rewards/cosine_scaled_reward": -0.049720464274287224, | |
| "rewards/format_reward": 0.7916666828095913, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 2371.604248046875, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 0.6043879985809326, | |
| "kl": 0.383544921875, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0414, | |
| "reward": 0.8758026575669646, | |
| "reward_std": 0.8594116196036339, | |
| "rewards/cosine_scaled_reward": 0.10456799250096083, | |
| "rewards/format_reward": 0.6666666846722364, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 2847.354248046875, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.900895357131958, | |
| "kl": 0.63330078125, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0595, | |
| "reward": 0.09159071743488312, | |
| "reward_std": 0.7040717899799347, | |
| "rewards/cosine_scaled_reward": -0.25628798035904765, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 2110.3125381469727, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 1.2333768606185913, | |
| "kl": 0.345977783203125, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0419, | |
| "reward": 0.7058122660964727, | |
| "reward_std": 0.9388692416250706, | |
| "rewards/cosine_scaled_reward": -0.011677220463752747, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 2366.916717529297, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 1.082507610321045, | |
| "kl": 0.560791015625, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.1017, | |
| "reward": 0.5549185280688107, | |
| "reward_std": 0.8698355071246624, | |
| "rewards/cosine_scaled_reward": -0.09754075668752193, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 2053.937545776367, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 1.2703036069869995, | |
| "kl": 0.31591796875, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": -0.0161, | |
| "reward": 0.7015451728366315, | |
| "reward_std": 0.6856296453624964, | |
| "rewards/cosine_scaled_reward": -0.03464407101273537, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 2001.6458892822266, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.640113890171051, | |
| "kl": 0.2894744873046875, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": -0.0049, | |
| "reward": 0.46910549892345443, | |
| "reward_std": 0.7106455899775028, | |
| "rewards/cosine_scaled_reward": -0.1404472654685378, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 2174.875068664551, | |
| "epoch": 0.456, | |
| "grad_norm": 0.405164510011673, | |
| "kl": 0.3242950439453125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0278, | |
| "reward": 0.7996035069227219, | |
| "reward_std": 0.7646129336208105, | |
| "rewards/cosine_scaled_reward": -0.027281596325337887, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 1563.2083892822266, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.38448086380958557, | |
| "kl": 0.147735595703125, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0137, | |
| "reward": 1.3166161552071571, | |
| "reward_std": 0.7907492704689503, | |
| "rewards/cosine_scaled_reward": 0.23122474236879498, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 2731.1875762939453, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.966650128364563, | |
| "kl": 0.475494384765625, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0993, | |
| "reward": 0.7280457699671388, | |
| "reward_std": 0.8715083934366703, | |
| "rewards/cosine_scaled_reward": 0.041106189135462046, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 2064.7292404174805, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 0.3579236567020416, | |
| "kl": 0.318389892578125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.057, | |
| "reward": 0.6332701966166496, | |
| "reward_std": 0.6316654235124588, | |
| "rewards/cosine_scaled_reward": -0.05836488865315914, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 1992.0625610351562, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 0.3985031843185425, | |
| "kl": 0.314453125, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0231, | |
| "reward": 0.7511493074707687, | |
| "reward_std": 0.7014381438493729, | |
| "rewards/cosine_scaled_reward": -0.030675357207655907, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 1876.354232788086, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 0.5146257281303406, | |
| "kl": 0.3045654296875, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0257, | |
| "reward": 0.6732284021563828, | |
| "reward_std": 0.5320135578513145, | |
| "rewards/cosine_scaled_reward": -0.038385817781090736, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 1674.3750381469727, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.8119452595710754, | |
| "kl": 0.293060302734375, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0567, | |
| "reward": 0.8279850594699383, | |
| "reward_std": 0.8649867400527, | |
| "rewards/cosine_scaled_reward": 0.08065919205546379, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 1803.916732788086, | |
| "epoch": 0.464, | |
| "grad_norm": 0.8215560913085938, | |
| "kl": 0.1740570068359375, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0547, | |
| "reward": 1.2124288752675056, | |
| "reward_std": 0.8886886425316334, | |
| "rewards/cosine_scaled_reward": 0.1687144124880433, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 2204.1875610351562, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 0.8370880484580994, | |
| "kl": 0.3338470458984375, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0397, | |
| "reward": 0.8222223985940218, | |
| "reward_std": 0.762837752699852, | |
| "rewards/cosine_scaled_reward": -0.005555473268032074, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 2199.437545776367, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.44091248512268066, | |
| "kl": 0.265228271484375, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0208, | |
| "reward": 1.1622834838926792, | |
| "reward_std": 0.6865539737045765, | |
| "rewards/cosine_scaled_reward": 0.16447505727410316, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 2955.916778564453, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.9645739197731018, | |
| "kl": 0.5897216796875, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0554, | |
| "reward": 0.31450588814914227, | |
| "reward_std": 0.8947923108935356, | |
| "rewards/cosine_scaled_reward": -0.13441372802481055, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 2469.7084159851074, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 1.2290736436843872, | |
| "kl": 0.5966949462890625, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0567, | |
| "reward": 0.38221518974751234, | |
| "reward_std": 0.8091713823378086, | |
| "rewards/cosine_scaled_reward": -0.10055907690548338, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 2658.666763305664, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 0.5205665826797485, | |
| "kl": 0.40289306640625, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0389, | |
| "reward": 1.0305472910404205, | |
| "reward_std": 0.8048725798726082, | |
| "rewards/cosine_scaled_reward": 0.11944028595462441, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 2483.604263305664, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 0.48482832312583923, | |
| "kl": 0.34637451171875, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0136, | |
| "reward": 1.0568090807646513, | |
| "reward_std": 0.7529363892972469, | |
| "rewards/cosine_scaled_reward": 0.10132121946662664, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 1959.5833892822266, | |
| "epoch": 0.472, | |
| "grad_norm": 0.5290913581848145, | |
| "kl": 0.23431396484375, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0159, | |
| "reward": 0.9711576336994767, | |
| "reward_std": 0.8100597076117992, | |
| "rewards/cosine_scaled_reward": 0.0689121619798243, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 2976.6459350585938, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 1.0799856185913086, | |
| "kl": 0.786224365234375, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0644, | |
| "reward": -0.04989051632583141, | |
| "reward_std": 0.6900010071694851, | |
| "rewards/cosine_scaled_reward": -0.24369526095688343, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 3083.2084045410156, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 1.4723012447357178, | |
| "kl": 0.7890625, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0774, | |
| "reward": 0.03524960530921817, | |
| "reward_std": 0.6955935060977936, | |
| "rewards/cosine_scaled_reward": -0.16987520130351186, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 2211.6458740234375, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.8043403029441833, | |
| "kl": 0.3631591796875, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.032, | |
| "reward": 0.8334652222692966, | |
| "reward_std": 0.855853334069252, | |
| "rewards/cosine_scaled_reward": 0.02089927066117525, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 2856.541748046875, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.6729423403739929, | |
| "kl": 0.6011962890625, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0811, | |
| "reward": 0.2203914044657722, | |
| "reward_std": 0.8685822859406471, | |
| "rewards/cosine_scaled_reward": -0.18147097853943706, | |
| "rewards/format_reward": 0.5833333563059568, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 1945.8125381469727, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 1.0605844259262085, | |
| "kl": 0.321624755859375, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": -0.004, | |
| "reward": 1.1576719619333744, | |
| "reward_std": 0.8231003619730473, | |
| "rewards/cosine_scaled_reward": 0.17258594185113907, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 2345.2500762939453, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 1.400498628616333, | |
| "kl": 0.31787109375, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0464, | |
| "reward": 1.1115051358938217, | |
| "reward_std": 0.885998897254467, | |
| "rewards/cosine_scaled_reward": 0.13908587209880352, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 1978.8542098999023, | |
| "epoch": 0.48, | |
| "grad_norm": 0.7775018215179443, | |
| "kl": 0.303009033203125, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0281, | |
| "reward": 0.667570760473609, | |
| "reward_std": 0.8073213696479797, | |
| "rewards/cosine_scaled_reward": -0.062047986313700676, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 2967.6459350585938, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 0.9923365116119385, | |
| "kl": 0.5174560546875, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0545, | |
| "reward": 0.1129405153915286, | |
| "reward_std": 0.6916285455226898, | |
| "rewards/cosine_scaled_reward": -0.22477975487709045, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 2569.8542404174805, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.7899558544158936, | |
| "kl": 0.3731689453125, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0784, | |
| "reward": 0.3494804035872221, | |
| "reward_std": 0.9125073701143265, | |
| "rewards/cosine_scaled_reward": -0.12734314193949103, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 2343.1458892822266, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 0.6041327714920044, | |
| "kl": 0.34786224365234375, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0094, | |
| "reward": 0.37775486428290606, | |
| "reward_std": 0.7306395750492811, | |
| "rewards/cosine_scaled_reward": -0.1652892343699932, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 2692.2709045410156, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 0.5803365707397461, | |
| "kl": 0.385528564453125, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0425, | |
| "reward": 0.3899629784282297, | |
| "reward_std": 0.7263697199523449, | |
| "rewards/cosine_scaled_reward": -0.15918518672697246, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 1992.9791870117188, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.4711278975009918, | |
| "kl": 0.20758056640625, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.045, | |
| "reward": 1.7557712569832802, | |
| "reward_std": 0.7609523758292198, | |
| "rewards/cosine_scaled_reward": 0.49246894381940365, | |
| "rewards/format_reward": 0.7708333488553762, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 1818.3333740234375, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.29636478424072266, | |
| "kl": 0.1734161376953125, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0137, | |
| "reward": 0.7233628639951348, | |
| "reward_std": 0.8688660450279713, | |
| "rewards/cosine_scaled_reward": -0.023735247552394867, | |
| "rewards/format_reward": 0.7708333618938923, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 2321.791748046875, | |
| "epoch": 0.488, | |
| "grad_norm": 0.9740093946456909, | |
| "kl": 0.3040771484375, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0749, | |
| "reward": 0.5350995054468513, | |
| "reward_std": 0.9547824487090111, | |
| "rewards/cosine_scaled_reward": 0.01754974015057087, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 2653.5208892822266, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 0.9272693991661072, | |
| "kl": 0.474517822265625, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0771, | |
| "reward": 0.6082139114532765, | |
| "reward_std": 0.9143152646720409, | |
| "rewards/cosine_scaled_reward": -0.06047638365998864, | |
| "rewards/format_reward": 0.7291666902601719, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 1800.8958892822266, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 1.1067618131637573, | |
| "kl": 0.1895751953125, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0414, | |
| "reward": 0.6690875124186277, | |
| "reward_std": 0.7954706754535437, | |
| "rewards/cosine_scaled_reward": -0.050872914493083954, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 2331.1875534057617, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 1.0719366073608398, | |
| "kl": 0.45489501953125, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0482, | |
| "reward": 0.6795799904502928, | |
| "reward_std": 0.6970538776367903, | |
| "rewards/cosine_scaled_reward": -0.03520998451858759, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 2329.9792251586914, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 0.6106032133102417, | |
| "kl": 0.31787109375, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0357, | |
| "reward": 0.5369092933833599, | |
| "reward_std": 0.6611959636211395, | |
| "rewards/cosine_scaled_reward": -0.11696202587336302, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 2501.7709197998047, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.4645460546016693, | |
| "kl": 0.380035400390625, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.045, | |
| "reward": 0.5862938910722733, | |
| "reward_std": 0.6654918566346169, | |
| "rewards/cosine_scaled_reward": -0.07143638655543327, | |
| "rewards/format_reward": 0.7291666734963655, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 2829.645896911621, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.5816503167152405, | |
| "kl": 0.444244384765625, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.083, | |
| "reward": 0.5509648718871176, | |
| "reward_std": 0.7200754433870316, | |
| "rewards/cosine_scaled_reward": -0.00576755590736866, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 2463.083427429199, | |
| "epoch": 0.496, | |
| "grad_norm": 0.8237125873565674, | |
| "kl": 0.395416259765625, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0314, | |
| "reward": 0.13537092343904078, | |
| "reward_std": 0.5622758902609348, | |
| "rewards/cosine_scaled_reward": -0.2448145542293787, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 1650.604232788086, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.9680685997009277, | |
| "kl": 0.1461639404296875, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": -0.0372, | |
| "reward": 0.49817070248536766, | |
| "reward_std": 0.6576566733419895, | |
| "rewards/cosine_scaled_reward": -0.15716465492732823, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 1983.520881652832, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 0.5045260787010193, | |
| "kl": 0.3553466796875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": -0.0021, | |
| "reward": 0.8208566140383482, | |
| "reward_std": 0.9373398050665855, | |
| "rewards/cosine_scaled_reward": 0.035428304225206375, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 2593.8125610351562, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.5763217806816101, | |
| "kl": 0.2955322265625, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0435, | |
| "reward": 0.4606641661375761, | |
| "reward_std": 0.6995394416153431, | |
| "rewards/cosine_scaled_reward": -0.12383459135890007, | |
| "rewards/format_reward": 0.7083333544433117, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 2628.041702270508, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.4055953919887543, | |
| "kl": 0.2898712158203125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0451, | |
| "reward": 0.3238631319254637, | |
| "reward_std": 0.6967433281242847, | |
| "rewards/cosine_scaled_reward": -0.1714017689228058, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 2041.5417442321777, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 0.6679253578186035, | |
| "kl": 0.2696380615234375, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0546, | |
| "reward": 0.5020741457119584, | |
| "reward_std": 0.7230929285287857, | |
| "rewards/cosine_scaled_reward": -0.11354626249521971, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 2653.0625762939453, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.9918028712272644, | |
| "kl": 0.390869140625, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0122, | |
| "reward": 0.26619721701717936, | |
| "reward_std": 0.6938467286527157, | |
| "rewards/cosine_scaled_reward": -0.16898473422043025, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 2321.5209045410156, | |
| "epoch": 0.504, | |
| "grad_norm": 1.487559199333191, | |
| "kl": 0.350006103515625, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": -0.0064, | |
| "reward": 0.5201191268861294, | |
| "reward_std": 0.6544113270938396, | |
| "rewards/cosine_scaled_reward": -0.05244044866412878, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 2166.58341217041, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.2839236259460449, | |
| "kl": 0.2300567626953125, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0157, | |
| "reward": 0.8514236286282539, | |
| "reward_std": 0.6945049427449703, | |
| "rewards/cosine_scaled_reward": 0.05071181803941727, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 2637.0833892822266, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.9685612320899963, | |
| "kl": 0.45916748046875, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0937, | |
| "reward": 0.4695184410084039, | |
| "reward_std": 0.8600027747452259, | |
| "rewards/cosine_scaled_reward": -0.04649078845977783, | |
| "rewards/format_reward": 0.5625000037252903, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 2139.729217529297, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 0.4402044117450714, | |
| "kl": 0.3176116943359375, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0325, | |
| "reward": 0.6444471004651859, | |
| "reward_std": 0.6833576895296574, | |
| "rewards/cosine_scaled_reward": -0.011109774932265282, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 1914.6667175292969, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 0.5042048692703247, | |
| "kl": 0.251068115234375, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.046, | |
| "reward": 0.5679828058928251, | |
| "reward_std": 0.8287508636713028, | |
| "rewards/cosine_scaled_reward": -0.0493419524282217, | |
| "rewards/format_reward": 0.6666666809469461, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 2167.0625381469727, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 0.7239652872085571, | |
| "kl": 0.242706298828125, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0355, | |
| "reward": 0.7304300144314766, | |
| "reward_std": 0.6240549013018608, | |
| "rewards/cosine_scaled_reward": -0.020201677456498146, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 2175.4375915527344, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.4598706364631653, | |
| "kl": 0.25213623046875, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0393, | |
| "reward": 0.7061773156747222, | |
| "reward_std": 0.6147476173937321, | |
| "rewards/cosine_scaled_reward": 0.009338663425296545, | |
| "rewards/format_reward": 0.6875000093132257, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 1812.0000457763672, | |
| "epoch": 0.512, | |
| "grad_norm": 0.75312340259552, | |
| "kl": 0.1580963134765625, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0441, | |
| "reward": 0.590579898096621, | |
| "reward_std": 0.6532803736627102, | |
| "rewards/cosine_scaled_reward": -0.11096005886793137, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 1596.2917022705078, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 0.4844936728477478, | |
| "kl": 0.22760009765625, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0554, | |
| "reward": 0.5192018076777458, | |
| "reward_std": 0.6534673273563385, | |
| "rewards/cosine_scaled_reward": -0.09456577710807323, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 2333.8958892822266, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.694821298122406, | |
| "kl": 0.3708038330078125, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0758, | |
| "reward": 0.48664434999227524, | |
| "reward_std": 0.7445627897977829, | |
| "rewards/cosine_scaled_reward": -0.09001116827130318, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 1962.1875610351562, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 1.1499308347702026, | |
| "kl": 0.2388916015625, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0803, | |
| "reward": 0.9750413531437516, | |
| "reward_std": 0.841393306851387, | |
| "rewards/cosine_scaled_reward": 0.08127065747976303, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 2639.0625610351562, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 2.796175241470337, | |
| "kl": 0.3663330078125, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.1874, | |
| "reward": 0.5532118980772793, | |
| "reward_std": 1.0444200411438942, | |
| "rewards/cosine_scaled_reward": -0.004644063068553805, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 2044.625099182129, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.9541345238685608, | |
| "kl": 0.255096435546875, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0042, | |
| "reward": 0.8220235072076321, | |
| "reward_std": 0.8156450688838959, | |
| "rewards/cosine_scaled_reward": 0.015178397297859192, | |
| "rewards/format_reward": 0.7916666902601719, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 2003.0833892822266, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 0.8397437334060669, | |
| "kl": 0.21966552734375, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0811, | |
| "reward": 0.449910047929734, | |
| "reward_std": 0.6436374075710773, | |
| "rewards/cosine_scaled_reward": -0.139628317207098, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 2385.166732788086, | |
| "epoch": 0.52, | |
| "grad_norm": 1.0180670022964478, | |
| "kl": 0.420745849609375, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.036, | |
| "reward": 0.3301239423453808, | |
| "reward_std": 0.7285679541528225, | |
| "rewards/cosine_scaled_reward": -0.1682713646441698, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 2575.208396911621, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 0.5599580407142639, | |
| "kl": 0.373779296875, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0236, | |
| "reward": 0.5938101289793849, | |
| "reward_std": 0.8681718483567238, | |
| "rewards/cosine_scaled_reward": -0.05726161040365696, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 2281.354248046875, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.6153411865234375, | |
| "kl": 0.3210906982421875, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0206, | |
| "reward": 0.6159039521589875, | |
| "reward_std": 0.680855069309473, | |
| "rewards/cosine_scaled_reward": -0.05663137254305184, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 2125.375068664551, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 1.2423961162567139, | |
| "kl": 0.3084259033203125, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": -0.0183, | |
| "reward": 0.3741978630423546, | |
| "reward_std": 0.8636242747306824, | |
| "rewards/cosine_scaled_reward": -0.1983177432557568, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 2127.812545776367, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 1.718005657196045, | |
| "kl": 0.253997802734375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0943, | |
| "reward": 0.8951551232021302, | |
| "reward_std": 0.9340690299868584, | |
| "rewards/cosine_scaled_reward": 0.062160891480743885, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 2741.1875610351562, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 1.218464970588684, | |
| "kl": 0.42919921875, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.084, | |
| "reward": 0.36291674245148897, | |
| "reward_std": 0.9569993317127228, | |
| "rewards/cosine_scaled_reward": -0.12062497227452695, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 2222.4584159851074, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 1.2166295051574707, | |
| "kl": 0.32196044921875, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0231, | |
| "reward": 0.7657130546867847, | |
| "reward_std": 0.9577609188854694, | |
| "rewards/cosine_scaled_reward": 0.03910651011392474, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 2043.1250686645508, | |
| "epoch": 0.528, | |
| "grad_norm": 0.40705692768096924, | |
| "kl": 0.3497314453125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0577, | |
| "reward": 0.31157703790813684, | |
| "reward_std": 0.6110754385590553, | |
| "rewards/cosine_scaled_reward": -0.20879483548924327, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 2590.8334045410156, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 0.5638855695724487, | |
| "kl": 0.440826416015625, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0413, | |
| "reward": 0.5640892055816948, | |
| "reward_std": 1.1282568126916885, | |
| "rewards/cosine_scaled_reward": -0.02003874396905303, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 1697.8958740234375, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 0.3173580467700958, | |
| "kl": 0.2566986083984375, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0373, | |
| "reward": 0.9147527795284986, | |
| "reward_std": 0.6120780212804675, | |
| "rewards/cosine_scaled_reward": 0.01987638510763645, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 2732.604217529297, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 0.9214864373207092, | |
| "kl": 0.56097412109375, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0832, | |
| "reward": 0.5037600143114105, | |
| "reward_std": 0.827365554869175, | |
| "rewards/cosine_scaled_reward": -0.05020333209540695, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 2255.895881652832, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 0.7400510311126709, | |
| "kl": 0.2809600830078125, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0503, | |
| "reward": 1.0706525277346373, | |
| "reward_std": 0.877636231482029, | |
| "rewards/cosine_scaled_reward": 0.1394929286034312, | |
| "rewards/format_reward": 0.791666692122817, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 2365.375068664551, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 0.9116939306259155, | |
| "kl": 0.427398681640625, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0076, | |
| "reward": 0.40383276902139187, | |
| "reward_std": 0.6409453861415386, | |
| "rewards/cosine_scaled_reward": -0.12100029923021793, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 2636.5417251586914, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 0.706851601600647, | |
| "kl": 0.528076171875, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0899, | |
| "reward": 0.5270813233219087, | |
| "reward_std": 0.9826765283942223, | |
| "rewards/cosine_scaled_reward": -0.0697926718275994, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 2235.6042098999023, | |
| "epoch": 0.536, | |
| "grad_norm": 0.9927799105644226, | |
| "kl": 0.5670166015625, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.045, | |
| "reward": 0.6296205222606659, | |
| "reward_std": 0.7105157673358917, | |
| "rewards/cosine_scaled_reward": -0.01852307841181755, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 2740.979248046875, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 1.1925156116485596, | |
| "kl": 0.44580078125, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0244, | |
| "reward": 0.3472237199312076, | |
| "reward_std": 0.8571870625019073, | |
| "rewards/cosine_scaled_reward": -0.13888813648372889, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 2298.8958892822266, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.5537897348403931, | |
| "kl": 0.326873779296875, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0462, | |
| "reward": 0.6785652227699757, | |
| "reward_std": 0.6474947556853294, | |
| "rewards/cosine_scaled_reward": 0.02678260626271367, | |
| "rewards/format_reward": 0.625000013038516, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 2207.2083740234375, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 0.6610918045043945, | |
| "kl": 0.354949951171875, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0261, | |
| "reward": 0.3990835212171078, | |
| "reward_std": 0.659260107204318, | |
| "rewards/cosine_scaled_reward": -0.14420824265107512, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 2732.604248046875, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 0.9713963270187378, | |
| "kl": 0.40972900390625, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.038, | |
| "reward": 0.536033843178302, | |
| "reward_std": 0.7053390368819237, | |
| "rewards/cosine_scaled_reward": -0.10698308888822794, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 2258.875068664551, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 0.7563430666923523, | |
| "kl": 0.446258544921875, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0445, | |
| "reward": 0.9616024261340499, | |
| "reward_std": 0.9263377264142036, | |
| "rewards/cosine_scaled_reward": 0.1370511914137751, | |
| "rewards/format_reward": 0.6875000260770321, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 2066.3750534057617, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.874462366104126, | |
| "kl": 0.272369384765625, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0436, | |
| "reward": 1.0160637801745906, | |
| "reward_std": 0.9121544472873211, | |
| "rewards/cosine_scaled_reward": 0.0809485362842679, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 2530.3333892822266, | |
| "epoch": 0.544, | |
| "grad_norm": 1.8674249649047852, | |
| "kl": 0.4962158203125, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0869, | |
| "reward": 0.6834689327515662, | |
| "reward_std": 1.0683976262807846, | |
| "rewards/cosine_scaled_reward": -0.043682204210199416, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 2057.562557220459, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 1.2835708856582642, | |
| "kl": 0.296966552734375, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0562, | |
| "reward": 1.2522786986082792, | |
| "reward_std": 0.9867831543087959, | |
| "rewards/cosine_scaled_reward": 0.23030602023936808, | |
| "rewards/format_reward": 0.7916666828095913, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 2334.1250534057617, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 0.5521568655967712, | |
| "kl": 0.382965087890625, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0722, | |
| "reward": 0.4701565820723772, | |
| "reward_std": 0.7251895070075989, | |
| "rewards/cosine_scaled_reward": -0.108671719674021, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 2686.3959045410156, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 1.0934919118881226, | |
| "kl": 0.466461181640625, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0255, | |
| "reward": 0.41961941798217595, | |
| "reward_std": 0.8614507652819157, | |
| "rewards/cosine_scaled_reward": -0.13394030090421438, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 2260.020927429199, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 1.4672926664352417, | |
| "kl": 0.4683380126953125, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0376, | |
| "reward": 0.3727748654782772, | |
| "reward_std": 0.5491989254951477, | |
| "rewards/cosine_scaled_reward": -0.18861256520904135, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 2530.729217529297, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 1.4569040536880493, | |
| "kl": 0.45361328125, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0921, | |
| "reward": 0.36487692515947856, | |
| "reward_std": 0.7810634449124336, | |
| "rewards/cosine_scaled_reward": -0.1613115556538105, | |
| "rewards/format_reward": 0.6875000093132257, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 2573.8750915527344, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 1.0434833765029907, | |
| "kl": 0.42730712890625, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0425, | |
| "reward": 0.9190967418253422, | |
| "reward_std": 0.8362308479845524, | |
| "rewards/cosine_scaled_reward": 0.11579836346209049, | |
| "rewards/format_reward": 0.6875000167638063, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 2527.229202270508, | |
| "epoch": 0.552, | |
| "grad_norm": 1.4459831714630127, | |
| "kl": 0.57708740234375, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0301, | |
| "reward": 0.20397535432130098, | |
| "reward_std": 0.5737987570464611, | |
| "rewards/cosine_scaled_reward": -0.22092900332063437, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 2179.6875534057617, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 1.0252877473831177, | |
| "kl": 0.31011962890625, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0744, | |
| "reward": 0.8415414169430733, | |
| "reward_std": 0.6265546232461929, | |
| "rewards/cosine_scaled_reward": 0.014520692639052868, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 2130.3958740234375, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 2.06632924079895, | |
| "kl": 0.285888671875, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": -0.0265, | |
| "reward": 0.9047174965962768, | |
| "reward_std": 0.7558673620223999, | |
| "rewards/cosine_scaled_reward": 0.014858738519251347, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 1874.6458702087402, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 0.5376132130622864, | |
| "kl": 0.2660408020019531, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0535, | |
| "reward": 0.7522924607619643, | |
| "reward_std": 0.5480217132717371, | |
| "rewards/cosine_scaled_reward": -0.01968712778761983, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 1944.1041870117188, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.6722615361213684, | |
| "kl": 0.39788818359375, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0286, | |
| "reward": 1.0850566178560257, | |
| "reward_std": 0.8383910320699215, | |
| "rewards/cosine_scaled_reward": 0.15711162728257477, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 2214.08341217041, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 0.5369753241539001, | |
| "kl": 0.4105224609375, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0299, | |
| "reward": 0.42558533302508295, | |
| "reward_std": 0.6608752533793449, | |
| "rewards/cosine_scaled_reward": -0.15179067384451628, | |
| "rewards/format_reward": 0.7291666902601719, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 3057.2084045410156, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 0.41021808981895447, | |
| "kl": 0.56591796875, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0893, | |
| "reward": -0.06845853175036609, | |
| "reward_std": 0.5956653170287609, | |
| "rewards/cosine_scaled_reward": -0.24256260506808758, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 2016.6667098999023, | |
| "epoch": 0.56, | |
| "grad_norm": 1.8614702224731445, | |
| "kl": 0.3179168701171875, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": -0.0125, | |
| "reward": 0.6355590866878629, | |
| "reward_std": 0.6039589680731297, | |
| "rewards/cosine_scaled_reward": -0.09888713248074055, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 2568.8333740234375, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 0.7798163294792175, | |
| "kl": 0.378570556640625, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0627, | |
| "reward": 0.9301005639135838, | |
| "reward_std": 1.0789660066366196, | |
| "rewards/cosine_scaled_reward": 0.1108836093917489, | |
| "rewards/format_reward": 0.7083333544433117, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 2040.7500839233398, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 0.8784842491149902, | |
| "kl": 0.300018310546875, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": -0.0041, | |
| "reward": 1.00854002404958, | |
| "reward_std": 0.7161477841436863, | |
| "rewards/cosine_scaled_reward": 0.11885332595556974, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 2196.104232788086, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 0.9897964000701904, | |
| "kl": 0.3065032958984375, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0473, | |
| "reward": 1.1381681943312287, | |
| "reward_std": 0.8435879498720169, | |
| "rewards/cosine_scaled_reward": 0.15241740830242634, | |
| "rewards/format_reward": 0.8333333469927311, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 1807.5208740234375, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 1.8549937009811401, | |
| "kl": 0.2334747314453125, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0891, | |
| "reward": 0.9310838505625725, | |
| "reward_std": 0.9970204196870327, | |
| "rewards/cosine_scaled_reward": 0.059291912242770195, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 2438.5209045410156, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 1.1024754047393799, | |
| "kl": 0.4427490234375, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0862, | |
| "reward": 0.40903129265643656, | |
| "reward_std": 0.855750784277916, | |
| "rewards/cosine_scaled_reward": -0.08715102472342551, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1854.2500686645508, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 1.7624000310897827, | |
| "kl": 0.282470703125, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": -0.0104, | |
| "reward": 0.7299946136772633, | |
| "reward_std": 0.6787788085639477, | |
| "rewards/cosine_scaled_reward": -0.020419366657733917, | |
| "rewards/format_reward": 0.7708333376795053, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 1980.7708587646484, | |
| "epoch": 0.568, | |
| "grad_norm": 1.077419638633728, | |
| "kl": 0.1779632568359375, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0434, | |
| "reward": 1.1050571464002132, | |
| "reward_std": 0.8892782032489777, | |
| "rewards/cosine_scaled_reward": 0.13586187362670898, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 2287.5834045410156, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.9310129284858704, | |
| "kl": 0.35736083984375, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0329, | |
| "reward": 0.47706713899970055, | |
| "reward_std": 0.7550015151500702, | |
| "rewards/cosine_scaled_reward": -0.12604976166039705, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 2113.479248046875, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 1.2932276725769043, | |
| "kl": 0.3569793701171875, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0673, | |
| "reward": 0.7009947504848242, | |
| "reward_std": 0.9477885626256466, | |
| "rewards/cosine_scaled_reward": -0.0036692821886390448, | |
| "rewards/format_reward": 0.7083333544433117, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 2337.5833740234375, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.5998778939247131, | |
| "kl": 0.4022216796875, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0495, | |
| "reward": 0.46617759205400944, | |
| "reward_std": 0.6925142146646976, | |
| "rewards/cosine_scaled_reward": -0.06899452651850879, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.04010716926510213, | |
| "train_runtime": 69296.9973, | |
| "train_samples_per_second": 0.346, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |