Open-RS-DRGRPO / trainer_state.json
kangdawei's picture
Model save
ba4272f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5714285714285714,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 2571.2083587646484,
"epoch": 0.001142857142857143,
"grad_norm": 0.15054196119308472,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": 0.0802,
"reward": 0.4897647276520729,
"reward_std": 0.8290339335799217,
"rewards/cosine_scaled_reward": -0.015534311532974243,
"rewards/format_reward": 0.5208333488553762,
"step": 1
},
{
"completion_length": 2804.395881652832,
"epoch": 0.002285714285714286,
"grad_norm": 0.06441052258014679,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": 0.0258,
"reward": 0.27539755403995514,
"reward_std": 0.42092563211917877,
"rewards/cosine_scaled_reward": -0.04980122856795788,
"rewards/format_reward": 0.37500000558793545,
"step": 2
},
{
"completion_length": 3410.2083435058594,
"epoch": 0.0034285714285714284,
"grad_norm": 0.06412151455879211,
"kl": 3.890693187713623e-05,
"learning_rate": 6e-08,
"loss": 0.0187,
"reward": -0.3642676221206784,
"reward_std": 0.3840297982096672,
"rewards/cosine_scaled_reward": -0.23421715013682842,
"rewards/format_reward": 0.10416666977107525,
"step": 3
},
{
"completion_length": 2242.2916870117188,
"epoch": 0.004571428571428572,
"grad_norm": 0.163053959608078,
"kl": 4.735589027404785e-05,
"learning_rate": 8e-08,
"loss": 0.0732,
"reward": 0.5906303785741329,
"reward_std": 0.9497254565358162,
"rewards/cosine_scaled_reward": -0.027601496782153845,
"rewards/format_reward": 0.645833333954215,
"step": 4
},
{
"completion_length": 3097.9584045410156,
"epoch": 0.005714285714285714,
"grad_norm": 0.18420223891735077,
"kl": 4.886835813522339e-05,
"learning_rate": 1e-07,
"loss": 0.044,
"reward": 0.26460691541433334,
"reward_std": 0.82295261323452,
"rewards/cosine_scaled_reward": -0.08644656091928482,
"rewards/format_reward": 0.4375000074505806,
"step": 5
},
{
"completion_length": 3196.479232788086,
"epoch": 0.006857142857142857,
"grad_norm": 0.16443675756454468,
"kl": 4.808604717254639e-05,
"learning_rate": 1.2e-07,
"loss": 0.0233,
"reward": 0.023200208321213722,
"reward_std": 0.8427017889916897,
"rewards/cosine_scaled_reward": -0.16548323072493076,
"rewards/format_reward": 0.3541666753590107,
"step": 6
},
{
"completion_length": 2956.916748046875,
"epoch": 0.008,
"grad_norm": 0.12215663492679596,
"kl": 2.5171786546707153e-05,
"learning_rate": 1.4e-07,
"loss": 0.0395,
"reward": 0.3668882008641958,
"reward_std": 0.6829603910446167,
"rewards/cosine_scaled_reward": -0.09780590422451496,
"rewards/format_reward": 0.5625000093132257,
"step": 7
},
{
"completion_length": 2590.791702270508,
"epoch": 0.009142857142857144,
"grad_norm": 0.1336221843957901,
"kl": 1.7095357179641724e-05,
"learning_rate": 1.6e-07,
"loss": 0.0327,
"reward": 0.6794745922088623,
"reward_std": 0.7160088084638119,
"rewards/cosine_scaled_reward": 0.10015395213849843,
"rewards/format_reward": 0.47916667722165585,
"step": 8
},
{
"completion_length": 2873.104248046875,
"epoch": 0.010285714285714285,
"grad_norm": 0.23665077984333038,
"kl": 5.224347114562988e-05,
"learning_rate": 1.8e-07,
"loss": 0.1654,
"reward": 0.2612099088728428,
"reward_std": 0.9552066549658775,
"rewards/cosine_scaled_reward": -0.07772838324308395,
"rewards/format_reward": 0.41666667722165585,
"step": 9
},
{
"completion_length": 2782.5000534057617,
"epoch": 0.011428571428571429,
"grad_norm": 0.12416314333677292,
"kl": 3.0666589736938477e-05,
"learning_rate": 2e-07,
"loss": 0.0529,
"reward": 0.20608006697148085,
"reward_std": 0.8456610515713692,
"rewards/cosine_scaled_reward": -0.08445997314993292,
"rewards/format_reward": 0.37500000186264515,
"step": 10
},
{
"completion_length": 3373.1041870117188,
"epoch": 0.012571428571428572,
"grad_norm": 0.09285606443881989,
"kl": 3.6150217056274414e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0318,
"reward": -0.3522867253050208,
"reward_std": 0.49222037196159363,
"rewards/cosine_scaled_reward": -0.22822669660672545,
"rewards/format_reward": 0.10416666977107525,
"step": 11
},
{
"completion_length": 2749.6459350585938,
"epoch": 0.013714285714285714,
"grad_norm": 0.14122599363327026,
"kl": 4.419684410095215e-05,
"learning_rate": 2.4e-07,
"loss": 0.04,
"reward": 0.4197032814845443,
"reward_std": 0.8322520144283772,
"rewards/cosine_scaled_reward": -0.06098170578479767,
"rewards/format_reward": 0.5416666753590107,
"step": 12
},
{
"completion_length": 3078.3333587646484,
"epoch": 0.014857142857142857,
"grad_norm": 0.10787954181432724,
"kl": 3.547407686710358e-05,
"learning_rate": 2.6e-07,
"loss": 0.0205,
"reward": 0.20099145034328103,
"reward_std": 0.6062487177550793,
"rewards/cosine_scaled_reward": -0.1078376192599535,
"rewards/format_reward": 0.41666668467223644,
"step": 13
},
{
"completion_length": 2948.229232788086,
"epoch": 0.016,
"grad_norm": 0.15190070867538452,
"kl": 3.463774919509888e-05,
"learning_rate": 2.8e-07,
"loss": 0.0807,
"reward": 0.04745530150830746,
"reward_std": 0.8162283934652805,
"rewards/cosine_scaled_reward": -0.1429390194825828,
"rewards/format_reward": 0.33333333767950535,
"step": 14
},
{
"completion_length": 2853.750026702881,
"epoch": 0.017142857142857144,
"grad_norm": 0.08899455517530441,
"kl": 2.139061689376831e-05,
"learning_rate": 3e-07,
"loss": 0.0106,
"reward": 0.4579934204230085,
"reward_std": 0.6455020261928439,
"rewards/cosine_scaled_reward": 0.04149670549668372,
"rewards/format_reward": 0.37500000186264515,
"step": 15
},
{
"completion_length": 3580.6666870117188,
"epoch": 0.018285714285714287,
"grad_norm": 0.1100376769900322,
"kl": 3.6090612411499023e-05,
"learning_rate": 3.2e-07,
"loss": 0.0022,
"reward": -0.2719184570014477,
"reward_std": 0.6123667694628239,
"rewards/cosine_scaled_reward": -0.167209230363369,
"rewards/format_reward": 0.06250000186264515,
"step": 16
},
{
"completion_length": 2259.93754196167,
"epoch": 0.019428571428571427,
"grad_norm": 0.1220458522439003,
"kl": 4.4733285903930664e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0002,
"reward": 0.6442119255661964,
"reward_std": 0.7075937427580357,
"rewards/cosine_scaled_reward": 0.02002262556925416,
"rewards/format_reward": 0.6041666697710752,
"step": 17
},
{
"completion_length": 2983.4167098999023,
"epoch": 0.02057142857142857,
"grad_norm": 0.1742384135723114,
"kl": 2.3670494556427002e-05,
"learning_rate": 3.6e-07,
"loss": 0.049,
"reward": 0.05130944773554802,
"reward_std": 0.7180995307862759,
"rewards/cosine_scaled_reward": -0.14101194869726896,
"rewards/format_reward": 0.33333333767950535,
"step": 18
},
{
"completion_length": 2910.166717529297,
"epoch": 0.021714285714285714,
"grad_norm": 0.17305254936218262,
"kl": 2.6881694793701172e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0581,
"reward": 0.49342919746413827,
"reward_std": 0.8945518247783184,
"rewards/cosine_scaled_reward": 0.038381271064281464,
"rewards/format_reward": 0.41666667722165585,
"step": 19
},
{
"completion_length": 2444.6875534057617,
"epoch": 0.022857142857142857,
"grad_norm": 0.11907866597175598,
"kl": 1.6576610505580902e-05,
"learning_rate": 4e-07,
"loss": 0.0516,
"reward": 0.5244868360459805,
"reward_std": 0.6065918765962124,
"rewards/cosine_scaled_reward": -0.06067326734773815,
"rewards/format_reward": 0.6458333432674408,
"step": 20
},
{
"completion_length": 2736.583366394043,
"epoch": 0.024,
"grad_norm": 0.13774950802326202,
"kl": 3.9674341678619385e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0754,
"reward": 0.366255359724164,
"reward_std": 0.786641189828515,
"rewards/cosine_scaled_reward": -0.014788982225582004,
"rewards/format_reward": 0.39583333767950535,
"step": 21
},
{
"completion_length": 2147.479202270508,
"epoch": 0.025142857142857144,
"grad_norm": 0.1683036983013153,
"kl": 2.2970139980316162e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0518,
"reward": 0.5381300672888756,
"reward_std": 0.6436120271682739,
"rewards/cosine_scaled_reward": -0.04343497380614281,
"rewards/format_reward": 0.6250000093132257,
"step": 22
},
{
"completion_length": 2571.854232788086,
"epoch": 0.026285714285714287,
"grad_norm": 0.11163962632417679,
"kl": 3.069639205932617e-05,
"learning_rate": 4.6e-07,
"loss": 0.0962,
"reward": 0.03858701325953007,
"reward_std": 0.5964515460655093,
"rewards/cosine_scaled_reward": -0.1890398357063532,
"rewards/format_reward": 0.416666679084301,
"step": 23
},
{
"completion_length": 2990.0625915527344,
"epoch": 0.027428571428571427,
"grad_norm": 0.15436813235282898,
"kl": 2.1535903215408325e-05,
"learning_rate": 4.8e-07,
"loss": 0.0679,
"reward": 0.5357824601233006,
"reward_std": 0.9760596714913845,
"rewards/cosine_scaled_reward": 0.02830787282437086,
"rewards/format_reward": 0.47916668467223644,
"step": 24
},
{
"completion_length": 2695.937545776367,
"epoch": 0.02857142857142857,
"grad_norm": 0.09400183707475662,
"kl": 2.316199243068695e-05,
"learning_rate": 5e-07,
"loss": -0.0193,
"reward": 0.24818142130970955,
"reward_std": 0.7111284770071507,
"rewards/cosine_scaled_reward": -0.11549262329936028,
"rewards/format_reward": 0.47916666977107525,
"step": 25
},
{
"completion_length": 2949.6041870117188,
"epoch": 0.029714285714285714,
"grad_norm": 0.062154464423656464,
"kl": 2.1763145923614502e-05,
"learning_rate": 5.2e-07,
"loss": 0.0279,
"reward": 0.5066877827048302,
"reward_std": 0.5474561750888824,
"rewards/cosine_scaled_reward": 0.003343891352415085,
"rewards/format_reward": 0.5000000055879354,
"step": 26
},
{
"completion_length": 2975.3750610351562,
"epoch": 0.030857142857142857,
"grad_norm": 0.15270784497261047,
"kl": 1.735985279083252e-05,
"learning_rate": 5.4e-07,
"loss": 0.0572,
"reward": 0.3573550535365939,
"reward_std": 0.7755850367248058,
"rewards/cosine_scaled_reward": -0.060905810445547104,
"rewards/format_reward": 0.47916667722165585,
"step": 27
},
{
"completion_length": 2745.7083435058594,
"epoch": 0.032,
"grad_norm": 0.14373046159744263,
"kl": 2.94586643576622e-05,
"learning_rate": 5.6e-07,
"loss": 0.0143,
"reward": 0.6891938149929047,
"reward_std": 0.9697537384927273,
"rewards/cosine_scaled_reward": 0.06334691727533937,
"rewards/format_reward": 0.5625000055879354,
"step": 28
},
{
"completion_length": 3157.666732788086,
"epoch": 0.03314285714285714,
"grad_norm": 0.21617551147937775,
"kl": 2.562999725341797e-05,
"learning_rate": 5.8e-07,
"loss": 0.0941,
"reward": -0.19576303288340569,
"reward_std": 0.6964554395526648,
"rewards/cosine_scaled_reward": -0.222881518304348,
"rewards/format_reward": 0.25000000558793545,
"step": 29
},
{
"completion_length": 3055.3751220703125,
"epoch": 0.03428571428571429,
"grad_norm": 0.21643611788749695,
"kl": 2.6524066925048828e-05,
"learning_rate": 6e-07,
"loss": 0.1293,
"reward": 0.2991267549805343,
"reward_std": 1.1622500345110893,
"rewards/cosine_scaled_reward": -0.05876995751168579,
"rewards/format_reward": 0.41666667349636555,
"step": 30
},
{
"completion_length": 3113.7708587646484,
"epoch": 0.03542857142857143,
"grad_norm": 0.10899780690670013,
"kl": 1.920759677886963e-05,
"learning_rate": 6.2e-07,
"loss": 0.0359,
"reward": -0.09752624668180943,
"reward_std": 0.6536959744989872,
"rewards/cosine_scaled_reward": -0.18417980521917343,
"rewards/format_reward": 0.27083334513008595,
"step": 31
},
{
"completion_length": 3302.000030517578,
"epoch": 0.036571428571428574,
"grad_norm": 0.13032931089401245,
"kl": 2.7488917112350464e-05,
"learning_rate": 6.4e-07,
"loss": 0.0245,
"reward": 0.240218386054039,
"reward_std": 0.6280976049602032,
"rewards/cosine_scaled_reward": -0.036140820011496544,
"rewards/format_reward": 0.31250001303851604,
"step": 32
},
{
"completion_length": 3349.3333740234375,
"epoch": 0.037714285714285714,
"grad_norm": 0.10642867535352707,
"kl": 4.164688289165497e-05,
"learning_rate": 6.6e-07,
"loss": 0.0126,
"reward": 0.12095527164638042,
"reward_std": 0.7586401477456093,
"rewards/cosine_scaled_reward": -0.08535570465028286,
"rewards/format_reward": 0.2916666716337204,
"step": 33
},
{
"completion_length": 2513.687515258789,
"epoch": 0.038857142857142854,
"grad_norm": 0.32349422574043274,
"kl": 0.00012987852096557617,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0582,
"reward": 0.5865504257380962,
"reward_std": 0.9055213071405888,
"rewards/cosine_scaled_reward": 0.04327519703656435,
"rewards/format_reward": 0.5000000037252903,
"step": 34
},
{
"completion_length": 3105.625045776367,
"epoch": 0.04,
"grad_norm": 0.15311317145824432,
"kl": 9.425729513168335e-05,
"learning_rate": 7e-07,
"loss": 0.0384,
"reward": 0.030300017446279526,
"reward_std": 0.9541528224945068,
"rewards/cosine_scaled_reward": -0.13068332930561155,
"rewards/format_reward": 0.29166666977107525,
"step": 35
},
{
"completion_length": 3456.0208740234375,
"epoch": 0.04114285714285714,
"grad_norm": 0.06633453816175461,
"kl": 9.938329458236694e-05,
"learning_rate": 7.2e-07,
"loss": 0.032,
"reward": -0.4127392489463091,
"reward_std": 0.373018104583025,
"rewards/cosine_scaled_reward": -0.2480362793430686,
"rewards/format_reward": 0.0833333358168602,
"step": 36
},
{
"completion_length": 3322.0833740234375,
"epoch": 0.04228571428571429,
"grad_norm": 0.06093796342611313,
"kl": 5.175359547138214e-05,
"learning_rate": 7.4e-07,
"loss": 0.0138,
"reward": -0.3342415885999799,
"reward_std": 0.39543480053544044,
"rewards/cosine_scaled_reward": -0.2712874598801136,
"rewards/format_reward": 0.20833334140479565,
"step": 37
},
{
"completion_length": 3248.187530517578,
"epoch": 0.04342857142857143,
"grad_norm": 0.10319728404283524,
"kl": 9.037554264068604e-05,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0155,
"reward": -0.1099930020282045,
"reward_std": 0.6748690903186798,
"rewards/cosine_scaled_reward": -0.14874650537967682,
"rewards/format_reward": 0.18750000186264515,
"step": 38
},
{
"completion_length": 2824.7916870117188,
"epoch": 0.044571428571428574,
"grad_norm": 0.11912386864423752,
"kl": 9.545683860778809e-05,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0342,
"reward": 0.3973412849009037,
"reward_std": 0.5866867541335523,
"rewards/cosine_scaled_reward": -0.0409126803278923,
"rewards/format_reward": 0.479166679084301,
"step": 39
},
{
"completion_length": 2677.250015258789,
"epoch": 0.045714285714285714,
"grad_norm": 0.1012558713555336,
"kl": 0.00043101049959659576,
"learning_rate": 8e-07,
"loss": 0.0274,
"reward": 0.27511681243777275,
"reward_std": 0.5241224151104689,
"rewards/cosine_scaled_reward": -0.08119158074259758,
"rewards/format_reward": 0.43750000558793545,
"step": 40
},
{
"completion_length": 2805.875045776367,
"epoch": 0.046857142857142854,
"grad_norm": 0.15722188353538513,
"kl": 0.00016229506582021713,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0704,
"reward": 0.07507334044203162,
"reward_std": 0.706162091344595,
"rewards/cosine_scaled_reward": -0.19162999838590622,
"rewards/format_reward": 0.45833334140479565,
"step": 41
},
{
"completion_length": 2796.5000038146973,
"epoch": 0.048,
"grad_norm": 0.07421410083770752,
"kl": 7.737800478935242e-05,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0009,
"reward": -0.17676172463689,
"reward_std": 0.4381077494472265,
"rewards/cosine_scaled_reward": -0.2654642015695572,
"rewards/format_reward": 0.35416666977107525,
"step": 42
},
{
"completion_length": 3021.3958587646484,
"epoch": 0.04914285714285714,
"grad_norm": 0.15121984481811523,
"kl": 0.00015115737915039062,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0418,
"reward": 0.11950396373867989,
"reward_std": 0.7922810819000006,
"rewards/cosine_scaled_reward": -0.07566470839083195,
"rewards/format_reward": 0.27083333767950535,
"step": 43
},
{
"completion_length": 2888.979179382324,
"epoch": 0.05028571428571429,
"grad_norm": 0.1331688016653061,
"kl": 0.00041909515857696533,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0234,
"reward": 0.3374018808826804,
"reward_std": 0.7487543746829033,
"rewards/cosine_scaled_reward": -0.029215732589364052,
"rewards/format_reward": 0.3958333395421505,
"step": 44
},
{
"completion_length": 3375.562530517578,
"epoch": 0.05142857142857143,
"grad_norm": 0.1241455152630806,
"kl": 0.00021854229271411896,
"learning_rate": 9e-07,
"loss": 0.0205,
"reward": 0.1915082884952426,
"reward_std": 0.745365809649229,
"rewards/cosine_scaled_reward": -0.02924584597349167,
"rewards/format_reward": 0.2500000074505806,
"step": 45
},
{
"completion_length": 3235.4791717529297,
"epoch": 0.052571428571428575,
"grad_norm": 0.08959861844778061,
"kl": 0.0004153698682785034,
"learning_rate": 9.2e-07,
"loss": 0.0007,
"reward": -0.27040275279432535,
"reward_std": 0.5466338861733675,
"rewards/cosine_scaled_reward": -0.20811804989352822,
"rewards/format_reward": 0.14583333395421505,
"step": 46
},
{
"completion_length": 3032.6875915527344,
"epoch": 0.053714285714285714,
"grad_norm": 0.1654578596353531,
"kl": 0.0002987794578075409,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0742,
"reward": 0.2661336697638035,
"reward_std": 0.9970342293381691,
"rewards/cosine_scaled_reward": -0.06484984699636698,
"rewards/format_reward": 0.3958333432674408,
"step": 47
},
{
"completion_length": 2870.8541946411133,
"epoch": 0.054857142857142854,
"grad_norm": 0.08487487584352493,
"kl": 0.0008619073778390884,
"learning_rate": 9.6e-07,
"loss": 0.0052,
"reward": 0.021469716913998127,
"reward_std": 0.5047157257795334,
"rewards/cosine_scaled_reward": -0.14551514480262995,
"rewards/format_reward": 0.3125,
"step": 48
},
{
"completion_length": 2296.166717529297,
"epoch": 0.056,
"grad_norm": 0.16738910973072052,
"kl": 0.0007393211126327515,
"learning_rate": 9.8e-07,
"loss": 0.0039,
"reward": 0.7408803049474955,
"reward_std": 0.8585928715765476,
"rewards/cosine_scaled_reward": 0.047523480374366045,
"rewards/format_reward": 0.645833345130086,
"step": 49
},
{
"completion_length": 3003.729179382324,
"epoch": 0.05714285714285714,
"grad_norm": 0.12555623054504395,
"kl": 0.000688605010509491,
"learning_rate": 1e-06,
"loss": 0.0619,
"reward": 0.36810495844110847,
"reward_std": 0.6655636876821518,
"rewards/cosine_scaled_reward": 0.00696912594139576,
"rewards/format_reward": 0.3541666753590107,
"step": 50
},
{
"completion_length": 2403.270851135254,
"epoch": 0.05828571428571429,
"grad_norm": 0.096031554043293,
"kl": 0.0029038935899734497,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0161,
"reward": 0.41045723855495453,
"reward_std": 0.5240885466337204,
"rewards/cosine_scaled_reward": -0.04477138537913561,
"rewards/format_reward": 0.5000000055879354,
"step": 51
},
{
"completion_length": 3050.3542289733887,
"epoch": 0.05942857142857143,
"grad_norm": 0.13251036405563354,
"kl": 0.00112837553024292,
"learning_rate": 9.999561358041868e-07,
"loss": 0.0229,
"reward": 0.4021341912448406,
"reward_std": 0.8952255919575691,
"rewards/cosine_scaled_reward": 0.02398374956101179,
"rewards/format_reward": 0.3541666716337204,
"step": 52
},
{
"completion_length": 2802.5000610351562,
"epoch": 0.060571428571428575,
"grad_norm": 0.16859768331050873,
"kl": 0.012136131525039673,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0314,
"reward": 0.4030042998492718,
"reward_std": 0.9372089132666588,
"rewards/cosine_scaled_reward": -0.04849785100668669,
"rewards/format_reward": 0.5000000055879354,
"step": 53
},
{
"completion_length": 2765.104232788086,
"epoch": 0.061714285714285715,
"grad_norm": 0.12892229855060577,
"kl": 0.0006947778165340424,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0521,
"reward": 0.7572112157940865,
"reward_std": 0.8469797167927027,
"rewards/cosine_scaled_reward": 0.11818893579766154,
"rewards/format_reward": 0.5208333432674408,
"step": 54
},
{
"completion_length": 3193.9584045410156,
"epoch": 0.06285714285714286,
"grad_norm": 0.13522343337535858,
"kl": 0.002847835421562195,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0282,
"reward": 0.3383600414963439,
"reward_std": 0.9050974175333977,
"rewards/cosine_scaled_reward": 0.0025133611634373665,
"rewards/format_reward": 0.33333333767950535,
"step": 55
},
{
"completion_length": 2973.625030517578,
"epoch": 0.064,
"grad_norm": 0.10659506171941757,
"kl": 0.0008548498153686523,
"learning_rate": 9.996052735444862e-07,
"loss": 0.057,
"reward": 0.15444774832576513,
"reward_std": 0.6975248530507088,
"rewards/cosine_scaled_reward": -0.09985947422683239,
"rewards/format_reward": 0.35416666977107525,
"step": 56
},
{
"completion_length": 3409.3959045410156,
"epoch": 0.06514285714285714,
"grad_norm": 0.109820656478405,
"kl": 0.000378340482711792,
"learning_rate": 9.994627618036452e-07,
"loss": 0.0353,
"reward": -0.1370320685673505,
"reward_std": 0.6953500900417566,
"rewards/cosine_scaled_reward": -0.183099371381104,
"rewards/format_reward": 0.22916666977107525,
"step": 57
},
{
"completion_length": 2256.270866394043,
"epoch": 0.06628571428571428,
"grad_norm": 0.14203794300556183,
"kl": 0.011088848114013672,
"learning_rate": 9.992983438818915e-07,
"loss": 0.0796,
"reward": 0.6106004565954208,
"reward_std": 0.8679536152631044,
"rewards/cosine_scaled_reward": -0.007199776358902454,
"rewards/format_reward": 0.6250000093132257,
"step": 58
},
{
"completion_length": 3019.0625610351562,
"epoch": 0.06742857142857143,
"grad_norm": 0.17194640636444092,
"kl": 0.0011185333132743835,
"learning_rate": 9.991120277927223e-07,
"loss": 0.0533,
"reward": 0.3943455405533314,
"reward_std": 0.8865922130644321,
"rewards/cosine_scaled_reward": 0.00967277493327856,
"rewards/format_reward": 0.3750000037252903,
"step": 59
},
{
"completion_length": 2972.3333892822266,
"epoch": 0.06857142857142857,
"grad_norm": 0.14659450948238373,
"kl": 0.001021549105644226,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0618,
"reward": 0.3112136572599411,
"reward_std": 0.991888377815485,
"rewards/cosine_scaled_reward": -0.0527265154523775,
"rewards/format_reward": 0.4166666753590107,
"step": 60
},
{
"completion_length": 3116.5208740234375,
"epoch": 0.06971428571428571,
"grad_norm": 0.14671690762043,
"kl": 0.001110263168811798,
"learning_rate": 9.98673738502114e-07,
"loss": 0.0815,
"reward": 0.27738364040851593,
"reward_std": 0.800424050539732,
"rewards/cosine_scaled_reward": -0.06964152306318283,
"rewards/format_reward": 0.4166666753590107,
"step": 61
},
{
"completion_length": 2748.4583740234375,
"epoch": 0.07085714285714285,
"grad_norm": 0.12405635416507721,
"kl": 0.004500046372413635,
"learning_rate": 9.98421786662277e-07,
"loss": 0.0233,
"reward": 0.5339981620199978,
"reward_std": 0.8481226451694965,
"rewards/cosine_scaled_reward": 0.016999082639813423,
"rewards/format_reward": 0.5000000055879354,
"step": 62
},
{
"completion_length": 2458.0625610351562,
"epoch": 0.072,
"grad_norm": 0.2264307290315628,
"kl": 0.0018923580646514893,
"learning_rate": 9.981479793771866e-07,
"loss": 0.1231,
"reward": 0.7813117974437773,
"reward_std": 1.1678522191941738,
"rewards/cosine_scaled_reward": 0.06773922173306346,
"rewards/format_reward": 0.645833345130086,
"step": 63
},
{
"completion_length": 2938.3541870117188,
"epoch": 0.07314285714285715,
"grad_norm": 0.18781885504722595,
"kl": 0.0028982162475585938,
"learning_rate": 9.97852329991824e-07,
"loss": 0.0713,
"reward": 0.4028865471482277,
"reward_std": 1.0138535592705011,
"rewards/cosine_scaled_reward": -0.006890069227665663,
"rewards/format_reward": 0.41666668094694614,
"step": 64
},
{
"completion_length": 2938.270854949951,
"epoch": 0.07428571428571429,
"grad_norm": 0.11171157658100128,
"kl": 0.0012230873107910156,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0103,
"reward": 0.3659288566559553,
"reward_std": 0.6721893213689327,
"rewards/cosine_scaled_reward": -0.02536890748888254,
"rewards/format_reward": 0.4166666679084301,
"step": 65
},
{
"completion_length": 2261.8958473205566,
"epoch": 0.07542857142857143,
"grad_norm": 0.07533268630504608,
"kl": 0.004939556121826172,
"learning_rate": 9.971955636222684e-07,
"loss": 0.0008,
"reward": 0.44019365310668945,
"reward_std": 0.5285264812409878,
"rewards/cosine_scaled_reward": -0.01948651857674122,
"rewards/format_reward": 0.4791666716337204,
"step": 66
},
{
"completion_length": 3491.125030517578,
"epoch": 0.07657142857142857,
"grad_norm": 0.08121506124734879,
"kl": 0.001986861228942871,
"learning_rate": 9.968344786479415e-07,
"loss": 0.0262,
"reward": -0.5231649484485388,
"reward_std": 0.4413683470338583,
"rewards/cosine_scaled_reward": -0.3032491412013769,
"rewards/format_reward": 0.0833333358168602,
"step": 67
},
{
"completion_length": 2320.104202270508,
"epoch": 0.07771428571428571,
"grad_norm": 0.13665203750133514,
"kl": 0.009753227233886719,
"learning_rate": 9.964516155915151e-07,
"loss": 0.103,
"reward": 0.41882045567035675,
"reward_std": 0.8789598569273949,
"rewards/cosine_scaled_reward": -0.0614231089130044,
"rewards/format_reward": 0.5416666697710752,
"step": 68
},
{
"completion_length": 2683.520866394043,
"epoch": 0.07885714285714286,
"grad_norm": 0.12382709234952927,
"kl": 0.005780220031738281,
"learning_rate": 9.960469931131936e-07,
"loss": 0.029,
"reward": 0.05852051172405481,
"reward_std": 0.7762247212231159,
"rewards/cosine_scaled_reward": -0.17907308926805854,
"rewards/format_reward": 0.4166666679084301,
"step": 69
},
{
"completion_length": 3093.500030517578,
"epoch": 0.08,
"grad_norm": 0.0833624005317688,
"kl": 0.002095341682434082,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0049,
"reward": 0.1458067416679114,
"reward_std": 0.6252293065190315,
"rewards/cosine_scaled_reward": -0.11459663603454828,
"rewards/format_reward": 0.37500000186264515,
"step": 70
},
{
"completion_length": 2787.000030517578,
"epoch": 0.08114285714285714,
"grad_norm": 0.10387802124023438,
"kl": 0.00287473201751709,
"learning_rate": 9.951725498333448e-07,
"loss": 0.0334,
"reward": 0.3234255127608776,
"reward_std": 0.6192433759570122,
"rewards/cosine_scaled_reward": -0.00495389848947525,
"rewards/format_reward": 0.33333333395421505,
"step": 71
},
{
"completion_length": 3198.1458892822266,
"epoch": 0.08228571428571428,
"grad_norm": 0.16788320243358612,
"kl": 0.0061588287353515625,
"learning_rate": 9.947027716509488e-07,
"loss": 0.0888,
"reward": -0.16125928983092308,
"reward_std": 0.6868584919720888,
"rewards/cosine_scaled_reward": -0.18479630933143198,
"rewards/format_reward": 0.20833334140479565,
"step": 72
},
{
"completion_length": 3534.8125,
"epoch": 0.08342857142857144,
"grad_norm": 0.1018107533454895,
"kl": 0.0011434555053710938,
"learning_rate": 9.942113192828444e-07,
"loss": 0.0053,
"reward": -0.12906201742589474,
"reward_std": 0.6659317016601562,
"rewards/cosine_scaled_reward": -0.1478643520968035,
"rewards/format_reward": 0.16666666977107525,
"step": 73
},
{
"completion_length": 3263.2708435058594,
"epoch": 0.08457142857142858,
"grad_norm": 0.08958863466978073,
"kl": 0.0028752684593200684,
"learning_rate": 9.93698216681727e-07,
"loss": 0.0343,
"reward": -0.09894978068768978,
"reward_std": 0.5172175895422697,
"rewards/cosine_scaled_reward": -0.15364155592396855,
"rewards/format_reward": 0.2083333358168602,
"step": 74
},
{
"completion_length": 3089.437530517578,
"epoch": 0.08571428571428572,
"grad_norm": 0.08952262997627258,
"kl": 0.004845738410949707,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0376,
"reward": 0.3134958976879716,
"reward_std": 0.6024076864123344,
"rewards/cosine_scaled_reward": -0.009918726980686188,
"rewards/format_reward": 0.33333334140479565,
"step": 75
},
{
"completion_length": 2827.937545776367,
"epoch": 0.08685714285714285,
"grad_norm": 0.14939860999584198,
"kl": 0.0015323758125305176,
"learning_rate": 9.926071618660237e-07,
"loss": 0.0686,
"reward": 0.09713686350733042,
"reward_std": 0.6034146882593632,
"rewards/cosine_scaled_reward": -0.18059824593365192,
"rewards/format_reward": 0.4583333432674408,
"step": 76
},
{
"completion_length": 3079.0208892822266,
"epoch": 0.088,
"grad_norm": 0.06953620910644531,
"kl": 0.00173109769821167,
"learning_rate": 9.9202926282791e-07,
"loss": -0.0009,
"reward": 0.11226777359843254,
"reward_std": 0.44494798220694065,
"rewards/cosine_scaled_reward": -0.12094944715499878,
"rewards/format_reward": 0.35416667722165585,
"step": 77
},
{
"completion_length": 3188.3958892822266,
"epoch": 0.08914285714285715,
"grad_norm": 0.1326863169670105,
"kl": 0.0019420385360717773,
"learning_rate": 9.91429819907136e-07,
"loss": 0.0581,
"reward": 0.1064312756061554,
"reward_std": 0.6707769315689802,
"rewards/cosine_scaled_reward": -0.0822010301053524,
"rewards/format_reward": 0.2708333432674408,
"step": 78
},
{
"completion_length": 2369.7291831970215,
"epoch": 0.09028571428571429,
"grad_norm": 0.0872310996055603,
"kl": 0.0035691261291503906,
"learning_rate": 9.908088623197048e-07,
"loss": 0.0099,
"reward": 0.20641303062438965,
"reward_std": 0.6631737053394318,
"rewards/cosine_scaled_reward": -0.15721015818417072,
"rewards/format_reward": 0.5208333395421505,
"step": 79
},
{
"completion_length": 3369.7708435058594,
"epoch": 0.09142857142857143,
"grad_norm": 0.14515474438667297,
"kl": 0.0033063888549804688,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0147,
"reward": -0.007383164018392563,
"reward_std": 0.648245201446116,
"rewards/cosine_scaled_reward": -0.12869158759713173,
"rewards/format_reward": 0.2500000037252903,
"step": 80
},
{
"completion_length": 3120.166679382324,
"epoch": 0.09257142857142857,
"grad_norm": 0.12892520427703857,
"kl": 0.0076847076416015625,
"learning_rate": 9.895025252503755e-07,
"loss": 0.0038,
"reward": -0.025072289630770683,
"reward_std": 0.6095977611839771,
"rewards/cosine_scaled_reward": -0.15836948156356812,
"rewards/format_reward": 0.29166667349636555,
"step": 81
},
{
"completion_length": 2897.0208740234375,
"epoch": 0.09371428571428571,
"grad_norm": 0.22404946386814117,
"kl": 0.0032821297645568848,
"learning_rate": 9.888172094375033e-07,
"loss": 0.0577,
"reward": 0.3680579289793968,
"reward_std": 0.6173720192164183,
"rewards/cosine_scaled_reward": -0.013887699693441391,
"rewards/format_reward": 0.39583333395421505,
"step": 82
},
{
"completion_length": 2878.812515258789,
"epoch": 0.09485714285714286,
"grad_norm": 0.07131865620613098,
"kl": 0.0026471614837646484,
"learning_rate": 9.881105062929221e-07,
"loss": -0.0005,
"reward": 0.055370867252349854,
"reward_std": 0.513982892036438,
"rewards/cosine_scaled_reward": -0.11814789846539497,
"rewards/format_reward": 0.2916666679084301,
"step": 83
},
{
"completion_length": 3116.8958740234375,
"epoch": 0.096,
"grad_norm": 0.11975695192813873,
"kl": 0.0038411617279052734,
"learning_rate": 9.873824502603459e-07,
"loss": 0.0425,
"reward": 0.42375171184539795,
"reward_std": 0.7906983122229576,
"rewards/cosine_scaled_reward": 0.02437583915889263,
"rewards/format_reward": 0.37500000558793545,
"step": 84
},
{
"completion_length": 3274.5208435058594,
"epoch": 0.09714285714285714,
"grad_norm": 0.15465512871742249,
"kl": 0.0028481483459472656,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0083,
"reward": 0.08125680079683661,
"reward_std": 0.9014002867043018,
"rewards/cosine_scaled_reward": -0.13645494263619184,
"rewards/format_reward": 0.3541666753590107,
"step": 85
},
{
"completion_length": 2986.687530517578,
"epoch": 0.09828571428571428,
"grad_norm": 0.10517808794975281,
"kl": 0.004029273986816406,
"learning_rate": 9.85862422507884e-07,
"loss": 0.0243,
"reward": 0.2641464173793793,
"reward_std": 0.558088131248951,
"rewards/cosine_scaled_reward": -0.05542680807411671,
"rewards/format_reward": 0.3750000074505806,
"step": 86
},
{
"completion_length": 3038.0833740234375,
"epoch": 0.09942857142857142,
"grad_norm": 0.14221909642219543,
"kl": 0.00909280776977539,
"learning_rate": 9.850705248720068e-07,
"loss": 0.0763,
"reward": 0.03238655626773834,
"reward_std": 0.7338373847305775,
"rewards/cosine_scaled_reward": -0.16089007258415222,
"rewards/format_reward": 0.3541666716337204,
"step": 87
},
{
"completion_length": 3097.604217529297,
"epoch": 0.10057142857142858,
"grad_norm": 0.18620258569717407,
"kl": 0.010651111602783203,
"learning_rate": 9.8425742251254e-07,
"loss": 0.094,
"reward": 0.319759342353791,
"reward_std": 0.9601002521812916,
"rewards/cosine_scaled_reward": -0.027620327193289995,
"rewards/format_reward": 0.37500000558793545,
"step": 88
},
{
"completion_length": 3288.8125610351562,
"epoch": 0.10171428571428572,
"grad_norm": 0.15396399796009064,
"kl": 0.004711151123046875,
"learning_rate": 9.83423155058946e-07,
"loss": 0.0457,
"reward": 0.19393361918628216,
"reward_std": 0.8315184041857719,
"rewards/cosine_scaled_reward": -0.048866515047848225,
"rewards/format_reward": 0.2916666753590107,
"step": 89
},
{
"completion_length": 2598.083351135254,
"epoch": 0.10285714285714286,
"grad_norm": 0.0603555403649807,
"kl": 0.010183334350585938,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0085,
"reward": -0.08365453220903873,
"reward_std": 0.4098529051989317,
"rewards/cosine_scaled_reward": -0.27099394612014294,
"rewards/format_reward": 0.45833333395421505,
"step": 90
},
{
"completion_length": 3258.8541870117188,
"epoch": 0.104,
"grad_norm": 0.18056602776050568,
"kl": 0.005778074264526367,
"learning_rate": 9.816912885430258e-07,
"loss": 0.0472,
"reward": 0.1628702199086547,
"reward_std": 0.9603168293833733,
"rewards/cosine_scaled_reward": -0.07481488771736622,
"rewards/format_reward": 0.31250001303851604,
"step": 91
},
{
"completion_length": 3047.166748046875,
"epoch": 0.10514285714285715,
"grad_norm": 0.12348229438066483,
"kl": 0.008596420288085938,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0333,
"reward": 0.3812000777106732,
"reward_std": 0.7860049642622471,
"rewards/cosine_scaled_reward": -0.028149965219199657,
"rewards/format_reward": 0.4375000149011612,
"step": 92
},
{
"completion_length": 3517.0416870117188,
"epoch": 0.10628571428571429,
"grad_norm": 0.08583661168813705,
"kl": 0.0059413909912109375,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0176,
"reward": -0.5209241807460785,
"reward_std": 0.38511228561401367,
"rewards/cosine_scaled_reward": -0.281295420601964,
"rewards/format_reward": 0.0416666679084301,
"step": 93
},
{
"completion_length": 3197.7291870117188,
"epoch": 0.10742857142857143,
"grad_norm": 0.10561229288578033,
"kl": 0.010333061218261719,
"learning_rate": 9.78935800506826e-07,
"loss": 0.039,
"reward": 0.015513140708208084,
"reward_std": 0.7152232564985752,
"rewards/cosine_scaled_reward": -0.11724343802779913,
"rewards/format_reward": 0.25000000186264515,
"step": 94
},
{
"completion_length": 3515.3333435058594,
"epoch": 0.10857142857142857,
"grad_norm": 0.10697196424007416,
"kl": 0.002697467803955078,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0037,
"reward": -0.2317026201635599,
"reward_std": 0.646982979029417,
"rewards/cosine_scaled_reward": -0.1783513177651912,
"rewards/format_reward": 0.1250000037252903,
"step": 95
},
{
"completion_length": 3197.041732788086,
"epoch": 0.10971428571428571,
"grad_norm": 0.11685776710510254,
"kl": 0.008313179016113281,
"learning_rate": 9.769942052400235e-07,
"loss": 0.0622,
"reward": 0.04121094010770321,
"reward_std": 0.7906505540013313,
"rewards/cosine_scaled_reward": -0.10439453413709998,
"rewards/format_reward": 0.25000000931322575,
"step": 96
},
{
"completion_length": 3248.1666870117188,
"epoch": 0.11085714285714286,
"grad_norm": 0.10923011600971222,
"kl": 0.004992485046386719,
"learning_rate": 9.759921670520634e-07,
"loss": 0.0214,
"reward": 0.23345047235488892,
"reward_std": 0.6134117320179939,
"rewards/cosine_scaled_reward": -0.04994143359363079,
"rewards/format_reward": 0.3333333432674408,
"step": 97
},
{
"completion_length": 3227.6458740234375,
"epoch": 0.112,
"grad_norm": 0.09419604390859604,
"kl": 0.003627777099609375,
"learning_rate": 9.749693666068663e-07,
"loss": 0.0441,
"reward": 0.07032760046422482,
"reward_std": 0.5467477329075336,
"rewards/cosine_scaled_reward": -0.12108622305095196,
"rewards/format_reward": 0.3125000111758709,
"step": 98
},
{
"completion_length": 2894.3541717529297,
"epoch": 0.11314285714285714,
"grad_norm": 0.09645549207925797,
"kl": 0.00807952880859375,
"learning_rate": 9.739258537542835e-07,
"loss": 0.0198,
"reward": 0.19703226536512375,
"reward_std": 0.5642017982900143,
"rewards/cosine_scaled_reward": -0.06815055944025517,
"rewards/format_reward": 0.3333333432674408,
"step": 99
},
{
"completion_length": 2980.4583740234375,
"epoch": 0.11428571428571428,
"grad_norm": 0.10959184169769287,
"kl": 0.009540557861328125,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0112,
"reward": 0.3093057796359062,
"reward_std": 0.7769787572324276,
"rewards/cosine_scaled_reward": -0.03284713625907898,
"rewards/format_reward": 0.37500000186264515,
"step": 100
},
{
"completion_length": 2883.354217529297,
"epoch": 0.11542857142857142,
"grad_norm": 0.12877003848552704,
"kl": 0.005811214447021484,
"learning_rate": 9.717768952713511e-07,
"loss": 0.0381,
"reward": 0.14440507721155882,
"reward_std": 0.5162308318540454,
"rewards/cosine_scaled_reward": -0.09446411859244108,
"rewards/format_reward": 0.33333334140479565,
"step": 101
},
{
"completion_length": 3023.1875610351562,
"epoch": 0.11657142857142858,
"grad_norm": 0.1877162903547287,
"kl": 0.011671066284179688,
"learning_rate": 9.706715543782064e-07,
"loss": 0.0889,
"reward": 0.22887461259961128,
"reward_std": 0.8127965480089188,
"rewards/cosine_scaled_reward": -0.11472935602068901,
"rewards/format_reward": 0.45833334885537624,
"step": 102
},
{
"completion_length": 3156.812545776367,
"epoch": 0.11771428571428572,
"grad_norm": 0.15089279413223267,
"kl": 0.010381698608398438,
"learning_rate": 9.695457105469804e-07,
"loss": 0.0498,
"reward": -0.00840279646217823,
"reward_std": 0.8644813783466816,
"rewards/cosine_scaled_reward": -0.16045140800997615,
"rewards/format_reward": 0.31250000931322575,
"step": 103
},
{
"completion_length": 2804.000015258789,
"epoch": 0.11885714285714286,
"grad_norm": 0.1974973976612091,
"kl": 0.006961822509765625,
"learning_rate": 9.683994186497132e-07,
"loss": 0.0379,
"reward": 0.023546243086457253,
"reward_std": 0.5169991590082645,
"rewards/cosine_scaled_reward": -0.17572688311338425,
"rewards/format_reward": 0.3750000037252903,
"step": 104
},
{
"completion_length": 3190.7500610351562,
"epoch": 0.12,
"grad_norm": 0.2654437720775604,
"kl": 0.006871223449707031,
"learning_rate": 9.672327345550543e-07,
"loss": 0.1131,
"reward": 0.33211813122034073,
"reward_std": 1.1573103182017803,
"rewards/cosine_scaled_reward": 0.009809067007154226,
"rewards/format_reward": 0.3125000074505806,
"step": 105
},
{
"completion_length": 2616.2708435058594,
"epoch": 0.12114285714285715,
"grad_norm": 0.13210490345954895,
"kl": 0.006643772125244141,
"learning_rate": 9.66045715125541e-07,
"loss": 0.0729,
"reward": 1.0535442419350147,
"reward_std": 0.7898675352334976,
"rewards/cosine_scaled_reward": 0.21427209861576557,
"rewards/format_reward": 0.6250000055879354,
"step": 106
},
{
"completion_length": 2912.875030517578,
"epoch": 0.12228571428571429,
"grad_norm": 0.12955093383789062,
"kl": 0.0073909759521484375,
"learning_rate": 9.648384182148252e-07,
"loss": 0.0618,
"reward": 0.21932815946638584,
"reward_std": 0.6689837593585253,
"rewards/cosine_scaled_reward": -0.10908591747283936,
"rewards/format_reward": 0.4375000111758709,
"step": 107
},
{
"completion_length": 3047.9583740234375,
"epoch": 0.12342857142857143,
"grad_norm": 14.177762985229492,
"kl": 0.9636068344116211,
"learning_rate": 9.636109026648554e-07,
"loss": 0.1109,
"reward": 0.3542330916970968,
"reward_std": 0.7520873434841633,
"rewards/cosine_scaled_reward": -0.010383456945419312,
"rewards/format_reward": 0.37500000558793545,
"step": 108
},
{
"completion_length": 3088.916717529297,
"epoch": 0.12457142857142857,
"grad_norm": 0.10591775923967361,
"kl": 0.0058727264404296875,
"learning_rate": 9.623632283030077e-07,
"loss": 0.0157,
"reward": 0.28438636660575867,
"reward_std": 0.5916559211909771,
"rewards/cosine_scaled_reward": -0.07655682414770126,
"rewards/format_reward": 0.4375000037252903,
"step": 109
},
{
"completion_length": 3114.958366394043,
"epoch": 0.12571428571428572,
"grad_norm": 0.13596315681934357,
"kl": 0.008614540100097656,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0252,
"reward": 0.21716876979917288,
"reward_std": 0.8383330777287483,
"rewards/cosine_scaled_reward": -0.08933228440582752,
"rewards/format_reward": 0.39583333767950535,
"step": 110
},
{
"completion_length": 3459.812530517578,
"epoch": 0.12685714285714286,
"grad_norm": 0.13085001707077026,
"kl": 0.009960174560546875,
"learning_rate": 9.598076473627796e-07,
"loss": 0.0291,
"reward": 0.025424662977457047,
"reward_std": 0.7261426411569118,
"rewards/cosine_scaled_reward": -0.0706210074131377,
"rewards/format_reward": 0.16666667349636555,
"step": 111
},
{
"completion_length": 3242.541717529297,
"epoch": 0.128,
"grad_norm": 0.13919313251972198,
"kl": 0.00591278076171875,
"learning_rate": 9.58499865339809e-07,
"loss": 0.0053,
"reward": 0.6084912680089474,
"reward_std": 0.7924976646900177,
"rewards/cosine_scaled_reward": 0.08549563866108656,
"rewards/format_reward": 0.43750000558793545,
"step": 112
},
{
"completion_length": 2981.916748046875,
"epoch": 0.12914285714285714,
"grad_norm": 0.24504096806049347,
"kl": 0.01073455810546875,
"learning_rate": 9.571721736097088e-07,
"loss": 0.0422,
"reward": 0.21685536485165358,
"reward_std": 0.8932247292250395,
"rewards/cosine_scaled_reward": -0.13115566316992044,
"rewards/format_reward": 0.4791666753590107,
"step": 113
},
{
"completion_length": 2645.479202270508,
"epoch": 0.13028571428571428,
"grad_norm": 0.07428579032421112,
"kl": 0.007167816162109375,
"learning_rate": 9.55824636882301e-07,
"loss": 0.0088,
"reward": 0.17610520124435425,
"reward_std": 0.4693563599139452,
"rewards/cosine_scaled_reward": -0.20361408591270447,
"rewards/format_reward": 0.583333333954215,
"step": 114
},
{
"completion_length": 2916.500030517578,
"epoch": 0.13142857142857142,
"grad_norm": 0.11420946568250656,
"kl": 0.0070934295654296875,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0296,
"reward": 0.20668567204847932,
"reward_std": 0.7353272885084152,
"rewards/cosine_scaled_reward": -0.06332382163964212,
"rewards/format_reward": 0.33333334140479565,
"step": 115
},
{
"completion_length": 3320.6041870117188,
"epoch": 0.13257142857142856,
"grad_norm": 0.10533823817968369,
"kl": 0.00713348388671875,
"learning_rate": 9.530702921077358e-07,
"loss": -0.0252,
"reward": -0.1739243706688285,
"reward_std": 0.6250900998711586,
"rewards/cosine_scaled_reward": -0.18071219464764,
"rewards/format_reward": 0.18750000186264515,
"step": 116
},
{
"completion_length": 3231.625030517578,
"epoch": 0.1337142857142857,
"grad_norm": 0.12218490988016129,
"kl": 0.010065078735351562,
"learning_rate": 9.516636183034564e-07,
"loss": 0.003,
"reward": 0.0012212982401251793,
"reward_std": 0.693468015640974,
"rewards/cosine_scaled_reward": -0.1452226904220879,
"rewards/format_reward": 0.29166666977107525,
"step": 117
},
{
"completion_length": 3066.125015258789,
"epoch": 0.13485714285714287,
"grad_norm": 0.1486603021621704,
"kl": 0.0053539276123046875,
"learning_rate": 9.502373679810839e-07,
"loss": 0.0267,
"reward": 0.2887073950842023,
"reward_std": 0.8192379102110863,
"rewards/cosine_scaled_reward": -0.03272963920608163,
"rewards/format_reward": 0.35416667349636555,
"step": 118
},
{
"completion_length": 2546.9375610351562,
"epoch": 0.136,
"grad_norm": 0.4716709852218628,
"kl": 0.21265125274658203,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0025,
"reward": 0.5194442104548216,
"reward_std": 0.623758127912879,
"rewards/cosine_scaled_reward": -0.03194458410143852,
"rewards/format_reward": 0.5833333414047956,
"step": 119
},
{
"completion_length": 2618.1458740234375,
"epoch": 0.13714285714285715,
"grad_norm": 0.1533748060464859,
"kl": 0.010034561157226562,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0074,
"reward": 0.47723614796996117,
"reward_std": 0.7403046824038029,
"rewards/cosine_scaled_reward": -0.011381933465600014,
"rewards/format_reward": 0.5000000055879354,
"step": 120
},
{
"completion_length": 2108.979232788086,
"epoch": 0.1382857142857143,
"grad_norm": 0.20188815891742706,
"kl": 0.015041351318359375,
"learning_rate": 9.458418577899774e-07,
"loss": 0.0589,
"reward": 0.5848385840654373,
"reward_std": 0.7289820089936256,
"rewards/cosine_scaled_reward": -0.030497390776872635,
"rewards/format_reward": 0.6458333376795053,
"step": 121
},
{
"completion_length": 2961.6459197998047,
"epoch": 0.13942857142857143,
"grad_norm": 0.1945829838514328,
"kl": 0.008840560913085938,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0574,
"reward": 0.4458494456484914,
"reward_std": 0.867495059967041,
"rewards/cosine_scaled_reward": 0.01459137536585331,
"rewards/format_reward": 0.4166666753590107,
"step": 122
},
{
"completion_length": 2952.000030517578,
"epoch": 0.14057142857142857,
"grad_norm": 0.12263938039541245,
"kl": 0.0066699981689453125,
"learning_rate": 9.428149347714143e-07,
"loss": 0.0451,
"reward": 0.305552801117301,
"reward_std": 0.7307926155626774,
"rewards/cosine_scaled_reward": -0.07639027573168278,
"rewards/format_reward": 0.4583333469927311,
"step": 123
},
{
"completion_length": 2579.687545776367,
"epoch": 0.1417142857142857,
"grad_norm": 0.15065783262252808,
"kl": 0.010354995727539062,
"learning_rate": 9.412727182773486e-07,
"loss": 0.0237,
"reward": 0.6200504712760448,
"reward_std": 0.9035943485796452,
"rewards/cosine_scaled_reward": 0.00794189516454935,
"rewards/format_reward": 0.6041666734963655,
"step": 124
},
{
"completion_length": 2819.000030517578,
"epoch": 0.14285714285714285,
"grad_norm": 0.10753726214170456,
"kl": 0.0061092376708984375,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0258,
"reward": 0.5180913750082254,
"reward_std": 0.7182744760066271,
"rewards/cosine_scaled_reward": 0.081962333410047,
"rewards/format_reward": 0.35416666977107525,
"step": 125
},
{
"completion_length": 2857.291717529297,
"epoch": 0.144,
"grad_norm": 0.19327324628829956,
"kl": 0.0054225921630859375,
"learning_rate": 9.381311511432658e-07,
"loss": 0.0597,
"reward": 0.4767994333524257,
"reward_std": 1.0140015110373497,
"rewards/cosine_scaled_reward": -0.0011836281046271324,
"rewards/format_reward": 0.47916667349636555,
"step": 126
},
{
"completion_length": 3123.0208435058594,
"epoch": 0.14514285714285713,
"grad_norm": 0.14542251825332642,
"kl": 0.008556365966796875,
"learning_rate": 9.36531953618799e-07,
"loss": 0.0721,
"reward": -0.07007080456241965,
"reward_std": 0.7998057566583157,
"rewards/cosine_scaled_reward": -0.201702069491148,
"rewards/format_reward": 0.33333334513008595,
"step": 127
},
{
"completion_length": 2893.812530517578,
"epoch": 0.1462857142857143,
"grad_norm": 0.12933015823364258,
"kl": 0.00739288330078125,
"learning_rate": 9.34913917072228e-07,
"loss": 0.0413,
"reward": 0.6880392283201218,
"reward_std": 0.7831969410181046,
"rewards/cosine_scaled_reward": 0.11485293135046959,
"rewards/format_reward": 0.4583333432674408,
"step": 128
},
{
"completion_length": 3500.562530517578,
"epoch": 0.14742857142857144,
"grad_norm": 0.1216391995549202,
"kl": 0.009105682373046875,
"learning_rate": 9.332771203643714e-07,
"loss": -0.0089,
"reward": -0.261786799877882,
"reward_std": 0.6217585429549217,
"rewards/cosine_scaled_reward": -0.20381007622927427,
"rewards/format_reward": 0.14583333767950535,
"step": 129
},
{
"completion_length": 3168.0208740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.09015782177448273,
"kl": 0.008625030517578125,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0264,
"reward": -0.012573342770338058,
"reward_std": 0.45624612644314766,
"rewards/cosine_scaled_reward": -0.10003667138516903,
"rewards/format_reward": 0.18750000186264515,
"step": 130
},
{
"completion_length": 2846.687530517578,
"epoch": 0.14971428571428572,
"grad_norm": 0.14023305475711823,
"kl": 0.00933074951171875,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0644,
"reward": 0.45178989693522453,
"reward_std": 0.7442049328237772,
"rewards/cosine_scaled_reward": 0.027978284284472466,
"rewards/format_reward": 0.39583334140479565,
"step": 131
},
{
"completion_length": 2890.3333587646484,
"epoch": 0.15085714285714286,
"grad_norm": 0.13710667192935944,
"kl": 0.0076751708984375,
"learning_rate": 9.282549715730579e-07,
"loss": 0.016,
"reward": 0.3612702414393425,
"reward_std": 0.8515727780759335,
"rewards/cosine_scaled_reward": -0.02769822347909212,
"rewards/format_reward": 0.41666667349636555,
"step": 132
},
{
"completion_length": 3069.729202270508,
"epoch": 0.152,
"grad_norm": 0.10156344622373581,
"kl": 0.009157180786132812,
"learning_rate": 9.265439410565328e-07,
"loss": 0.0382,
"reward": -0.0937919020652771,
"reward_std": 0.5027989856898785,
"rewards/cosine_scaled_reward": -0.20314595522359014,
"rewards/format_reward": 0.3125000074505806,
"step": 133
},
{
"completion_length": 2549.937530517578,
"epoch": 0.15314285714285714,
"grad_norm": 0.077970951795578,
"kl": 0.011308670043945312,
"learning_rate": 9.248145583195447e-07,
"loss": 0.0088,
"reward": 0.2696135453879833,
"reward_std": 0.43714287504553795,
"rewards/cosine_scaled_reward": -0.10477657988667488,
"rewards/format_reward": 0.47916666977107525,
"step": 134
},
{
"completion_length": 1877.0833625793457,
"epoch": 0.15428571428571428,
"grad_norm": 0.10776728391647339,
"kl": 0.007760047912597656,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0044,
"reward": 1.0087775029242039,
"reward_std": 0.766286326572299,
"rewards/cosine_scaled_reward": 0.17105539632029831,
"rewards/format_reward": 0.6666666679084301,
"step": 135
},
{
"completion_length": 2871.8750610351562,
"epoch": 0.15542857142857142,
"grad_norm": 0.22153440117835999,
"kl": 0.011474609375,
"learning_rate": 9.213010742252327e-07,
"loss": 0.0905,
"reward": 0.6536878123879433,
"reward_std": 1.1243817768990993,
"rewards/cosine_scaled_reward": 0.08726056106388569,
"rewards/format_reward": 0.4791666753590107,
"step": 136
},
{
"completion_length": 3189.4791870117188,
"epoch": 0.15657142857142858,
"grad_norm": 0.11029347777366638,
"kl": 0.010852813720703125,
"learning_rate": 9.195171441101668e-07,
"loss": 0.0475,
"reward": -0.21665774658322334,
"reward_std": 0.5276653412729502,
"rewards/cosine_scaled_reward": -0.22291221655905247,
"rewards/format_reward": 0.2291666716337204,
"step": 137
},
{
"completion_length": 2698.1875610351562,
"epoch": 0.15771428571428572,
"grad_norm": 0.19467265903949738,
"kl": 0.010044097900390625,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0883,
"reward": 0.38313706149347126,
"reward_std": 0.9247510060667992,
"rewards/cosine_scaled_reward": -0.07926480891183019,
"rewards/format_reward": 0.5416666716337204,
"step": 138
},
{
"completion_length": 3213.479217529297,
"epoch": 0.15885714285714286,
"grad_norm": 0.15390190482139587,
"kl": 0.011737823486328125,
"learning_rate": 9.158953424711624e-07,
"loss": 0.0208,
"reward": 0.11885028099641204,
"reward_std": 0.8315641395747662,
"rewards/cosine_scaled_reward": -0.12807486671954393,
"rewards/format_reward": 0.37500000931322575,
"step": 139
},
{
"completion_length": 3079.7083740234375,
"epoch": 0.16,
"grad_norm": 0.2192172110080719,
"kl": 0.015716552734375,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0322,
"reward": 0.0838099829852581,
"reward_std": 0.4756584819406271,
"rewards/cosine_scaled_reward": -0.14559502340853214,
"rewards/format_reward": 0.37500000931322575,
"step": 140
},
{
"completion_length": 2925.416748046875,
"epoch": 0.16114285714285714,
"grad_norm": 0.16468165814876556,
"kl": 0.015735626220703125,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0516,
"reward": 0.3168928772211075,
"reward_std": 0.8896390050649643,
"rewards/cosine_scaled_reward": -0.08113690535537899,
"rewards/format_reward": 0.47916667349636555,
"step": 141
},
{
"completion_length": 2902.8334045410156,
"epoch": 0.16228571428571428,
"grad_norm": 0.11325091868638992,
"kl": 0.014360427856445312,
"learning_rate": 9.103291169269299e-07,
"loss": 0.0589,
"reward": 0.6967496890574694,
"reward_std": 0.6483085379004478,
"rewards/cosine_scaled_reward": 0.025458187563344836,
"rewards/format_reward": 0.6458333507180214,
"step": 142
},
{
"completion_length": 2774.2292251586914,
"epoch": 0.16342857142857142,
"grad_norm": 0.26006487011909485,
"kl": 0.01424407958984375,
"learning_rate": 9.084384631108882e-07,
"loss": 0.0847,
"reward": 0.2764196931384504,
"reward_std": 0.7496595717966557,
"rewards/cosine_scaled_reward": -0.09095682273618877,
"rewards/format_reward": 0.4583333507180214,
"step": 143
},
{
"completion_length": 3044.0833892822266,
"epoch": 0.16457142857142856,
"grad_norm": 0.1781724989414215,
"kl": 0.011707305908203125,
"learning_rate": 9.065303395098358e-07,
"loss": 0.0491,
"reward": 0.19056477712001652,
"reward_std": 0.9319424778223038,
"rewards/cosine_scaled_reward": -0.06096761766821146,
"rewards/format_reward": 0.3125000037252903,
"step": 144
},
{
"completion_length": 2377.4583740234375,
"epoch": 0.1657142857142857,
"grad_norm": 0.13194841146469116,
"kl": 0.013391494750976562,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0296,
"reward": 0.6011020904406905,
"reward_std": 0.5612340047955513,
"rewards/cosine_scaled_reward": 0.01930104335770011,
"rewards/format_reward": 0.5625000055879354,
"step": 145
},
{
"completion_length": 2360.9792098999023,
"epoch": 0.16685714285714287,
"grad_norm": 0.14858490228652954,
"kl": 0.008525848388671875,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0943,
"reward": 0.22767078503966331,
"reward_std": 0.5497826747596264,
"rewards/cosine_scaled_reward": -0.20908129028975964,
"rewards/format_reward": 0.6458333469927311,
"step": 146
},
{
"completion_length": 2831.4583740234375,
"epoch": 0.168,
"grad_norm": 0.13822530210018158,
"kl": 0.015819549560546875,
"learning_rate": 9.007020842191634e-07,
"loss": 0.0489,
"reward": 0.17515791207551956,
"reward_std": 0.7267673751339316,
"rewards/cosine_scaled_reward": -0.0999210444279015,
"rewards/format_reward": 0.3750000074505806,
"step": 147
},
{
"completion_length": 2381.062530517578,
"epoch": 0.16914285714285715,
"grad_norm": 0.09740369766950607,
"kl": 0.013698577880859375,
"learning_rate": 8.987250199168808e-07,
"loss": 0.0063,
"reward": 0.39699011482298374,
"reward_std": 0.515554535202682,
"rewards/cosine_scaled_reward": -0.0931716226041317,
"rewards/format_reward": 0.5833333358168602,
"step": 148
},
{
"completion_length": 2793.291702270508,
"epoch": 0.1702857142857143,
"grad_norm": 0.09077835828065872,
"kl": 0.011531829833984375,
"learning_rate": 8.967309592491052e-07,
"loss": 0.0011,
"reward": 0.5288912262767553,
"reward_std": 0.5327219031751156,
"rewards/cosine_scaled_reward": 0.004028939350973815,
"rewards/format_reward": 0.520833333954215,
"step": 149
},
{
"completion_length": 2927.9583587646484,
"epoch": 0.17142857142857143,
"grad_norm": 0.200521320104599,
"kl": 0.016937255859375,
"learning_rate": 8.9471999940354e-07,
"loss": 0.068,
"reward": 0.37648333609104156,
"reward_std": 0.8985544182360172,
"rewards/cosine_scaled_reward": 0.0007416550070047379,
"rewards/format_reward": 0.37500000558793545,
"step": 150
},
{
"completion_length": 2586.041732788086,
"epoch": 0.17257142857142857,
"grad_norm": 0.30554497241973877,
"kl": 0.01647186279296875,
"learning_rate": 8.926922383915315e-07,
"loss": 0.1073,
"reward": 0.6497351740254089,
"reward_std": 0.7574196644127369,
"rewards/cosine_scaled_reward": 0.054034238681197166,
"rewards/format_reward": 0.5416666753590107,
"step": 151
},
{
"completion_length": 2981.250030517578,
"epoch": 0.1737142857142857,
"grad_norm": 0.21656572818756104,
"kl": 0.019748687744140625,
"learning_rate": 8.906477750432903e-07,
"loss": 0.0443,
"reward": -0.05579106882214546,
"reward_std": 0.7100877314805984,
"rewards/cosine_scaled_reward": -0.17372886929661036,
"rewards/format_reward": 0.2916666679084301,
"step": 152
},
{
"completion_length": 2944.3750228881836,
"epoch": 0.17485714285714285,
"grad_norm": 0.11329537630081177,
"kl": 0.02336883544921875,
"learning_rate": 8.88586709003076e-07,
"loss": 0.0242,
"reward": 0.03104301728308201,
"reward_std": 0.6042890883982182,
"rewards/cosine_scaled_reward": -0.16156182251870632,
"rewards/format_reward": 0.35416667349636555,
"step": 153
},
{
"completion_length": 3291.2500610351562,
"epoch": 0.176,
"grad_norm": 0.19572459161281586,
"kl": 0.01239013671875,
"learning_rate": 8.865091407243394e-07,
"loss": 0.0356,
"reward": 0.9391661360859871,
"reward_std": 0.8916139230132103,
"rewards/cosine_scaled_reward": 0.20916638150811195,
"rewards/format_reward": 0.5208333414047956,
"step": 154
},
{
"completion_length": 2646.916702270508,
"epoch": 0.17714285714285713,
"grad_norm": 0.11613517999649048,
"kl": 0.0165252685546875,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0234,
"reward": 0.5908447969704866,
"reward_std": 0.7097595669329166,
"rewards/cosine_scaled_reward": 0.024589055217802525,
"rewards/format_reward": 0.5416666753590107,
"step": 155
},
{
"completion_length": 3025.1666870117188,
"epoch": 0.1782857142857143,
"grad_norm": 0.23642632365226746,
"kl": 0.014795303344726562,
"learning_rate": 8.823049032816478e-07,
"loss": 0.0964,
"reward": 0.13831177074462175,
"reward_std": 0.793521448969841,
"rewards/cosine_scaled_reward": -0.08709411323070526,
"rewards/format_reward": 0.31250000558793545,
"step": 156
},
{
"completion_length": 3077.2291870117188,
"epoch": 0.17942857142857144,
"grad_norm": 0.08657457679510117,
"kl": 0.017913818359375,
"learning_rate": 8.801784390262943e-07,
"loss": 0.0329,
"reward": 0.01709558442234993,
"reward_std": 0.39399333111941814,
"rewards/cosine_scaled_reward": -0.1685355380177498,
"rewards/format_reward": 0.3541666753590107,
"step": 157
},
{
"completion_length": 3292.8958740234375,
"epoch": 0.18057142857142858,
"grad_norm": 0.19324812293052673,
"kl": 0.01824951171875,
"learning_rate": 8.780358823396352e-07,
"loss": 0.0586,
"reward": 0.7785975768638309,
"reward_std": 0.9201798811554909,
"rewards/cosine_scaled_reward": 0.18096543662250042,
"rewards/format_reward": 0.41666667722165585,
"step": 158
},
{
"completion_length": 3100.854202270508,
"epoch": 0.18171428571428572,
"grad_norm": 0.15799157321453094,
"kl": 0.02082061767578125,
"learning_rate": 8.758773376468604e-07,
"loss": 0.035,
"reward": 0.008442229591310024,
"reward_std": 0.7840565517544746,
"rewards/cosine_scaled_reward": -0.15202889824286103,
"rewards/format_reward": 0.3125000111758709,
"step": 159
},
{
"completion_length": 2739.5416946411133,
"epoch": 0.18285714285714286,
"grad_norm": 0.18316112458705902,
"kl": 0.020570755004882812,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0374,
"reward": 0.2813178598880768,
"reward_std": 0.678614292293787,
"rewards/cosine_scaled_reward": -0.04684108844958246,
"rewards/format_reward": 0.3750000111758709,
"step": 160
},
{
"completion_length": 2786.0208892822266,
"epoch": 0.184,
"grad_norm": 0.1603342592716217,
"kl": 0.02384185791015625,
"learning_rate": 8.715127058347614e-07,
"loss": 0.0217,
"reward": 0.6455709072761238,
"reward_std": 0.6690337508916855,
"rewards/cosine_scaled_reward": 0.0623687906190753,
"rewards/format_reward": 0.5208333432674408,
"step": 161
},
{
"completion_length": 3222.5000610351562,
"epoch": 0.18514285714285714,
"grad_norm": 0.25506702065467834,
"kl": 0.0278472900390625,
"learning_rate": 8.693068314414344e-07,
"loss": 0.0432,
"reward": 0.13122543692588806,
"reward_std": 0.88908408023417,
"rewards/cosine_scaled_reward": -0.10105395689606667,
"rewards/format_reward": 0.3333333358168602,
"step": 162
},
{
"completion_length": 2518.604179382324,
"epoch": 0.18628571428571428,
"grad_norm": 0.09915700554847717,
"kl": 0.019123077392578125,
"learning_rate": 8.670853944836176e-07,
"loss": 0.0145,
"reward": 0.8429984450340271,
"reward_std": 0.6572048924863338,
"rewards/cosine_scaled_reward": 0.10899921134114265,
"rewards/format_reward": 0.6250000055879354,
"step": 163
},
{
"completion_length": 2556.354202270508,
"epoch": 0.18742857142857142,
"grad_norm": 0.1429044008255005,
"kl": 0.021711349487304688,
"learning_rate": 8.648485032310144e-07,
"loss": 0.0501,
"reward": 0.4997831657528877,
"reward_std": 0.6965379063040018,
"rewards/cosine_scaled_reward": -0.03135842923074961,
"rewards/format_reward": 0.5625000055879354,
"step": 164
},
{
"completion_length": 3219.229248046875,
"epoch": 0.18857142857142858,
"grad_norm": 0.18279628455638885,
"kl": 0.033294677734375,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0497,
"reward": 0.08890796452760696,
"reward_std": 0.8442841582000256,
"rewards/cosine_scaled_reward": -0.10137936053797603,
"rewards/format_reward": 0.2916666753590107,
"step": 165
},
{
"completion_length": 2898.8333587646484,
"epoch": 0.18971428571428572,
"grad_norm": 0.1255594938993454,
"kl": 0.02011871337890625,
"learning_rate": 8.603287946810513e-07,
"loss": 0.0265,
"reward": 0.3105367962270975,
"reward_std": 0.62895817309618,
"rewards/cosine_scaled_reward": -0.06348160747438669,
"rewards/format_reward": 0.43750001303851604,
"step": 166
},
{
"completion_length": 2437.8959197998047,
"epoch": 0.19085714285714286,
"grad_norm": 0.25301864743232727,
"kl": 0.01929473876953125,
"learning_rate": 8.580461976679099e-07,
"loss": 0.0756,
"reward": 0.41188543289899826,
"reward_std": 0.8582058474421501,
"rewards/cosine_scaled_reward": -0.1482239617034793,
"rewards/format_reward": 0.7083333469927311,
"step": 167
},
{
"completion_length": 3033.0209045410156,
"epoch": 0.192,
"grad_norm": 0.24609790742397308,
"kl": 0.02398681640625,
"learning_rate": 8.557485869176825e-07,
"loss": 0.0643,
"reward": 0.3896508193574846,
"reward_std": 0.9152091555297375,
"rewards/cosine_scaled_reward": -0.05517459171824157,
"rewards/format_reward": 0.5000000186264515,
"step": 168
},
{
"completion_length": 2549.3750610351562,
"epoch": 0.19314285714285714,
"grad_norm": 0.24488316476345062,
"kl": 0.02884674072265625,
"learning_rate": 8.534360744126753e-07,
"loss": 0.0667,
"reward": 1.2588793262839317,
"reward_std": 1.0218086428940296,
"rewards/cosine_scaled_reward": 0.31693966779857874,
"rewards/format_reward": 0.6250000074505806,
"step": 169
},
{
"completion_length": 2387.8958740234375,
"epoch": 0.19428571428571428,
"grad_norm": 0.15666845440864563,
"kl": 0.021160125732421875,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0736,
"reward": 0.5572759381029755,
"reward_std": 0.7070234641432762,
"rewards/cosine_scaled_reward": -0.013028699904680252,
"rewards/format_reward": 0.5833333395421505,
"step": 170
},
{
"completion_length": 2764.2083892822266,
"epoch": 0.19542857142857142,
"grad_norm": 0.2903493046760559,
"kl": 0.023040771484375,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0756,
"reward": 0.3716934472322464,
"reward_std": 0.7128049619495869,
"rewards/cosine_scaled_reward": -0.03290329407900572,
"rewards/format_reward": 0.43750001303851604,
"step": 171
},
{
"completion_length": 2922.833366394043,
"epoch": 0.19657142857142856,
"grad_norm": 0.32527434825897217,
"kl": 0.03302001953125,
"learning_rate": 8.464102570534061e-07,
"loss": 0.0821,
"reward": 0.5163949467241764,
"reward_std": 0.914849640801549,
"rewards/cosine_scaled_reward": 0.06028079940006137,
"rewards/format_reward": 0.39583333767950535,
"step": 172
},
{
"completion_length": 2277.3541870117188,
"epoch": 0.1977142857142857,
"grad_norm": 0.28431299328804016,
"kl": 0.0292205810546875,
"learning_rate": 8.440392717955475e-07,
"loss": 0.0899,
"reward": 0.19020776241086423,
"reward_std": 0.8620494175702333,
"rewards/cosine_scaled_reward": -0.18614612240344286,
"rewards/format_reward": 0.5625000074505806,
"step": 173
},
{
"completion_length": 2744.687545776367,
"epoch": 0.19885714285714284,
"grad_norm": 0.2881910800933838,
"kl": 0.03791046142578125,
"learning_rate": 8.416539554784089e-07,
"loss": 0.0575,
"reward": 0.31613041274249554,
"reward_std": 0.8530451729893684,
"rewards/cosine_scaled_reward": -0.08151813084259629,
"rewards/format_reward": 0.47916666977107525,
"step": 174
},
{
"completion_length": 2789.6875534057617,
"epoch": 0.2,
"grad_norm": 0.08940012753009796,
"kl": 0.037750244140625,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0053,
"reward": 0.3539888858795166,
"reward_std": 0.473992221057415,
"rewards/cosine_scaled_reward": -0.06258890964090824,
"rewards/format_reward": 0.4791666716337204,
"step": 175
},
{
"completion_length": 2489.5625610351562,
"epoch": 0.20114285714285715,
"grad_norm": 0.18962816894054413,
"kl": 0.03296661376953125,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0326,
"reward": 0.5119255073368549,
"reward_std": 0.8859294652938843,
"rewards/cosine_scaled_reward": -0.014870589919155464,
"rewards/format_reward": 0.5416666772216558,
"step": 176
},
{
"completion_length": 3037.5209045410156,
"epoch": 0.2022857142857143,
"grad_norm": 0.2424456924200058,
"kl": 0.043304443359375,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0484,
"reward": 0.2696651890873909,
"reward_std": 0.9238467961549759,
"rewards/cosine_scaled_reward": -0.12558407767210156,
"rewards/format_reward": 0.5208333544433117,
"step": 177
},
{
"completion_length": 2764.645839691162,
"epoch": 0.20342857142857143,
"grad_norm": 0.22290733456611633,
"kl": 0.05438232421875,
"learning_rate": 8.319717151140072e-07,
"loss": 0.0283,
"reward": 0.1138812736608088,
"reward_std": 0.8126712590456009,
"rewards/cosine_scaled_reward": -0.13055937364697456,
"rewards/format_reward": 0.3750000074505806,
"step": 178
},
{
"completion_length": 2984.416702270508,
"epoch": 0.20457142857142857,
"grad_norm": 0.1466417908668518,
"kl": 0.047088623046875,
"learning_rate": 8.295165011252396e-07,
"loss": 0.015,
"reward": 0.0321359746158123,
"reward_std": 0.5167017672210932,
"rewards/cosine_scaled_reward": -0.17143201641738415,
"rewards/format_reward": 0.3750000074505806,
"step": 179
},
{
"completion_length": 2136.145881652832,
"epoch": 0.2057142857142857,
"grad_norm": 0.09843747317790985,
"kl": 0.0455169677734375,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0107,
"reward": 0.910092594102025,
"reward_std": 0.585525143891573,
"rewards/cosine_scaled_reward": 0.1425462868064642,
"rewards/format_reward": 0.6250000055879354,
"step": 180
},
{
"completion_length": 3006.2500228881836,
"epoch": 0.20685714285714285,
"grad_norm": 0.13610567152500153,
"kl": 0.055999755859375,
"learning_rate": 8.245653237555705e-07,
"loss": 0.0132,
"reward": -0.03030078485608101,
"reward_std": 0.600213073194027,
"rewards/cosine_scaled_reward": -0.18181706592440605,
"rewards/format_reward": 0.3333333469927311,
"step": 181
},
{
"completion_length": 2419.437530517578,
"epoch": 0.208,
"grad_norm": 0.18283116817474365,
"kl": 0.0460205078125,
"learning_rate": 8.220696016880687e-07,
"loss": 0.0463,
"reward": 0.5305005796253681,
"reward_std": 0.7933950982987881,
"rewards/cosine_scaled_reward": -0.005583042744547129,
"rewards/format_reward": 0.5416666753590107,
"step": 182
},
{
"completion_length": 2477.1875915527344,
"epoch": 0.20914285714285713,
"grad_norm": 0.6196571588516235,
"kl": 0.0609283447265625,
"learning_rate": 8.195606193320136e-07,
"loss": 0.1294,
"reward": 0.5540785677731037,
"reward_std": 0.9478080496191978,
"rewards/cosine_scaled_reward": 0.006205941084772348,
"rewards/format_reward": 0.5416666828095913,
"step": 183
},
{
"completion_length": 2701.562530517578,
"epoch": 0.2102857142857143,
"grad_norm": 0.19450385868549347,
"kl": 0.063079833984375,
"learning_rate": 8.170384989716657e-07,
"loss": 0.0082,
"reward": 0.25770498625934124,
"reward_std": 0.8395693749189377,
"rewards/cosine_scaled_reward": -0.07948085246607661,
"rewards/format_reward": 0.4166666753590107,
"step": 184
},
{
"completion_length": 2466.125068664551,
"epoch": 0.21142857142857144,
"grad_norm": 0.1757989078760147,
"kl": 0.08551025390625,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0215,
"reward": 0.06042790925130248,
"reward_std": 0.5447751199826598,
"rewards/cosine_scaled_reward": -0.18853605829644948,
"rewards/format_reward": 0.4375000186264515,
"step": 185
},
{
"completion_length": 2972.1458740234375,
"epoch": 0.21257142857142858,
"grad_norm": 0.2645973861217499,
"kl": 0.077728271484375,
"learning_rate": 8.119553365707802e-07,
"loss": -0.0252,
"reward": 0.47168839909136295,
"reward_std": 0.5297163799405098,
"rewards/cosine_scaled_reward": 0.03792751580476761,
"rewards/format_reward": 0.3958333395421505,
"step": 186
},
{
"completion_length": 2462.250045776367,
"epoch": 0.21371428571428572,
"grad_norm": 0.23852241039276123,
"kl": 0.07574462890625,
"learning_rate": 8.093945422764069e-07,
"loss": 0.0462,
"reward": 0.22391528385924175,
"reward_std": 0.6513072997331619,
"rewards/cosine_scaled_reward": -0.16929237358272076,
"rewards/format_reward": 0.5625000111758709,
"step": 187
},
{
"completion_length": 3234.854202270508,
"epoch": 0.21485714285714286,
"grad_norm": 0.2536059319972992,
"kl": 0.084686279296875,
"learning_rate": 8.068211054579943e-07,
"loss": 0.006,
"reward": -0.11121096089482307,
"reward_std": 0.8675953522324562,
"rewards/cosine_scaled_reward": -0.14935547299683094,
"rewards/format_reward": 0.18750000186264515,
"step": 188
},
{
"completion_length": 2777.7084045410156,
"epoch": 0.216,
"grad_norm": 0.2072547823190689,
"kl": 0.0953369140625,
"learning_rate": 8.04235151541222e-07,
"loss": 0.0194,
"reward": 0.494580146856606,
"reward_std": 0.7433264572173357,
"rewards/cosine_scaled_reward": -0.07562660798430443,
"rewards/format_reward": 0.6458333395421505,
"step": 189
},
{
"completion_length": 2382.875045776367,
"epoch": 0.21714285714285714,
"grad_norm": 0.2012052983045578,
"kl": 0.08172607421875,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0391,
"reward": 0.3381795147433877,
"reward_std": 0.5929664559662342,
"rewards/cosine_scaled_reward": -0.07049360126256943,
"rewards/format_reward": 0.47916668094694614,
"step": 190
},
{
"completion_length": 2421.979217529297,
"epoch": 0.21828571428571428,
"grad_norm": 0.2843816578388214,
"kl": 0.09423828125,
"learning_rate": 7.990261971595048e-07,
"loss": 0.0108,
"reward": 0.6503109214827418,
"reward_std": 1.1196836531162262,
"rewards/cosine_scaled_reward": 0.08557211281731725,
"rewards/format_reward": 0.47916667722165585,
"step": 191
},
{
"completion_length": 3207.229217529297,
"epoch": 0.21942857142857142,
"grad_norm": 0.3324408233165741,
"kl": 0.10272216796875,
"learning_rate": 7.964034505716476e-07,
"loss": 0.0665,
"reward": -0.08572058263234794,
"reward_std": 0.6475069168955088,
"rewards/cosine_scaled_reward": -0.19911029934883118,
"rewards/format_reward": 0.31250000931322575,
"step": 192
},
{
"completion_length": 3053.5834045410156,
"epoch": 0.22057142857142858,
"grad_norm": 0.29358112812042236,
"kl": 0.0987548828125,
"learning_rate": 7.93768694627233e-07,
"loss": 0.0517,
"reward": 0.2612769678235054,
"reward_std": 0.8455968722701073,
"rewards/cosine_scaled_reward": -0.07769485469907522,
"rewards/format_reward": 0.4166666679084301,
"step": 193
},
{
"completion_length": 2582.645927429199,
"epoch": 0.22171428571428572,
"grad_norm": 0.42259493470191956,
"kl": 0.0806884765625,
"learning_rate": 7.911220577405484e-07,
"loss": 0.0436,
"reward": 0.9180384781211615,
"reward_std": 0.7587432824075222,
"rewards/cosine_scaled_reward": 0.2090192511677742,
"rewards/format_reward": 0.5000000055879354,
"step": 194
},
{
"completion_length": 2819.6667098999023,
"epoch": 0.22285714285714286,
"grad_norm": 0.49130040407180786,
"kl": 0.105621337890625,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0463,
"reward": 0.2741181245073676,
"reward_std": 0.762896966189146,
"rewards/cosine_scaled_reward": -0.040024266578257084,
"rewards/format_reward": 0.35416668094694614,
"step": 195
},
{
"completion_length": 3187.7708740234375,
"epoch": 0.224,
"grad_norm": 0.22648778557777405,
"kl": 0.13623046875,
"learning_rate": 7.857936576865356e-07,
"loss": 0.0205,
"reward": 0.16096001025289297,
"reward_std": 0.6957874298095703,
"rewards/cosine_scaled_reward": -0.06535333674401045,
"rewards/format_reward": 0.29166666977107525,
"step": 196
},
{
"completion_length": 2287.1875610351562,
"epoch": 0.22514285714285714,
"grad_norm": 0.6928053498268127,
"kl": 0.13812255859375,
"learning_rate": 7.831121542179086e-07,
"loss": -0.0182,
"reward": 0.7687155855819583,
"reward_std": 1.2102737426757812,
"rewards/cosine_scaled_reward": 0.12394111696630716,
"rewards/format_reward": 0.5208333469927311,
"step": 197
},
{
"completion_length": 2939.187530517578,
"epoch": 0.22628571428571428,
"grad_norm": 0.4229676425457001,
"kl": 0.155517578125,
"learning_rate": 7.804192891917571e-07,
"loss": 0.0503,
"reward": 0.317043187096715,
"reward_std": 0.6642983369529247,
"rewards/cosine_scaled_reward": -0.008145075291395187,
"rewards/format_reward": 0.3333333395421505,
"step": 198
},
{
"completion_length": 2638.979217529297,
"epoch": 0.22742857142857142,
"grad_norm": 0.4884941875934601,
"kl": 0.116912841796875,
"learning_rate": 7.777151938545235e-07,
"loss": 0.0449,
"reward": -0.08919784612953663,
"reward_std": 0.5895257294178009,
"rewards/cosine_scaled_reward": -0.1383489231520798,
"rewards/format_reward": 0.18750000186264515,
"step": 199
},
{
"completion_length": 2367.125068664551,
"epoch": 0.22857142857142856,
"grad_norm": 0.6410492062568665,
"kl": 0.13128662109375,
"learning_rate": 7.75e-07,
"loss": 0.0794,
"reward": 0.7019957322627306,
"reward_std": 0.9874038621783257,
"rewards/cosine_scaled_reward": 0.04891452379524708,
"rewards/format_reward": 0.604166679084301,
"step": 200
},
{
"completion_length": 2166.708366394043,
"epoch": 0.2297142857142857,
"grad_norm": 0.7525650858879089,
"kl": 0.12518310546875,
"learning_rate": 7.72273839962904e-07,
"loss": 0.0497,
"reward": 0.9610625859349966,
"reward_std": 1.213215809315443,
"rewards/cosine_scaled_reward": 0.24094795435667038,
"rewards/format_reward": 0.4791666753590107,
"step": 201
},
{
"completion_length": 2405.8958892822266,
"epoch": 0.23085714285714284,
"grad_norm": 0.3488696217536926,
"kl": 0.199951171875,
"learning_rate": 7.695368466124296e-07,
"loss": -0.0162,
"reward": 0.9840776808559895,
"reward_std": 0.6163855046033859,
"rewards/cosine_scaled_reward": 0.20037216693162918,
"rewards/format_reward": 0.5833333358168602,
"step": 202
},
{
"completion_length": 3139.791702270508,
"epoch": 0.232,
"grad_norm": 0.4035970866680145,
"kl": 0.2327880859375,
"learning_rate": 7.667891533457718e-07,
"loss": 0.0518,
"reward": 0.11016486119478941,
"reward_std": 0.6740043368190527,
"rewards/cosine_scaled_reward": -0.08033423824235797,
"rewards/format_reward": 0.2708333395421505,
"step": 203
},
{
"completion_length": 2195.8125610351562,
"epoch": 0.23314285714285715,
"grad_norm": 0.45050907135009766,
"kl": 0.18145751953125,
"learning_rate": 7.640308940816239e-07,
"loss": 0.0271,
"reward": 0.4769836826599203,
"reward_std": 0.7626992892473936,
"rewards/cosine_scaled_reward": -0.11567484028637409,
"rewards/format_reward": 0.7083333507180214,
"step": 204
},
{
"completion_length": 2807.3333892822266,
"epoch": 0.2342857142857143,
"grad_norm": 0.40741464495658875,
"kl": 0.202392578125,
"learning_rate": 7.612622032536507e-07,
"loss": 0.02,
"reward": 0.6580097423866391,
"reward_std": 0.8570194113999605,
"rewards/cosine_scaled_reward": 0.13108818820910528,
"rewards/format_reward": 0.3958333507180214,
"step": 205
},
{
"completion_length": 3126.8541870117188,
"epoch": 0.23542857142857143,
"grad_norm": 0.3323240876197815,
"kl": 0.2293701171875,
"learning_rate": 7.584832158039378e-07,
"loss": 0.025,
"reward": -0.058580704033374786,
"reward_std": 0.7412764355540276,
"rewards/cosine_scaled_reward": -0.16470702644437551,
"rewards/format_reward": 0.2708333395421505,
"step": 206
},
{
"completion_length": 3054.875045776367,
"epoch": 0.23657142857142857,
"grad_norm": 0.4214930534362793,
"kl": 0.2855224609375,
"learning_rate": 7.556940671764124e-07,
"loss": 0.0122,
"reward": -0.014476008713245392,
"reward_std": 0.5796043276786804,
"rewards/cosine_scaled_reward": -0.22598800901323557,
"rewards/format_reward": 0.4375000149011612,
"step": 207
},
{
"completion_length": 2381.5417251586914,
"epoch": 0.2377142857142857,
"grad_norm": 0.284451961517334,
"kl": 0.1992034912109375,
"learning_rate": 7.528948933102438e-07,
"loss": 0.0347,
"reward": 0.6913385018706322,
"reward_std": 0.6349469162523746,
"rewards/cosine_scaled_reward": 0.05400256626307964,
"rewards/format_reward": 0.5833333358168602,
"step": 208
},
{
"completion_length": 2797.312545776367,
"epoch": 0.23885714285714285,
"grad_norm": 0.9943209290504456,
"kl": 0.2471923828125,
"learning_rate": 7.500858306332172e-07,
"loss": 0.0679,
"reward": 0.6385658853687346,
"reward_std": 0.8412618339061737,
"rewards/cosine_scaled_reward": 0.038032938959077,
"rewards/format_reward": 0.5625000074505806,
"step": 209
},
{
"completion_length": 2810.729202270508,
"epoch": 0.24,
"grad_norm": 0.27606576681137085,
"kl": 0.216064453125,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0424,
"reward": 0.17591036087833345,
"reward_std": 0.5839308425784111,
"rewards/cosine_scaled_reward": -0.10996149946004152,
"rewards/format_reward": 0.39583334140479565,
"step": 210
},
{
"completion_length": 2546.8333892822266,
"epoch": 0.24114285714285713,
"grad_norm": 0.5562529563903809,
"kl": 0.2337646484375,
"learning_rate": 7.444385869608921e-07,
"loss": 0.0024,
"reward": 0.3355960976332426,
"reward_std": 0.8199643902480602,
"rewards/cosine_scaled_reward": -0.061368612572550774,
"rewards/format_reward": 0.4583333395421505,
"step": 211
},
{
"completion_length": 2440.375045776367,
"epoch": 0.2422857142857143,
"grad_norm": 0.6957542300224304,
"kl": 0.21954345703125,
"learning_rate": 7.416006812042827e-07,
"loss": 0.0798,
"reward": 0.315935923717916,
"reward_std": 0.7707805298268795,
"rewards/cosine_scaled_reward": -0.09203205385711044,
"rewards/format_reward": 0.5000000074505806,
"step": 212
},
{
"completion_length": 2497.416702270508,
"epoch": 0.24342857142857144,
"grad_norm": 0.6777252554893494,
"kl": 0.28155517578125,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0231,
"reward": 0.4181370511651039,
"reward_std": 0.701967678964138,
"rewards/cosine_scaled_reward": -0.051348146982491016,
"rewards/format_reward": 0.5208333432674408,
"step": 213
},
{
"completion_length": 2721.791702270508,
"epoch": 0.24457142857142858,
"grad_norm": 0.44339874386787415,
"kl": 0.2493896484375,
"learning_rate": 7.358969934210438e-07,
"loss": 0.0185,
"reward": 0.5002706460654736,
"reward_std": 0.8209729120135307,
"rewards/cosine_scaled_reward": -0.020698013715445995,
"rewards/format_reward": 0.5416666734963655,
"step": 214
},
{
"completion_length": 2568.8958740234375,
"epoch": 0.24571428571428572,
"grad_norm": 0.3160378336906433,
"kl": 0.20928955078125,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0097,
"reward": 0.19143693707883358,
"reward_std": 0.5702317543327808,
"rewards/cosine_scaled_reward": -0.14386487286537886,
"rewards/format_reward": 0.47916667349636555,
"step": 215
},
{
"completion_length": 2325.666702270508,
"epoch": 0.24685714285714286,
"grad_norm": 0.42989280819892883,
"kl": 0.224609375,
"learning_rate": 7.301570646506027e-07,
"loss": 0.0334,
"reward": 0.6585849169641733,
"reward_std": 0.9609077526256442,
"rewards/cosine_scaled_reward": -0.014457540586590767,
"rewards/format_reward": 0.6875000223517418,
"step": 216
},
{
"completion_length": 2915.604232788086,
"epoch": 0.248,
"grad_norm": 1.269454836845398,
"kl": 0.2520751953125,
"learning_rate": 7.27273859315928e-07,
"loss": 0.1032,
"reward": 0.1808436312712729,
"reward_std": 0.8456562999635935,
"rewards/cosine_scaled_reward": -0.08666152018122375,
"rewards/format_reward": 0.3541666753590107,
"step": 217
},
{
"completion_length": 2640.4583740234375,
"epoch": 0.24914285714285714,
"grad_norm": 0.4037168323993683,
"kl": 0.21539306640625,
"learning_rate": 7.243820139034464e-07,
"loss": 0.0083,
"reward": 0.4553571194410324,
"reward_std": 0.7446375414729118,
"rewards/cosine_scaled_reward": -0.04315477702766657,
"rewards/format_reward": 0.5416666697710752,
"step": 218
},
{
"completion_length": 2454.8958740234375,
"epoch": 0.2502857142857143,
"grad_norm": 0.6872981190681458,
"kl": 0.2410888671875,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0494,
"reward": 0.7339769629761577,
"reward_std": 0.8893967866897583,
"rewards/cosine_scaled_reward": 0.03365514148026705,
"rewards/format_reward": 0.6666666753590107,
"step": 219
},
{
"completion_length": 2835.3750534057617,
"epoch": 0.25142857142857145,
"grad_norm": 0.4908996820449829,
"kl": 0.2960205078125,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0663,
"reward": -0.27113689854741096,
"reward_std": 0.4867183081805706,
"rewards/cosine_scaled_reward": -0.2918184567242861,
"rewards/format_reward": 0.3125000074505806,
"step": 220
},
{
"completion_length": 2121.1250610351562,
"epoch": 0.25257142857142856,
"grad_norm": 0.9558163285255432,
"kl": 0.21978759765625,
"learning_rate": 7.156560487081051e-07,
"loss": -0.0463,
"reward": 0.7585705481469631,
"reward_std": 0.8061345554888248,
"rewards/cosine_scaled_reward": 0.06678527034819126,
"rewards/format_reward": 0.6250000055879354,
"step": 221
},
{
"completion_length": 2521.4584197998047,
"epoch": 0.2537142857142857,
"grad_norm": 0.2961377203464508,
"kl": 0.2503662109375,
"learning_rate": 7.127310565369415e-07,
"loss": 0.0342,
"reward": 0.4850865611806512,
"reward_std": 0.5646930634975433,
"rewards/cosine_scaled_reward": -0.08037340734153986,
"rewards/format_reward": 0.645833345130086,
"step": 222
},
{
"completion_length": 2613.5208892822266,
"epoch": 0.25485714285714284,
"grad_norm": 0.3459699749946594,
"kl": 0.2802734375,
"learning_rate": 7.097981330836616e-07,
"loss": 0.0119,
"reward": 0.4122865653480403,
"reward_std": 0.6126667521893978,
"rewards/cosine_scaled_reward": -0.09594006463885307,
"rewards/format_reward": 0.6041666716337204,
"step": 223
},
{
"completion_length": 2961.7709350585938,
"epoch": 0.256,
"grad_norm": 0.6146445870399475,
"kl": 0.28375244140625,
"learning_rate": 7.068574212948169e-07,
"loss": 0.0476,
"reward": 0.2019269121810794,
"reward_std": 0.7791545800864697,
"rewards/cosine_scaled_reward": -0.05528653599321842,
"rewards/format_reward": 0.3125000074505806,
"step": 224
},
{
"completion_length": 3000.687545776367,
"epoch": 0.2571428571428571,
"grad_norm": 0.6927861571311951,
"kl": 0.2901611328125,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0702,
"reward": 0.3877083119004965,
"reward_std": 0.7308296859264374,
"rewards/cosine_scaled_reward": -0.05614587618038058,
"rewards/format_reward": 0.5000000111758709,
"step": 225
},
{
"completion_length": 2560.979248046875,
"epoch": 0.2582857142857143,
"grad_norm": 0.47879165410995483,
"kl": 0.26910400390625,
"learning_rate": 7.009532063876148e-07,
"loss": 0.0242,
"reward": 0.7968797162175179,
"reward_std": 0.7786244936287403,
"rewards/cosine_scaled_reward": 0.14843985438346863,
"rewards/format_reward": 0.5000000037252903,
"step": 226
},
{
"completion_length": 2564.1458740234375,
"epoch": 0.25942857142857145,
"grad_norm": 0.469950795173645,
"kl": 0.324462890625,
"learning_rate": 6.979899910323624e-07,
"loss": 0.0308,
"reward": 0.5330985919572413,
"reward_std": 0.9147845208644867,
"rewards/cosine_scaled_reward": -0.014700718224048615,
"rewards/format_reward": 0.5625000149011612,
"step": 227
},
{
"completion_length": 2534.604202270508,
"epoch": 0.26057142857142856,
"grad_norm": 0.3445420265197754,
"kl": 0.29400634765625,
"learning_rate": 6.950195628537299e-07,
"loss": 0.0282,
"reward": 0.46225254982709885,
"reward_std": 0.6663805656135082,
"rewards/cosine_scaled_reward": 0.012376293540000916,
"rewards/format_reward": 0.4375,
"step": 228
},
{
"completion_length": 3104.7500610351562,
"epoch": 0.26171428571428573,
"grad_norm": 0.5392424464225769,
"kl": 0.446044921875,
"learning_rate": 6.920420666261961e-07,
"loss": 0.0297,
"reward": 0.41551550291478634,
"reward_std": 0.5293188579380512,
"rewards/cosine_scaled_reward": -0.021408939734101295,
"rewards/format_reward": 0.4583333469927311,
"step": 229
},
{
"completion_length": 3205.3959045410156,
"epoch": 0.26285714285714284,
"grad_norm": 0.3657205104827881,
"kl": 0.4345703125,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0674,
"reward": -0.06970879365690053,
"reward_std": 0.5970296040177345,
"rewards/cosine_scaled_reward": -0.19110439904034138,
"rewards/format_reward": 0.31250000558793545,
"step": 230
},
{
"completion_length": 2905.166717529297,
"epoch": 0.264,
"grad_norm": 0.6562315821647644,
"kl": 0.39349365234375,
"learning_rate": 6.860664508377001e-07,
"loss": 0.0594,
"reward": 0.326082413084805,
"reward_std": 0.7788660638034344,
"rewards/cosine_scaled_reward": -0.003625463228672743,
"rewards/format_reward": 0.3333333395421505,
"step": 231
},
{
"completion_length": 3106.604248046875,
"epoch": 0.2651428571428571,
"grad_norm": 0.3778403103351593,
"kl": 0.42138671875,
"learning_rate": 6.83068622519821e-07,
"loss": 0.062,
"reward": -0.2184969331137836,
"reward_std": 0.5246814601123333,
"rewards/cosine_scaled_reward": -0.21341513469815254,
"rewards/format_reward": 0.2083333358168602,
"step": 232
},
{
"completion_length": 2835.93758392334,
"epoch": 0.2662857142857143,
"grad_norm": 0.723120927810669,
"kl": 0.4295654296875,
"learning_rate": 6.800643086250121e-07,
"loss": 0.0837,
"reward": 0.21882726065814495,
"reward_std": 0.8319694129750133,
"rewards/cosine_scaled_reward": -0.10933638806454837,
"rewards/format_reward": 0.43750001676380634,
"step": 233
},
{
"completion_length": 2814.5417137145996,
"epoch": 0.2674285714285714,
"grad_norm": 0.5316165685653687,
"kl": 0.404998779296875,
"learning_rate": 6.770536555792944e-07,
"loss": 0.0213,
"reward": -0.02029888378456235,
"reward_std": 0.5957829728722572,
"rewards/cosine_scaled_reward": -0.17681611701846123,
"rewards/format_reward": 0.3333333358168602,
"step": 234
},
{
"completion_length": 2171.729217529297,
"epoch": 0.26857142857142857,
"grad_norm": 1.2359085083007812,
"kl": 0.30389404296875,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0713,
"reward": 0.5996768039185554,
"reward_std": 0.8905858621001244,
"rewards/cosine_scaled_reward": -0.02307826466858387,
"rewards/format_reward": 0.645833333954215,
"step": 235
},
{
"completion_length": 2596.937545776367,
"epoch": 0.26971428571428574,
"grad_norm": 0.44107890129089355,
"kl": 0.310302734375,
"learning_rate": 6.710139192768694e-07,
"loss": 0.005,
"reward": 0.6644928082823753,
"reward_std": 0.8656329847872257,
"rewards/cosine_scaled_reward": -0.00108695263043046,
"rewards/format_reward": 0.6666666734963655,
"step": 236
},
{
"completion_length": 2504.2083892822266,
"epoch": 0.27085714285714285,
"grad_norm": 0.6235182285308838,
"kl": 0.3521728515625,
"learning_rate": 6.679851303883891e-07,
"loss": 0.0334,
"reward": 0.5841595754027367,
"reward_std": 0.7832061983644962,
"rewards/cosine_scaled_reward": 0.00041312072426080704,
"rewards/format_reward": 0.5833333395421505,
"step": 237
},
{
"completion_length": 2374.1458892822266,
"epoch": 0.272,
"grad_norm": 0.5051608085632324,
"kl": 0.3377685546875,
"learning_rate": 6.649505910711058e-07,
"loss": 0.0394,
"reward": 1.067820217460394,
"reward_std": 0.7289628759026527,
"rewards/cosine_scaled_reward": 0.23182673379778862,
"rewards/format_reward": 0.604166679084301,
"step": 238
},
{
"completion_length": 2220.125030517578,
"epoch": 0.27314285714285713,
"grad_norm": 0.33481940627098083,
"kl": 0.332763671875,
"learning_rate": 6.619104492241847e-07,
"loss": 0.0246,
"reward": 0.8250223018694669,
"reward_std": 0.5633459165692329,
"rewards/cosine_scaled_reward": 0.15209448896348476,
"rewards/format_reward": 0.520833333954215,
"step": 239
},
{
"completion_length": 2807.2709045410156,
"epoch": 0.2742857142857143,
"grad_norm": 0.47511982917785645,
"kl": 0.609130859375,
"learning_rate": 6.588648530198504e-07,
"loss": 0.063,
"reward": 0.18580850027501583,
"reward_std": 0.5817486252635717,
"rewards/cosine_scaled_reward": -0.16751242137979716,
"rewards/format_reward": 0.520833345130086,
"step": 240
},
{
"completion_length": 2705.0833892822266,
"epoch": 0.2754285714285714,
"grad_norm": 0.6473789811134338,
"kl": 0.42578125,
"learning_rate": 6.558139508961654e-07,
"loss": 0.025,
"reward": 0.17192419804632664,
"reward_std": 0.6516960971057415,
"rewards/cosine_scaled_reward": -0.20570457633584738,
"rewards/format_reward": 0.5833333395421505,
"step": 241
},
{
"completion_length": 2445.0834045410156,
"epoch": 0.2765714285714286,
"grad_norm": 0.5017490983009338,
"kl": 0.50115966796875,
"learning_rate": 6.527578915497951e-07,
"loss": 0.0511,
"reward": 0.5459022335708141,
"reward_std": 0.6534602418541908,
"rewards/cosine_scaled_reward": -0.10204888670705259,
"rewards/format_reward": 0.7500000167638063,
"step": 242
},
{
"completion_length": 2775.3125915527344,
"epoch": 0.2777142857142857,
"grad_norm": 0.9957566261291504,
"kl": 0.471435546875,
"learning_rate": 6.496968239287603e-07,
"loss": 0.009,
"reward": 0.7295572739094496,
"reward_std": 0.8050657417625189,
"rewards/cosine_scaled_reward": 0.09394530206918716,
"rewards/format_reward": 0.5416666716337204,
"step": 243
},
{
"completion_length": 2788.8959197998047,
"epoch": 0.27885714285714286,
"grad_norm": 0.5656623244285583,
"kl": 0.4091796875,
"learning_rate": 6.466308972251785e-07,
"loss": 0.0419,
"reward": 0.5535422116518021,
"reward_std": 0.7251980789005756,
"rewards/cosine_scaled_reward": 0.05802109342766926,
"rewards/format_reward": 0.43750000931322575,
"step": 244
},
{
"completion_length": 3191.8334045410156,
"epoch": 0.28,
"grad_norm": 1.2059617042541504,
"kl": 0.46240234375,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0945,
"reward": 0.15825808281078935,
"reward_std": 0.9811517968773842,
"rewards/cosine_scaled_reward": -0.13962095836177468,
"rewards/format_reward": 0.4375000111758709,
"step": 245
},
{
"completion_length": 2959.666748046875,
"epoch": 0.28114285714285714,
"grad_norm": 0.74327552318573,
"kl": 0.458740234375,
"learning_rate": 6.404850645156841e-07,
"loss": 0.0224,
"reward": 0.4606718048453331,
"reward_std": 0.9427804127335548,
"rewards/cosine_scaled_reward": -0.009247444570064545,
"rewards/format_reward": 0.47916667722165585,
"step": 246
},
{
"completion_length": 3009.2083435058594,
"epoch": 0.2822857142857143,
"grad_norm": 0.6833385229110718,
"kl": 0.4307861328125,
"learning_rate": 6.374054580489873e-07,
"loss": 0.0303,
"reward": 0.013721236027777195,
"reward_std": 0.6590881273150444,
"rewards/cosine_scaled_reward": -0.19105605548247695,
"rewards/format_reward": 0.39583333767950535,
"step": 247
},
{
"completion_length": 2684.1875762939453,
"epoch": 0.2834285714285714,
"grad_norm": 0.5844886302947998,
"kl": 0.35345458984375,
"learning_rate": 6.343215915635761e-07,
"loss": 0.0537,
"reward": 0.45633104629814625,
"reward_std": 0.7913403064012527,
"rewards/cosine_scaled_reward": -0.032251136377453804,
"rewards/format_reward": 0.5208333414047956,
"step": 248
},
{
"completion_length": 2176.4792251586914,
"epoch": 0.2845714285714286,
"grad_norm": 0.9092757105827332,
"kl": 0.264862060546875,
"learning_rate": 6.31233615362752e-07,
"loss": 0.0562,
"reward": 0.8251112774014473,
"reward_std": 0.7513875584118068,
"rewards/cosine_scaled_reward": 0.08963895868510008,
"rewards/format_reward": 0.6458333395421505,
"step": 249
},
{
"completion_length": 2420.500030517578,
"epoch": 0.2857142857142857,
"grad_norm": 1.0886207818984985,
"kl": 0.3681640625,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0552,
"reward": 0.2780859973281622,
"reward_std": 0.6804126389324665,
"rewards/cosine_scaled_reward": -0.183873676112853,
"rewards/format_reward": 0.6458333432674408,
"step": 250
},
{
"completion_length": 1995.895896911621,
"epoch": 0.28685714285714287,
"grad_norm": 0.5505285263061523,
"kl": 0.30914306640625,
"learning_rate": 6.25045936022246e-07,
"loss": 0.045,
"reward": 0.45957393012940884,
"reward_std": 0.8468098007142544,
"rewards/cosine_scaled_reward": -0.07229639682918787,
"rewards/format_reward": 0.604166679084301,
"step": 251
},
{
"completion_length": 2733.312545776367,
"epoch": 0.288,
"grad_norm": 0.4072404205799103,
"kl": 0.435791015625,
"learning_rate": 6.219465344613258e-07,
"loss": 0.0364,
"reward": 0.22467345744371414,
"reward_std": 0.5785614065825939,
"rewards/cosine_scaled_reward": -0.08557994151487947,
"rewards/format_reward": 0.3958333432674408,
"step": 252
},
{
"completion_length": 2344.9792098999023,
"epoch": 0.28914285714285715,
"grad_norm": 0.5800250768661499,
"kl": 0.36541748046875,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0011,
"reward": 0.6350781377404928,
"reward_std": 0.6775294542312622,
"rewards/cosine_scaled_reward": -0.03662758320569992,
"rewards/format_reward": 0.708333345130086,
"step": 253
},
{
"completion_length": 3104.7709045410156,
"epoch": 0.29028571428571426,
"grad_norm": 0.5420771241188049,
"kl": 0.61962890625,
"learning_rate": 6.157373628530852e-07,
"loss": 0.0671,
"reward": 0.0348883168771863,
"reward_std": 0.6898283958435059,
"rewards/cosine_scaled_reward": -0.17005583579884842,
"rewards/format_reward": 0.37500000186264515,
"step": 254
},
{
"completion_length": 2963.145896911621,
"epoch": 0.2914285714285714,
"grad_norm": 1.1052091121673584,
"kl": 0.5419921875,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0088,
"reward": 0.11596883228048682,
"reward_std": 0.6711905375123024,
"rewards/cosine_scaled_reward": -0.20243225805461407,
"rewards/format_reward": 0.5208333469927311,
"step": 255
},
{
"completion_length": 2662.437530517578,
"epoch": 0.2925714285714286,
"grad_norm": 0.6445716619491577,
"kl": 0.4893798828125,
"learning_rate": 6.095153756157051e-07,
"loss": 0.0404,
"reward": 0.3911870224401355,
"reward_std": 0.6272533088922501,
"rewards/cosine_scaled_reward": -0.012739824131131172,
"rewards/format_reward": 0.416666679084301,
"step": 256
},
{
"completion_length": 3104.31258392334,
"epoch": 0.2937142857142857,
"grad_norm": 0.8529700040817261,
"kl": 0.52423095703125,
"learning_rate": 6.06399955103937e-07,
"loss": 0.0561,
"reward": 0.34972720965743065,
"reward_std": 0.8778971843421459,
"rewards/cosine_scaled_reward": -0.033469736110419035,
"rewards/format_reward": 0.41666667349636555,
"step": 257
},
{
"completion_length": 3088.354278564453,
"epoch": 0.2948571428571429,
"grad_norm": 1.0896036624908447,
"kl": 0.4991455078125,
"learning_rate": 6.032817857379256e-07,
"loss": 0.0909,
"reward": 0.26875314209610224,
"reward_std": 0.7737620323896408,
"rewards/cosine_scaled_reward": -0.12604010291397572,
"rewards/format_reward": 0.5208333376795053,
"step": 258
},
{
"completion_length": 2515.1459045410156,
"epoch": 0.296,
"grad_norm": 0.5050967335700989,
"kl": 0.449066162109375,
"learning_rate": 6.001610194928464e-07,
"loss": 0.0397,
"reward": 0.7024204786866903,
"reward_std": 0.7655514609068632,
"rewards/cosine_scaled_reward": -0.002956441603600979,
"rewards/format_reward": 0.7083333525806665,
"step": 259
},
{
"completion_length": 2217.250015258789,
"epoch": 0.29714285714285715,
"grad_norm": 0.4907468855381012,
"kl": 0.36773681640625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0207,
"reward": 0.9972690381109715,
"reward_std": 0.8888046778738499,
"rewards/cosine_scaled_reward": 0.18613450415432453,
"rewards/format_reward": 0.6250000018626451,
"step": 260
},
{
"completion_length": 2846.3958740234375,
"epoch": 0.29828571428571427,
"grad_norm": 0.5651382207870483,
"kl": 0.461669921875,
"learning_rate": 5.939123048916173e-07,
"loss": 0.0341,
"reward": 0.17303861770778894,
"reward_std": 0.6959999911487103,
"rewards/cosine_scaled_reward": -0.14264736231416464,
"rewards/format_reward": 0.45833334140479565,
"step": 261
},
{
"completion_length": 2430.916717529297,
"epoch": 0.29942857142857143,
"grad_norm": 0.7266518473625183,
"kl": 0.408447265625,
"learning_rate": 5.907846610890011e-07,
"loss": 0.0076,
"reward": 0.26563636353239417,
"reward_std": 0.5362856052815914,
"rewards/cosine_scaled_reward": -0.2213484961539507,
"rewards/format_reward": 0.7083333469927311,
"step": 262
},
{
"completion_length": 2708.4583740234375,
"epoch": 0.30057142857142854,
"grad_norm": 0.6144644618034363,
"kl": 0.48321533203125,
"learning_rate": 5.87655029499542e-07,
"loss": 0.0328,
"reward": 0.16829278273507953,
"reward_std": 0.7771525047719479,
"rewards/cosine_scaled_reward": -0.2179369544610381,
"rewards/format_reward": 0.6041666828095913,
"step": 263
},
{
"completion_length": 2807.541717529297,
"epoch": 0.3017142857142857,
"grad_norm": 0.6735979914665222,
"kl": 0.43017578125,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0176,
"reward": 0.5329833417199552,
"reward_std": 0.7325766794383526,
"rewards/cosine_scaled_reward": -0.0772583307698369,
"rewards/format_reward": 0.6875000111758709,
"step": 264
},
{
"completion_length": 2439.3334197998047,
"epoch": 0.3028571428571429,
"grad_norm": 1.1831626892089844,
"kl": 0.361572265625,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0469,
"reward": 0.6903195958584547,
"reward_std": 0.82445028424263,
"rewards/cosine_scaled_reward": -0.009006870910525322,
"rewards/format_reward": 0.708333345130086,
"step": 265
},
{
"completion_length": 2822.1250915527344,
"epoch": 0.304,
"grad_norm": 0.8838703036308289,
"kl": 0.39208984375,
"learning_rate": 5.78255733788191e-07,
"loss": 0.005,
"reward": 0.3137262724339962,
"reward_std": 0.7213578186929226,
"rewards/cosine_scaled_reward": -0.14522020891308784,
"rewards/format_reward": 0.6041666753590107,
"step": 266
},
{
"completion_length": 3202.937530517578,
"epoch": 0.30514285714285716,
"grad_norm": 0.977644145488739,
"kl": 0.5537109375,
"learning_rate": 5.751196772469237e-07,
"loss": 0.0299,
"reward": 0.054544554091989994,
"reward_std": 0.6957491394132376,
"rewards/cosine_scaled_reward": -0.128977719694376,
"rewards/format_reward": 0.3125000074505806,
"step": 267
},
{
"completion_length": 2406.0208740234375,
"epoch": 0.3062857142857143,
"grad_norm": 0.6916462182998657,
"kl": 0.3612060546875,
"learning_rate": 5.71982396408026e-07,
"loss": 0.031,
"reward": 0.06796193681657314,
"reward_std": 0.5696085840463638,
"rewards/cosine_scaled_reward": -0.20560237113386393,
"rewards/format_reward": 0.47916667722165585,
"step": 268
},
{
"completion_length": 2778.479217529297,
"epoch": 0.30742857142857144,
"grad_norm": 0.8052944540977478,
"kl": 0.3636474609375,
"learning_rate": 5.688440441781398e-07,
"loss": 0.0335,
"reward": 0.3834271663799882,
"reward_std": 0.865951232612133,
"rewards/cosine_scaled_reward": -0.11036975774914026,
"rewards/format_reward": 0.6041666809469461,
"step": 269
},
{
"completion_length": 2532.1458892822266,
"epoch": 0.30857142857142855,
"grad_norm": 1.3412328958511353,
"kl": 0.26837158203125,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0278,
"reward": 0.8955399505794048,
"reward_std": 0.9300484023988247,
"rewards/cosine_scaled_reward": 0.09360331390053034,
"rewards/format_reward": 0.7083333507180214,
"step": 270
},
{
"completion_length": 2416.395950317383,
"epoch": 0.3097142857142857,
"grad_norm": 1.7048732042312622,
"kl": 0.30621337890625,
"learning_rate": 5.625647374256061e-07,
"loss": 0.0786,
"reward": 0.9949164763092995,
"reward_std": 0.953071303665638,
"rewards/cosine_scaled_reward": 0.14329158514738083,
"rewards/format_reward": 0.708333345130086,
"step": 271
},
{
"completion_length": 3282.5834350585938,
"epoch": 0.31085714285714283,
"grad_norm": 0.832673966884613,
"kl": 0.585693359375,
"learning_rate": 5.594240889475106e-07,
"loss": 0.0292,
"reward": 0.19685243256390095,
"reward_std": 0.7315353937447071,
"rewards/cosine_scaled_reward": -0.0994904637336731,
"rewards/format_reward": 0.39583334140479565,
"step": 272
},
{
"completion_length": 2979.0625915527344,
"epoch": 0.312,
"grad_norm": 0.6270153522491455,
"kl": 0.535888671875,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0332,
"reward": 0.3002479658462107,
"reward_std": 0.8212124370038509,
"rewards/cosine_scaled_reward": -0.09987602103501558,
"rewards/format_reward": 0.5000000055879354,
"step": 273
},
{
"completion_length": 1942.5000457763672,
"epoch": 0.31314285714285717,
"grad_norm": 0.7007995247840881,
"kl": 0.320556640625,
"learning_rate": 5.531415671340826e-07,
"loss": -0.0018,
"reward": 1.0117733776569366,
"reward_std": 0.7953921295702457,
"rewards/cosine_scaled_reward": 0.14130336791276932,
"rewards/format_reward": 0.7291666753590107,
"step": 274
},
{
"completion_length": 2129.750045776367,
"epoch": 0.3142857142857143,
"grad_norm": 0.5516811609268188,
"kl": 0.302520751953125,
"learning_rate": 5.5e-07,
"loss": 0.0104,
"reward": 0.5132089601829648,
"reward_std": 0.9499057307839394,
"rewards/cosine_scaled_reward": 0.027437805198132992,
"rewards/format_reward": 0.45833334140479565,
"step": 275
},
{
"completion_length": 2182.4166984558105,
"epoch": 0.31542857142857145,
"grad_norm": 0.8734866380691528,
"kl": 0.3303680419921875,
"learning_rate": 5.468584328659172e-07,
"loss": 0.0497,
"reward": 0.6296617463231087,
"reward_std": 0.8634731210768223,
"rewards/cosine_scaled_reward": 0.033580862917006016,
"rewards/format_reward": 0.5625000074505806,
"step": 276
},
{
"completion_length": 2209.4792098999023,
"epoch": 0.31657142857142856,
"grad_norm": 1.0455769300460815,
"kl": 0.2725830078125,
"learning_rate": 5.437170188473847e-07,
"loss": 0.0508,
"reward": 0.6334413010627031,
"reward_std": 0.9880629740655422,
"rewards/cosine_scaled_reward": -0.016612697392702103,
"rewards/format_reward": 0.6666666772216558,
"step": 277
},
{
"completion_length": 2201.250030517578,
"epoch": 0.3177142857142857,
"grad_norm": 0.5012733340263367,
"kl": 0.374755859375,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0199,
"reward": 0.843031425960362,
"reward_std": 0.5963149815797806,
"rewards/cosine_scaled_reward": 0.04651568736881018,
"rewards/format_reward": 0.7500000167638063,
"step": 278
},
{
"completion_length": 2702.8333892822266,
"epoch": 0.31885714285714284,
"grad_norm": 1.3764408826828003,
"kl": 0.383636474609375,
"learning_rate": 5.37435262574394e-07,
"loss": -0.018,
"reward": 0.4093172252178192,
"reward_std": 0.7577077932655811,
"rewards/cosine_scaled_reward": -0.0765913873910904,
"rewards/format_reward": 0.5625000055879354,
"step": 279
},
{
"completion_length": 2229.104232788086,
"epoch": 0.32,
"grad_norm": 1.4205951690673828,
"kl": 0.393707275390625,
"learning_rate": 5.342952264838747e-07,
"loss": -0.0021,
"reward": 1.024111781269312,
"reward_std": 1.1223380044102669,
"rewards/cosine_scaled_reward": 0.12663922272622585,
"rewards/format_reward": 0.7708333469927311,
"step": 280
},
{
"completion_length": 3130.2084045410156,
"epoch": 0.3211428571428571,
"grad_norm": 0.6159874200820923,
"kl": 0.49462890625,
"learning_rate": 5.311559558218603e-07,
"loss": 0.0566,
"reward": -0.1348068374209106,
"reward_std": 0.7328717596828938,
"rewards/cosine_scaled_reward": -0.2132367566227913,
"rewards/format_reward": 0.2916666753590107,
"step": 281
},
{
"completion_length": 2341.541717529297,
"epoch": 0.3222857142857143,
"grad_norm": 1.7661807537078857,
"kl": 0.31353759765625,
"learning_rate": 5.28017603591974e-07,
"loss": -0.0445,
"reward": 0.6573122011031955,
"reward_std": 0.7708069607615471,
"rewards/cosine_scaled_reward": 0.04740610299631953,
"rewards/format_reward": 0.5625000111758709,
"step": 282
},
{
"completion_length": 2914.729217529297,
"epoch": 0.32342857142857145,
"grad_norm": 1.0965200662612915,
"kl": 0.47216796875,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0109,
"reward": 0.5034051882103086,
"reward_std": 0.7967246808111668,
"rewards/cosine_scaled_reward": 0.012119237333536148,
"rewards/format_reward": 0.479166679084301,
"step": 283
},
{
"completion_length": 2276.291732788086,
"epoch": 0.32457142857142857,
"grad_norm": 0.4287702441215515,
"kl": 0.3963623046875,
"learning_rate": 5.21744266211809e-07,
"loss": 0.0495,
"reward": 0.3963469974696636,
"reward_std": 0.6148848831653595,
"rewards/cosine_scaled_reward": -0.12474317848682404,
"rewards/format_reward": 0.6458333414047956,
"step": 284
},
{
"completion_length": 2141.0833740234375,
"epoch": 0.32571428571428573,
"grad_norm": 0.2775723934173584,
"kl": 0.219696044921875,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0093,
"reward": 0.5369033999741077,
"reward_std": 0.5743259247392416,
"rewards/cosine_scaled_reward": -0.11696498841047287,
"rewards/format_reward": 0.7708333395421505,
"step": 285
},
{
"completion_length": 2264.7709350585938,
"epoch": 0.32685714285714285,
"grad_norm": 2.459235668182373,
"kl": 0.26708984375,
"learning_rate": 5.154764373429315e-07,
"loss": 0.1085,
"reward": 0.6158911837264895,
"reward_std": 1.0042807385325432,
"rewards/cosine_scaled_reward": 0.01627892069518566,
"rewards/format_reward": 0.583333345130086,
"step": 286
},
{
"completion_length": 1770.3750343322754,
"epoch": 0.328,
"grad_norm": 0.3129092752933502,
"kl": 0.2149658203125,
"learning_rate": 5.123449705004581e-07,
"loss": 0.0042,
"reward": 0.5954279694706202,
"reward_std": 0.46624701749533415,
"rewards/cosine_scaled_reward": -0.05645267991349101,
"rewards/format_reward": 0.7083333469927311,
"step": 287
},
{
"completion_length": 2374.270881652832,
"epoch": 0.3291428571428571,
"grad_norm": 0.4308589696884155,
"kl": 0.286712646484375,
"learning_rate": 5.09215338910999e-07,
"loss": 0.0262,
"reward": 0.4366315193474293,
"reward_std": 0.8237803354859352,
"rewards/cosine_scaled_reward": -0.052517580799758434,
"rewards/format_reward": 0.5416666828095913,
"step": 288
},
{
"completion_length": 2094.7292289733887,
"epoch": 0.3302857142857143,
"grad_norm": 0.5887665748596191,
"kl": 0.2393341064453125,
"learning_rate": 5.060876951083828e-07,
"loss": 0.0238,
"reward": 0.634770268574357,
"reward_std": 0.559820894151926,
"rewards/cosine_scaled_reward": -0.047198209911584854,
"rewards/format_reward": 0.7291666716337204,
"step": 289
},
{
"completion_length": 2255.979263305664,
"epoch": 0.3314285714285714,
"grad_norm": 0.6363411545753479,
"kl": 0.2767333984375,
"learning_rate": 5.02962191529556e-07,
"loss": -0.0051,
"reward": 0.5892711323685944,
"reward_std": 0.595702001824975,
"rewards/cosine_scaled_reward": -0.09078110568225384,
"rewards/format_reward": 0.7708333414047956,
"step": 290
},
{
"completion_length": 2604.604263305664,
"epoch": 0.3325714285714286,
"grad_norm": 0.6978944540023804,
"kl": 0.255279541015625,
"learning_rate": 4.998389805071536e-07,
"loss": 0.0281,
"reward": 0.49167120084166527,
"reward_std": 0.848762433975935,
"rewards/cosine_scaled_reward": -0.1187477596104145,
"rewards/format_reward": 0.7291666846722364,
"step": 291
},
{
"completion_length": 2789.7500610351562,
"epoch": 0.33371428571428574,
"grad_norm": 0.8494940400123596,
"kl": 0.31634521484375,
"learning_rate": 4.967182142620745e-07,
"loss": -0.003,
"reward": 0.5868554009357467,
"reward_std": 0.5472784880548716,
"rewards/cosine_scaled_reward": -0.08157230284996331,
"rewards/format_reward": 0.7500000055879354,
"step": 292
},
{
"completion_length": 2231.8333740234375,
"epoch": 0.33485714285714285,
"grad_norm": 0.8752176761627197,
"kl": 0.223968505859375,
"learning_rate": 4.93600044896063e-07,
"loss": -0.0123,
"reward": 0.7991173285990953,
"reward_std": 0.4973907843232155,
"rewards/cosine_scaled_reward": -0.006691355258226395,
"rewards/format_reward": 0.8125000149011612,
"step": 293
},
{
"completion_length": 2741.604232788086,
"epoch": 0.336,
"grad_norm": 0.7732493877410889,
"kl": 0.2772216796875,
"learning_rate": 4.904846243842949e-07,
"loss": 0.0615,
"reward": 0.5131175501737744,
"reward_std": 0.6472405269742012,
"rewards/cosine_scaled_reward": 0.01697544753551483,
"rewards/format_reward": 0.47916666977107525,
"step": 294
},
{
"completion_length": 2742.9166870117188,
"epoch": 0.33714285714285713,
"grad_norm": 0.7361288666725159,
"kl": 0.2744140625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0561,
"reward": 0.6931683626025915,
"reward_std": 0.7358212843537331,
"rewards/cosine_scaled_reward": 0.044500820338726044,
"rewards/format_reward": 0.604166679084301,
"step": 295
},
{
"completion_length": 3106.4584350585938,
"epoch": 0.3382857142857143,
"grad_norm": 0.4049779176712036,
"kl": 0.263427734375,
"learning_rate": 4.842626371469149e-07,
"loss": 0.0199,
"reward": 0.3909061732701957,
"reward_std": 0.6087947525084019,
"rewards/cosine_scaled_reward": -0.13788025826215744,
"rewards/format_reward": 0.6666666902601719,
"step": 296
},
{
"completion_length": 3155.0208892822266,
"epoch": 0.3394285714285714,
"grad_norm": 0.36298710107803345,
"kl": 0.312744140625,
"learning_rate": 4.811563736721829e-07,
"loss": 0.0074,
"reward": 0.08901350852102041,
"reward_std": 0.6059992350637913,
"rewards/cosine_scaled_reward": -0.13257658202201128,
"rewards/format_reward": 0.35416667722165585,
"step": 297
},
{
"completion_length": 2160.562545776367,
"epoch": 0.3405714285714286,
"grad_norm": 0.6580337882041931,
"kl": 0.15863037109375,
"learning_rate": 4.780534655386743e-07,
"loss": 0.0368,
"reward": 0.4065987435169518,
"reward_std": 0.6639986764639616,
"rewards/cosine_scaled_reward": -0.06753396429121494,
"rewards/format_reward": 0.5416666753590107,
"step": 298
},
{
"completion_length": 2909.5208587646484,
"epoch": 0.3417142857142857,
"grad_norm": 0.5796670913696289,
"kl": 0.21197509765625,
"learning_rate": 4.749540639777539e-07,
"loss": 0.0596,
"reward": 0.6528121263254434,
"reward_std": 0.7620734348893166,
"rewards/cosine_scaled_reward": 0.01390604767948389,
"rewards/format_reward": 0.6250000111758709,
"step": 299
},
{
"completion_length": 2671.916748046875,
"epoch": 0.34285714285714286,
"grad_norm": 0.509225606918335,
"kl": 0.208465576171875,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.006,
"reward": 0.49726424319669604,
"reward_std": 0.5585810020565987,
"rewards/cosine_scaled_reward": -0.053451212123036385,
"rewards/format_reward": 0.604166679084301,
"step": 300
},
{
"completion_length": 2402.916748046875,
"epoch": 0.344,
"grad_norm": 0.4044334292411804,
"kl": 0.20062255859375,
"learning_rate": 4.68766384637248e-07,
"loss": 0.0287,
"reward": 0.5725528532639146,
"reward_std": 0.642670027911663,
"rewards/cosine_scaled_reward": -0.047056916169822216,
"rewards/format_reward": 0.6666666697710752,
"step": 301
},
{
"completion_length": 2237.020896911621,
"epoch": 0.34514285714285714,
"grad_norm": 1.1649314165115356,
"kl": 0.206268310546875,
"learning_rate": 4.656784084364238e-07,
"loss": 0.0401,
"reward": 0.5389626898686402,
"reward_std": 1.0156296789646149,
"rewards/cosine_scaled_reward": 0.019481339491903782,
"rewards/format_reward": 0.5000000111758709,
"step": 302
},
{
"completion_length": 2634.166748046875,
"epoch": 0.3462857142857143,
"grad_norm": 0.5552562475204468,
"kl": 0.272735595703125,
"learning_rate": 4.6259454195101267e-07,
"loss": 0.0404,
"reward": 0.4783714674413204,
"reward_std": 0.7024741154164076,
"rewards/cosine_scaled_reward": -0.14623095095157623,
"rewards/format_reward": 0.7708333469927311,
"step": 303
},
{
"completion_length": 2843.104263305664,
"epoch": 0.3474285714285714,
"grad_norm": 0.3334513306617737,
"kl": 0.30706787109375,
"learning_rate": 4.59514935484316e-07,
"loss": 0.0326,
"reward": 0.3916517546167597,
"reward_std": 0.644689092412591,
"rewards/cosine_scaled_reward": -0.14792412985116243,
"rewards/format_reward": 0.6875000167638063,
"step": 304
},
{
"completion_length": 3143.8125610351562,
"epoch": 0.3485714285714286,
"grad_norm": 0.46253135800361633,
"kl": 0.371826171875,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0503,
"reward": 0.2803553529083729,
"reward_std": 0.6069540809839964,
"rewards/cosine_scaled_reward": -0.1619056654162705,
"rewards/format_reward": 0.604166679084301,
"step": 305
},
{
"completion_length": 2349.5833892822266,
"epoch": 0.3497142857142857,
"grad_norm": 0.9043461680412292,
"kl": 0.2369384765625,
"learning_rate": 4.5336910277482155e-07,
"loss": 0.0383,
"reward": 1.2639683187007904,
"reward_std": 0.8216119185090065,
"rewards/cosine_scaled_reward": 0.21531746815890074,
"rewards/format_reward": 0.8333333507180214,
"step": 306
},
{
"completion_length": 2729.500045776367,
"epoch": 0.35085714285714287,
"grad_norm": 0.7317904829978943,
"kl": 0.35693359375,
"learning_rate": 4.503031760712397e-07,
"loss": -0.0004,
"reward": 0.35641809552907944,
"reward_std": 0.8619059510529041,
"rewards/cosine_scaled_reward": -0.07179095968604088,
"rewards/format_reward": 0.5000000074505806,
"step": 307
},
{
"completion_length": 3229.0209045410156,
"epoch": 0.352,
"grad_norm": 0.7616815567016602,
"kl": 0.499267578125,
"learning_rate": 4.4724210845020494e-07,
"loss": 0.0444,
"reward": 0.3788898056373,
"reward_std": 0.7400763519108295,
"rewards/cosine_scaled_reward": -0.12305509229190648,
"rewards/format_reward": 0.6250000111758709,
"step": 308
},
{
"completion_length": 2903.291748046875,
"epoch": 0.35314285714285715,
"grad_norm": 1.0771965980529785,
"kl": 0.3709716796875,
"learning_rate": 4.441860491038345e-07,
"loss": 0.0667,
"reward": 0.2175411656498909,
"reward_std": 0.7625751979649067,
"rewards/cosine_scaled_reward": -0.17247941810637712,
"rewards/format_reward": 0.5625000167638063,
"step": 309
},
{
"completion_length": 2517.5209045410156,
"epoch": 0.35428571428571426,
"grad_norm": 0.8801992535591125,
"kl": 0.4547119140625,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0203,
"reward": 0.3958463743329048,
"reward_std": 0.7051512375473976,
"rewards/cosine_scaled_reward": -0.13541015330702066,
"rewards/format_reward": 0.6666666753590107,
"step": 310
},
{
"completion_length": 2367.395881652832,
"epoch": 0.3554285714285714,
"grad_norm": 0.4860677421092987,
"kl": 0.287261962890625,
"learning_rate": 4.3808955077581546e-07,
"loss": 0.0301,
"reward": 0.8140461907387362,
"reward_std": 0.729220163077116,
"rewards/cosine_scaled_reward": 0.032023081090301275,
"rewards/format_reward": 0.7500000074505806,
"step": 311
},
{
"completion_length": 2170.0000381469727,
"epoch": 0.3565714285714286,
"grad_norm": 1.8704103231430054,
"kl": 0.35418701171875,
"learning_rate": 4.350494089288943e-07,
"loss": -0.0452,
"reward": 0.9425890631973743,
"reward_std": 0.5832888670265675,
"rewards/cosine_scaled_reward": 0.13796119578182697,
"rewards/format_reward": 0.6666666697710752,
"step": 312
},
{
"completion_length": 2891.3334045410156,
"epoch": 0.3577142857142857,
"grad_norm": 0.5072900652885437,
"kl": 0.40447998046875,
"learning_rate": 4.3201486961161093e-07,
"loss": 0.0527,
"reward": 0.9016526590567082,
"reward_std": 0.901871845126152,
"rewards/cosine_scaled_reward": 0.10707633011043072,
"rewards/format_reward": 0.6875000149011612,
"step": 313
},
{
"completion_length": 2322.729248046875,
"epoch": 0.3588571428571429,
"grad_norm": 0.8013210296630859,
"kl": 0.294403076171875,
"learning_rate": 4.2898608072313045e-07,
"loss": -0.0126,
"reward": 1.0439924612874165,
"reward_std": 0.8895706832408905,
"rewards/cosine_scaled_reward": 0.16782954335212708,
"rewards/format_reward": 0.7083333525806665,
"step": 314
},
{
"completion_length": 2793.3333892822266,
"epoch": 0.36,
"grad_norm": 1.2426762580871582,
"kl": 0.43634033203125,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.0831,
"reward": 0.48491090000607073,
"reward_std": 0.70790103264153,
"rewards/cosine_scaled_reward": -0.049211218021810055,
"rewards/format_reward": 0.5833333395421505,
"step": 315
},
{
"completion_length": 3277.3750915527344,
"epoch": 0.36114285714285715,
"grad_norm": 1.8028209209442139,
"kl": 0.505859375,
"learning_rate": 4.2294634442070553e-07,
"loss": -0.0052,
"reward": 0.30412304773926735,
"reward_std": 0.6635380201041698,
"rewards/cosine_scaled_reward": -0.19168847613036633,
"rewards/format_reward": 0.6875000149011612,
"step": 316
},
{
"completion_length": 2602.041717529297,
"epoch": 0.36228571428571427,
"grad_norm": 1.0472040176391602,
"kl": 0.38592529296875,
"learning_rate": 4.1993569137498776e-07,
"loss": 0.0476,
"reward": 0.5747585827484727,
"reward_std": 0.9246655404567719,
"rewards/cosine_scaled_reward": -0.004287379328161478,
"rewards/format_reward": 0.5833333432674408,
"step": 317
},
{
"completion_length": 2433.229232788086,
"epoch": 0.36342857142857143,
"grad_norm": 0.5204926133155823,
"kl": 0.3592529296875,
"learning_rate": 4.1693137748017915e-07,
"loss": 0.0436,
"reward": 0.6435124352574348,
"reward_std": 0.7634010724723339,
"rewards/cosine_scaled_reward": -0.08449378702789545,
"rewards/format_reward": 0.8125000111758709,
"step": 318
},
{
"completion_length": 2784.708396911621,
"epoch": 0.36457142857142855,
"grad_norm": 0.5020449161529541,
"kl": 0.378021240234375,
"learning_rate": 4.1393354916230005e-07,
"loss": 0.0635,
"reward": 0.4931874219328165,
"reward_std": 0.7557184733450413,
"rewards/cosine_scaled_reward": -0.149239641148597,
"rewards/format_reward": 0.7916666846722364,
"step": 319
},
{
"completion_length": 2341.791717529297,
"epoch": 0.3657142857142857,
"grad_norm": 1.0193767547607422,
"kl": 0.27911376953125,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0637,
"reward": 0.6816958046983927,
"reward_std": 0.7552312985062599,
"rewards/cosine_scaled_reward": -0.08623544871807098,
"rewards/format_reward": 0.854166679084301,
"step": 320
},
{
"completion_length": 2105.729202270508,
"epoch": 0.3668571428571429,
"grad_norm": 0.3755457103252411,
"kl": 0.26214599609375,
"learning_rate": 4.079579333738039e-07,
"loss": 0.0043,
"reward": 1.0699648894369602,
"reward_std": 0.8030014112591743,
"rewards/cosine_scaled_reward": 0.13914906978607178,
"rewards/format_reward": 0.7916666902601719,
"step": 321
},
{
"completion_length": 2367.6875610351562,
"epoch": 0.368,
"grad_norm": 0.4942283034324646,
"kl": 0.30426025390625,
"learning_rate": 4.0498043714627006e-07,
"loss": 0.0148,
"reward": 0.7899385392665863,
"reward_std": 0.743595227599144,
"rewards/cosine_scaled_reward": -0.03211408853530884,
"rewards/format_reward": 0.854166679084301,
"step": 322
},
{
"completion_length": 2852.3958892822266,
"epoch": 0.36914285714285716,
"grad_norm": 1.1540416479110718,
"kl": 0.41192626953125,
"learning_rate": 4.020100089676376e-07,
"loss": 0.0112,
"reward": 0.5696731898933649,
"reward_std": 0.6305563114583492,
"rewards/cosine_scaled_reward": -0.09016341622918844,
"rewards/format_reward": 0.750000013038516,
"step": 323
},
{
"completion_length": 3199.541748046875,
"epoch": 0.3702857142857143,
"grad_norm": 0.42631351947784424,
"kl": 0.4495849609375,
"learning_rate": 3.9904679361238526e-07,
"loss": 0.0534,
"reward": 0.3698822483420372,
"reward_std": 0.7412667665630579,
"rewards/cosine_scaled_reward": -0.09630887286039069,
"rewards/format_reward": 0.5625000111758709,
"step": 324
},
{
"completion_length": 3050.104217529297,
"epoch": 0.37142857142857144,
"grad_norm": 0.44686606526374817,
"kl": 0.448394775390625,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0349,
"reward": 0.28104627318680286,
"reward_std": 0.8294301778078079,
"rewards/cosine_scaled_reward": -0.09906021226197481,
"rewards/format_reward": 0.479166679084301,
"step": 325
},
{
"completion_length": 2556.916732788086,
"epoch": 0.37257142857142855,
"grad_norm": 0.713642954826355,
"kl": 0.252593994140625,
"learning_rate": 3.931425787051832e-07,
"loss": 0.0419,
"reward": 0.625084163621068,
"reward_std": 0.7249293215572834,
"rewards/cosine_scaled_reward": -0.09370792843401432,
"rewards/format_reward": 0.8125000074505806,
"step": 326
},
{
"completion_length": 2512.229232788086,
"epoch": 0.3737142857142857,
"grad_norm": 0.6753360033035278,
"kl": 0.29766845703125,
"learning_rate": 3.902018669163384e-07,
"loss": 0.0522,
"reward": 0.9986484311521053,
"reward_std": 0.7536502368748188,
"rewards/cosine_scaled_reward": 0.08265753649175167,
"rewards/format_reward": 0.8333333488553762,
"step": 327
},
{
"completion_length": 3129.541717529297,
"epoch": 0.37485714285714283,
"grad_norm": 0.6392536163330078,
"kl": 0.433837890625,
"learning_rate": 3.872689434630585e-07,
"loss": 0.0176,
"reward": 0.41261669318191707,
"reward_std": 0.6660363562405109,
"rewards/cosine_scaled_reward": -0.08535831584595144,
"rewards/format_reward": 0.5833333432674408,
"step": 328
},
{
"completion_length": 2198.8542404174805,
"epoch": 0.376,
"grad_norm": 0.7879998087882996,
"kl": 0.2938232421875,
"learning_rate": 3.843439512918949e-07,
"loss": 0.0376,
"reward": 0.8884001635015011,
"reward_std": 0.8436827287077904,
"rewards/cosine_scaled_reward": 0.06920007988810539,
"rewards/format_reward": 0.7500000335276127,
"step": 329
},
{
"completion_length": 2306.812530517578,
"epoch": 0.37714285714285717,
"grad_norm": 0.3462686538696289,
"kl": 0.298553466796875,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0274,
"reward": 0.38892703130841255,
"reward_std": 0.7339756563305855,
"rewards/cosine_scaled_reward": -0.18053649738430977,
"rewards/format_reward": 0.750000013038516,
"step": 330
},
{
"completion_length": 2696.229248046875,
"epoch": 0.3782857142857143,
"grad_norm": 0.5001699924468994,
"kl": 0.39886474609375,
"learning_rate": 3.785183306423767e-07,
"loss": 0.0252,
"reward": 0.17869803123176098,
"reward_std": 0.6411689594388008,
"rewards/cosine_scaled_reward": -0.19190098624676466,
"rewards/format_reward": 0.5625000111758709,
"step": 331
},
{
"completion_length": 2362.645896911621,
"epoch": 0.37942857142857145,
"grad_norm": 0.9171366691589355,
"kl": 0.287750244140625,
"learning_rate": 3.7561798609655373e-07,
"loss": -0.0042,
"reward": 0.7273948602378368,
"reward_std": 0.644059307873249,
"rewards/cosine_scaled_reward": -0.06338590569794178,
"rewards/format_reward": 0.8541666828095913,
"step": 332
},
{
"completion_length": 2217.7500610351562,
"epoch": 0.38057142857142856,
"grad_norm": 0.4733556807041168,
"kl": 0.19439697265625,
"learning_rate": 3.72726140684072e-07,
"loss": 0.0278,
"reward": 0.6465493626892567,
"reward_std": 0.7646395452320576,
"rewards/cosine_scaled_reward": -0.11422532517462969,
"rewards/format_reward": 0.8750000111758709,
"step": 333
},
{
"completion_length": 2848.5208740234375,
"epoch": 0.38171428571428573,
"grad_norm": 0.6546262502670288,
"kl": 0.42724609375,
"learning_rate": 3.6984293534939737e-07,
"loss": 0.0575,
"reward": 0.2325472510419786,
"reward_std": 0.8501592725515366,
"rewards/cosine_scaled_reward": -0.19622638076543808,
"rewards/format_reward": 0.6250000204890966,
"step": 334
},
{
"completion_length": 2426.750068664551,
"epoch": 0.38285714285714284,
"grad_norm": 0.459773987531662,
"kl": 0.3013916015625,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.002,
"reward": 0.7257073321379721,
"reward_std": 0.8327980078756809,
"rewards/cosine_scaled_reward": 0.01910366490483284,
"rewards/format_reward": 0.6875000149011612,
"step": 335
},
{
"completion_length": 2826.104232788086,
"epoch": 0.384,
"grad_norm": 0.9565772414207458,
"kl": 0.369140625,
"learning_rate": 3.641030065789562e-07,
"loss": 0.0762,
"reward": 0.6304492875933647,
"reward_std": 0.9015435054898262,
"rewards/cosine_scaled_reward": -0.0076920222491025925,
"rewards/format_reward": 0.6458333600312471,
"step": 336
},
{
"completion_length": 2443.3958587646484,
"epoch": 0.3851428571428571,
"grad_norm": 0.9018928408622742,
"kl": 0.2510223388671875,
"learning_rate": 3.612465628992203e-07,
"loss": 0.0516,
"reward": 0.4559049401432276,
"reward_std": 0.6961762681603432,
"rewards/cosine_scaled_reward": -0.18871420342475176,
"rewards/format_reward": 0.833333358168602,
"step": 337
},
{
"completion_length": 2281.166717529297,
"epoch": 0.3862857142857143,
"grad_norm": 0.41089001297950745,
"kl": 0.32318115234375,
"learning_rate": 3.5839931879571725e-07,
"loss": 0.0283,
"reward": 0.664438179373974,
"reward_std": 0.6754178777337074,
"rewards/cosine_scaled_reward": -0.04278091713786125,
"rewards/format_reward": 0.750000013038516,
"step": 338
},
{
"completion_length": 2934.458427429199,
"epoch": 0.38742857142857146,
"grad_norm": 0.45900437235832214,
"kl": 0.371612548828125,
"learning_rate": 3.555614130391079e-07,
"loss": 0.0265,
"reward": 0.5773372187613859,
"reward_std": 0.5042719468474388,
"rewards/cosine_scaled_reward": -0.0759147321805358,
"rewards/format_reward": 0.729166679084301,
"step": 339
},
{
"completion_length": 2755.4375610351562,
"epoch": 0.38857142857142857,
"grad_norm": 0.4454108476638794,
"kl": 0.3587646484375,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0325,
"reward": 0.4514606408774853,
"reward_std": 0.6363845467567444,
"rewards/cosine_scaled_reward": -0.1805196925997734,
"rewards/format_reward": 0.8125000149011612,
"step": 340
},
{
"completion_length": 2544.6250610351562,
"epoch": 0.38971428571428574,
"grad_norm": 1.1304974555969238,
"kl": 0.305999755859375,
"learning_rate": 3.4991416936678276e-07,
"loss": 0.0396,
"reward": 1.0055190767161548,
"reward_std": 0.7852493021637201,
"rewards/cosine_scaled_reward": 0.14859285950660706,
"rewards/format_reward": 0.7083333469927311,
"step": 341
},
{
"completion_length": 2673.812530517578,
"epoch": 0.39085714285714285,
"grad_norm": 1.1692603826522827,
"kl": 0.439208984375,
"learning_rate": 3.471051066897562e-07,
"loss": 0.0718,
"reward": 0.4766305387020111,
"reward_std": 0.8266436979174614,
"rewards/cosine_scaled_reward": -0.11585140600800514,
"rewards/format_reward": 0.7083333525806665,
"step": 342
},
{
"completion_length": 3054.729217529297,
"epoch": 0.392,
"grad_norm": 0.7233371138572693,
"kl": 0.43121337890625,
"learning_rate": 3.4430593282358777e-07,
"loss": 0.0223,
"reward": 0.8223184086382389,
"reward_std": 0.8432090878486633,
"rewards/cosine_scaled_reward": 0.09865919034928083,
"rewards/format_reward": 0.6250000055879354,
"step": 343
},
{
"completion_length": 2295.4584350585938,
"epoch": 0.3931428571428571,
"grad_norm": 0.29200509190559387,
"kl": 0.244537353515625,
"learning_rate": 3.4151678419606233e-07,
"loss": 0.0264,
"reward": 1.348364820703864,
"reward_std": 0.6582886949181557,
"rewards/cosine_scaled_reward": 0.20543239824473858,
"rewards/format_reward": 0.9375000074505806,
"step": 344
},
{
"completion_length": 2679.2084197998047,
"epoch": 0.3942857142857143,
"grad_norm": 0.8350253105163574,
"kl": 0.42327880859375,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0278,
"reward": 1.0699175857007504,
"reward_std": 0.5837405696511269,
"rewards/cosine_scaled_reward": 0.12870877236127853,
"rewards/format_reward": 0.8125000111758709,
"step": 345
},
{
"completion_length": 2560.666748046875,
"epoch": 0.3954285714285714,
"grad_norm": 0.6276114583015442,
"kl": 0.397705078125,
"learning_rate": 3.359691059183761e-07,
"loss": 0.0694,
"reward": 0.46554950438439846,
"reward_std": 0.5874128863215446,
"rewards/cosine_scaled_reward": -0.17347524780780077,
"rewards/format_reward": 0.8125000186264515,
"step": 346
},
{
"completion_length": 2896.354278564453,
"epoch": 0.3965714285714286,
"grad_norm": 1.1534738540649414,
"kl": 0.4324951171875,
"learning_rate": 3.3321084665422803e-07,
"loss": 0.0784,
"reward": 0.5726351104676723,
"reward_std": 0.8148692063987255,
"rewards/cosine_scaled_reward": -0.09909912198781967,
"rewards/format_reward": 0.7708333507180214,
"step": 347
},
{
"completion_length": 2368.6251068115234,
"epoch": 0.3977142857142857,
"grad_norm": 0.5168960094451904,
"kl": 0.35772705078125,
"learning_rate": 3.3046315338757026e-07,
"loss": 0.0494,
"reward": 0.7037911647930741,
"reward_std": 0.7166893742978573,
"rewards/cosine_scaled_reward": -0.07518776506185532,
"rewards/format_reward": 0.8541666753590107,
"step": 348
},
{
"completion_length": 2686.5834045410156,
"epoch": 0.39885714285714285,
"grad_norm": 0.9646963477134705,
"kl": 0.477783203125,
"learning_rate": 3.2772616003709616e-07,
"loss": 0.07,
"reward": 0.7249975223094225,
"reward_std": 0.8503868244588375,
"rewards/cosine_scaled_reward": -0.07500123046338558,
"rewards/format_reward": 0.8750000074505806,
"step": 349
},
{
"completion_length": 2476.979202270508,
"epoch": 0.4,
"grad_norm": 1.1782015562057495,
"kl": 0.65362548828125,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0431,
"reward": 0.44952827505767345,
"reward_std": 0.6957900896668434,
"rewards/cosine_scaled_reward": -0.1398192130291136,
"rewards/format_reward": 0.7291666902601719,
"step": 350
},
{
"completion_length": 2507.9375762939453,
"epoch": 0.40114285714285713,
"grad_norm": 0.49656009674072266,
"kl": 0.3612060546875,
"learning_rate": 3.222848061454764e-07,
"loss": 0.0399,
"reward": 0.9078153409063816,
"reward_std": 0.8314991928637028,
"rewards/cosine_scaled_reward": 0.005991012789309025,
"rewards/format_reward": 0.8958333432674408,
"step": 351
},
{
"completion_length": 2370.604217529297,
"epoch": 0.4022857142857143,
"grad_norm": 1.5748661756515503,
"kl": 0.4361572265625,
"learning_rate": 3.195807108082429e-07,
"loss": 0.0125,
"reward": 0.6327532059513032,
"reward_std": 0.7839572783559561,
"rewards/cosine_scaled_reward": 0.01429326320067048,
"rewards/format_reward": 0.604166679084301,
"step": 352
},
{
"completion_length": 1954.3125686645508,
"epoch": 0.4034285714285714,
"grad_norm": 0.35592421889305115,
"kl": 0.27490234375,
"learning_rate": 3.168878457820915e-07,
"loss": 0.0218,
"reward": 0.941289596259594,
"reward_std": 0.8161356300115585,
"rewards/cosine_scaled_reward": 0.04356144741177559,
"rewards/format_reward": 0.8541666865348816,
"step": 353
},
{
"completion_length": 2080.3959045410156,
"epoch": 0.4045714285714286,
"grad_norm": 0.3516908288002014,
"kl": 0.274932861328125,
"learning_rate": 3.142063423134644e-07,
"loss": 0.0192,
"reward": 0.9610122065059841,
"reward_std": 0.6026036366820335,
"rewards/cosine_scaled_reward": 0.043006100691854954,
"rewards/format_reward": 0.8750000149011612,
"step": 354
},
{
"completion_length": 2038.5209197998047,
"epoch": 0.4057142857142857,
"grad_norm": 0.6201685667037964,
"kl": 0.34228515625,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0472,
"reward": 0.974739572731778,
"reward_std": 0.9850451275706291,
"rewards/cosine_scaled_reward": 0.11236979370005429,
"rewards/format_reward": 0.7500000093132257,
"step": 355
},
{
"completion_length": 2371.541748046875,
"epoch": 0.40685714285714286,
"grad_norm": 0.4849975109100342,
"kl": 0.395751953125,
"learning_rate": 3.0887794225945143e-07,
"loss": 0.0326,
"reward": 0.623257277533412,
"reward_std": 0.7509507350623608,
"rewards/cosine_scaled_reward": -0.0946213798597455,
"rewards/format_reward": 0.8125000111758709,
"step": 356
},
{
"completion_length": 2560.7709045410156,
"epoch": 0.408,
"grad_norm": 0.7741249799728394,
"kl": 0.390869140625,
"learning_rate": 3.062313053727671e-07,
"loss": 0.012,
"reward": 0.5958885028958321,
"reward_std": 0.7125682160258293,
"rewards/cosine_scaled_reward": -0.11872242018580437,
"rewards/format_reward": 0.8333333507180214,
"step": 357
},
{
"completion_length": 2100.729202270508,
"epoch": 0.40914285714285714,
"grad_norm": 2.192291021347046,
"kl": 0.4425048828125,
"learning_rate": 3.0359654942835247e-07,
"loss": 0.1018,
"reward": 0.8158981092274189,
"reward_std": 0.7913046702742577,
"rewards/cosine_scaled_reward": 0.04336569644510746,
"rewards/format_reward": 0.7291666828095913,
"step": 358
},
{
"completion_length": 2156.3333892822266,
"epoch": 0.4102857142857143,
"grad_norm": 0.9902085065841675,
"kl": 0.37115478515625,
"learning_rate": 3.0097380284049523e-07,
"loss": 0.0701,
"reward": 0.6149628674611449,
"reward_std": 0.7031097002327442,
"rewards/cosine_scaled_reward": -0.10918523697182536,
"rewards/format_reward": 0.8333333507180214,
"step": 359
},
{
"completion_length": 2550.729232788086,
"epoch": 0.4114285714285714,
"grad_norm": 0.8987974524497986,
"kl": 0.4498291015625,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0107,
"reward": 0.7415556833148003,
"reward_std": 0.8184454329311848,
"rewards/cosine_scaled_reward": -0.05630549229681492,
"rewards/format_reward": 0.8541666939854622,
"step": 360
},
{
"completion_length": 2538.6875610351562,
"epoch": 0.4125714285714286,
"grad_norm": 1.1241296529769897,
"kl": 0.4794921875,
"learning_rate": 2.9576484845877793e-07,
"loss": 0.0314,
"reward": 0.8051093611866236,
"reward_std": 0.836889635771513,
"rewards/cosine_scaled_reward": -0.014111996628344059,
"rewards/format_reward": 0.8333333507180214,
"step": 361
},
{
"completion_length": 1696.6042442321777,
"epoch": 0.4137142857142857,
"grad_norm": 0.36879751086235046,
"kl": 0.3428955078125,
"learning_rate": 2.931788945420058e-07,
"loss": 0.0423,
"reward": 0.807725053280592,
"reward_std": 0.5215695351362228,
"rewards/cosine_scaled_reward": -0.033637505024671555,
"rewards/format_reward": 0.8750000149011612,
"step": 362
},
{
"completion_length": 1807.4792098999023,
"epoch": 0.41485714285714287,
"grad_norm": 0.5585716366767883,
"kl": 0.396026611328125,
"learning_rate": 2.9060545772359305e-07,
"loss": 0.0461,
"reward": 0.49642418074654415,
"reward_std": 0.7605064287781715,
"rewards/cosine_scaled_reward": -0.09553791396319866,
"rewards/format_reward": 0.687500013038516,
"step": 363
},
{
"completion_length": 2447.354217529297,
"epoch": 0.416,
"grad_norm": 0.512050211429596,
"kl": 0.5145263671875,
"learning_rate": 2.8804466342921987e-07,
"loss": 0.0591,
"reward": 0.1905357912182808,
"reward_std": 0.6694304198026657,
"rewards/cosine_scaled_reward": -0.2588987797498703,
"rewards/format_reward": 0.7083333544433117,
"step": 364
},
{
"completion_length": 2767.666778564453,
"epoch": 0.41714285714285715,
"grad_norm": 1.2084037065505981,
"kl": 0.5614013671875,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0845,
"reward": 0.4251478109508753,
"reward_std": 0.8055129833519459,
"rewards/cosine_scaled_reward": -0.07909277267754078,
"rewards/format_reward": 0.583333345130086,
"step": 365
},
{
"completion_length": 1953.7500305175781,
"epoch": 0.41828571428571426,
"grad_norm": 1.8075039386749268,
"kl": 0.295562744140625,
"learning_rate": 2.829615010283344e-07,
"loss": 0.0725,
"reward": 1.0165046447655186,
"reward_std": 0.8075561951845884,
"rewards/cosine_scaled_reward": 0.09158563800156116,
"rewards/format_reward": 0.8333333432674408,
"step": 366
},
{
"completion_length": 2626.354263305664,
"epoch": 0.41942857142857143,
"grad_norm": 1.0496845245361328,
"kl": 0.48699951171875,
"learning_rate": 2.8043938066798645e-07,
"loss": 0.0661,
"reward": 0.7226119879633188,
"reward_std": 0.9606124758720398,
"rewards/cosine_scaled_reward": 0.02797265024855733,
"rewards/format_reward": 0.666666692122817,
"step": 367
},
{
"completion_length": 2698.1459197998047,
"epoch": 0.4205714285714286,
"grad_norm": 1.6566721200942993,
"kl": 0.4873046875,
"learning_rate": 2.7793039831193133e-07,
"loss": 0.0166,
"reward": 0.5656480398029089,
"reward_std": 0.6528111733496189,
"rewards/cosine_scaled_reward": -0.0713426498696208,
"rewards/format_reward": 0.7083333432674408,
"step": 368
},
{
"completion_length": 2418.041748046875,
"epoch": 0.4217142857142857,
"grad_norm": 0.9178156852722168,
"kl": 0.41973876953125,
"learning_rate": 2.7543467624442956e-07,
"loss": 0.0271,
"reward": 0.8303387649357319,
"reward_std": 1.1390304267406464,
"rewards/cosine_scaled_reward": 0.04016935685649514,
"rewards/format_reward": 0.7500000149011612,
"step": 369
},
{
"completion_length": 2595.7709350585938,
"epoch": 0.4228571428571429,
"grad_norm": 0.9787195324897766,
"kl": 0.5360107421875,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0475,
"reward": 0.6089789252728224,
"reward_std": 0.49529892206192017,
"rewards/cosine_scaled_reward": -0.06009387783706188,
"rewards/format_reward": 0.7291666828095913,
"step": 370
},
{
"completion_length": 1669.125057220459,
"epoch": 0.424,
"grad_norm": 0.6832586526870728,
"kl": 0.305511474609375,
"learning_rate": 2.7048349887476037e-07,
"loss": 0.0156,
"reward": 1.334755502641201,
"reward_std": 0.7108926326036453,
"rewards/cosine_scaled_reward": 0.2298777117393911,
"rewards/format_reward": 0.8750000037252903,
"step": 371
},
{
"completion_length": 2707.750045776367,
"epoch": 0.42514285714285716,
"grad_norm": 0.7063215970993042,
"kl": 0.460296630859375,
"learning_rate": 2.6802828488599294e-07,
"loss": 0.0567,
"reward": 0.8714766772463918,
"reward_std": 0.7350384518504143,
"rewards/cosine_scaled_reward": 0.0503216665238142,
"rewards/format_reward": 0.7708333469927311,
"step": 372
},
{
"completion_length": 1592.6459121704102,
"epoch": 0.42628571428571427,
"grad_norm": 0.9033951163291931,
"kl": 0.2605133056640625,
"learning_rate": 2.655868138008171e-07,
"loss": 0.0191,
"reward": 0.7105765882879496,
"reward_std": 0.6581134386360645,
"rewards/cosine_scaled_reward": -0.07179503422230482,
"rewards/format_reward": 0.8541666753590107,
"step": 373
},
{
"completion_length": 2403.375030517578,
"epoch": 0.42742857142857144,
"grad_norm": 0.9998828768730164,
"kl": 0.48974609375,
"learning_rate": 2.631592046130896e-07,
"loss": 0.0693,
"reward": 0.4763774862512946,
"reward_std": 0.7915669940412045,
"rewards/cosine_scaled_reward": -0.06389461923390627,
"rewards/format_reward": 0.6041666865348816,
"step": 374
},
{
"completion_length": 2501.8750762939453,
"epoch": 0.42857142857142855,
"grad_norm": 0.610641598701477,
"kl": 0.4688720703125,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0509,
"reward": 0.8910817317664623,
"reward_std": 0.7950854599475861,
"rewards/cosine_scaled_reward": 0.09137419052422047,
"rewards/format_reward": 0.7083333488553762,
"step": 375
},
{
"completion_length": 2130.5625610351562,
"epoch": 0.4297142857142857,
"grad_norm": 0.6212278008460999,
"kl": 0.380096435546875,
"learning_rate": 2.583460445215911e-07,
"loss": 0.016,
"reward": 0.4537246283143759,
"reward_std": 0.6907018758356571,
"rewards/cosine_scaled_reward": -0.15855437144637108,
"rewards/format_reward": 0.770833358168602,
"step": 376
},
{
"completion_length": 2739.0209197998047,
"epoch": 0.4308571428571429,
"grad_norm": 0.9390625357627869,
"kl": 0.5252685546875,
"learning_rate": 2.5596072820445254e-07,
"loss": 0.0252,
"reward": 0.7201773710548878,
"reward_std": 0.9287758991122246,
"rewards/cosine_scaled_reward": 0.005922011099755764,
"rewards/format_reward": 0.708333358168602,
"step": 377
},
{
"completion_length": 2193.7709159851074,
"epoch": 0.432,
"grad_norm": 0.5597609877586365,
"kl": 0.43035888671875,
"learning_rate": 2.5358974294659373e-07,
"loss": 0.0432,
"reward": 1.151801437139511,
"reward_std": 0.9676465280354023,
"rewards/cosine_scaled_reward": 0.16965069761499763,
"rewards/format_reward": 0.8125000298023224,
"step": 378
},
{
"completion_length": 2664.916732788086,
"epoch": 0.43314285714285716,
"grad_norm": 1.6749874353408813,
"kl": 0.76043701171875,
"learning_rate": 2.512332043064913e-07,
"loss": 0.1177,
"reward": 0.40762139530852437,
"reward_std": 0.8776891604065895,
"rewards/cosine_scaled_reward": -0.11910597886890173,
"rewards/format_reward": 0.6458333469927311,
"step": 379
},
{
"completion_length": 2202.104232788086,
"epoch": 0.4342857142857143,
"grad_norm": 1.1534351110458374,
"kl": 0.31915283203125,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0704,
"reward": 0.6592778088524938,
"reward_std": 0.7035505771636963,
"rewards/cosine_scaled_reward": -0.0974444393068552,
"rewards/format_reward": 0.854166679084301,
"step": 380
},
{
"completion_length": 2642.104232788086,
"epoch": 0.43542857142857144,
"grad_norm": 0.5377724170684814,
"kl": 0.52935791015625,
"learning_rate": 2.465639255873246e-07,
"loss": 0.0518,
"reward": 0.31143795792013407,
"reward_std": 0.7106641083955765,
"rewards/cosine_scaled_reward": -0.17761437874287367,
"rewards/format_reward": 0.6666666753590107,
"step": 381
},
{
"completion_length": 2292.2500610351562,
"epoch": 0.43657142857142855,
"grad_norm": 0.722557008266449,
"kl": 0.40679931640625,
"learning_rate": 2.4425141308231765e-07,
"loss": 0.0297,
"reward": 0.3869906556792557,
"reward_std": 0.6342262029647827,
"rewards/cosine_scaled_reward": -0.23358802066650242,
"rewards/format_reward": 0.8541666865348816,
"step": 382
},
{
"completion_length": 2579.416702270508,
"epoch": 0.4377142857142857,
"grad_norm": 0.5246860980987549,
"kl": 0.52972412109375,
"learning_rate": 2.4195380233209006e-07,
"loss": 0.0596,
"reward": 0.7596514848992229,
"reward_std": 0.8228101618587971,
"rewards/cosine_scaled_reward": 0.02565906196832657,
"rewards/format_reward": 0.7083333432674408,
"step": 383
},
{
"completion_length": 1909.2292098999023,
"epoch": 0.43885714285714283,
"grad_norm": 0.30958041548728943,
"kl": 0.29815673828125,
"learning_rate": 2.3967120531894857e-07,
"loss": 0.0282,
"reward": 1.4319515749812126,
"reward_std": 0.7947760932147503,
"rewards/cosine_scaled_reward": 0.2576424600556493,
"rewards/format_reward": 0.9166666716337204,
"step": 384
},
{
"completion_length": 2197.312545776367,
"epoch": 0.44,
"grad_norm": 0.6841936111450195,
"kl": 0.4190673828125,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0334,
"reward": 0.49799082055687904,
"reward_std": 0.5665691867470741,
"rewards/cosine_scaled_reward": -0.13642127346247435,
"rewards/format_reward": 0.7708333507180214,
"step": 385
},
{
"completion_length": 2120.0834045410156,
"epoch": 0.44114285714285717,
"grad_norm": 0.7352175116539001,
"kl": 0.3243408203125,
"learning_rate": 2.3515149676898552e-07,
"loss": 0.0477,
"reward": 0.9959990195930004,
"reward_std": 0.7146658357232809,
"rewards/cosine_scaled_reward": 0.06049949396401644,
"rewards/format_reward": 0.8750000111758709,
"step": 386
},
{
"completion_length": 2542.3334197998047,
"epoch": 0.4422857142857143,
"grad_norm": 0.9107005596160889,
"kl": 0.56768798828125,
"learning_rate": 2.3291460551638237e-07,
"loss": 0.0838,
"reward": 0.5603130697272718,
"reward_std": 0.7675389684736729,
"rewards/cosine_scaled_reward": -0.03234346956014633,
"rewards/format_reward": 0.6250000149011612,
"step": 387
},
{
"completion_length": 2395.937545776367,
"epoch": 0.44342857142857145,
"grad_norm": 0.914421796798706,
"kl": 0.484466552734375,
"learning_rate": 2.306931685585657e-07,
"loss": 0.0322,
"reward": 0.9246965646743774,
"reward_std": 0.576560951769352,
"rewards/cosine_scaled_reward": 0.10818159952759743,
"rewards/format_reward": 0.7083333395421505,
"step": 388
},
{
"completion_length": 2336.645896911621,
"epoch": 0.44457142857142856,
"grad_norm": 0.6898525953292847,
"kl": 0.455474853515625,
"learning_rate": 2.2848729416523859e-07,
"loss": 0.0534,
"reward": 0.6510482397861779,
"reward_std": 1.0318073891103268,
"rewards/cosine_scaled_reward": -0.049475882202386856,
"rewards/format_reward": 0.7500000149011612,
"step": 389
},
{
"completion_length": 2531.291763305664,
"epoch": 0.44571428571428573,
"grad_norm": 0.9229927062988281,
"kl": 0.6007080078125,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.052,
"reward": 0.2972820373252034,
"reward_std": 0.7185437642037868,
"rewards/cosine_scaled_reward": -0.1846923204138875,
"rewards/format_reward": 0.6666666772216558,
"step": 390
},
{
"completion_length": 2353.4167404174805,
"epoch": 0.44685714285714284,
"grad_norm": 0.569179117679596,
"kl": 0.51171875,
"learning_rate": 2.2412266235313973e-07,
"loss": 0.0774,
"reward": 0.6255490938201547,
"reward_std": 0.8703299462795258,
"rewards/cosine_scaled_reward": -0.04139212518930435,
"rewards/format_reward": 0.7083333432674408,
"step": 391
},
{
"completion_length": 2231.208366394043,
"epoch": 0.448,
"grad_norm": 1.0076370239257812,
"kl": 0.39593505859375,
"learning_rate": 2.2196411766036487e-07,
"loss": 0.0718,
"reward": 0.6922257398255169,
"reward_std": 0.7815711013972759,
"rewards/cosine_scaled_reward": -0.049720464274287224,
"rewards/format_reward": 0.7916666828095913,
"step": 392
},
{
"completion_length": 2371.604248046875,
"epoch": 0.4491428571428571,
"grad_norm": 0.6043879985809326,
"kl": 0.383544921875,
"learning_rate": 2.1982156097370557e-07,
"loss": 0.0414,
"reward": 0.8758026575669646,
"reward_std": 0.8594116196036339,
"rewards/cosine_scaled_reward": 0.10456799250096083,
"rewards/format_reward": 0.6666666846722364,
"step": 393
},
{
"completion_length": 2847.354248046875,
"epoch": 0.4502857142857143,
"grad_norm": 0.900895357131958,
"kl": 0.63330078125,
"learning_rate": 2.1769509671835223e-07,
"loss": 0.0595,
"reward": 0.09159071743488312,
"reward_std": 0.7040717899799347,
"rewards/cosine_scaled_reward": -0.25628798035904765,
"rewards/format_reward": 0.6041666828095913,
"step": 394
},
{
"completion_length": 2110.3125381469727,
"epoch": 0.4514285714285714,
"grad_norm": 1.2333768606185913,
"kl": 0.345977783203125,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0419,
"reward": 0.7058122660964727,
"reward_std": 0.9388692416250706,
"rewards/cosine_scaled_reward": -0.011677220463752747,
"rewards/format_reward": 0.7291666828095913,
"step": 395
},
{
"completion_length": 2366.916717529297,
"epoch": 0.45257142857142857,
"grad_norm": 1.082507610321045,
"kl": 0.560791015625,
"learning_rate": 2.134908592756607e-07,
"loss": 0.1017,
"reward": 0.5549185280688107,
"reward_std": 0.8698355071246624,
"rewards/cosine_scaled_reward": -0.09754075668752193,
"rewards/format_reward": 0.7500000186264515,
"step": 396
},
{
"completion_length": 2053.937545776367,
"epoch": 0.45371428571428574,
"grad_norm": 1.2703036069869995,
"kl": 0.31591796875,
"learning_rate": 2.1141329099692406e-07,
"loss": -0.0161,
"reward": 0.7015451728366315,
"reward_std": 0.6856296453624964,
"rewards/cosine_scaled_reward": -0.03464407101273537,
"rewards/format_reward": 0.7708333507180214,
"step": 397
},
{
"completion_length": 2001.6458892822266,
"epoch": 0.45485714285714285,
"grad_norm": 0.640113890171051,
"kl": 0.2894744873046875,
"learning_rate": 2.0935222495670968e-07,
"loss": -0.0049,
"reward": 0.46910549892345443,
"reward_std": 0.7106455899775028,
"rewards/cosine_scaled_reward": -0.1404472654685378,
"rewards/format_reward": 0.7500000149011612,
"step": 398
},
{
"completion_length": 2174.875068664551,
"epoch": 0.456,
"grad_norm": 0.405164510011673,
"kl": 0.3242950439453125,
"learning_rate": 2.0730776160846853e-07,
"loss": 0.0278,
"reward": 0.7996035069227219,
"reward_std": 0.7646129336208105,
"rewards/cosine_scaled_reward": -0.027281596325337887,
"rewards/format_reward": 0.8541666716337204,
"step": 399
},
{
"completion_length": 1563.2083892822266,
"epoch": 0.45714285714285713,
"grad_norm": 0.38448086380958557,
"kl": 0.147735595703125,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0137,
"reward": 1.3166161552071571,
"reward_std": 0.7907492704689503,
"rewards/cosine_scaled_reward": 0.23122474236879498,
"rewards/format_reward": 0.8541666753590107,
"step": 400
},
{
"completion_length": 2731.1875762939453,
"epoch": 0.4582857142857143,
"grad_norm": 0.966650128364563,
"kl": 0.475494384765625,
"learning_rate": 2.032690407508949e-07,
"loss": 0.0993,
"reward": 0.7280457699671388,
"reward_std": 0.8715083934366703,
"rewards/cosine_scaled_reward": 0.041106189135462046,
"rewards/format_reward": 0.6458333507180214,
"step": 401
},
{
"completion_length": 2064.7292404174805,
"epoch": 0.4594285714285714,
"grad_norm": 0.3579236567020416,
"kl": 0.318389892578125,
"learning_rate": 2.0127498008311922e-07,
"loss": 0.057,
"reward": 0.6332701966166496,
"reward_std": 0.6316654235124588,
"rewards/cosine_scaled_reward": -0.05836488865315914,
"rewards/format_reward": 0.7500000111758709,
"step": 402
},
{
"completion_length": 1992.0625610351562,
"epoch": 0.4605714285714286,
"grad_norm": 0.3985031843185425,
"kl": 0.314453125,
"learning_rate": 1.9929791578083655e-07,
"loss": 0.0231,
"reward": 0.7511493074707687,
"reward_std": 0.7014381438493729,
"rewards/cosine_scaled_reward": -0.030675357207655907,
"rewards/format_reward": 0.8125000055879354,
"step": 403
},
{
"completion_length": 1876.354232788086,
"epoch": 0.4617142857142857,
"grad_norm": 0.5146257281303406,
"kl": 0.3045654296875,
"learning_rate": 1.9733794420337213e-07,
"loss": 0.0257,
"reward": 0.6732284021563828,
"reward_std": 0.5320135578513145,
"rewards/cosine_scaled_reward": -0.038385817781090736,
"rewards/format_reward": 0.7500000055879354,
"step": 404
},
{
"completion_length": 1674.3750381469727,
"epoch": 0.46285714285714286,
"grad_norm": 0.8119452595710754,
"kl": 0.293060302734375,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0567,
"reward": 0.8279850594699383,
"reward_std": 0.8649867400527,
"rewards/cosine_scaled_reward": 0.08065919205546379,
"rewards/format_reward": 0.6666666679084301,
"step": 405
},
{
"completion_length": 1803.916732788086,
"epoch": 0.464,
"grad_norm": 0.8215560913085938,
"kl": 0.1740570068359375,
"learning_rate": 1.934696604901642e-07,
"loss": 0.0547,
"reward": 1.2124288752675056,
"reward_std": 0.8886886425316334,
"rewards/cosine_scaled_reward": 0.1687144124880433,
"rewards/format_reward": 0.8750000149011612,
"step": 406
},
{
"completion_length": 2204.1875610351562,
"epoch": 0.46514285714285714,
"grad_norm": 0.8370880484580994,
"kl": 0.3338470458984375,
"learning_rate": 1.915615368891117e-07,
"loss": 0.0397,
"reward": 0.8222223985940218,
"reward_std": 0.762837752699852,
"rewards/cosine_scaled_reward": -0.005555473268032074,
"rewards/format_reward": 0.8333333432674408,
"step": 407
},
{
"completion_length": 2199.437545776367,
"epoch": 0.4662857142857143,
"grad_norm": 0.44091248512268066,
"kl": 0.265228271484375,
"learning_rate": 1.8967088307307e-07,
"loss": 0.0208,
"reward": 1.1622834838926792,
"reward_std": 0.6865539737045765,
"rewards/cosine_scaled_reward": 0.16447505727410316,
"rewards/format_reward": 0.833333358168602,
"step": 408
},
{
"completion_length": 2955.916778564453,
"epoch": 0.4674285714285714,
"grad_norm": 0.9645739197731018,
"kl": 0.5897216796875,
"learning_rate": 1.8779779118983867e-07,
"loss": 0.0554,
"reward": 0.31450588814914227,
"reward_std": 0.8947923108935356,
"rewards/cosine_scaled_reward": -0.13441372802481055,
"rewards/format_reward": 0.5833333507180214,
"step": 409
},
{
"completion_length": 2469.7084159851074,
"epoch": 0.4685714285714286,
"grad_norm": 1.2290736436843872,
"kl": 0.5966949462890625,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0567,
"reward": 0.38221518974751234,
"reward_std": 0.8091713823378086,
"rewards/cosine_scaled_reward": -0.10055907690548338,
"rewards/format_reward": 0.5833333432674408,
"step": 410
},
{
"completion_length": 2658.666763305664,
"epoch": 0.4697142857142857,
"grad_norm": 0.5205665826797485,
"kl": 0.40289306640625,
"learning_rate": 1.8410465752883758e-07,
"loss": 0.0389,
"reward": 1.0305472910404205,
"reward_std": 0.8048725798726082,
"rewards/cosine_scaled_reward": 0.11944028595462441,
"rewards/format_reward": 0.7916666753590107,
"step": 411
},
{
"completion_length": 2483.604263305664,
"epoch": 0.47085714285714286,
"grad_norm": 0.48482832312583923,
"kl": 0.34637451171875,
"learning_rate": 1.822847957491922e-07,
"loss": 0.0136,
"reward": 1.0568090807646513,
"reward_std": 0.7529363892972469,
"rewards/cosine_scaled_reward": 0.10132121946662664,
"rewards/format_reward": 0.8541666753590107,
"step": 412
},
{
"completion_length": 1959.5833892822266,
"epoch": 0.472,
"grad_norm": 0.5290913581848145,
"kl": 0.23431396484375,
"learning_rate": 1.804828558898332e-07,
"loss": 0.0159,
"reward": 0.9711576336994767,
"reward_std": 0.8100597076117992,
"rewards/cosine_scaled_reward": 0.0689121619798243,
"rewards/format_reward": 0.8333333395421505,
"step": 413
},
{
"completion_length": 2976.6459350585938,
"epoch": 0.47314285714285714,
"grad_norm": 1.0799856185913086,
"kl": 0.786224365234375,
"learning_rate": 1.7869892577476722e-07,
"loss": 0.0644,
"reward": -0.04989051632583141,
"reward_std": 0.6900010071694851,
"rewards/cosine_scaled_reward": -0.24369526095688343,
"rewards/format_reward": 0.43750000931322575,
"step": 414
},
{
"completion_length": 3083.2084045410156,
"epoch": 0.4742857142857143,
"grad_norm": 1.4723012447357178,
"kl": 0.7890625,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0774,
"reward": 0.03524960530921817,
"reward_std": 0.6955935060977936,
"rewards/cosine_scaled_reward": -0.16987520130351186,
"rewards/format_reward": 0.37500000558793545,
"step": 415
},
{
"completion_length": 2211.6458740234375,
"epoch": 0.4754285714285714,
"grad_norm": 0.8043403029441833,
"kl": 0.3631591796875,
"learning_rate": 1.7518544168045524e-07,
"loss": 0.032,
"reward": 0.8334652222692966,
"reward_std": 0.855853334069252,
"rewards/cosine_scaled_reward": 0.02089927066117525,
"rewards/format_reward": 0.7916666865348816,
"step": 416
},
{
"completion_length": 2856.541748046875,
"epoch": 0.4765714285714286,
"grad_norm": 0.6729423403739929,
"kl": 0.6011962890625,
"learning_rate": 1.7345605894346726e-07,
"loss": 0.0811,
"reward": 0.2203914044657722,
"reward_std": 0.8685822859406471,
"rewards/cosine_scaled_reward": -0.18147097853943706,
"rewards/format_reward": 0.5833333563059568,
"step": 417
},
{
"completion_length": 1945.8125381469727,
"epoch": 0.4777142857142857,
"grad_norm": 1.0605844259262085,
"kl": 0.321624755859375,
"learning_rate": 1.7174502842694212e-07,
"loss": -0.004,
"reward": 1.1576719619333744,
"reward_std": 0.8231003619730473,
"rewards/cosine_scaled_reward": 0.17258594185113907,
"rewards/format_reward": 0.812500013038516,
"step": 418
},
{
"completion_length": 2345.2500762939453,
"epoch": 0.47885714285714287,
"grad_norm": 1.400498628616333,
"kl": 0.31787109375,
"learning_rate": 1.7005243352409333e-07,
"loss": 0.0464,
"reward": 1.1115051358938217,
"reward_std": 0.885998897254467,
"rewards/cosine_scaled_reward": 0.13908587209880352,
"rewards/format_reward": 0.8333333507180214,
"step": 419
},
{
"completion_length": 1978.8542098999023,
"epoch": 0.48,
"grad_norm": 0.7775018215179443,
"kl": 0.303009033203125,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0281,
"reward": 0.667570760473609,
"reward_std": 0.8073213696479797,
"rewards/cosine_scaled_reward": -0.062047986313700676,
"rewards/format_reward": 0.791666679084301,
"step": 420
},
{
"completion_length": 2967.6459350585938,
"epoch": 0.48114285714285715,
"grad_norm": 0.9923365116119385,
"kl": 0.5174560546875,
"learning_rate": 1.6672287963562852e-07,
"loss": 0.0545,
"reward": 0.1129405153915286,
"reward_std": 0.6916285455226898,
"rewards/cosine_scaled_reward": -0.22477975487709045,
"rewards/format_reward": 0.5625000055879354,
"step": 421
},
{
"completion_length": 2569.8542404174805,
"epoch": 0.48228571428571426,
"grad_norm": 0.7899558544158936,
"kl": 0.3731689453125,
"learning_rate": 1.6508608292777203e-07,
"loss": 0.0784,
"reward": 0.3494804035872221,
"reward_std": 0.9125073701143265,
"rewards/cosine_scaled_reward": -0.12734314193949103,
"rewards/format_reward": 0.6041666828095913,
"step": 422
},
{
"completion_length": 2343.1458892822266,
"epoch": 0.48342857142857143,
"grad_norm": 0.6041327714920044,
"kl": 0.34786224365234375,
"learning_rate": 1.6346804638120098e-07,
"loss": 0.0094,
"reward": 0.37775486428290606,
"reward_std": 0.7306395750492811,
"rewards/cosine_scaled_reward": -0.1652892343699932,
"rewards/format_reward": 0.7083333432674408,
"step": 423
},
{
"completion_length": 2692.2709045410156,
"epoch": 0.4845714285714286,
"grad_norm": 0.5803365707397461,
"kl": 0.385528564453125,
"learning_rate": 1.6186884885673413e-07,
"loss": 0.0425,
"reward": 0.3899629784282297,
"reward_std": 0.7263697199523449,
"rewards/cosine_scaled_reward": -0.15918518672697246,
"rewards/format_reward": 0.7083333507180214,
"step": 424
},
{
"completion_length": 1992.9791870117188,
"epoch": 0.4857142857142857,
"grad_norm": 0.4711278975009918,
"kl": 0.20758056640625,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.045,
"reward": 1.7557712569832802,
"reward_std": 0.7609523758292198,
"rewards/cosine_scaled_reward": 0.49246894381940365,
"rewards/format_reward": 0.7708333488553762,
"step": 425
},
{
"completion_length": 1818.3333740234375,
"epoch": 0.4868571428571429,
"grad_norm": 0.29636478424072266,
"kl": 0.1734161376953125,
"learning_rate": 1.5872728172265146e-07,
"loss": 0.0137,
"reward": 0.7233628639951348,
"reward_std": 0.8688660450279713,
"rewards/cosine_scaled_reward": -0.023735247552394867,
"rewards/format_reward": 0.7708333618938923,
"step": 426
},
{
"completion_length": 2321.791748046875,
"epoch": 0.488,
"grad_norm": 0.9740093946456909,
"kl": 0.3040771484375,
"learning_rate": 1.5718506522858572e-07,
"loss": 0.0749,
"reward": 0.5350995054468513,
"reward_std": 0.9547824487090111,
"rewards/cosine_scaled_reward": 0.01754974015057087,
"rewards/format_reward": 0.5000000111758709,
"step": 427
},
{
"completion_length": 2653.5208892822266,
"epoch": 0.48914285714285716,
"grad_norm": 0.9272693991661072,
"kl": 0.474517822265625,
"learning_rate": 1.5566199398026147e-07,
"loss": 0.0771,
"reward": 0.6082139114532765,
"reward_std": 0.9143152646720409,
"rewards/cosine_scaled_reward": -0.06047638365998864,
"rewards/format_reward": 0.7291666902601719,
"step": 428
},
{
"completion_length": 1800.8958892822266,
"epoch": 0.49028571428571427,
"grad_norm": 1.1067618131637573,
"kl": 0.1895751953125,
"learning_rate": 1.5415814221002265e-07,
"loss": 0.0414,
"reward": 0.6690875124186277,
"reward_std": 0.7954706754535437,
"rewards/cosine_scaled_reward": -0.050872914493083954,
"rewards/format_reward": 0.7708333469927311,
"step": 429
},
{
"completion_length": 2331.1875534057617,
"epoch": 0.49142857142857144,
"grad_norm": 1.0719366073608398,
"kl": 0.45489501953125,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0482,
"reward": 0.6795799904502928,
"reward_std": 0.6970538776367903,
"rewards/cosine_scaled_reward": -0.03520998451858759,
"rewards/format_reward": 0.750000013038516,
"step": 430
},
{
"completion_length": 2329.9792251586914,
"epoch": 0.49257142857142855,
"grad_norm": 0.6106032133102417,
"kl": 0.31787109375,
"learning_rate": 1.5120838934595337e-07,
"loss": 0.0357,
"reward": 0.5369092933833599,
"reward_std": 0.6611959636211395,
"rewards/cosine_scaled_reward": -0.11696202587336302,
"rewards/format_reward": 0.7708333395421505,
"step": 431
},
{
"completion_length": 2501.7709197998047,
"epoch": 0.4937142857142857,
"grad_norm": 0.4645460546016693,
"kl": 0.380035400390625,
"learning_rate": 1.4976263201891613e-07,
"loss": 0.045,
"reward": 0.5862938910722733,
"reward_std": 0.6654918566346169,
"rewards/cosine_scaled_reward": -0.07143638655543327,
"rewards/format_reward": 0.7291666734963655,
"step": 432
},
{
"completion_length": 2829.645896911621,
"epoch": 0.4948571428571429,
"grad_norm": 0.5816503167152405,
"kl": 0.444244384765625,
"learning_rate": 1.483363816965435e-07,
"loss": 0.083,
"reward": 0.5509648718871176,
"reward_std": 0.7200754433870316,
"rewards/cosine_scaled_reward": -0.00576755590736866,
"rewards/format_reward": 0.5625000111758709,
"step": 433
},
{
"completion_length": 2463.083427429199,
"epoch": 0.496,
"grad_norm": 0.8237125873565674,
"kl": 0.395416259765625,
"learning_rate": 1.469297078922642e-07,
"loss": 0.0314,
"reward": 0.13537092343904078,
"reward_std": 0.5622758902609348,
"rewards/cosine_scaled_reward": -0.2448145542293787,
"rewards/format_reward": 0.6250000074505806,
"step": 434
},
{
"completion_length": 1650.604232788086,
"epoch": 0.49714285714285716,
"grad_norm": 0.9680685997009277,
"kl": 0.1461639404296875,
"learning_rate": 1.4554267916537495e-07,
"loss": -0.0372,
"reward": 0.49817070248536766,
"reward_std": 0.6576566733419895,
"rewards/cosine_scaled_reward": -0.15716465492732823,
"rewards/format_reward": 0.8125000111758709,
"step": 435
},
{
"completion_length": 1983.520881652832,
"epoch": 0.4982857142857143,
"grad_norm": 0.5045260787010193,
"kl": 0.3553466796875,
"learning_rate": 1.4417536311769885e-07,
"loss": -0.0021,
"reward": 0.8208566140383482,
"reward_std": 0.9373398050665855,
"rewards/cosine_scaled_reward": 0.035428304225206375,
"rewards/format_reward": 0.7500000149011612,
"step": 436
},
{
"completion_length": 2593.8125610351562,
"epoch": 0.49942857142857144,
"grad_norm": 0.5763217806816101,
"kl": 0.2955322265625,
"learning_rate": 1.4282782639029128e-07,
"loss": 0.0435,
"reward": 0.4606641661375761,
"reward_std": 0.6995394416153431,
"rewards/cosine_scaled_reward": -0.12383459135890007,
"rewards/format_reward": 0.7083333544433117,
"step": 437
},
{
"completion_length": 2628.041702270508,
"epoch": 0.5005714285714286,
"grad_norm": 0.4055953919887543,
"kl": 0.2898712158203125,
"learning_rate": 1.4150013466019114e-07,
"loss": 0.0451,
"reward": 0.3238631319254637,
"reward_std": 0.6967433281242847,
"rewards/cosine_scaled_reward": -0.1714017689228058,
"rewards/format_reward": 0.6666666753590107,
"step": 438
},
{
"completion_length": 2041.5417442321777,
"epoch": 0.5017142857142857,
"grad_norm": 0.6679253578186035,
"kl": 0.2696380615234375,
"learning_rate": 1.4019235263722034e-07,
"loss": 0.0546,
"reward": 0.5020741457119584,
"reward_std": 0.7230929285287857,
"rewards/cosine_scaled_reward": -0.11354626249521971,
"rewards/format_reward": 0.729166679084301,
"step": 439
},
{
"completion_length": 2653.0625762939453,
"epoch": 0.5028571428571429,
"grad_norm": 0.9918028712272644,
"kl": 0.390869140625,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0122,
"reward": 0.26619721701717936,
"reward_std": 0.6938467286527157,
"rewards/cosine_scaled_reward": -0.16898473422043025,
"rewards/format_reward": 0.6041666772216558,
"step": 440
},
{
"completion_length": 2321.5209045410156,
"epoch": 0.504,
"grad_norm": 1.487559199333191,
"kl": 0.350006103515625,
"learning_rate": 1.3763677169699217e-07,
"loss": -0.0064,
"reward": 0.5201191268861294,
"reward_std": 0.6544113270938396,
"rewards/cosine_scaled_reward": -0.05244044866412878,
"rewards/format_reward": 0.6250000093132257,
"step": 441
},
{
"completion_length": 2166.58341217041,
"epoch": 0.5051428571428571,
"grad_norm": 0.2839236259460449,
"kl": 0.2300567626953125,
"learning_rate": 1.3638909733514452e-07,
"loss": 0.0157,
"reward": 0.8514236286282539,
"reward_std": 0.6945049427449703,
"rewards/cosine_scaled_reward": 0.05071181803941727,
"rewards/format_reward": 0.7500000186264515,
"step": 442
},
{
"completion_length": 2637.0833892822266,
"epoch": 0.5062857142857143,
"grad_norm": 0.9685612320899963,
"kl": 0.45916748046875,
"learning_rate": 1.351615817851748e-07,
"loss": 0.0937,
"reward": 0.4695184410084039,
"reward_std": 0.8600027747452259,
"rewards/cosine_scaled_reward": -0.04649078845977783,
"rewards/format_reward": 0.5625000037252903,
"step": 443
},
{
"completion_length": 2139.729217529297,
"epoch": 0.5074285714285715,
"grad_norm": 0.4402044117450714,
"kl": 0.3176116943359375,
"learning_rate": 1.3395428487445914e-07,
"loss": 0.0325,
"reward": 0.6444471004651859,
"reward_std": 0.6833576895296574,
"rewards/cosine_scaled_reward": -0.011109774932265282,
"rewards/format_reward": 0.666666679084301,
"step": 444
},
{
"completion_length": 1914.6667175292969,
"epoch": 0.5085714285714286,
"grad_norm": 0.5042048692703247,
"kl": 0.251068115234375,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.046,
"reward": 0.5679828058928251,
"reward_std": 0.8287508636713028,
"rewards/cosine_scaled_reward": -0.0493419524282217,
"rewards/format_reward": 0.6666666809469461,
"step": 445
},
{
"completion_length": 2167.0625381469727,
"epoch": 0.5097142857142857,
"grad_norm": 0.7239652872085571,
"kl": 0.242706298828125,
"learning_rate": 1.316005813502869e-07,
"loss": 0.0355,
"reward": 0.7304300144314766,
"reward_std": 0.6240549013018608,
"rewards/cosine_scaled_reward": -0.020201677456498146,
"rewards/format_reward": 0.7708333432674408,
"step": 446
},
{
"completion_length": 2175.4375915527344,
"epoch": 0.5108571428571429,
"grad_norm": 0.4598706364631653,
"kl": 0.25213623046875,
"learning_rate": 1.3045428945301953e-07,
"loss": 0.0393,
"reward": 0.7061773156747222,
"reward_std": 0.6147476173937321,
"rewards/cosine_scaled_reward": 0.009338663425296545,
"rewards/format_reward": 0.6875000093132257,
"step": 447
},
{
"completion_length": 1812.0000457763672,
"epoch": 0.512,
"grad_norm": 0.75312340259552,
"kl": 0.1580963134765625,
"learning_rate": 1.2932844562179352e-07,
"loss": 0.0441,
"reward": 0.590579898096621,
"reward_std": 0.6532803736627102,
"rewards/cosine_scaled_reward": -0.11096005886793137,
"rewards/format_reward": 0.8125000149011612,
"step": 448
},
{
"completion_length": 1596.2917022705078,
"epoch": 0.5131428571428571,
"grad_norm": 0.4844936728477478,
"kl": 0.22760009765625,
"learning_rate": 1.2822310472864885e-07,
"loss": 0.0554,
"reward": 0.5192018076777458,
"reward_std": 0.6534673273563385,
"rewards/cosine_scaled_reward": -0.09456577710807323,
"rewards/format_reward": 0.7083333469927311,
"step": 449
},
{
"completion_length": 2333.8958892822266,
"epoch": 0.5142857142857142,
"grad_norm": 0.694821298122406,
"kl": 0.3708038330078125,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0758,
"reward": 0.48664434999227524,
"reward_std": 0.7445627897977829,
"rewards/cosine_scaled_reward": -0.09001116827130318,
"rewards/format_reward": 0.6666666753590107,
"step": 450
},
{
"completion_length": 1962.1875610351562,
"epoch": 0.5154285714285715,
"grad_norm": 1.1499308347702026,
"kl": 0.2388916015625,
"learning_rate": 1.260741462457165e-07,
"loss": 0.0803,
"reward": 0.9750413531437516,
"reward_std": 0.841393306851387,
"rewards/cosine_scaled_reward": 0.08127065747976303,
"rewards/format_reward": 0.8125000186264515,
"step": 451
},
{
"completion_length": 2639.0625610351562,
"epoch": 0.5165714285714286,
"grad_norm": 2.796175241470337,
"kl": 0.3663330078125,
"learning_rate": 1.2503063339313356e-07,
"loss": 0.1874,
"reward": 0.5532118980772793,
"reward_std": 1.0444200411438942,
"rewards/cosine_scaled_reward": -0.004644063068553805,
"rewards/format_reward": 0.5625000149011612,
"step": 452
},
{
"completion_length": 2044.625099182129,
"epoch": 0.5177142857142857,
"grad_norm": 0.9541345238685608,
"kl": 0.255096435546875,
"learning_rate": 1.2400783294793668e-07,
"loss": 0.0042,
"reward": 0.8220235072076321,
"reward_std": 0.8156450688838959,
"rewards/cosine_scaled_reward": 0.015178397297859192,
"rewards/format_reward": 0.7916666902601719,
"step": 453
},
{
"completion_length": 2003.0833892822266,
"epoch": 0.5188571428571429,
"grad_norm": 0.8397437334060669,
"kl": 0.21966552734375,
"learning_rate": 1.2300579475997657e-07,
"loss": 0.0811,
"reward": 0.449910047929734,
"reward_std": 0.6436374075710773,
"rewards/cosine_scaled_reward": -0.139628317207098,
"rewards/format_reward": 0.7291666865348816,
"step": 454
},
{
"completion_length": 2385.166732788086,
"epoch": 0.52,
"grad_norm": 1.0180670022964478,
"kl": 0.420745849609375,
"learning_rate": 1.220245676671809e-07,
"loss": 0.036,
"reward": 0.3301239423453808,
"reward_std": 0.7285679541528225,
"rewards/cosine_scaled_reward": -0.1682713646441698,
"rewards/format_reward": 0.6666666828095913,
"step": 455
},
{
"completion_length": 2575.208396911621,
"epoch": 0.5211428571428571,
"grad_norm": 0.5599580407142639,
"kl": 0.373779296875,
"learning_rate": 1.2106419949317388e-07,
"loss": 0.0236,
"reward": 0.5938101289793849,
"reward_std": 0.8681718483567238,
"rewards/cosine_scaled_reward": -0.05726161040365696,
"rewards/format_reward": 0.7083333507180214,
"step": 456
},
{
"completion_length": 2281.354248046875,
"epoch": 0.5222857142857142,
"grad_norm": 0.6153411865234375,
"kl": 0.3210906982421875,
"learning_rate": 1.2012473704494537e-07,
"loss": 0.0206,
"reward": 0.6159039521589875,
"reward_std": 0.680855069309473,
"rewards/cosine_scaled_reward": -0.05663137254305184,
"rewards/format_reward": 0.7291666772216558,
"step": 457
},
{
"completion_length": 2125.375068664551,
"epoch": 0.5234285714285715,
"grad_norm": 1.2423961162567139,
"kl": 0.3084259033203125,
"learning_rate": 1.1920622611056974e-07,
"loss": -0.0183,
"reward": 0.3741978630423546,
"reward_std": 0.8636242747306824,
"rewards/cosine_scaled_reward": -0.1983177432557568,
"rewards/format_reward": 0.7708333358168602,
"step": 458
},
{
"completion_length": 2127.812545776367,
"epoch": 0.5245714285714286,
"grad_norm": 1.718005657196045,
"kl": 0.253997802734375,
"learning_rate": 1.1830871145697412e-07,
"loss": 0.0943,
"reward": 0.8951551232021302,
"reward_std": 0.9340690299868584,
"rewards/cosine_scaled_reward": 0.062160891480743885,
"rewards/format_reward": 0.7708333395421505,
"step": 459
},
{
"completion_length": 2741.1875610351562,
"epoch": 0.5257142857142857,
"grad_norm": 1.218464970588684,
"kl": 0.42919921875,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.084,
"reward": 0.36291674245148897,
"reward_std": 0.9569993317127228,
"rewards/cosine_scaled_reward": -0.12062497227452695,
"rewards/format_reward": 0.6041666772216558,
"step": 460
},
{
"completion_length": 2222.4584159851074,
"epoch": 0.5268571428571428,
"grad_norm": 1.2166295051574707,
"kl": 0.32196044921875,
"learning_rate": 1.1657684494105386e-07,
"loss": 0.0231,
"reward": 0.7657130546867847,
"reward_std": 0.9577609188854694,
"rewards/cosine_scaled_reward": 0.03910651011392474,
"rewards/format_reward": 0.6875000186264515,
"step": 461
},
{
"completion_length": 2043.1250686645508,
"epoch": 0.528,
"grad_norm": 0.40705692768096924,
"kl": 0.3497314453125,
"learning_rate": 1.1574257748745986e-07,
"loss": 0.0577,
"reward": 0.31157703790813684,
"reward_std": 0.6110754385590553,
"rewards/cosine_scaled_reward": -0.20879483548924327,
"rewards/format_reward": 0.729166679084301,
"step": 462
},
{
"completion_length": 2590.8334045410156,
"epoch": 0.5291428571428571,
"grad_norm": 0.5638855695724487,
"kl": 0.440826416015625,
"learning_rate": 1.1492947512799328e-07,
"loss": 0.0413,
"reward": 0.5640892055816948,
"reward_std": 1.1282568126916885,
"rewards/cosine_scaled_reward": -0.02003874396905303,
"rewards/format_reward": 0.6041666828095913,
"step": 463
},
{
"completion_length": 1697.8958740234375,
"epoch": 0.5302857142857142,
"grad_norm": 0.3173580467700958,
"kl": 0.2566986083984375,
"learning_rate": 1.1413757749211602e-07,
"loss": 0.0373,
"reward": 0.9147527795284986,
"reward_std": 0.6120780212804675,
"rewards/cosine_scaled_reward": 0.01987638510763645,
"rewards/format_reward": 0.8750000149011612,
"step": 464
},
{
"completion_length": 2732.604217529297,
"epoch": 0.5314285714285715,
"grad_norm": 0.9214864373207092,
"kl": 0.56097412109375,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0832,
"reward": 0.5037600143114105,
"reward_std": 0.827365554869175,
"rewards/cosine_scaled_reward": -0.05020333209540695,
"rewards/format_reward": 0.604166679084301,
"step": 465
},
{
"completion_length": 2255.895881652832,
"epoch": 0.5325714285714286,
"grad_norm": 0.7400510311126709,
"kl": 0.2809600830078125,
"learning_rate": 1.1261754973965422e-07,
"loss": 0.0503,
"reward": 1.0706525277346373,
"reward_std": 0.877636231482029,
"rewards/cosine_scaled_reward": 0.1394929286034312,
"rewards/format_reward": 0.791666692122817,
"step": 466
},
{
"completion_length": 2365.375068664551,
"epoch": 0.5337142857142857,
"grad_norm": 0.9116939306259155,
"kl": 0.427398681640625,
"learning_rate": 1.1188949370707787e-07,
"loss": 0.0076,
"reward": 0.40383276902139187,
"reward_std": 0.6409453861415386,
"rewards/cosine_scaled_reward": -0.12100029923021793,
"rewards/format_reward": 0.6458333432674408,
"step": 467
},
{
"completion_length": 2636.5417251586914,
"epoch": 0.5348571428571428,
"grad_norm": 0.706851601600647,
"kl": 0.528076171875,
"learning_rate": 1.1118279056249653e-07,
"loss": 0.0899,
"reward": 0.5270813233219087,
"reward_std": 0.9826765283942223,
"rewards/cosine_scaled_reward": -0.0697926718275994,
"rewards/format_reward": 0.6666666772216558,
"step": 468
},
{
"completion_length": 2235.6042098999023,
"epoch": 0.536,
"grad_norm": 0.9927799105644226,
"kl": 0.5670166015625,
"learning_rate": 1.1049747474962444e-07,
"loss": 0.045,
"reward": 0.6296205222606659,
"reward_std": 0.7105157673358917,
"rewards/cosine_scaled_reward": -0.01852307841181755,
"rewards/format_reward": 0.6666666828095913,
"step": 469
},
{
"completion_length": 2740.979248046875,
"epoch": 0.5371428571428571,
"grad_norm": 1.1925156116485596,
"kl": 0.44580078125,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0244,
"reward": 0.3472237199312076,
"reward_std": 0.8571870625019073,
"rewards/cosine_scaled_reward": -0.13888813648372889,
"rewards/format_reward": 0.6250000223517418,
"step": 470
},
{
"completion_length": 2298.8958892822266,
"epoch": 0.5382857142857143,
"grad_norm": 0.5537897348403931,
"kl": 0.326873779296875,
"learning_rate": 1.0919113768029517e-07,
"loss": 0.0462,
"reward": 0.6785652227699757,
"reward_std": 0.6474947556853294,
"rewards/cosine_scaled_reward": 0.02678260626271367,
"rewards/format_reward": 0.625000013038516,
"step": 471
},
{
"completion_length": 2207.2083740234375,
"epoch": 0.5394285714285715,
"grad_norm": 0.6610918045043945,
"kl": 0.354949951171875,
"learning_rate": 1.0857018009286381e-07,
"loss": 0.0261,
"reward": 0.3990835212171078,
"reward_std": 0.659260107204318,
"rewards/cosine_scaled_reward": -0.14420824265107512,
"rewards/format_reward": 0.6875000074505806,
"step": 472
},
{
"completion_length": 2732.604248046875,
"epoch": 0.5405714285714286,
"grad_norm": 0.9713963270187378,
"kl": 0.40972900390625,
"learning_rate": 1.0797073717209013e-07,
"loss": 0.038,
"reward": 0.536033843178302,
"reward_std": 0.7053390368819237,
"rewards/cosine_scaled_reward": -0.10698308888822794,
"rewards/format_reward": 0.7500000074505806,
"step": 473
},
{
"completion_length": 2258.875068664551,
"epoch": 0.5417142857142857,
"grad_norm": 0.7563430666923523,
"kl": 0.446258544921875,
"learning_rate": 1.0739283813397639e-07,
"loss": 0.0445,
"reward": 0.9616024261340499,
"reward_std": 0.9263377264142036,
"rewards/cosine_scaled_reward": 0.1370511914137751,
"rewards/format_reward": 0.6875000260770321,
"step": 474
},
{
"completion_length": 2066.3750534057617,
"epoch": 0.5428571428571428,
"grad_norm": 0.874462366104126,
"kl": 0.272369384765625,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0436,
"reward": 1.0160637801745906,
"reward_std": 0.9121544472873211,
"rewards/cosine_scaled_reward": 0.0809485362842679,
"rewards/format_reward": 0.8541666753590107,
"step": 475
},
{
"completion_length": 2530.3333892822266,
"epoch": 0.544,
"grad_norm": 1.8674249649047852,
"kl": 0.4962158203125,
"learning_rate": 1.063017833182728e-07,
"loss": 0.0869,
"reward": 0.6834689327515662,
"reward_std": 1.0683976262807846,
"rewards/cosine_scaled_reward": -0.043682204210199416,
"rewards/format_reward": 0.7708333469927311,
"step": 476
},
{
"completion_length": 2057.562557220459,
"epoch": 0.5451428571428572,
"grad_norm": 1.2835708856582642,
"kl": 0.296966552734375,
"learning_rate": 1.0578868071715544e-07,
"loss": 0.0562,
"reward": 1.2522786986082792,
"reward_std": 0.9867831543087959,
"rewards/cosine_scaled_reward": 0.23030602023936808,
"rewards/format_reward": 0.7916666828095913,
"step": 477
},
{
"completion_length": 2334.1250534057617,
"epoch": 0.5462857142857143,
"grad_norm": 0.5521568655967712,
"kl": 0.382965087890625,
"learning_rate": 1.0529722834905125e-07,
"loss": 0.0722,
"reward": 0.4701565820723772,
"reward_std": 0.7251895070075989,
"rewards/cosine_scaled_reward": -0.108671719674021,
"rewards/format_reward": 0.6875000074505806,
"step": 478
},
{
"completion_length": 2686.3959045410156,
"epoch": 0.5474285714285714,
"grad_norm": 1.0934919118881226,
"kl": 0.466461181640625,
"learning_rate": 1.0482745016665526e-07,
"loss": 0.0255,
"reward": 0.41961941798217595,
"reward_std": 0.8614507652819157,
"rewards/cosine_scaled_reward": -0.13394030090421438,
"rewards/format_reward": 0.6875000149011612,
"step": 479
},
{
"completion_length": 2260.020927429199,
"epoch": 0.5485714285714286,
"grad_norm": 1.4672926664352417,
"kl": 0.4683380126953125,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0376,
"reward": 0.3727748654782772,
"reward_std": 0.5491989254951477,
"rewards/cosine_scaled_reward": -0.18861256520904135,
"rewards/format_reward": 0.7500000111758709,
"step": 480
},
{
"completion_length": 2530.729217529297,
"epoch": 0.5497142857142857,
"grad_norm": 1.4569040536880493,
"kl": 0.45361328125,
"learning_rate": 1.0395300688680625e-07,
"loss": 0.0921,
"reward": 0.36487692515947856,
"reward_std": 0.7810634449124336,
"rewards/cosine_scaled_reward": -0.1613115556538105,
"rewards/format_reward": 0.6875000093132257,
"step": 481
},
{
"completion_length": 2573.8750915527344,
"epoch": 0.5508571428571428,
"grad_norm": 1.0434833765029907,
"kl": 0.42730712890625,
"learning_rate": 1.0354838440848501e-07,
"loss": 0.0425,
"reward": 0.9190967418253422,
"reward_std": 0.8362308479845524,
"rewards/cosine_scaled_reward": 0.11579836346209049,
"rewards/format_reward": 0.6875000167638063,
"step": 482
},
{
"completion_length": 2527.229202270508,
"epoch": 0.552,
"grad_norm": 1.4459831714630127,
"kl": 0.57708740234375,
"learning_rate": 1.0316552135205837e-07,
"loss": 0.0301,
"reward": 0.20397535432130098,
"reward_std": 0.5737987570464611,
"rewards/cosine_scaled_reward": -0.22092900332063437,
"rewards/format_reward": 0.6458333488553762,
"step": 483
},
{
"completion_length": 2179.6875534057617,
"epoch": 0.5531428571428572,
"grad_norm": 1.0252877473831177,
"kl": 0.31011962890625,
"learning_rate": 1.0280443637773163e-07,
"loss": 0.0744,
"reward": 0.8415414169430733,
"reward_std": 0.6265546232461929,
"rewards/cosine_scaled_reward": 0.014520692639052868,
"rewards/format_reward": 0.8125000149011612,
"step": 484
},
{
"completion_length": 2130.3958740234375,
"epoch": 0.5542857142857143,
"grad_norm": 2.06632924079895,
"kl": 0.285888671875,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0265,
"reward": 0.9047174965962768,
"reward_std": 0.7558673620223999,
"rewards/cosine_scaled_reward": 0.014858738519251347,
"rewards/format_reward": 0.8750000149011612,
"step": 485
},
{
"completion_length": 1874.6458702087402,
"epoch": 0.5554285714285714,
"grad_norm": 0.5376132130622864,
"kl": 0.2660408020019531,
"learning_rate": 1.0214767000817596e-07,
"loss": 0.0535,
"reward": 0.7522924607619643,
"reward_std": 0.5480217132717371,
"rewards/cosine_scaled_reward": -0.01968712778761983,
"rewards/format_reward": 0.7916666772216558,
"step": 486
},
{
"completion_length": 1944.1041870117188,
"epoch": 0.5565714285714286,
"grad_norm": 0.6722615361213684,
"kl": 0.39788818359375,
"learning_rate": 1.0185202062281336e-07,
"loss": 0.0286,
"reward": 1.0850566178560257,
"reward_std": 0.8383910320699215,
"rewards/cosine_scaled_reward": 0.15711162728257477,
"rewards/format_reward": 0.7708333432674408,
"step": 487
},
{
"completion_length": 2214.08341217041,
"epoch": 0.5577142857142857,
"grad_norm": 0.5369753241539001,
"kl": 0.4105224609375,
"learning_rate": 1.0157821333772304e-07,
"loss": 0.0299,
"reward": 0.42558533302508295,
"reward_std": 0.6608752533793449,
"rewards/cosine_scaled_reward": -0.15179067384451628,
"rewards/format_reward": 0.7291666902601719,
"step": 488
},
{
"completion_length": 3057.2084045410156,
"epoch": 0.5588571428571428,
"grad_norm": 0.41021808981895447,
"kl": 0.56591796875,
"learning_rate": 1.013262614978859e-07,
"loss": 0.0893,
"reward": -0.06845853175036609,
"reward_std": 0.5956653170287609,
"rewards/cosine_scaled_reward": -0.24256260506808758,
"rewards/format_reward": 0.41666667349636555,
"step": 489
},
{
"completion_length": 2016.6667098999023,
"epoch": 0.56,
"grad_norm": 1.8614702224731445,
"kl": 0.3179168701171875,
"learning_rate": 1.0109617738307911e-07,
"loss": -0.0125,
"reward": 0.6355590866878629,
"reward_std": 0.6039589680731297,
"rewards/cosine_scaled_reward": -0.09888713248074055,
"rewards/format_reward": 0.8333333395421505,
"step": 490
},
{
"completion_length": 2568.8333740234375,
"epoch": 0.5611428571428572,
"grad_norm": 0.7798163294792175,
"kl": 0.378570556640625,
"learning_rate": 1.0088797220727779e-07,
"loss": 0.0627,
"reward": 0.9301005639135838,
"reward_std": 1.0789660066366196,
"rewards/cosine_scaled_reward": 0.1108836093917489,
"rewards/format_reward": 0.7083333544433117,
"step": 491
},
{
"completion_length": 2040.7500839233398,
"epoch": 0.5622857142857143,
"grad_norm": 0.8784842491149902,
"kl": 0.300018310546875,
"learning_rate": 1.0070165611810855e-07,
"loss": -0.0041,
"reward": 1.00854002404958,
"reward_std": 0.7161477841436863,
"rewards/cosine_scaled_reward": 0.11885332595556974,
"rewards/format_reward": 0.7708333414047956,
"step": 492
},
{
"completion_length": 2196.104232788086,
"epoch": 0.5634285714285714,
"grad_norm": 0.9897964000701904,
"kl": 0.3065032958984375,
"learning_rate": 1.005372381963547e-07,
"loss": 0.0473,
"reward": 1.1381681943312287,
"reward_std": 0.8435879498720169,
"rewards/cosine_scaled_reward": 0.15241740830242634,
"rewards/format_reward": 0.8333333469927311,
"step": 493
},
{
"completion_length": 1807.5208740234375,
"epoch": 0.5645714285714286,
"grad_norm": 1.8549937009811401,
"kl": 0.2334747314453125,
"learning_rate": 1.0039472645551372e-07,
"loss": 0.0891,
"reward": 0.9310838505625725,
"reward_std": 0.9970204196870327,
"rewards/cosine_scaled_reward": 0.059291912242770195,
"rewards/format_reward": 0.8125000186264515,
"step": 494
},
{
"completion_length": 2438.5209045410156,
"epoch": 0.5657142857142857,
"grad_norm": 1.1024754047393799,
"kl": 0.4427490234375,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0862,
"reward": 0.40903129265643656,
"reward_std": 0.855750784277916,
"rewards/cosine_scaled_reward": -0.08715102472342551,
"rewards/format_reward": 0.583333345130086,
"step": 495
},
{
"completion_length": 1854.2500686645508,
"epoch": 0.5668571428571428,
"grad_norm": 1.7624000310897827,
"kl": 0.282470703125,
"learning_rate": 1.0017544823184055e-07,
"loss": -0.0104,
"reward": 0.7299946136772633,
"reward_std": 0.6787788085639477,
"rewards/cosine_scaled_reward": -0.020419366657733917,
"rewards/format_reward": 0.7708333376795053,
"step": 496
},
{
"completion_length": 1980.7708587646484,
"epoch": 0.568,
"grad_norm": 1.077419638633728,
"kl": 0.1779632568359375,
"learning_rate": 1.0009869243631952e-07,
"loss": 0.0434,
"reward": 1.1050571464002132,
"reward_std": 0.8892782032489777,
"rewards/cosine_scaled_reward": 0.13586187362670898,
"rewards/format_reward": 0.8333333395421505,
"step": 497
},
{
"completion_length": 2287.5834045410156,
"epoch": 0.5691428571428572,
"grad_norm": 0.9310129284858704,
"kl": 0.35736083984375,
"learning_rate": 1.000438641958131e-07,
"loss": 0.0329,
"reward": 0.47706713899970055,
"reward_std": 0.7550015151500702,
"rewards/cosine_scaled_reward": -0.12604976166039705,
"rewards/format_reward": 0.7291666828095913,
"step": 498
},
{
"completion_length": 2113.479248046875,
"epoch": 0.5702857142857143,
"grad_norm": 1.2932276725769043,
"kl": 0.3569793701171875,
"learning_rate": 1.0001096618257236e-07,
"loss": 0.0673,
"reward": 0.7009947504848242,
"reward_std": 0.9477885626256466,
"rewards/cosine_scaled_reward": -0.0036692821886390448,
"rewards/format_reward": 0.7083333544433117,
"step": 499
},
{
"completion_length": 2337.5833740234375,
"epoch": 0.5714285714285714,
"grad_norm": 0.5998778939247131,
"kl": 0.4022216796875,
"learning_rate": 1e-07,
"loss": 0.0495,
"reward": 0.46617759205400944,
"reward_std": 0.6925142146646976,
"rewards/cosine_scaled_reward": -0.06899452651850879,
"rewards/format_reward": 0.604166679084301,
"step": 500
},
{
"epoch": 0.5714285714285714,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.04010716926510213,
"train_runtime": 69296.9973,
"train_samples_per_second": 0.346,
"train_steps_per_second": 0.007
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}