{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21488047273704003, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 0.7574486248195171, "advantage_mean": 3.1044091186593903e-09, "advantage_min": -0.35297736525535583, "advantage_std": 0.4102070014923811, "completion_length": 3253.0, "epoch": 0.00042976094547408005, "grad_norm": 0.05558167025446892, "kl": 0.0, "lambda_div_used": 0.7000000000000001, "learning_rate": 2e-08, "loss": 0.0052, "reward": -0.4359366223216057, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4359366223216057, "reward_after_std": 0.41020700335502625, "reward_before_mean": -0.2934060515835881, "reward_before_std": 0.3901851810514927, "reward_change_max": 0.00023843348026275635, "reward_change_mean": -0.14253058983013034, "reward_change_min": -0.25587797723710537, "reward_change_std": 0.09850339544937015, "reward_std": 0.41020701453089714, "rewards/cosine_scaled_reward": -0.21961969044059515, "rewards/format_reward": 0.14583333395421505, "step": 1 }, { "advantage_max": 1.4106307551264763, "advantage_mean": -6.208817904251873e-09, "advantage_min": -0.676025751978159, "advantage_std": 0.8005286566913128, "completion_length": 3107.5625610351562, "epoch": 0.0008595218909481601, "grad_norm": 0.12468738853931427, "kl": 0.0, "lambda_div_used": 0.7000000000000001, "learning_rate": 4e-08, "loss": 0.0347, "reward": -0.16470268240664154, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16470268240664154, "reward_after_std": 0.8005286566913128, "reward_before_mean": 0.02774460567161441, "reward_before_std": 0.8293319866061211, "reward_change_max": 0.0011471733450889587, "reward_change_mean": -0.19244728982448578, "reward_change_min": -0.4903235826641321, "reward_change_std": 0.18530499562621117, "reward_std": 0.8005287051200867, "rewards/cosine_scaled_reward": -0.15279437159188092, "rewards/format_reward": 0.33333333767950535, "step": 2 }, { "advantage_max": 0.7690494544804096, "advantage_mean": 1.893689316467828e-08, "advantage_min": -0.40768903121352196, "advantage_std": 0.4341406896710396, "completion_length": 3255.3541870117188, "epoch": 0.0012892828364222402, "grad_norm": 0.08482886850833893, "kl": 6.273388862609863e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6e-08, "loss": 0.0207, "reward": -0.4453692873939872, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4453692873939872, "reward_after_std": 0.4341406933963299, "reward_before_mean": -0.3086634774808772, "reward_before_std": 0.43394630029797554, "reward_change_max": 0.0007981359958648682, "reward_change_mean": -0.1367058209143579, "reward_change_min": -0.2732925359159708, "reward_change_std": 0.11301766941323876, "reward_std": 0.4341407082974911, "rewards/cosine_scaled_reward": -0.27933173812925816, "rewards/format_reward": 0.25000000558793545, "step": 3 }, { "advantage_max": 0.6063321307301521, "advantage_mean": 3.166496842510469e-08, "advantage_min": -0.38830479234457016, "advantage_std": 0.35908698476850986, "completion_length": 3582.8333435058594, "epoch": 0.0017190437818963202, "grad_norm": 0.0665796622633934, "kl": 5.093961954116821e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8e-08, "loss": 0.0, "reward": -0.5265545975416899, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5265545975416899, "reward_after_std": 0.359086986631155, "reward_before_mean": -0.4067907880526036, "reward_before_std": 0.3715064935386181, "reward_change_max": 0.0, "reward_change_mean": -0.11976379062980413, "reward_change_min": -0.25006793811917305, "reward_change_std": 0.10275549255311489, "reward_std": 0.3590869978070259, "rewards/cosine_scaled_reward": -0.22422873228788376, "rewards/format_reward": 0.0416666679084301, "step": 4 }, { "advantage_max": 0.9123956672847271, "advantage_mean": -3.104408841103634e-09, "advantage_min": -0.5983563214540482, "advantage_std": 0.5378642901778221, "completion_length": 2683.125015258789, "epoch": 0.0021488047273704003, "grad_norm": 0.0683504268527031, "kl": 3.984197974205017e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-07, "loss": 0.0194, "reward": 0.11995275318622589, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11995275318622589, "reward_after_std": 0.5378642939031124, "reward_before_mean": 0.4483791949460283, "reward_before_std": 0.506984006613493, "reward_change_max": 0.0, "reward_change_mean": -0.328426456078887, "reward_change_min": -0.5342350415885448, "reward_change_std": 0.2112575932405889, "reward_std": 0.5378643125295639, "rewards/cosine_scaled_reward": -0.025810403749346733, "rewards/format_reward": 0.5, "step": 5 }, { "advantage_max": 1.0520896948873997, "advantage_mean": 1.0554989438027462e-08, "advantage_min": -0.8828505203127861, "advantage_std": 0.6766230743378401, "completion_length": 3079.1666870117188, "epoch": 0.0025785656728444803, "grad_norm": 0.12584571540355682, "kl": 4.413723945617676e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2e-07, "loss": 0.0642, "reward": 0.03475750982761383, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03475750982761383, "reward_after_std": 0.6766230780631304, "reward_before_mean": 0.3235095057170838, "reward_before_std": 0.7230437900871038, "reward_change_max": 8.161365985870361e-05, "reward_change_mean": -0.28875201754271984, "reward_change_min": -0.5174895022064447, "reward_change_std": 0.22692573769018054, "reward_std": 0.6766230892390013, "rewards/cosine_scaled_reward": -0.015328572771977633, "rewards/format_reward": 0.354166679084301, "step": 6 }, { "advantage_max": 0.6782091520726681, "advantage_mean": 2.3283064559675992e-08, "advantage_min": -0.4198353700339794, "advantage_std": 0.3897525705397129, "completion_length": 3219.5833740234375, "epoch": 0.0030083266183185604, "grad_norm": 0.07295465469360352, "kl": 2.533942461013794e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4e-07, "loss": 0.0402, "reward": -0.3215319234877825, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3215319234877825, "reward_after_std": 0.3897525705397129, "reward_before_mean": -0.13357361126691103, "reward_before_std": 0.37388426065444946, "reward_change_max": 0.0, "reward_change_mean": -0.18795829452574253, "reward_change_min": -0.33705861680209637, "reward_change_std": 0.12576818279922009, "reward_std": 0.38975257985293865, "rewards/cosine_scaled_reward": -0.18137014657258987, "rewards/format_reward": 0.2291666716337204, "step": 7 }, { "advantage_max": 0.7355631701648235, "advantage_mean": 1.8005570368018198e-08, "advantage_min": -0.5340957194566727, "advantage_std": 0.48499002680182457, "completion_length": 3569.2916870117188, "epoch": 0.0034380875637926404, "grad_norm": 0.08784355223178864, "kl": 3.435462713241577e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6e-07, "loss": 0.0094, "reward": -0.3904084414243698, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3904084414243698, "reward_after_std": 0.48499003052711487, "reward_before_mean": -0.2309352532029152, "reward_before_std": 0.5427466109395027, "reward_change_max": 0.00025159120559692383, "reward_change_mean": -0.15947317611426115, "reward_change_min": -0.3716627322137356, "reward_change_std": 0.15931534208357334, "reward_std": 0.48499004915356636, "rewards/cosine_scaled_reward": -0.1467176303267479, "rewards/format_reward": 0.0625, "step": 8 }, { "advantage_max": 0.8696979656815529, "advantage_mean": -1.3038516710750514e-08, "advantage_min": -0.5371990613639355, "advantage_std": 0.5132522564381361, "completion_length": 2767.062515258789, "epoch": 0.0038678485092667205, "grad_norm": 0.08149543404579163, "kl": 2.8088688850402832e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8e-07, "loss": 0.0094, "reward": 0.001597665250301361, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.001597665250301361, "reward_after_std": 0.5132522620260715, "reward_before_mean": 0.29100372828543186, "reward_before_std": 0.4940206576138735, "reward_change_max": 0.0009395778179168701, "reward_change_mean": -0.2894060639664531, "reward_change_min": -0.4880948457866907, "reward_change_std": 0.1963454270735383, "reward_std": 0.5132522694766521, "rewards/cosine_scaled_reward": -0.02116481587290764, "rewards/format_reward": 0.33333333395421505, "step": 9 }, { "advantage_max": 1.1381889209151268, "advantage_mean": 5.5879357807597785e-09, "advantage_min": -0.5834753960371017, "advantage_std": 0.622060913592577, "completion_length": 3227.000045776367, "epoch": 0.0042976094547408005, "grad_norm": 0.09776145219802856, "kl": 3.33636999130249e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2e-07, "loss": 0.0212, "reward": -0.32379621523432434, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32379621523432434, "reward_after_std": 0.622060913592577, "reward_before_mean": -0.17128651589155197, "reward_before_std": 0.6215133480727673, "reward_change_max": 0.0006311312317848206, "reward_change_mean": -0.15250970982015133, "reward_change_min": -0.2990625947713852, "reward_change_std": 0.12602092139422894, "reward_std": 0.6220609359443188, "rewards/cosine_scaled_reward": -0.20022658235393465, "rewards/format_reward": 0.22916667349636555, "step": 10 }, { "advantage_max": 0.995155330747366, "advantage_mean": 2.4214387939203164e-08, "advantage_min": -0.5060127899050713, "advantage_std": 0.5420315228402615, "completion_length": 3058.3125762939453, "epoch": 0.00472737040021488, "grad_norm": 0.12377487868070602, "kl": 5.3495168685913086e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1999999999999998e-07, "loss": 0.0767, "reward": -0.37695033056661487, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.37695033056661487, "reward_after_std": 0.5420315228402615, "reward_before_mean": -0.23300920613110065, "reward_before_std": 0.5371159501373768, "reward_change_max": 0.001499362289905548, "reward_change_mean": -0.14394111139699817, "reward_change_min": -0.29964184388518333, "reward_change_std": 0.12228569574654102, "reward_std": 0.5420315451920033, "rewards/cosine_scaled_reward": -0.24150461703538895, "rewards/format_reward": 0.25000000558793545, "step": 11 }, { "advantage_max": 1.07436428591609, "advantage_mean": 8.6923440667519e-09, "advantage_min": -0.5780129581689835, "advantage_std": 0.6174972727894783, "completion_length": 3343.7291870117188, "epoch": 0.005157131345688961, "grad_norm": 0.13393187522888184, "kl": 4.7847628593444824e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4e-07, "loss": 0.0433, "reward": -0.364647981710732, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.364647981710732, "reward_after_std": 0.6174972988665104, "reward_before_mean": -0.22011579759418964, "reward_before_std": 0.644727323204279, "reward_change_max": 0.0012020617723464966, "reward_change_mean": -0.1445321929641068, "reward_change_min": -0.35754341073334217, "reward_change_std": 0.14418305456638336, "reward_std": 0.6174973249435425, "rewards/cosine_scaled_reward": -0.19339124485850334, "rewards/format_reward": 0.16666667349636555, "step": 12 }, { "advantage_max": 0.9926727637648582, "advantage_mean": 6.208814018471287e-10, "advantage_min": -0.5157452747225761, "advantage_std": 0.551893811672926, "completion_length": 3349.229217529297, "epoch": 0.00558689229116304, "grad_norm": 0.09721053391695023, "kl": 3.769993782043457e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6e-07, "loss": 0.0181, "reward": -0.15828805044293404, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15828805044293404, "reward_after_std": 0.551893800497055, "reward_before_mean": 0.06458748131990433, "reward_before_std": 0.5266094282269478, "reward_change_max": 0.0006849393248558044, "reward_change_mean": -0.22287553269416094, "reward_change_min": -0.4139387719333172, "reward_change_std": 0.1647694082930684, "reward_std": 0.5518938079476357, "rewards/cosine_scaled_reward": -0.11353961005806923, "rewards/format_reward": 0.29166666977107525, "step": 13 }, { "advantage_max": 0.7466056384146214, "advantage_mean": 3.104408341503273e-09, "advantage_min": -0.48210836946964264, "advantage_std": 0.4744464196264744, "completion_length": 3512.5416870117188, "epoch": 0.006016653236637121, "grad_norm": 0.09139310568571091, "kl": 5.5089592933654785e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8e-07, "loss": 0.0362, "reward": -0.47086888551712036, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.47086888551712036, "reward_after_std": 0.474446427077055, "reward_before_mean": -0.3409730903804302, "reward_before_std": 0.5245035365223885, "reward_change_max": 0.0008458793163299561, "reward_change_mean": -0.129895796533674, "reward_change_min": -0.3196791186928749, "reward_change_std": 0.14185051806271076, "reward_std": 0.4744464345276356, "rewards/cosine_scaled_reward": -0.21215321496129036, "rewards/format_reward": 0.08333333395421505, "step": 14 }, { "advantage_max": 0.8289473205804825, "advantage_mean": -5.551115123125783e-17, "advantage_min": -0.6448510959744453, "advantage_std": 0.5116193145513535, "completion_length": 3409.4583740234375, "epoch": 0.0064464141821112, "grad_norm": 0.10181403160095215, "kl": 3.750622272491455e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3e-07, "loss": 0.024, "reward": -0.26276327297091484, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26276327297091484, "reward_after_std": 0.5116192996501923, "reward_before_mean": -0.06355440150946379, "reward_before_std": 0.5410188250243664, "reward_change_max": 0.0006737709045410156, "reward_change_mean": -0.19920885050669312, "reward_change_min": -0.380127627402544, "reward_change_std": 0.15808111242949963, "reward_std": 0.5116193071007729, "rewards/cosine_scaled_reward": -0.13594387844204903, "rewards/format_reward": 0.2083333395421505, "step": 15 }, { "advantage_max": 0.9558442495763302, "advantage_mean": 2.1730860444435507e-08, "advantage_min": -0.610591858625412, "advantage_std": 0.6121120825409889, "completion_length": 3517.8958740234375, "epoch": 0.006876175127585281, "grad_norm": 0.132332906126976, "kl": 4.704296588897705e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.2e-07, "loss": 0.0094, "reward": -0.3299556262791157, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3299556262791157, "reward_after_std": 0.6121120825409889, "reward_before_mean": -0.16464661434292793, "reward_before_std": 0.6840539453551173, "reward_change_max": 0.0007035061717033386, "reward_change_mean": -0.1653090133331716, "reward_change_min": -0.43854134529829025, "reward_change_std": 0.18664807290770113, "reward_std": 0.6121120862662792, "rewards/cosine_scaled_reward": -0.15523997228592634, "rewards/format_reward": 0.1458333358168602, "step": 16 }, { "advantage_max": 0.6369807757437229, "advantage_mean": 1.9247333282734758e-08, "advantage_min": -0.39003145322203636, "advantage_std": 0.377358241006732, "completion_length": 2917.1041946411133, "epoch": 0.0073059360730593605, "grad_norm": 0.06940352916717529, "kl": 4.088878631591797e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4000000000000003e-07, "loss": 0.0293, "reward": -0.22959079654538073, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22959079654538073, "reward_after_std": 0.377358241006732, "reward_before_mean": -0.005572929047048092, "reward_before_std": 0.3452690658159554, "reward_change_max": 0.0, "reward_change_mean": -0.22401784686371684, "reward_change_min": -0.3686513267457485, "reward_change_std": 0.14894617535173893, "reward_std": 0.3773582596331835, "rewards/cosine_scaled_reward": -0.11736980685964227, "rewards/format_reward": 0.22916666977107525, "step": 17 }, { "advantage_max": 0.7173232361674309, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.37687674164772034, "advantage_std": 0.4082351066172123, "completion_length": 3417.1041870117188, "epoch": 0.007735697018533441, "grad_norm": 0.11753091961145401, "kl": 5.3554773330688477e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6e-07, "loss": 0.0368, "reward": -0.46231960505247116, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.46231960505247116, "reward_after_std": 0.40823510475456715, "reward_before_mean": -0.32891321554780006, "reward_before_std": 0.40890152007341385, "reward_change_max": 0.0006503984332084656, "reward_change_mean": -0.13340640487149358, "reward_change_min": -0.2734087072312832, "reward_change_std": 0.10482003539800644, "reward_std": 0.40823512710630894, "rewards/cosine_scaled_reward": -0.20612327102571726, "rewards/format_reward": 0.0833333358168602, "step": 18 }, { "advantage_max": 0.7875667475163937, "advantage_mean": 3.1044081749698194e-09, "advantage_min": -0.42941123247146606, "advantage_std": 0.4503257982432842, "completion_length": 3514.5833740234375, "epoch": 0.00816545796400752, "grad_norm": 0.1084132269024849, "kl": 5.62034547328949e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.7999999999999996e-07, "loss": 0.0089, "reward": -0.5272988877259195, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5272988877259195, "reward_after_std": 0.4503257963806391, "reward_before_mean": -0.4207569342106581, "reward_before_std": 0.471115505322814, "reward_change_max": 0.0019148662686347961, "reward_change_mean": -0.10654198052361608, "reward_change_min": -0.2554769217967987, "reward_change_std": 0.10843606432899833, "reward_std": 0.4503258019685745, "rewards/cosine_scaled_reward": -0.25204512663185596, "rewards/format_reward": 0.0833333358168602, "step": 19 }, { "advantage_max": 1.1536703146994114, "advantage_mean": 2.980232322036258e-08, "advantage_min": -0.7997937351465225, "advantage_std": 0.7067597024142742, "completion_length": 3003.104202270508, "epoch": 0.008595218909481601, "grad_norm": 0.12452682852745056, "kl": 3.5081058740615845e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4e-07, "loss": 0.0563, "reward": -0.030954900197684765, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.030954900197684765, "reward_after_std": 0.7067596893757582, "reward_before_mean": 0.2262389063835144, "reward_before_std": 0.7488967813551426, "reward_change_max": 0.0011496245861053467, "reward_change_mean": -0.2571937805041671, "reward_change_min": -0.49643432907760143, "reward_change_std": 0.20574933383613825, "reward_std": 0.7067597080022097, "rewards/cosine_scaled_reward": -0.032713882625103, "rewards/format_reward": 0.29166667349636555, "step": 20 }, { "advantage_max": 0.5693751126527786, "advantage_mean": -6.829698695476338e-09, "advantage_min": -0.4755325987935066, "advantage_std": 0.3604725245386362, "completion_length": 3232.0625, "epoch": 0.009024979854955682, "grad_norm": 0.05251428112387657, "kl": 3.382936120033264e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1999999999999995e-07, "loss": 0.0105, "reward": -0.32258956134319305, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32258956134319305, "reward_after_std": 0.3604725282639265, "reward_before_mean": -0.126388905569911, "reward_before_std": 0.37894536554813385, "reward_change_max": 0.0, "reward_change_mean": -0.19620067533105612, "reward_change_min": -0.33256699331104755, "reward_change_std": 0.13785515492781997, "reward_std": 0.36047253385186195, "rewards/cosine_scaled_reward": -0.13611112721264362, "rewards/format_reward": 0.14583333395421505, "step": 21 }, { "advantage_max": 1.1929019466042519, "advantage_mean": 1.3038516377683607e-08, "advantage_min": -0.5564581416547298, "advantage_std": 0.6388566717505455, "completion_length": 3359.437530517578, "epoch": 0.00945474080042976, "grad_norm": 0.1303045004606247, "kl": 3.71783971786499e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3999999999999997e-07, "loss": 0.048, "reward": -0.36082927137613297, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.36082927137613297, "reward_after_std": 0.6388566717505455, "reward_before_mean": -0.22351342299953103, "reward_before_std": 0.6339729223400354, "reward_change_max": 0.0010377168655395508, "reward_change_mean": -0.13731584697961807, "reward_change_min": -0.2884688079357147, "reward_change_std": 0.11948961112648249, "reward_std": 0.6388566941022873, "rewards/cosine_scaled_reward": -0.18467338103801012, "rewards/format_reward": 0.1458333395421505, "step": 22 }, { "advantage_max": 0.7958515286445618, "advantage_mean": 3.352761412944716e-08, "advantage_min": -0.37989144027233124, "advantage_std": 0.4433410055935383, "completion_length": 3292.5833435058594, "epoch": 0.00988450174590384, "grad_norm": 0.07386847585439682, "kl": 4.3258070945739746e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.6e-07, "loss": -0.0099, "reward": -0.5137841608375311, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5137841608375311, "reward_after_std": 0.443341001868248, "reward_before_mean": -0.40300147235393524, "reward_before_std": 0.4493147246539593, "reward_change_max": 0.0009110346436500549, "reward_change_mean": -0.11078267311677337, "reward_change_min": -0.2510601822286844, "reward_change_std": 0.09955019829794765, "reward_std": 0.4433410167694092, "rewards/cosine_scaled_reward": -0.27441740967333317, "rewards/format_reward": 0.14583333395421505, "step": 23 }, { "advantage_max": 0.5248563438653946, "advantage_mean": 1.5522043095295146e-08, "advantage_min": -0.38495224714279175, "advantage_std": 0.3390180254355073, "completion_length": 3515.2708435058594, "epoch": 0.010314262691377921, "grad_norm": 0.07022113353013992, "kl": 3.9011240005493164e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.8e-07, "loss": 0.0019, "reward": -0.5779488757252693, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5779488757252693, "reward_after_std": 0.3390180291607976, "reward_before_mean": -0.4703486163634807, "reward_before_std": 0.3764043478295207, "reward_change_max": 0.0013789311051368713, "reward_change_mean": -0.10760025191120803, "reward_change_min": -0.26089750975370407, "reward_change_std": 0.10835923533886671, "reward_std": 0.33901803381741047, "rewards/cosine_scaled_reward": -0.2768409736454487, "rewards/format_reward": 0.0833333358168602, "step": 24 }, { "advantage_max": 1.204544223845005, "advantage_mean": -6.208817127095756e-09, "advantage_min": -0.5962263382971287, "advantage_std": 0.6774736233055592, "completion_length": 3199.8333740234375, "epoch": 0.010744023636852002, "grad_norm": 0.1414061188697815, "kl": 2.3894011974334717e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5e-07, "loss": 0.042, "reward": -0.11183955147862434, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11183955147862434, "reward_after_std": 0.6774736233055592, "reward_before_mean": 0.1130799688398838, "reward_before_std": 0.672430120408535, "reward_change_max": 0.0007963404059410095, "reward_change_mean": -0.22491954546421766, "reward_change_min": -0.4662415347993374, "reward_change_std": 0.18719838093966246, "reward_std": 0.6774736419320107, "rewards/cosine_scaled_reward": -0.0684600081294775, "rewards/format_reward": 0.25000000186264515, "step": 25 }, { "advantage_max": 1.45574264600873, "advantage_mean": 2.048909719665204e-08, "advantage_min": -0.6033074408769608, "advantage_std": 0.765930850058794, "completion_length": 3139.3958740234375, "epoch": 0.01117378458232608, "grad_norm": 0.11832817643880844, "kl": 1.938454806804657e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.2e-07, "loss": 0.0828, "reward": -0.2817384535446763, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2817384535446763, "reward_after_std": 0.765930887311697, "reward_before_mean": -0.13595028594136238, "reward_before_std": 0.7537354081869125, "reward_change_max": 0.0010430961847305298, "reward_change_mean": -0.14578815456479788, "reward_change_min": -0.32745685055851936, "reward_change_std": 0.1265777386724949, "reward_std": 0.7659309245646, "rewards/cosine_scaled_reward": -0.19297514762729406, "rewards/format_reward": 0.25000000186264515, "step": 26 }, { "advantage_max": 0.6409856155514717, "advantage_mean": 7.450580874479584e-09, "advantage_min": -0.438970223069191, "advantage_std": 0.3909902423620224, "completion_length": 3516.875, "epoch": 0.011603545527800161, "grad_norm": 0.0657980814576149, "kl": 3.580749034881592e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.4e-07, "loss": 0.0077, "reward": -0.4962577372789383, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4962577372789383, "reward_after_std": 0.3909902460873127, "reward_before_mean": -0.3675841437652707, "reward_before_std": 0.4152248688042164, "reward_change_max": 0.0011112913489341736, "reward_change_mean": -0.12867358326911926, "reward_change_min": -0.26869202591478825, "reward_change_std": 0.11499698692932725, "reward_std": 0.3909902572631836, "rewards/cosine_scaled_reward": -0.22545874118804932, "rewards/format_reward": 0.08333333395421505, "step": 27 }, { "advantage_max": 0.7554237805306911, "advantage_mean": 2.2972624802442e-08, "advantage_min": -0.41956276446580887, "advantage_std": 0.42861715145409107, "completion_length": 3369.9166870117188, "epoch": 0.012033306473274242, "grad_norm": 0.08383121341466904, "kl": 4.254281520843506e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.6e-07, "loss": 0.0506, "reward": -0.5621975660324097, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5621975660324097, "reward_after_std": 0.4286171440035105, "reward_before_mean": -0.46600986272096634, "reward_before_std": 0.4460342675447464, "reward_change_max": 0.0016005784273147583, "reward_change_mean": -0.09618769865483046, "reward_change_min": -0.222119377925992, "reward_change_std": 0.0954738324508071, "reward_std": 0.4286171570420265, "rewards/cosine_scaled_reward": -0.2850882653146982, "rewards/format_reward": 0.10416666977107525, "step": 28 }, { "advantage_max": 0.7828796319663525, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.5586818270385265, "advantage_std": 0.5024765618145466, "completion_length": 3154.1666870117188, "epoch": 0.012463067418748322, "grad_norm": 0.07388980686664581, "kl": 3.124400973320007e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.8e-07, "loss": 0.041, "reward": -0.26803396129980683, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26803396129980683, "reward_after_std": 0.5024765580892563, "reward_before_mean": -0.06967536732554436, "reward_before_std": 0.5417081043124199, "reward_change_max": 0.0011145249009132385, "reward_change_mean": -0.19835859863087535, "reward_change_min": -0.40712250396609306, "reward_change_std": 0.17049869988113642, "reward_std": 0.5024765841662884, "rewards/cosine_scaled_reward": -0.14942101668566465, "rewards/format_reward": 0.2291666716337204, "step": 29 }, { "advantage_max": 0.623421486467123, "advantage_mean": 2.7318796169684134e-08, "advantage_min": -0.3189329728484154, "advantage_std": 0.3679187763482332, "completion_length": 3315.8333435058594, "epoch": 0.0128928283642224, "grad_norm": 0.06645527482032776, "kl": 5.459785461425781e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6e-07, "loss": 0.009, "reward": -0.5430757235735655, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5430757235735655, "reward_after_std": 0.3679187800735235, "reward_before_mean": -0.429974976927042, "reward_before_std": 0.3803410939872265, "reward_change_max": 0.0008812397718429565, "reward_change_mean": -0.11310075176879764, "reward_change_min": -0.27332986146211624, "reward_change_std": 0.10510111972689629, "reward_std": 0.3679187912493944, "rewards/cosine_scaled_reward": -0.2879041489213705, "rewards/format_reward": 0.1458333395421505, "step": 30 }, { "advantage_max": 1.0361644551157951, "advantage_mean": 3.1044083970144243e-09, "advantage_min": -0.48268650099635124, "advantage_std": 0.584163736552, "completion_length": 3164.1041717529297, "epoch": 0.013322589309696481, "grad_norm": 0.09330141544342041, "kl": 3.378093242645264e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.2e-07, "loss": 0.0497, "reward": -0.3489931561052799, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3489931561052799, "reward_after_std": 0.5841637291014194, "reward_before_mean": -0.19693255051970482, "reward_before_std": 0.6010555103421211, "reward_change_max": 0.0007572099566459656, "reward_change_mean": -0.15206060372292995, "reward_change_min": -0.33258076570928097, "reward_change_std": 0.1285634245723486, "reward_std": 0.5841637328267097, "rewards/cosine_scaled_reward": -0.19221627712249756, "rewards/format_reward": 0.18750000186264515, "step": 31 }, { "advantage_max": 1.0226392820477486, "advantage_mean": -1.6653345369377348e-16, "advantage_min": -0.4906293451786041, "advantage_std": 0.5639783944934607, "completion_length": 3485.750030517578, "epoch": 0.013752350255170562, "grad_norm": 0.1287228912115097, "kl": 3.248453140258789e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.4e-07, "loss": 0.0255, "reward": -0.35614329017698765, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35614329017698765, "reward_after_std": 0.5639783926308155, "reward_before_mean": -0.20582007244229317, "reward_before_std": 0.5658728964626789, "reward_change_max": 0.0009469762444496155, "reward_change_mean": -0.15032322611659765, "reward_change_min": -0.3254125975072384, "reward_change_std": 0.13112161355093122, "reward_std": 0.5639783926308155, "rewards/cosine_scaled_reward": -0.16541003715246916, "rewards/format_reward": 0.12500000186264515, "step": 32 }, { "advantage_max": 1.2033557668328285, "advantage_mean": -8.071462942460528e-09, "advantage_min": -0.7096223160624504, "advantage_std": 0.6830128151923418, "completion_length": 3186.9583740234375, "epoch": 0.014182111200644642, "grad_norm": 0.12961259484291077, "kl": 3.152340650558472e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.6e-07, "loss": 0.0528, "reward": -0.09680512579507194, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09680512579507194, "reward_after_std": 0.6830128226429224, "reward_before_mean": 0.1336783617734909, "reward_before_std": 0.6882087625563145, "reward_change_max": 0.0009331479668617249, "reward_change_mean": -0.23048346117138863, "reward_change_min": -0.4332106411457062, "reward_change_std": 0.17555851209908724, "reward_std": 0.6830128245055676, "rewards/cosine_scaled_reward": -0.09982750163180754, "rewards/format_reward": 0.33333333767950535, "step": 33 }, { "advantage_max": 1.2890900522470474, "advantage_mean": 6.208815683805824e-10, "advantage_min": -0.6820579394698143, "advantage_std": 0.7361489869654179, "completion_length": 3070.312545776367, "epoch": 0.014611872146118721, "grad_norm": 0.11987338215112686, "kl": 3.113411366939545e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.800000000000001e-07, "loss": 0.0621, "reward": -0.16021249815821648, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16021249815821648, "reward_after_std": 0.7361490018665791, "reward_before_mean": 0.041148992255330086, "reward_before_std": 0.7578000836074352, "reward_change_max": 0.0, "reward_change_mean": -0.2013614997267723, "reward_change_min": -0.4576392397284508, "reward_change_std": 0.18230012245476246, "reward_std": 0.7361490055918694, "rewards/cosine_scaled_reward": -0.12525883968919516, "rewards/format_reward": 0.29166667349636555, "step": 34 }, { "advantage_max": 1.1760406605899334, "advantage_mean": -5.587935447692871e-09, "advantage_min": -0.5484105348587036, "advantage_std": 0.6375007666647434, "completion_length": 3316.3541870117188, "epoch": 0.015041633091592801, "grad_norm": 0.0912495106458664, "kl": 3.7550926208496094e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7e-07, "loss": 0.0287, "reward": -0.324324581772089, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.324324581772089, "reward_after_std": 0.6375007666647434, "reward_before_mean": -0.1730878222733736, "reward_before_std": 0.6367185302078724, "reward_change_max": 0.0017997249960899353, "reward_change_mean": -0.151236770208925, "reward_change_min": -0.3389398083090782, "reward_change_std": 0.13052857108414173, "reward_std": 0.6375007703900337, "rewards/cosine_scaled_reward": -0.19071058183908463, "rewards/format_reward": 0.20833334140479565, "step": 35 }, { "advantage_max": 0.8663501963019371, "advantage_mean": 2.2351742123838392e-08, "advantage_min": -0.4297560378909111, "advantage_std": 0.47614113613963127, "completion_length": 3378.9166870117188, "epoch": 0.015471394037066882, "grad_norm": 0.11749003827571869, "kl": 8.411705493927002e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.2e-07, "loss": 0.033, "reward": -0.5068996499758214, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5068996499758214, "reward_after_std": 0.47614115476608276, "reward_before_mean": -0.3981145448051393, "reward_before_std": 0.48368828371167183, "reward_change_max": 0.0005510523915290833, "reward_change_mean": -0.10878509283065796, "reward_change_min": -0.23767412453889847, "reward_change_std": 0.09782943641766906, "reward_std": 0.47614115849137306, "rewards/cosine_scaled_reward": -0.29280728101730347, "rewards/format_reward": 0.18750000558793545, "step": 36 }, { "advantage_max": 1.2931434661149979, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.5958400443196297, "advantage_std": 0.6976716183125973, "completion_length": 3322.8958435058594, "epoch": 0.015901154982540962, "grad_norm": 0.11367114633321762, "kl": 0.00011722790077328682, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.4e-07, "loss": 0.0218, "reward": -0.3415362127125263, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3415362127125263, "reward_after_std": 0.6976716332137585, "reward_before_mean": -0.20584865659475327, "reward_before_std": 0.7006500288844109, "reward_change_max": 0.0013926699757575989, "reward_change_mean": -0.13568756263703108, "reward_change_min": -0.305011423304677, "reward_change_std": 0.1276150057092309, "reward_std": 0.6976716406643391, "rewards/cosine_scaled_reward": -0.18625766286277212, "rewards/format_reward": 0.16666667349636555, "step": 37 }, { "advantage_max": 0.7593724019825459, "advantage_mean": 6.208821234920947e-10, "advantage_min": -0.4482211396098137, "advantage_std": 0.44886738434433937, "completion_length": 2895.437511444092, "epoch": 0.01633091592801504, "grad_norm": 0.06101670116186142, "kl": 9.188055992126465e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.599999999999999e-07, "loss": 0.0125, "reward": -0.3825923502445221, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3825923502445221, "reward_after_std": 0.44886739552021027, "reward_before_mean": -0.22192330658435822, "reward_before_std": 0.466464851051569, "reward_change_max": 0.0014516115188598633, "reward_change_mean": -0.16066907439380884, "reward_change_min": -0.34917990677058697, "reward_change_std": 0.1354043777100742, "reward_std": 0.44886742159724236, "rewards/cosine_scaled_reward": -0.22554498445242643, "rewards/format_reward": 0.2291666716337204, "step": 38 }, { "advantage_max": 1.1070952340960503, "advantage_mean": 3.1044087300813317e-09, "advantage_min": -0.663080383092165, "advantage_std": 0.6673881653696299, "completion_length": 3136.541702270508, "epoch": 0.01676067687348912, "grad_norm": 0.14436458051204681, "kl": 0.00011908076703548431, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.799999999999999e-07, "loss": 0.0607, "reward": -0.19684172049164772, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19684172049164772, "reward_after_std": 0.6673881784081459, "reward_before_mean": 0.0038967072032392025, "reward_before_std": 0.708386916667223, "reward_change_max": 0.0008318871259689331, "reward_change_mean": -0.20073845144361258, "reward_change_min": -0.4534219093620777, "reward_change_std": 0.1835086210630834, "reward_std": 0.6673881858587265, "rewards/cosine_scaled_reward": -0.13346831314265728, "rewards/format_reward": 0.27083333767950535, "step": 39 }, { "advantage_max": 0.8875597789883614, "advantage_mean": -1.8626450382086546e-09, "advantage_min": -0.3722330071032047, "advantage_std": 0.4764030687510967, "completion_length": 3528.6458435058594, "epoch": 0.017190437818963202, "grad_norm": 0.11511418223381042, "kl": 9.122490882873535e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8e-07, "loss": 0.0209, "reward": -0.5411857590079308, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5411857590079308, "reward_after_std": 0.4764030687510967, "reward_before_mean": -0.4462329354137182, "reward_before_std": 0.47696023248136044, "reward_change_max": 0.0007775053381919861, "reward_change_mean": -0.09495283849537373, "reward_change_min": -0.21666884794831276, "reward_change_std": 0.08561577135697007, "reward_std": 0.4764030687510967, "rewards/cosine_scaled_reward": -0.24394980631768703, "rewards/format_reward": 0.0416666679084301, "step": 40 }, { "advantage_max": 0.7697790078818798, "advantage_mean": 1.2417635142369932e-08, "advantage_min": -0.4808138310909271, "advantage_std": 0.4799698516726494, "completion_length": 3521.2291870117188, "epoch": 0.01762019876443728, "grad_norm": 0.09071311354637146, "kl": 0.00017404090613126755, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.199999999999999e-07, "loss": 0.02, "reward": -0.4482840746641159, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4482840746641159, "reward_after_std": 0.4799698479473591, "reward_before_mean": -0.31189268454909325, "reward_before_std": 0.5262452885508537, "reward_change_max": 0.001023031771183014, "reward_change_mean": -0.13639137987047434, "reward_change_min": -0.3571072407066822, "reward_change_std": 0.14451579190790653, "reward_std": 0.4799698702991009, "rewards/cosine_scaled_reward": -0.1871963464654982, "rewards/format_reward": 0.06250000186264515, "step": 41 }, { "advantage_max": 0.9414204210042953, "advantage_mean": -2.607703258883376e-08, "advantage_min": -0.866959273815155, "advantage_std": 0.6225488074123859, "completion_length": 2877.5417098999023, "epoch": 0.018049959709911363, "grad_norm": 0.09929980337619781, "kl": 7.925182580947876e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.399999999999999e-07, "loss": 0.0355, "reward": 0.4646645858883858, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4646645858883858, "reward_after_std": 0.6225487906485796, "reward_before_mean": 0.9152079857885838, "reward_before_std": 0.6252114064991474, "reward_change_max": 0.0009936243295669556, "reward_change_mean": -0.45054343715310097, "reward_change_min": -0.6904154606163502, "reward_change_std": 0.2915335167199373, "reward_std": 0.6225488092750311, "rewards/cosine_scaled_reward": 0.22843733243644238, "rewards/format_reward": 0.4583333432674408, "step": 42 }, { "advantage_max": 0.6760650016367435, "advantage_mean": 2.421438738409165e-08, "advantage_min": -0.3766053691506386, "advantage_std": 0.39602072164416313, "completion_length": 3523.3333740234375, "epoch": 0.018479720655385442, "grad_norm": 0.0634540393948555, "kl": 8.285604417324066e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.599999999999999e-07, "loss": 0.0018, "reward": -0.6100408025085926, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.6100408025085926, "reward_after_std": 0.39602071419358253, "reward_before_mean": -0.5248913671821356, "reward_before_std": 0.42384987510740757, "reward_change_max": 0.002795107662677765, "reward_change_mean": -0.0851494213566184, "reward_change_min": -0.24643570557236671, "reward_change_std": 0.10080668283626437, "reward_std": 0.3960207272320986, "rewards/cosine_scaled_reward": -0.31452902033925056, "rewards/format_reward": 0.1041666679084301, "step": 43 }, { "advantage_max": 0.8805547580122948, "advantage_mean": 1.9868215961338365e-08, "advantage_min": -0.573288805782795, "advantage_std": 0.5421565938740969, "completion_length": 3384.8958435058594, "epoch": 0.01890948160085952, "grad_norm": 0.10884402692317963, "kl": 0.00017033517360687256, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.799999999999999e-07, "loss": 0.0204, "reward": -0.22253214567899704, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.22253214567899704, "reward_after_std": 0.5421565975993872, "reward_before_mean": -0.013217732310295105, "reward_before_std": 0.5633676610887051, "reward_change_max": 0.0025639981031417847, "reward_change_mean": -0.20931439334526658, "reward_change_min": -0.41567404940724373, "reward_change_std": 0.1773575865663588, "reward_std": 0.542156632989645, "rewards/cosine_scaled_reward": -0.10035887360572815, "rewards/format_reward": 0.1875000074505806, "step": 44 }, { "advantage_max": 1.061951208859682, "advantage_mean": 8.071462498371318e-09, "advantage_min": -0.682291891425848, "advantage_std": 0.6715851351618767, "completion_length": 3493.9791870117188, "epoch": 0.019339242546333603, "grad_norm": 0.10313650220632553, "kl": 3.740284591913223e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9e-07, "loss": 0.0296, "reward": -0.2209322745911777, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2209322745911777, "reward_after_std": 0.6715851351618767, "reward_before_mean": -0.02393370494246483, "reward_before_std": 0.7380391471087933, "reward_change_max": 0.0004358366131782532, "reward_change_mean": -0.19699857477098703, "reward_change_min": -0.49178143963217735, "reward_change_std": 0.203695354051888, "reward_std": 0.6715851612389088, "rewards/cosine_scaled_reward": -0.11613352037966251, "rewards/format_reward": 0.20833333767950535, "step": 45 }, { "advantage_max": 1.3110761158168316, "advantage_mean": 1.7384688300037254e-08, "advantage_min": -0.5931250527501106, "advantage_std": 0.7154916040599346, "completion_length": 3429.2083740234375, "epoch": 0.01976900349180768, "grad_norm": 0.12837538123130798, "kl": 3.78552358597517e-05, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.2e-07, "loss": 0.0257, "reward": -0.3574729685788043, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3574729685788043, "reward_after_std": 0.715491633862257, "reward_before_mean": -0.22764497064054012, "reward_before_std": 0.7340891137719154, "reward_change_max": 0.0007813572883605957, "reward_change_mean": -0.12982799299061298, "reward_change_min": -0.3038519024848938, "reward_change_std": 0.1273790579289198, "reward_std": 0.7154916599392891, "rewards/cosine_scaled_reward": -0.19715582148637623, "rewards/format_reward": 0.16666666977107525, "step": 46 }, { "advantage_max": 0.9200848415493965, "advantage_mean": 9.934107647602275e-09, "advantage_min": -0.6299811191856861, "advantage_std": 0.5796599294990301, "completion_length": 3423.0416870117188, "epoch": 0.02019876443728176, "grad_norm": 0.11290145665407181, "kl": 0.0002300366759300232, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.399999999999999e-07, "loss": 0.0133, "reward": -0.23545695981010795, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23545695981010795, "reward_after_std": 0.5796599294990301, "reward_before_mean": -0.03300930839031935, "reward_before_std": 0.6244792677462101, "reward_change_max": 0.0005015283823013306, "reward_change_mean": -0.20244765654206276, "reward_change_min": -0.44455963373184204, "reward_change_std": 0.18726156931370497, "reward_std": 0.5796599332243204, "rewards/cosine_scaled_reward": -0.17275465093553066, "rewards/format_reward": 0.3125000074505806, "step": 47 }, { "advantage_max": 1.0629437901079655, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.5088880062103271, "advantage_std": 0.593526903539896, "completion_length": 3305.7084045410156, "epoch": 0.020628525382755843, "grad_norm": 0.10822011530399323, "kl": 0.00025949627161026, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.6e-07, "loss": 0.0392, "reward": -0.4392389543354511, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4392389543354511, "reward_after_std": 0.593526903539896, "reward_before_mean": -0.3200397063046694, "reward_before_std": 0.6147603616118431, "reward_change_max": 0.000492587685585022, "reward_change_mean": -0.11919927597045898, "reward_change_min": -0.29686538875102997, "reward_change_std": 0.12495697382837534, "reward_std": 0.5935269184410572, "rewards/cosine_scaled_reward": -0.27460318338125944, "rewards/format_reward": 0.2291666716337204, "step": 48 }, { "advantage_max": 1.0348233133554459, "advantage_mean": 6.208817071584605e-09, "advantage_min": -0.5345447920262814, "advantage_std": 0.5911475047469139, "completion_length": 3522.1666870117188, "epoch": 0.02105828632822992, "grad_norm": 0.10515246540307999, "kl": 0.00021766126155853271, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.8e-07, "loss": 0.0165, "reward": -0.4048044327646494, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4048044327646494, "reward_after_std": 0.5911475047469139, "reward_before_mean": -0.2719508111476898, "reward_before_std": 0.6193715762346983, "reward_change_max": 0.00020141899585723877, "reward_change_mean": -0.13285361579619348, "reward_change_min": -0.32379806600511074, "reward_change_std": 0.13374649826437235, "reward_std": 0.5911475196480751, "rewards/cosine_scaled_reward": -0.1880587488412857, "rewards/format_reward": 0.1041666679084301, "step": 49 }, { "advantage_max": 0.979575838893652, "advantage_mean": 1.4280279514444771e-08, "advantage_min": -0.5027905330061913, "advantage_std": 0.5617313664406538, "completion_length": 3196.375015258789, "epoch": 0.021488047273704004, "grad_norm": 0.09323639422655106, "kl": 0.0006543025374412537, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-06, "loss": 0.0132, "reward": -0.33916434785351157, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.33916434785351157, "reward_after_std": 0.5617313701659441, "reward_before_mean": -0.17794132977724075, "reward_before_std": 0.5731094125658274, "reward_change_max": 0.0010968074202537537, "reward_change_mean": -0.1612229987513274, "reward_change_min": -0.37989285960793495, "reward_change_std": 0.1447381847538054, "reward_std": 0.5617313776165247, "rewards/cosine_scaled_reward": -0.2139706751331687, "rewards/format_reward": 0.25, "step": 50 }, { "advantage_max": 1.0646364465355873, "advantage_mean": 1.3038516155639002e-08, "advantage_min": -0.4602276161313057, "advantage_std": 0.5711296675726771, "completion_length": 3520.3334045410156, "epoch": 0.021917808219178082, "grad_norm": 0.09898613393306732, "kl": 0.00012963078916072845, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999890338174275e-07, "loss": 0.0379, "reward": -0.42180849611759186, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.42180849611759186, "reward_after_std": 0.571129665710032, "reward_before_mean": -0.2965225577354431, "reward_before_std": 0.5680376547388732, "reward_change_max": 0.00031816214323043823, "reward_change_mean": -0.12528593931347132, "reward_change_min": -0.24348186701536179, "reward_change_std": 0.10121515719220042, "reward_std": 0.571129685267806, "rewards/cosine_scaled_reward": -0.18992794398218393, "rewards/format_reward": 0.0833333358168602, "step": 51 }, { "advantage_max": 1.142410833388567, "advantage_mean": 7.450580596923828e-09, "advantage_min": -0.5407504960894585, "advantage_std": 0.6236628964543343, "completion_length": 3053.6250610351562, "epoch": 0.02234756916465216, "grad_norm": 0.14361868798732758, "kl": 0.00046241283416748047, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999561358041868e-07, "loss": 0.0777, "reward": -0.2570524848997593, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2570524848997593, "reward_after_std": 0.6236629039049149, "reward_before_mean": -0.07985424902290106, "reward_before_std": 0.614604577422142, "reward_change_max": 0.00041443854570388794, "reward_change_mean": -0.1771982330828905, "reward_change_min": -0.38107147067785263, "reward_change_std": 0.14288352942094207, "reward_std": 0.6236629281193018, "rewards/cosine_scaled_reward": -0.1857604654505849, "rewards/format_reward": 0.29166667349636555, "step": 52 }, { "advantage_max": 0.976898729801178, "advantage_mean": -2.4835271617007493e-09, "advantage_min": -0.43036219477653503, "advantage_std": 0.5190170416608453, "completion_length": 3166.375030517578, "epoch": 0.022777330110126243, "grad_norm": 0.09648339450359344, "kl": 0.0002802610397338867, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.999013075636804e-07, "loss": 0.0433, "reward": -0.3949314132332802, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3949314132332802, "reward_after_std": 0.519017037935555, "reward_before_mean": -0.2554573090746999, "reward_before_std": 0.4992683785967529, "reward_change_max": 0.0011106356978416443, "reward_change_mean": -0.13947410648688674, "reward_change_min": -0.25785068795084953, "reward_change_std": 0.10727057664189488, "reward_std": 0.5190170644782484, "rewards/cosine_scaled_reward": -0.2214786563999951, "rewards/format_reward": 0.18750000186264515, "step": 53 }, { "advantage_max": 1.4017380811274052, "advantage_mean": -6.208815683805824e-10, "advantage_min": -0.8832301311194897, "advantage_std": 0.8333938643336296, "completion_length": 3198.354217529297, "epoch": 0.023207091055600322, "grad_norm": 0.14914840459823608, "kl": 0.0004200937692075968, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.998245517681593e-07, "loss": -0.0004, "reward": 0.17324183136224747, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17324183136224747, "reward_after_std": 0.8333938531577587, "reward_before_mean": 0.48688042163848877, "reward_before_std": 0.8648860827088356, "reward_change_max": 0.0005889683961868286, "reward_change_mean": -0.31363858468830585, "reward_change_min": -0.6035315953195095, "reward_change_std": 0.25070911832153797, "reward_std": 0.8333938531577587, "rewards/cosine_scaled_reward": 0.03510686231311411, "rewards/format_reward": 0.41666667349636555, "step": 54 }, { "advantage_max": 1.1244895234704018, "advantage_mean": 1.1175870895385742e-08, "advantage_min": -0.5521738156676292, "advantage_std": 0.6349265649914742, "completion_length": 2895.3333435058594, "epoch": 0.0236368520010744, "grad_norm": 0.10603523254394531, "kl": 0.0009374618530273438, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.997258721585931e-07, "loss": 0.0419, "reward": -0.19849698804318905, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19849698804318905, "reward_after_std": 0.6349265463650227, "reward_before_mean": 0.0015930309891700745, "reward_before_std": 0.6384835094213486, "reward_change_max": 0.0006782263517379761, "reward_change_mean": -0.2000899976119399, "reward_change_min": -0.41361572220921516, "reward_change_std": 0.1665561399422586, "reward_std": 0.6349265649914742, "rewards/cosine_scaled_reward": -0.1762868259102106, "rewards/format_reward": 0.3541666679084301, "step": 55 }, { "advantage_max": 0.9523991905152798, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -0.6075415499508381, "advantage_std": 0.5921418834477663, "completion_length": 3497.000030517578, "epoch": 0.024066612946548483, "grad_norm": 0.10025162994861603, "kl": 0.001046299934387207, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.996052735444862e-07, "loss": 0.0191, "reward": -0.34861547127366066, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.34861547127366066, "reward_after_std": 0.59214186668396, "reward_before_mean": -0.1904515065252781, "reward_before_std": 0.6486162189394236, "reward_change_max": 0.001343272626399994, "reward_change_mean": -0.15816396102309227, "reward_change_min": -0.41353959776461124, "reward_change_std": 0.17091365531086922, "reward_std": 0.5921418890357018, "rewards/cosine_scaled_reward": -0.1785590942017734, "rewards/format_reward": 0.16666666977107525, "step": 56 }, { "advantage_max": 1.0490206964313984, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -0.5839561298489571, "advantage_std": 0.5993405655026436, "completion_length": 3488.6041870117188, "epoch": 0.024496373892022562, "grad_norm": 0.12256208062171936, "kl": 0.0011476650834083557, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.994627618036452e-07, "loss": -0.0002, "reward": -0.32453581877052784, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32453581877052784, "reward_after_std": 0.5993405543267727, "reward_before_mean": -0.16365431668236852, "reward_before_std": 0.6202043741941452, "reward_change_max": 0.0, "reward_change_mean": -0.16088151466101408, "reward_change_min": -0.33611369505524635, "reward_change_std": 0.145022327080369, "reward_std": 0.5993405878543854, "rewards/cosine_scaled_reward": -0.14432716462761164, "rewards/format_reward": 0.1250000037252903, "step": 57 }, { "advantage_max": 0.9272233359515667, "advantage_mean": -6.208814018471287e-10, "advantage_min": -0.5611174069344997, "advantage_std": 0.5522931329905987, "completion_length": 2651.4167251586914, "epoch": 0.024926134837496644, "grad_norm": 0.06667205691337585, "kl": 0.003368619829416275, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.992983438818915e-07, "loss": -0.009, "reward": 0.1917843222618103, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1917843222618103, "reward_after_std": 0.5522931255400181, "reward_before_mean": 0.5447516441345215, "reward_before_std": 0.5185897685587406, "reward_change_max": 0.0, "reward_change_mean": -0.3529673032462597, "reward_change_min": -0.5914964880794287, "reward_change_std": 0.22492309287190437, "reward_std": 0.5522931590676308, "rewards/cosine_scaled_reward": 0.0536258090287447, "rewards/format_reward": 0.43750000186264515, "step": 58 }, { "advantage_max": 0.5185256265103817, "advantage_mean": 1.0554989882116672e-08, "advantage_min": -0.36473191902041435, "advantage_std": 0.33480220660567284, "completion_length": 3584.0, "epoch": 0.025355895782970723, "grad_norm": 0.05752472206950188, "kl": 0.0006178617477416992, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": -0.5592984966933727, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5592984966933727, "reward_after_std": 0.33480220660567284, "reward_before_mean": -0.44459547474980354, "reward_before_std": 0.3676304556429386, "reward_change_max": 0.0010232478380203247, "reward_change_mean": -0.11470302660018206, "reward_change_min": -0.26190848276019096, "reward_change_std": 0.10856694914400578, "reward_std": 0.33480222523212433, "rewards/cosine_scaled_reward": -0.22229773364961147, "rewards/format_reward": 0.0, "step": 59 }, { "advantage_max": 1.0113364905118942, "advantage_mean": -6.208814573582799e-10, "advantage_min": -0.46363168209791183, "advantage_std": 0.5531458184123039, "completion_length": 3560.3541870117188, "epoch": 0.0257856567284448, "grad_norm": 0.11161834746599197, "kl": 0.00031003355979919434, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.989038226169207e-07, "loss": 0.0101, "reward": -0.5052008010679856, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5052008010679856, "reward_after_std": 0.5531458146870136, "reward_before_mean": -0.4055161736905575, "reward_before_std": 0.5659666620194912, "reward_change_max": 0.0004497617483139038, "reward_change_mean": -0.09968465520069003, "reward_change_min": -0.2652219757437706, "reward_change_std": 0.10727592324838042, "reward_std": 0.5531458184123039, "rewards/cosine_scaled_reward": -0.23400807566940784, "rewards/format_reward": 0.06250000186264515, "step": 60 }, { "advantage_max": 0.7338531874120235, "advantage_mean": 3.290673195044391e-08, "advantage_min": -0.4587595984339714, "advantage_std": 0.44532119296491146, "completion_length": 3582.5208435058594, "epoch": 0.026215417673918884, "grad_norm": 0.09000901877880096, "kl": 0.001034565269947052, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.98673738502114e-07, "loss": 0.0005, "reward": -0.4494348429143429, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4494348429143429, "reward_after_std": 0.4453211985528469, "reward_before_mean": -0.31043073907494545, "reward_before_std": 0.47398094832897186, "reward_change_max": 0.0002864822745323181, "reward_change_mean": -0.13900407776236534, "reward_change_min": -0.33759179152548313, "reward_change_std": 0.13227981515228748, "reward_std": 0.4453212171792984, "rewards/cosine_scaled_reward": -0.17604870349168777, "rewards/format_reward": 0.0416666679084301, "step": 61 }, { "advantage_max": 1.0768824554979801, "advantage_mean": 1.7384688799637615e-08, "advantage_min": -0.6898246333003044, "advantage_std": 0.6572067588567734, "completion_length": 3128.0833740234375, "epoch": 0.026645178619392963, "grad_norm": 0.10973693430423737, "kl": 0.00024312734603881836, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.98421786662277e-07, "loss": 0.0092, "reward": -0.008700018748641014, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.008700018748641014, "reward_after_std": 0.6572067476809025, "reward_before_mean": 0.26289848051965237, "reward_before_std": 0.6902702413499355, "reward_change_max": 0.0006353259086608887, "reward_change_mean": -0.27159848157316446, "reward_change_min": -0.5149102509021759, "reward_change_std": 0.2076151706278324, "reward_std": 0.657206766307354, "rewards/cosine_scaled_reward": -0.024800771847367287, "rewards/format_reward": 0.31250000558793545, "step": 62 }, { "advantage_max": 0.6160202585160732, "advantage_mean": 1.1796752852344383e-08, "advantage_min": -0.4154182970523834, "advantage_std": 0.38310002628713846, "completion_length": 3285.8333435058594, "epoch": 0.02707493956486704, "grad_norm": 0.057609327137470245, "kl": 0.001836538314819336, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.981479793771866e-07, "loss": 0.0028, "reward": -0.44364625960588455, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.44364625960588455, "reward_after_std": 0.3831000318750739, "reward_before_mean": -0.29481642320752144, "reward_before_std": 0.405568927526474, "reward_change_max": 0.0004621073603630066, "reward_change_mean": -0.14882983453571796, "reward_change_min": -0.30552570335567, "reward_change_std": 0.1229396362323314, "reward_std": 0.3831000318750739, "rewards/cosine_scaled_reward": -0.20990821532905102, "rewards/format_reward": 0.125, "step": 63 }, { "advantage_max": 0.6489166170358658, "advantage_mean": 2.1109978931566076e-08, "advantage_min": -0.475914791226387, "advantage_std": 0.42533592134714127, "completion_length": 3390.3333435058594, "epoch": 0.027504700510341124, "grad_norm": 0.07548660039901733, "kl": 0.0012118220329284668, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.97852329991824e-07, "loss": 0.0298, "reward": -0.49255532398819923, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.49255532398819923, "reward_after_std": 0.42533592879772186, "reward_before_mean": -0.36459886794909835, "reward_before_std": 0.4747042637318373, "reward_change_max": 0.0007499232888221741, "reward_change_mean": -0.12795646023005247, "reward_change_min": -0.3067194353789091, "reward_change_std": 0.13339131139218807, "reward_std": 0.4253359381109476, "rewards/cosine_scaled_reward": -0.24479943700134754, "rewards/format_reward": 0.12500000186264515, "step": 64 }, { "advantage_max": 0.6024645566940308, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.36312612146139145, "advantage_std": 0.34816169925034046, "completion_length": 3356.5416717529297, "epoch": 0.027934461455815202, "grad_norm": 0.052782803773880005, "kl": 0.0007527768611907959, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.975348529157229e-07, "loss": 0.0023, "reward": -0.3258682116866112, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3258682116866112, "reward_after_std": 0.34816169179975986, "reward_before_mean": -0.13329238072037697, "reward_before_std": 0.32404848746955395, "reward_change_max": 0.000697597861289978, "reward_change_mean": -0.19257583376020193, "reward_change_min": -0.3325822986662388, "reward_change_std": 0.12960705161094666, "reward_std": 0.34816169179975986, "rewards/cosine_scaled_reward": -0.12914619036018848, "rewards/format_reward": 0.125, "step": 65 }, { "advantage_max": 0.783698033541441, "advantage_mean": 1.8626452213954536e-08, "advantage_min": -0.4754526987671852, "advantage_std": 0.48099360801279545, "completion_length": 3559.3125, "epoch": 0.028364222401289284, "grad_norm": 0.10543376952409744, "kl": 0.000553131103515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.971955636222684e-07, "loss": 0.0067, "reward": -0.4978593084961176, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4978593084961176, "reward_after_std": 0.48099359311163425, "reward_before_mean": -0.37943920120596886, "reward_before_std": 0.5280016735196114, "reward_change_max": 0.0012401863932609558, "reward_change_mean": -0.11842010822147131, "reward_change_min": -0.3338940292596817, "reward_change_std": 0.1365427323617041, "reward_std": 0.48099360242486, "rewards/cosine_scaled_reward": -0.21055293548852205, "rewards/format_reward": 0.0416666679084301, "step": 66 }, { "advantage_max": 1.087129384279251, "advantage_mean": -1.924733428193548e-08, "advantage_min": -0.7149610742926598, "advantage_std": 0.6335822902619839, "completion_length": 2616.604202270508, "epoch": 0.028793983346763363, "grad_norm": 0.06779853254556656, "kl": 0.002802550792694092, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.968344786479415e-07, "loss": 0.0037, "reward": 0.06216258555650711, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06216258555650711, "reward_after_std": 0.6335822902619839, "reward_before_mean": 0.3565434757620096, "reward_before_std": 0.6236644685268402, "reward_change_max": 0.0005370453000068665, "reward_change_mean": -0.2943809209391475, "reward_change_min": -0.5025293100625277, "reward_change_std": 0.19753529597073793, "reward_std": 0.6335823088884354, "rewards/cosine_scaled_reward": -0.03006160445511341, "rewards/format_reward": 0.4166666679084301, "step": 67 }, { "advantage_max": 1.0618442930281162, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.569156602025032, "advantage_std": 0.6074351221323013, "completion_length": 2926.083335876465, "epoch": 0.029223744292237442, "grad_norm": 0.10366114974021912, "kl": 0.0016489028930664062, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.964516155915151e-07, "loss": 0.0233, "reward": -0.11333970446139574, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11333970446139574, "reward_after_std": 0.607435118407011, "reward_before_mean": 0.1203654371201992, "reward_before_std": 0.5902032870799303, "reward_change_max": 0.0008795708417892456, "reward_change_mean": -0.23370513645932078, "reward_change_min": -0.4569368865340948, "reward_change_std": 0.18548431433737278, "reward_std": 0.6074351295828819, "rewards/cosine_scaled_reward": -0.064817288890481, "rewards/format_reward": 0.25000000558793545, "step": 68 }, { "advantage_max": 1.2105547711253166, "advantage_mean": 6.208819014474898e-10, "advantage_min": -0.6963208541274071, "advantage_std": 0.713896993547678, "completion_length": 3397.4375, "epoch": 0.029653505237711524, "grad_norm": 0.12487664818763733, "kl": 0.0008016526699066162, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.960469931131936e-07, "loss": 0.0141, "reward": -0.17890601605176926, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17890601605176926, "reward_after_std": 0.7138970047235489, "reward_before_mean": 0.023239322006702423, "reward_before_std": 0.7564393673092127, "reward_change_max": 0.00018440186977386475, "reward_change_mean": -0.202145351562649, "reward_change_min": -0.4125629961490631, "reward_change_std": 0.17683170642703772, "reward_std": 0.7138970345258713, "rewards/cosine_scaled_reward": -0.09254700131714344, "rewards/format_reward": 0.20833334140479565, "step": 69 }, { "advantage_max": 1.3531583361327648, "advantage_mean": 3.4148499583608327e-09, "advantage_min": -0.8613870292901993, "advantage_std": 0.7757849097251892, "completion_length": 3007.7291870117188, "epoch": 0.030083266183185603, "grad_norm": 0.15636110305786133, "kl": 0.0029556751251220703, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.956206309337066e-07, "loss": 0.0476, "reward": -0.05706272320821881, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05706272320821881, "reward_after_std": 0.7757849022746086, "reward_before_mean": 0.17682876624166965, "reward_before_std": 0.7966156378388405, "reward_change_max": 0.0012296438217163086, "reward_change_mean": -0.23389147967100143, "reward_change_min": -0.42563481628894806, "reward_change_std": 0.18985693342983723, "reward_std": 0.7757849097251892, "rewards/cosine_scaled_reward": -0.0990856271237135, "rewards/format_reward": 0.37500001676380634, "step": 70 }, { "advantage_max": 0.7877018973231316, "advantage_mean": 9.934107647602275e-09, "advantage_min": -0.45891378074884415, "advantage_std": 0.46137798205018044, "completion_length": 3449.8541870117188, "epoch": 0.03051302712865968, "grad_norm": 0.09266134351491928, "kl": 0.003154754638671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.951725498333448e-07, "loss": 0.0218, "reward": -0.4374130815267563, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4374130815267563, "reward_after_std": 0.46137796342372894, "reward_before_mean": -0.2996031027287245, "reward_before_std": 0.48241357132792473, "reward_change_max": 0.0006419941782951355, "reward_change_mean": -0.13780998706351966, "reward_change_min": -0.28036942705512047, "reward_change_std": 0.11825473792850971, "reward_std": 0.46137796342372894, "rewards/cosine_scaled_reward": -0.21230154857039452, "rewards/format_reward": 0.1250000037252903, "step": 71 }, { "advantage_max": 0.8047559261322021, "advantage_mean": -1.2417644690287943e-09, "advantage_min": -0.5307071059942245, "advantage_std": 0.49777455627918243, "completion_length": 3051.8541870117188, "epoch": 0.030942788074133764, "grad_norm": 0.07476215809583664, "kl": 0.0013123154640197754, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.947027716509488e-07, "loss": 0.0297, "reward": -0.08685511350631714, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08685511350631714, "reward_after_std": 0.49777456372976303, "reward_before_mean": 0.1764240376651287, "reward_before_std": 0.49765095859766006, "reward_change_max": 0.0005834922194480896, "reward_change_mean": -0.26327915489673615, "reward_change_min": -0.45576081424951553, "reward_change_std": 0.18889683345332742, "reward_std": 0.49777457118034363, "rewards/cosine_scaled_reward": -0.0367879718542099, "rewards/format_reward": 0.25000000558793545, "step": 72 }, { "advantage_max": 1.075548905879259, "advantage_mean": 1.3659398556686853e-08, "advantage_min": -0.6061269715428352, "advantage_std": 0.59186259098351, "completion_length": 2981.8333740234375, "epoch": 0.03137254901960784, "grad_norm": 0.11844071000814438, "kl": 0.002924680709838867, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.942113192828444e-07, "loss": 0.0301, "reward": -0.24150283355265856, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24150283355265856, "reward_after_std": 0.59186259098351, "reward_before_mean": -0.056053802371025085, "reward_before_std": 0.5812375675886869, "reward_change_max": 0.0014141350984573364, "reward_change_mean": -0.18544901721179485, "reward_change_min": -0.3683231994509697, "reward_change_std": 0.14343121368438005, "reward_std": 0.5918625947088003, "rewards/cosine_scaled_reward": -0.18427690863609314, "rewards/format_reward": 0.31250001303851604, "step": 73 }, { "advantage_max": 0.7128509841859341, "advantage_mean": 1.4280279514444771e-08, "advantage_min": -0.4495863616466522, "advantage_std": 0.44957539066672325, "completion_length": 3365.3333435058594, "epoch": 0.031802309965081925, "grad_norm": 0.09631112217903137, "kl": 0.001154780387878418, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.93698216681727e-07, "loss": 0.0471, "reward": -0.46448274701833725, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.46448274701833725, "reward_after_std": 0.44957539066672325, "reward_before_mean": -0.3298438750207424, "reward_before_std": 0.49301424250006676, "reward_change_max": 0.0011412426829338074, "reward_change_mean": -0.1346388653619215, "reward_change_min": -0.3163988031446934, "reward_change_std": 0.13437246182002127, "reward_std": 0.44957539439201355, "rewards/cosine_scaled_reward": -0.206588601693511, "rewards/format_reward": 0.0833333358168602, "step": 74 }, { "advantage_max": 1.1982688196003437, "advantage_mean": -1.241763691872677e-09, "advantage_min": -0.7570008710026741, "advantage_std": 0.7434534449130297, "completion_length": 2975.729202270508, "epoch": 0.032232070910556, "grad_norm": 0.13756830990314484, "kl": 0.0014859437942504883, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.931634888554935e-07, "loss": 0.005, "reward": -0.013792794197797775, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.013792794197797775, "reward_after_std": 0.7434534542262554, "reward_before_mean": 0.24603205174207687, "reward_before_std": 0.7943346351385117, "reward_change_max": 0.0006025210022926331, "reward_change_mean": -0.25982486084103584, "reward_change_min": -0.5653695464134216, "reward_change_std": 0.23263592226430774, "reward_std": 0.7434534803032875, "rewards/cosine_scaled_reward": -0.07490064110606909, "rewards/format_reward": 0.39583333767950535, "step": 75 }, { "advantage_max": 0.9519620202481747, "advantage_mean": 2.4835271617007493e-09, "advantage_min": -0.6556081250309944, "advantage_std": 0.5908295009285212, "completion_length": 3251.8333435058594, "epoch": 0.03266183185603008, "grad_norm": 0.09772420674562454, "kl": 0.0022117942571640015, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.926071618660237e-07, "loss": 0.0121, "reward": -0.14480791240930557, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14480791240930557, "reward_after_std": 0.590829499065876, "reward_before_mean": 0.0876416340470314, "reward_before_std": 0.6272661369293928, "reward_change_max": 0.0021880343556404114, "reward_change_mean": -0.2324495743960142, "reward_change_min": -0.45025860145688057, "reward_change_std": 0.19491465855389833, "reward_std": 0.5908295046538115, "rewards/cosine_scaled_reward": -0.11242918483912945, "rewards/format_reward": 0.31250000558793545, "step": 76 }, { "advantage_max": 1.3450633361935616, "advantage_mean": -9.313226356777449e-09, "advantage_min": -0.6204375252127647, "advantage_std": 0.7431157119572163, "completion_length": 3310.6041870117188, "epoch": 0.033091592801504165, "grad_norm": 0.11613147705793381, "kl": 0.001922607421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.9202926282791e-07, "loss": 0.0107, "reward": -0.20864915568381548, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20864915568381548, "reward_after_std": 0.7431156970560551, "reward_before_mean": -0.027927943505346775, "reward_before_std": 0.7443163171410561, "reward_change_max": 0.0017031282186508179, "reward_change_mean": -0.18072122987359762, "reward_change_min": -0.376650782302022, "reward_change_std": 0.15685666911303997, "reward_std": 0.7431157305836678, "rewards/cosine_scaled_reward": -0.12854730524122715, "rewards/format_reward": 0.2291666679084301, "step": 77 }, { "advantage_max": 0.8634050823748112, "advantage_mean": 2.1730860666480112e-08, "advantage_min": -0.4731779918074608, "advantage_std": 0.480957567691803, "completion_length": 3581.5208435058594, "epoch": 0.03352135374697824, "grad_norm": 0.11537489295005798, "kl": 0.0018619298934936523, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.91429819907136e-07, "loss": 0.0007, "reward": -0.48017206601798534, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.48017206601798534, "reward_after_std": 0.4809575751423836, "reward_before_mean": -0.36163159320130944, "reward_before_std": 0.4892412982881069, "reward_change_max": 0.0012174323201179504, "reward_change_mean": -0.11854046303778887, "reward_change_min": -0.23841693811118603, "reward_change_std": 0.10337638854980469, "reward_std": 0.48095759749412537, "rewards/cosine_scaled_reward": -0.20164913078770041, "rewards/format_reward": 0.0416666679084301, "step": 78 }, { "advantage_max": 0.7346944697201252, "advantage_mean": 1.8005570590062803e-08, "advantage_min": -0.40547844395041466, "advantage_std": 0.43019791319966316, "completion_length": 3584.0, "epoch": 0.03395111469245232, "grad_norm": 0.09404244273900986, "kl": 0.0024056434631347656, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.908088623197048e-07, "loss": 0.0001, "reward": -0.4991060486063361, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4991060486063361, "reward_after_std": 0.4301979187875986, "reward_before_mean": -0.37774336338043213, "reward_before_std": 0.44768408313393593, "reward_change_max": 0.0009080022573471069, "reward_change_mean": -0.12136267591267824, "reward_change_min": -0.25858513452112675, "reward_change_std": 0.10581270419061184, "reward_std": 0.43019793182611465, "rewards/cosine_scaled_reward": -0.1992883514612913, "rewards/format_reward": 0.02083333395421505, "step": 79 }, { "advantage_max": 0.9889465495944023, "advantage_mean": 6.208817404651512e-09, "advantage_min": -0.6419688388705254, "advantage_std": 0.5894065890461206, "completion_length": 3367.5625610351562, "epoch": 0.034380875637926404, "grad_norm": 0.1108761802315712, "kl": 0.002140045166015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.901664203302124e-07, "loss": 0.0366, "reward": -0.2429164806380868, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2429164806380868, "reward_after_std": 0.5894065778702497, "reward_before_mean": -0.049463117495179176, "reward_before_std": 0.6158843375742435, "reward_change_max": 0.00134962797164917, "reward_change_mean": -0.19345338735729456, "reward_change_min": -0.4004933312535286, "reward_change_std": 0.16743275616317987, "reward_std": 0.5894065927714109, "rewards/cosine_scaled_reward": -0.13931488059461117, "rewards/format_reward": 0.22916666977107525, "step": 80 }, { "advantage_max": 0.5897867120802402, "advantage_mean": 1.6142924996742636e-08, "advantage_min": -0.3822917975485325, "advantage_std": 0.3630852960050106, "completion_length": 3195.583335876465, "epoch": 0.03481063658340049, "grad_norm": 0.056312356144189835, "kl": 0.002270936965942383, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.895025252503755e-07, "loss": -0.0004, "reward": -0.23495609685778618, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23495609685778618, "reward_after_std": 0.3630852960050106, "reward_before_mean": -0.00928628258407116, "reward_before_std": 0.3375773187726736, "reward_change_max": 0.00016789138317108154, "reward_change_mean": -0.22566980589181185, "reward_change_min": -0.3746103513985872, "reward_change_std": 0.1555609842762351, "reward_std": 0.3630852997303009, "rewards/cosine_scaled_reward": -0.067143140360713, "rewards/format_reward": 0.125, "step": 81 }, { "advantage_max": 1.5101319625973701, "advantage_mean": 5.587935947293232e-09, "advantage_min": -0.5173898264765739, "advantage_std": 0.7820956893265247, "completion_length": 3227.0833892822266, "epoch": 0.03524039752887456, "grad_norm": 0.1486523449420929, "kl": 0.002370119094848633, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.888172094375033e-07, "loss": 0.0414, "reward": -0.061504267854616046, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.061504267854616046, "reward_after_std": 0.7820956818759441, "reward_before_mean": 0.15933408122509718, "reward_before_std": 0.7287064343690872, "reward_change_max": 0.002685330808162689, "reward_change_mean": -0.2208383409306407, "reward_change_min": -0.3759304117411375, "reward_change_std": 0.1504957154393196, "reward_std": 0.7820957116782665, "rewards/cosine_scaled_reward": -0.024499624967575073, "rewards/format_reward": 0.2083333358168602, "step": 82 }, { "advantage_max": 0.8638816736638546, "advantage_mean": 2.7318795892128378e-08, "advantage_min": -0.45508793741464615, "advantage_std": 0.49914186634123325, "completion_length": 3388.9583740234375, "epoch": 0.035670158474348644, "grad_norm": 0.09634040296077728, "kl": 0.0027495622634887695, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.881105062929221e-07, "loss": 0.0261, "reward": -0.4610670078545809, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4610670078545809, "reward_after_std": 0.49914187379181385, "reward_before_mean": -0.33818054012954235, "reward_before_std": 0.5207065120339394, "reward_change_max": 0.0004860013723373413, "reward_change_mean": -0.12288645934313536, "reward_change_min": -0.2674871888011694, "reward_change_std": 0.11428458523005247, "reward_std": 0.49914190731942654, "rewards/cosine_scaled_reward": -0.2315902765840292, "rewards/format_reward": 0.12500000186264515, "step": 83 }, { "advantage_max": 0.67760119587183, "advantage_mean": 1.6763806731656672e-08, "advantage_min": -0.5341562069952488, "advantage_std": 0.43436816707253456, "completion_length": 3469.5625, "epoch": 0.036099919419822726, "grad_norm": 0.07665356993675232, "kl": 0.0010115280747413635, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.873824502603459e-07, "loss": 0.0073, "reward": -0.3482162468135357, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3482162468135357, "reward_after_std": 0.43436817079782486, "reward_before_mean": -0.16947033256292343, "reward_before_std": 0.46964727342128754, "reward_change_max": 0.0018496736884117126, "reward_change_mean": -0.1787459277547896, "reward_change_min": -0.35342732444405556, "reward_change_std": 0.14810111792758107, "reward_std": 0.43436818197369576, "rewards/cosine_scaled_reward": -0.13681849837303162, "rewards/format_reward": 0.1041666716337204, "step": 84 }, { "advantage_max": 1.004574753344059, "advantage_mean": 1.241763458725842e-08, "advantage_min": -0.6185353174805641, "advantage_std": 0.628087654709816, "completion_length": 3261.2708740234375, "epoch": 0.0365296803652968, "grad_norm": 0.2711460292339325, "kl": 0.03903117775917053, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.866330768241983e-07, "loss": -0.0015, "reward": -0.01060779020190239, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.01060779020190239, "reward_after_std": 0.6280876845121384, "reward_before_mean": 0.2663103900849819, "reward_before_std": 0.6458847709000111, "reward_change_max": 0.0020218193531036377, "reward_change_mean": -0.2769181942567229, "reward_change_min": -0.551235556602478, "reward_change_std": 0.22579412907361984, "reward_std": 0.6280876994132996, "rewards/cosine_scaled_reward": -0.012678128201514482, "rewards/format_reward": 0.2916666716337204, "step": 85 }, { "advantage_max": 1.2530723325908184, "advantage_mean": -1.8626447051417472e-09, "advantage_min": -0.7563226148486137, "advantage_std": 0.7341955509036779, "completion_length": 3331.4166870117188, "epoch": 0.036959441310770884, "grad_norm": 0.14710231125354767, "kl": 0.0020885467529296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.85862422507884e-07, "loss": 0.0276, "reward": -0.1649502506479621, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1649502506479621, "reward_after_std": 0.734195563942194, "reward_before_mean": 0.037638706737197936, "reward_before_std": 0.7727732248604298, "reward_change_max": 0.00026404112577438354, "reward_change_mean": -0.2025889791548252, "reward_change_min": -0.40829845145344734, "reward_change_std": 0.1769371759146452, "reward_std": 0.7341956179589033, "rewards/cosine_scaled_reward": -0.0853473087772727, "rewards/format_reward": 0.2083333395421505, "step": 86 }, { "advantage_max": 1.1191134750843048, "advantage_mean": 8.692344455329959e-09, "advantage_min": -0.6699051596224308, "advantage_std": 0.6759327910840511, "completion_length": 2987.291717529297, "epoch": 0.037389202256244966, "grad_norm": 0.2217923104763031, "kl": 0.003255128860473633, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.850705248720068e-07, "loss": 0.0904, "reward": -0.19250718597322702, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19250718597322702, "reward_after_std": 0.6759327799081802, "reward_before_mean": 0.00909726694226265, "reward_before_std": 0.7209260798990726, "reward_change_max": 0.0010297894477844238, "reward_change_mean": -0.20160441705957055, "reward_change_min": -0.48480752669274807, "reward_change_std": 0.19618520885705948, "reward_std": 0.6759327836334705, "rewards/cosine_scaled_reward": -0.15170137956738472, "rewards/format_reward": 0.31250000931322575, "step": 87 }, { "advantage_max": 1.0284177735447884, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.5964771695435047, "advantage_std": 0.6148467808961868, "completion_length": 3431.1041870117188, "epoch": 0.03781896320171904, "grad_norm": 0.1120285764336586, "kl": 0.0016106478869915009, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.8425742251254e-07, "loss": 0.0153, "reward": -0.27137863636016846, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27137863636016846, "reward_after_std": 0.6148467995226383, "reward_before_mean": -0.09168392047286034, "reward_before_std": 0.6534042283892632, "reward_change_max": 0.0, "reward_change_mean": -0.17969474382698536, "reward_change_min": -0.4623326286673546, "reward_change_std": 0.1751184817403555, "reward_std": 0.6148468069732189, "rewards/cosine_scaled_reward": -0.12917529046535492, "rewards/format_reward": 0.1666666679084301, "step": 88 }, { "advantage_max": 0.6792150661349297, "advantage_mean": 1.924733428193548e-08, "advantage_min": -0.30098605528473854, "advantage_std": 0.3748701773583889, "completion_length": 3498.2291870117188, "epoch": 0.038248724147193124, "grad_norm": 0.06137798726558685, "kl": 0.0026760101318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.83423155058946e-07, "loss": 0.0029, "reward": -0.5058371741324663, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5058371741324663, "reward_after_std": 0.3748701810836792, "reward_before_mean": -0.3823966421186924, "reward_before_std": 0.36087556183338165, "reward_change_max": 0.0008312985301017761, "reward_change_mean": -0.12344052828848362, "reward_change_min": -0.23908154480159283, "reward_change_std": 0.09375854209065437, "reward_std": 0.3748701922595501, "rewards/cosine_scaled_reward": -0.26411499083042145, "rewards/format_reward": 0.1458333358168602, "step": 89 }, { "advantage_max": 0.8820087499916553, "advantage_mean": 4.346172144398253e-09, "advantage_min": -0.476658396422863, "advantage_std": 0.5245650261640549, "completion_length": 3560.9791870117188, "epoch": 0.038678485092667206, "grad_norm": 0.13809970021247864, "kl": 0.003906607627868652, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.825677631722435e-07, "loss": 0.0106, "reward": -0.482957623898983, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.482957623898983, "reward_after_std": 0.524565014988184, "reward_before_mean": -0.3672230467200279, "reward_before_std": 0.5626931227743626, "reward_change_max": 0.002255283296108246, "reward_change_mean": -0.11573457717895508, "reward_change_min": -0.31228984519839287, "reward_change_std": 0.13188880030065775, "reward_std": 0.5245650410652161, "rewards/cosine_scaled_reward": -0.21486153453588486, "rewards/format_reward": 0.06250000186264515, "step": 90 }, { "advantage_max": 1.004807773977518, "advantage_mean": 1.6453366169510986e-08, "advantage_min": -0.47694606333971024, "advantage_std": 0.5567643530666828, "completion_length": 3507.000030517578, "epoch": 0.03910824603814128, "grad_norm": 0.10015463083982468, "kl": 0.0030349791049957275, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.816912885430258e-07, "loss": 0.0167, "reward": -0.41716958954930305, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.41716958954930305, "reward_after_std": 0.5567643437534571, "reward_before_mean": -0.2859762394800782, "reward_before_std": 0.5664056334644556, "reward_change_max": 0.0, "reward_change_mean": -0.13119335379451513, "reward_change_min": -0.27869343757629395, "reward_change_std": 0.11123088700696826, "reward_std": 0.5567643791437149, "rewards/cosine_scaled_reward": -0.20548812113702297, "rewards/format_reward": 0.1250000037252903, "step": 91 }, { "advantage_max": 0.7804395742714405, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.36228614300489426, "advantage_std": 0.42865980602800846, "completion_length": 3499.4166870117188, "epoch": 0.03953800698361536, "grad_norm": 0.07240769267082214, "kl": 0.005733788013458252, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.807937738894303e-07, "loss": 0.0279, "reward": -0.5952946897596121, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5952946897596121, "reward_after_std": 0.4286598041653633, "reward_before_mean": -0.5125366747379303, "reward_before_std": 0.4384130146354437, "reward_change_max": 0.0005381479859352112, "reward_change_mean": -0.08275801967829466, "reward_change_min": -0.18670807778835297, "reward_change_std": 0.08080129115842283, "reward_std": 0.4286598227918148, "rewards/cosine_scaled_reward": -0.27710168063640594, "rewards/format_reward": 0.0416666679084301, "step": 92 }, { "advantage_max": 0.9178200103342533, "advantage_mean": 1.6142925052253787e-08, "advantage_min": -0.51988185942173, "advantage_std": 0.5390436160378158, "completion_length": 3526.6041870117188, "epoch": 0.039967767929089446, "grad_norm": 0.09631077200174332, "kl": 0.003081798553466797, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.798752629550546e-07, "loss": 0.0089, "reward": -0.4239487461745739, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4239487461745739, "reward_after_std": 0.5390436169691384, "reward_before_mean": -0.2901838943362236, "reward_before_std": 0.5713146729394794, "reward_change_max": 0.00035165995359420776, "reward_change_mean": -0.13376484345644712, "reward_change_min": -0.3075775373727083, "reward_change_std": 0.13356224622111768, "reward_std": 0.5390436397865415, "rewards/cosine_scaled_reward": -0.197175282984972, "rewards/format_reward": 0.10416666977107525, "step": 93 }, { "advantage_max": 0.5984636135399342, "advantage_mean": 2.9181441596470847e-08, "advantage_min": -0.3486388474702835, "advantage_std": 0.35203731805086136, "completion_length": 3537.7916870117188, "epoch": 0.04039752887456352, "grad_norm": 0.05249588564038277, "kl": 0.0034914016723632812, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.78935800506826e-07, "loss": 0.0188, "reward": -0.5955394580960274, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5955394580960274, "reward_after_std": 0.35203731805086136, "reward_before_mean": -0.4999112971127033, "reward_before_std": 0.37140160240232944, "reward_change_max": 0.0007591173052787781, "reward_change_mean": -0.09562815143726766, "reward_change_min": -0.2064863070845604, "reward_change_std": 0.0881137396208942, "reward_std": 0.35203733295202255, "rewards/cosine_scaled_reward": -0.2603723108768463, "rewards/format_reward": 0.02083333395421505, "step": 94 }, { "advantage_max": 1.2463648654520512, "advantage_mean": 1.179675268581093e-08, "advantage_min": -0.6432937495410442, "advantage_std": 0.7092402279376984, "completion_length": 3498.9791870117188, "epoch": 0.0408272898200376, "grad_norm": 0.1408698409795761, "kl": 0.0032587051391601562, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.779754323328192e-07, "loss": 0.0157, "reward": -0.2897273050621152, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2897273050621152, "reward_after_std": 0.7092402242124081, "reward_before_mean": -0.13089681649580598, "reward_before_std": 0.7410722561180592, "reward_change_max": 0.0025123506784439087, "reward_change_mean": -0.15883046993985772, "reward_change_min": -0.3506867475807667, "reward_change_std": 0.1540903588756919, "reward_std": 0.7092402577400208, "rewards/cosine_scaled_reward": -0.13836508244276047, "rewards/format_reward": 0.1458333395421505, "step": 95 }, { "advantage_max": 0.6827729940414429, "advantage_mean": 6.2088175156738146e-09, "advantage_min": -0.44478175044059753, "advantage_std": 0.4010522700846195, "completion_length": 3320.187530517578, "epoch": 0.041257050765511685, "grad_norm": 0.061541855335235596, "kl": 0.0034122467041015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.769942052400235e-07, "loss": 0.0193, "reward": -0.20009618741460145, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20009618741460145, "reward_after_std": 0.4010522775352001, "reward_before_mean": 0.03102658875286579, "reward_before_std": 0.3822368886321783, "reward_change_max": 0.00014751404523849487, "reward_change_mean": -0.23112275870516896, "reward_change_min": -0.3827540911734104, "reward_change_std": 0.15361567679792643, "reward_std": 0.4010522849857807, "rewards/cosine_scaled_reward": -0.08865337073802948, "rewards/format_reward": 0.2083333358168602, "step": 96 }, { "advantage_max": 1.0262844860553741, "advantage_mean": 1.6763806731656672e-08, "advantage_min": -0.5179901346564293, "advantage_std": 0.57785814255476, "completion_length": 3351.1875, "epoch": 0.04168681171098577, "grad_norm": 0.11202176660299301, "kl": 0.003623485565185547, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.759921670520634e-07, "loss": 0.0237, "reward": -0.42936178483068943, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.42936178483068943, "reward_after_std": 0.5778581313788891, "reward_before_mean": -0.30425630882382393, "reward_before_std": 0.5991627685725689, "reward_change_max": 0.0015925392508506775, "reward_change_mean": -0.12510547740384936, "reward_change_min": -0.3441330697387457, "reward_change_std": 0.13356181979179382, "reward_std": 0.5778581462800503, "rewards/cosine_scaled_reward": -0.2250448283739388, "rewards/format_reward": 0.1458333395421505, "step": 97 }, { "advantage_max": 0.48923345282673836, "advantage_mean": 2.5456151298008933e-08, "advantage_min": -0.32482245191931725, "advantage_std": 0.2987217754125595, "completion_length": 3584.0, "epoch": 0.04211657265645984, "grad_norm": 0.05485212430357933, "kl": 0.0030007362365722656, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.749693666068663e-07, "loss": 0.0001, "reward": -0.6113384552299976, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.6113384552299976, "reward_after_std": 0.29872178100049496, "reward_before_mean": -0.5143393557518721, "reward_before_std": 0.31780483573675156, "reward_change_max": 0.0019604116678237915, "reward_change_mean": -0.09699909761548042, "reward_change_min": -0.2016468085348606, "reward_change_std": 0.08585811033844948, "reward_std": 0.29872178845107555, "rewards/cosine_scaled_reward": -0.25716967787593603, "rewards/format_reward": 0.0, "step": 98 }, { "advantage_max": 1.211990024894476, "advantage_mean": -6.208814573582799e-10, "advantage_min": -0.6712222769856453, "advantage_std": 0.7105681858956814, "completion_length": 3431.25, "epoch": 0.042546333601933925, "grad_norm": 0.13730773329734802, "kl": 0.004390716552734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.739258537542835e-07, "loss": 0.0015, "reward": -0.207722385879606, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.207722385879606, "reward_after_std": 0.710568156093359, "reward_before_mean": -0.016314515843987465, "reward_before_std": 0.750538595020771, "reward_change_max": 7.884949445724487e-05, "reward_change_mean": -0.19140788167715073, "reward_change_min": -0.43128723837435246, "reward_change_std": 0.1798848919570446, "reward_std": 0.710568156093359, "rewards/cosine_scaled_reward": -0.10190726071596146, "rewards/format_reward": 0.1875000074505806, "step": 99 }, { "advantage_max": 1.3523523174226284, "advantage_mean": 2.918144215158236e-08, "advantage_min": -0.5370749533176422, "advantage_std": 0.698965035378933, "completion_length": 3335.4166870117188, "epoch": 0.04297609454740801, "grad_norm": 0.1258583664894104, "kl": 0.0042934417724609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.728616793536587e-07, "loss": 0.0092, "reward": -0.11018030112609267, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11018030112609267, "reward_after_std": 0.6989650577306747, "reward_before_mean": 0.10673996806144714, "reward_before_std": 0.6466423962265253, "reward_change_max": 8.732825517654419e-05, "reward_change_mean": -0.21692020120099187, "reward_change_min": -0.3789345324039459, "reward_change_std": 0.14862715220078826, "reward_std": 0.6989650838077068, "rewards/cosine_scaled_reward": -0.08204669645056129, "rewards/format_reward": 0.27083333395421505, "step": 100 }, { "advantage_max": 0.7998537644743919, "advantage_mean": 2.918144120789279e-08, "advantage_min": -0.40668024867773056, "advantage_std": 0.4712050184607506, "completion_length": 3584.0, "epoch": 0.04340585549288208, "grad_norm": 0.08554729074239731, "kl": 0.002620220184326172, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.717768952713511e-07, "loss": 0.0001, "reward": -0.5510246753692627, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5510246753692627, "reward_after_std": 0.4712050072848797, "reward_before_mean": -0.45386848598718643, "reward_before_std": 0.5063041485846043, "reward_change_max": 0.0013203620910644531, "reward_change_mean": -0.09715617774054408, "reward_change_min": -0.23990466073155403, "reward_change_std": 0.10478799603879452, "reward_std": 0.4712050296366215, "rewards/cosine_scaled_reward": -0.23735091090202332, "rewards/format_reward": 0.02083333395421505, "step": 101 }, { "advantage_max": 0.9792653508484364, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -0.47813913226127625, "advantage_std": 0.5334391184151173, "completion_length": 3519.5833435058594, "epoch": 0.043835616438356165, "grad_norm": 0.1106763556599617, "kl": 0.00519561767578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.706715543782064e-07, "loss": 0.009, "reward": -0.45675253495574, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.45675253495574, "reward_after_std": 0.5334391072392464, "reward_before_mean": -0.33850160241127014, "reward_before_std": 0.5388844087719917, "reward_change_max": 0.001609519124031067, "reward_change_mean": -0.11825094651430845, "reward_change_min": -0.2586698364466429, "reward_change_std": 0.10317747481167316, "reward_std": 0.5334391221404076, "rewards/cosine_scaled_reward": -0.20050079934298992, "rewards/format_reward": 0.06250000186264515, "step": 102 }, { "advantage_max": 1.1254120208323002, "advantage_mean": -2.220446049250313e-16, "advantage_min": -0.7579033225774765, "advantage_std": 0.7055772431194782, "completion_length": 3448.1666870117188, "epoch": 0.04426537738383025, "grad_norm": 0.1904595047235489, "kl": 0.004627704620361328, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.695457105469804e-07, "loss": 0.042, "reward": -0.1594617497175932, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1594617497175932, "reward_after_std": 0.7055772542953491, "reward_before_mean": 0.05498909763991833, "reward_before_std": 0.7698470167815685, "reward_change_max": 0.0011663958430290222, "reward_change_mean": -0.21445086784660816, "reward_change_min": -0.47822500206530094, "reward_change_std": 0.20545574370771646, "reward_std": 0.7055772729218006, "rewards/cosine_scaled_reward": -0.05583879258483648, "rewards/format_reward": 0.16666666977107525, "step": 103 }, { "advantage_max": 0.6345180086791515, "advantage_mean": 1.4280279736489376e-08, "advantage_min": -0.37014294788241386, "advantage_std": 0.3598313704133034, "completion_length": 3551.312530517578, "epoch": 0.04469513832930432, "grad_norm": 0.06650852411985397, "kl": 0.004472255706787109, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.683994186497132e-07, "loss": 0.0081, "reward": -0.4986319802701473, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4986319802701473, "reward_after_std": 0.3598313629627228, "reward_before_mean": -0.3700214847922325, "reward_before_std": 0.3577865567058325, "reward_change_max": 0.0015247538685798645, "reward_change_mean": -0.12861049128696322, "reward_change_min": -0.24244680628180504, "reward_change_std": 0.09806907596066594, "reward_std": 0.3598313666880131, "rewards/cosine_scaled_reward": -0.2162607442587614, "rewards/format_reward": 0.06250000186264515, "step": 104 }, { "advantage_max": 0.5287297740578651, "advantage_mean": 2.4214387439602802e-08, "advantage_min": -0.4122527725994587, "advantage_std": 0.34001942723989487, "completion_length": 3441.0625, "epoch": 0.045124899274778404, "grad_norm": 0.0553034283220768, "kl": 0.002765178680419922, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.672327345550543e-07, "loss": 0.0118, "reward": -0.5237542241811752, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5237542241811752, "reward_after_std": 0.34001941978931427, "reward_before_mean": -0.3971514627337456, "reward_before_std": 0.36936452239751816, "reward_change_max": 0.0009296536445617676, "reward_change_mean": -0.12660275120288134, "reward_change_min": -0.256743473932147, "reward_change_std": 0.1111523644067347, "reward_std": 0.34001943096518517, "rewards/cosine_scaled_reward": -0.24024239741265774, "rewards/format_reward": 0.0833333358168602, "step": 105 }, { "advantage_max": 1.6845214031636715, "advantage_mean": 8.071462387349015e-09, "advantage_min": -0.7446894571185112, "advantage_std": 0.9231962747871876, "completion_length": 3026.875030517578, "epoch": 0.04555466022025249, "grad_norm": 0.1255454421043396, "kl": 0.0034062862396240234, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.66045715125541e-07, "loss": 0.0136, "reward": -0.03562385495752096, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03562385495752096, "reward_after_std": 0.9231962747871876, "reward_before_mean": 0.18312770628836006, "reward_before_std": 0.9319545701146126, "reward_change_max": 0.0, "reward_change_mean": -0.21875156089663506, "reward_change_min": -0.4861839860677719, "reward_change_std": 0.18876921012997627, "reward_std": 0.9231963232159615, "rewards/cosine_scaled_reward": -0.06468615122139454, "rewards/format_reward": 0.3125000074505806, "step": 106 }, { "advantage_max": 1.681139212101698, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.8186116889119148, "advantage_std": 0.969216987490654, "completion_length": 3335.416717529297, "epoch": 0.04598442116572656, "grad_norm": 0.17100413143634796, "kl": 0.0035152435302734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.648384182148252e-07, "loss": 0.0176, "reward": -0.05131065659224987, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05131065659224987, "reward_after_std": 0.9692169986665249, "reward_before_mean": 0.16166264098137617, "reward_before_std": 1.0226215869188309, "reward_change_max": 0.001536741852760315, "reward_change_mean": -0.21297328313812613, "reward_change_min": -0.6393970511853695, "reward_change_std": 0.23807570291683078, "reward_std": 0.9692170508205891, "rewards/cosine_scaled_reward": -0.09625202789902687, "rewards/format_reward": 0.3541666753590107, "step": 107 }, { "advantage_max": 0.7562712617218494, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.4833696708083153, "advantage_std": 0.4540072977542877, "completion_length": 3532.8333740234375, "epoch": 0.046414182111200644, "grad_norm": 0.07033514976501465, "kl": 0.004111051559448242, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.636109026648554e-07, "loss": 0.0051, "reward": -0.4682664815336466, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4682664815336466, "reward_after_std": 0.45400729589164257, "reward_before_mean": -0.33747924616909586, "reward_before_std": 0.48293490149080753, "reward_change_max": 0.0006131604313850403, "reward_change_mean": -0.1307872380129993, "reward_change_min": -0.28976587392389774, "reward_change_std": 0.12618425115942955, "reward_std": 0.45400729589164257, "rewards/cosine_scaled_reward": -0.2729062885046005, "rewards/format_reward": 0.2083333358168602, "step": 108 }, { "advantage_max": 0.7139115296304226, "advantage_mean": -2.4835272727230517e-09, "advantage_min": -0.34086688607931137, "advantage_std": 0.3964364267885685, "completion_length": 3179.4583435058594, "epoch": 0.046843943056674726, "grad_norm": 0.07010386139154434, "kl": 0.003942131996154785, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.623632283030077e-07, "loss": 0.0188, "reward": -0.27352728694677353, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27352728694677353, "reward_after_std": 0.3964364156126976, "reward_before_mean": -0.07098178938031197, "reward_before_std": 0.3605372793972492, "reward_change_max": 0.00039087235927581787, "reward_change_mean": -0.2025454994291067, "reward_change_min": -0.3517267294228077, "reward_change_std": 0.13771351100876927, "reward_std": 0.3964364305138588, "rewards/cosine_scaled_reward": -0.11882423050701618, "rewards/format_reward": 0.1666666679084301, "step": 109 }, { "advantage_max": 0.7425019666552544, "advantage_mean": 6.8296991950766994e-09, "advantage_min": -0.5076679065823555, "advantage_std": 0.4658440090715885, "completion_length": 3398.5208435058594, "epoch": 0.0472737040021488, "grad_norm": 0.08720850199460983, "kl": 0.004151105880737305, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.610954559391704e-07, "loss": 0.0029, "reward": -0.42759570851922035, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.42759570851922035, "reward_after_std": 0.4658440090715885, "reward_before_mean": -0.2830139175057411, "reward_before_std": 0.5084549561142921, "reward_change_max": 0.0024454742670059204, "reward_change_mean": -0.14458178775385022, "reward_change_min": -0.33707537688314915, "reward_change_std": 0.1417062757536769, "reward_std": 0.4658440165221691, "rewards/cosine_scaled_reward": -0.19359029084444046, "rewards/format_reward": 0.1041666716337204, "step": 110 }, { "advantage_max": 0.5570516511797905, "advantage_mean": 1.1175871506008406e-08, "advantage_min": -0.3576982170343399, "advantage_std": 0.34777399711310863, "completion_length": 3519.6666870117188, "epoch": 0.047703464947622884, "grad_norm": 0.05584542453289032, "kl": 0.00586700439453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.598076473627796e-07, "loss": 0.0117, "reward": -0.5794062092900276, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5794062092900276, "reward_after_std": 0.34777399711310863, "reward_before_mean": -0.4753792751580477, "reward_before_std": 0.37977814860641956, "reward_change_max": 0.002154141664505005, "reward_change_mean": -0.10402693739160895, "reward_change_min": -0.2599708456546068, "reward_change_std": 0.10554625792428851, "reward_std": 0.34777400828897953, "rewards/cosine_scaled_reward": -0.25852297246456146, "rewards/format_reward": 0.0416666679084301, "step": 111 }, { "advantage_max": 1.1608381941914558, "advantage_mean": 1.707424734931351e-08, "advantage_min": -0.8266163691878319, "advantage_std": 0.7434123866260052, "completion_length": 2993.187515258789, "epoch": 0.048133225893096966, "grad_norm": 0.11706309020519257, "kl": 0.003997325897216797, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.58499865339809e-07, "loss": 0.0408, "reward": 0.02905360236763954, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.02905360236763954, "reward_after_std": 0.7434123791754246, "reward_before_mean": 0.3078249581158161, "reward_before_std": 0.8115455023944378, "reward_change_max": 0.0008453875780105591, "reward_change_mean": -0.2787713329307735, "reward_change_min": -0.558758407831192, "reward_change_std": 0.24680751468986273, "reward_std": 0.7434124238789082, "rewards/cosine_scaled_reward": -0.03358752280473709, "rewards/format_reward": 0.37500000558793545, "step": 112 }, { "advantage_max": 0.7879568450152874, "advantage_mean": 1.3659397835041887e-08, "advantage_min": -0.42077869921922684, "advantage_std": 0.45822883769869804, "completion_length": 3576.3125, "epoch": 0.04856298683857104, "grad_norm": 0.08284755051136017, "kl": 0.0051116943359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.571721736097088e-07, "loss": 0.0061, "reward": -0.5484014991670847, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5484014991670847, "reward_after_std": 0.4582288358360529, "reward_before_mean": -0.4486488178372383, "reward_before_std": 0.48739137686789036, "reward_change_max": 0.0011829137802124023, "reward_change_mean": -0.09975266479887068, "reward_change_min": -0.27241048216819763, "reward_change_std": 0.11045058607123792, "reward_std": 0.4582288395613432, "rewards/cosine_scaled_reward": -0.24515775218605995, "rewards/format_reward": 0.0416666679084301, "step": 113 }, { "advantage_max": 1.1926232390105724, "advantage_mean": 2.0489097529718947e-08, "advantage_min": -0.6087073385715485, "advantage_std": 0.6994397342205048, "completion_length": 3331.5208740234375, "epoch": 0.048992747784045124, "grad_norm": 0.13421542942523956, "kl": 0.0058441162109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.55824636882301e-07, "loss": 0.0741, "reward": -0.34899051301181316, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34899051301181316, "reward_after_std": 0.6994397193193436, "reward_before_mean": -0.2078576609492302, "reward_before_std": 0.7498466111719608, "reward_change_max": 0.0017466917634010315, "reward_change_mean": -0.14113285159692168, "reward_change_min": -0.4232480600476265, "reward_change_std": 0.16835328051820397, "reward_std": 0.6994397602975368, "rewards/cosine_scaled_reward": -0.19767883932217956, "rewards/format_reward": 0.18750000186264515, "step": 114 }, { "advantage_max": 0.5760264210402966, "advantage_mean": 1.8626452158443385e-08, "advantage_min": -0.3722720444202423, "advantage_std": 0.3423323854804039, "completion_length": 3516.4375, "epoch": 0.049422508729519206, "grad_norm": 0.07857788354158401, "kl": 0.010396957397460938, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.54457320834625e-07, "loss": 0.0185, "reward": -0.5877884589135647, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5877884589135647, "reward_after_std": 0.3423323854804039, "reward_before_mean": -0.4880765378475189, "reward_before_std": 0.362007450312376, "reward_change_max": 0.0032718032598495483, "reward_change_mean": -0.0997119212988764, "reward_change_min": -0.20852785743772984, "reward_change_std": 0.09262400353327394, "reward_std": 0.3423323966562748, "rewards/cosine_scaled_reward": -0.26487160101532936, "rewards/format_reward": 0.0416666679084301, "step": 115 }, { "advantage_max": 1.0772427767515182, "advantage_mean": -2.4835271617007493e-09, "advantage_min": -0.4915907420217991, "advantage_std": 0.6079626251012087, "completion_length": 3536.937530517578, "epoch": 0.04985226967499329, "grad_norm": 0.12491448223590851, "kl": 0.004974365234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.530702921077358e-07, "loss": 0.0309, "reward": -0.3958939127624035, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3958939127624035, "reward_after_std": 0.607962628826499, "reward_before_mean": -0.2631869101896882, "reward_before_std": 0.6300467811524868, "reward_change_max": 0.0004841610789299011, "reward_change_mean": -0.1327070277184248, "reward_change_min": -0.3273257948458195, "reward_change_std": 0.12791649345308542, "reward_std": 0.6079626530408859, "rewards/cosine_scaled_reward": -0.1628434481099248, "rewards/format_reward": 0.06250000186264515, "step": 116 }, { "advantage_max": 0.904010146856308, "advantage_mean": 8.071462942460528e-09, "advantage_min": -0.6787244156002998, "advantage_std": 0.5634792819619179, "completion_length": 3314.8333587646484, "epoch": 0.05028203062046736, "grad_norm": 0.08462508767843246, "kl": 0.007979393005371094, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.516636183034564e-07, "loss": -0.0056, "reward": -0.16134057566523552, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16134057566523552, "reward_after_std": 0.5634793136268854, "reward_before_mean": 0.068339753895998, "reward_before_std": 0.5997231397777796, "reward_change_max": 0.0007396265864372253, "reward_change_mean": -0.22968033514916897, "reward_change_min": -0.4412536099553108, "reward_change_std": 0.18997908476740122, "reward_std": 0.5634793471544981, "rewards/cosine_scaled_reward": -0.09083012491464615, "rewards/format_reward": 0.2500000037252903, "step": 117 }, { "advantage_max": 0.7542007863521576, "advantage_mean": 3.352761335229104e-08, "advantage_min": -0.36563776060938835, "advantage_std": 0.42353212647140026, "completion_length": 3584.0, "epoch": 0.050711791565941446, "grad_norm": 0.08291848003864288, "kl": 0.0029273033142089844, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.502373679810839e-07, "loss": 0.0001, "reward": -0.6020128503441811, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6020128503441811, "reward_after_std": 0.42353213019669056, "reward_before_mean": -0.5203612633049488, "reward_before_std": 0.4415900707244873, "reward_change_max": 0.0014768540859222412, "reward_change_mean": -0.08165157260373235, "reward_change_min": -0.19523423723876476, "reward_change_std": 0.08442434202879667, "reward_std": 0.42353213392198086, "rewards/cosine_scaled_reward": -0.2705973032861948, "rewards/format_reward": 0.02083333395421505, "step": 118 }, { "advantage_max": 0.7401220798492432, "advantage_mean": 2.2972623636707823e-08, "advantage_min": -0.39647942781448364, "advantage_std": 0.4288946036249399, "completion_length": 3570.9375, "epoch": 0.05114155251141553, "grad_norm": 0.07952865958213806, "kl": 0.004719734191894531, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.487916106540465e-07, "loss": 0.0105, "reward": -0.4964634031057358, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4964634031057358, "reward_after_std": 0.4288946036249399, "reward_before_mean": -0.37385331094264984, "reward_before_std": 0.44360541366040707, "reward_change_max": 0.0003344416618347168, "reward_change_mean": -0.12261007982306182, "reward_change_min": -0.2328458409756422, "reward_change_std": 0.098459537839517, "reward_std": 0.4288946185261011, "rewards/cosine_scaled_reward": -0.19734332617372274, "rewards/format_reward": 0.02083333395421505, "step": 119 }, { "advantage_max": 0.7654808275401592, "advantage_mean": 1.8626452158443385e-08, "advantage_min": -0.5552992895245552, "advantage_std": 0.5056708641350269, "completion_length": 3466.6666870117188, "epoch": 0.0515713134568896, "grad_norm": 0.10676053166389465, "kl": 0.004441499710083008, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.473264167865171e-07, "loss": 0.0217, "reward": -0.36113474145531654, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.36113474145531654, "reward_after_std": 0.5056708566844463, "reward_before_mean": -0.19331073248758912, "reward_before_std": 0.56310223788023, "reward_change_max": 0.0, "reward_change_mean": -0.1678239991888404, "reward_change_min": -0.3927532397210598, "reward_change_std": 0.16534169483929873, "reward_std": 0.5056708827614784, "rewards/cosine_scaled_reward": -0.16957203298807144, "rewards/format_reward": 0.14583333767950535, "step": 120 }, { "advantage_max": 1.0314167812466621, "advantage_mean": 1.8626451714354175e-08, "advantage_min": -0.6070163212716579, "advantage_std": 0.622599646449089, "completion_length": 3510.1875610351562, "epoch": 0.052001074402363685, "grad_norm": 0.12464013695716858, "kl": 0.007086753845214844, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.458418577899774e-07, "loss": 0.0262, "reward": -0.34601517114788294, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34601517114788294, "reward_after_std": 0.6225996501743793, "reward_before_mean": -0.19206706061959267, "reward_before_std": 0.671099990606308, "reward_change_max": 0.0004187077283859253, "reward_change_mean": -0.1539481021463871, "reward_change_min": -0.3674945142120123, "reward_change_std": 0.16222773492336273, "reward_std": 0.6225996650755405, "rewards/cosine_scaled_reward": -0.15853353100828826, "rewards/format_reward": 0.12500000186264515, "step": 121 }, { "advantage_max": 1.151826523244381, "advantage_mean": 1.6142925496342997e-08, "advantage_min": -0.5494605600833893, "advantage_std": 0.6305528096854687, "completion_length": 3259.937515258789, "epoch": 0.05243083534783777, "grad_norm": 0.125951886177063, "kl": 0.009899139404296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.443380060197385e-07, "loss": 0.0191, "reward": -0.27450993936508894, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.27450993936508894, "reward_after_std": 0.6305528208613396, "reward_before_mean": -0.1050114706158638, "reward_before_std": 0.619666650891304, "reward_change_max": 0.0005802363157272339, "reward_change_mean": -0.16949846129864454, "reward_change_min": -0.3220092337578535, "reward_change_std": 0.13353237276896834, "reward_std": 0.630552850663662, "rewards/cosine_scaled_reward": -0.16708907624706626, "rewards/format_reward": 0.2291666679084301, "step": 122 }, { "advantage_max": 0.735649973154068, "advantage_mean": 9.313226023710541e-09, "advantage_min": -0.49806712940335274, "advantage_std": 0.46031451784074306, "completion_length": 3515.0625, "epoch": 0.05286059629331184, "grad_norm": 0.0785604789853096, "kl": 0.005253314971923828, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.428149347714143e-07, "loss": 0.0053, "reward": -0.29238981008529663, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.29238981008529663, "reward_after_std": 0.4603145271539688, "reward_before_mean": -0.09660915285348892, "reward_before_std": 0.4840660709887743, "reward_change_max": 1.9252300262451172e-05, "reward_change_mean": -0.1957806684076786, "reward_change_min": -0.407364409416914, "reward_change_std": 0.15845954045653343, "reward_std": 0.4603145383298397, "rewards/cosine_scaled_reward": -0.13163790293037891, "rewards/format_reward": 0.1666666716337204, "step": 123 }, { "advantage_max": 0.789119053632021, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.47232208028435707, "advantage_std": 0.4631243348121643, "completion_length": 3300.5833740234375, "epoch": 0.053290357238785925, "grad_norm": 0.0882117822766304, "kl": 0.006946563720703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.412727182773486e-07, "loss": 0.043, "reward": -0.44438148057088256, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.44438148057088256, "reward_after_std": 0.4631243534386158, "reward_before_mean": -0.30867274291813374, "reward_before_std": 0.48385387286543846, "reward_change_max": 0.0020363032817840576, "reward_change_mean": -0.13570873625576496, "reward_change_min": -0.309432078152895, "reward_change_std": 0.12523407395929098, "reward_std": 0.463124368339777, "rewards/cosine_scaled_reward": -0.22725303377956152, "rewards/format_reward": 0.14583333767950535, "step": 124 }, { "advantage_max": 0.7443223558366299, "advantage_mean": 6.829699139565548e-09, "advantage_min": -0.54876459389925, "advantage_std": 0.49329181388020515, "completion_length": 3422.2083740234375, "epoch": 0.05372011818426001, "grad_norm": 0.11542148888111115, "kl": 0.007617950439453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.397114317029974e-07, "loss": 0.0171, "reward": -0.39412534050643444, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.39412534050643444, "reward_after_std": 0.49329181388020515, "reward_before_mean": -0.23680651932954788, "reward_before_std": 0.554126650094986, "reward_change_max": 0.0014179721474647522, "reward_change_mean": -0.1573188304901123, "reward_change_min": -0.36567760817706585, "reward_change_std": 0.1610144879668951, "reward_std": 0.49329183623194695, "rewards/cosine_scaled_reward": -0.2017365973442793, "rewards/format_reward": 0.1666666716337204, "step": 125 }, { "advantage_max": 0.7967126257717609, "advantage_mean": 3.16649688691939e-08, "advantage_min": -0.4650183841586113, "advantage_std": 0.48376845195889473, "completion_length": 3577.8541870117188, "epoch": 0.05414987912973408, "grad_norm": 0.08212237805128098, "kl": 0.004633426666259766, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.381311511432658e-07, "loss": 0.0018, "reward": -0.5015386715531349, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5015386715531349, "reward_after_std": 0.48376845195889473, "reward_before_mean": -0.38559823483228683, "reward_before_std": 0.527119591832161, "reward_change_max": 0.0008854493498802185, "reward_change_mean": -0.11594041809439659, "reward_change_min": -0.3294540550559759, "reward_change_std": 0.13405688805505633, "reward_std": 0.4837684780359268, "rewards/cosine_scaled_reward": -0.21363245695829391, "rewards/format_reward": 0.0416666679084301, "step": 126 }, { "advantage_max": 0.9745269455015659, "advantage_mean": 1.8626451714354175e-08, "advantage_min": -0.6112959086894989, "advantage_std": 0.578279010951519, "completion_length": 3468.8333435058594, "epoch": 0.054579640075208165, "grad_norm": 0.10569223761558533, "kl": 0.005480766296386719, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.36531953618799e-07, "loss": 0.0336, "reward": -0.32267098128795624, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32267098128795624, "reward_after_std": 0.5782790258526802, "reward_before_mean": -0.155442101880908, "reward_before_std": 0.6110133826732635, "reward_change_max": 0.0008152350783348083, "reward_change_mean": -0.16722885647322983, "reward_change_min": -0.32547407411038876, "reward_change_std": 0.14718749257735908, "reward_std": 0.5782790295779705, "rewards/cosine_scaled_reward": -0.14022105187177658, "rewards/format_reward": 0.1250000037252903, "step": 127 }, { "advantage_max": 0.9878061637282372, "advantage_mean": 1.4280279514444771e-08, "advantage_min": -0.4725644178688526, "advantage_std": 0.5343535616993904, "completion_length": 3226.166717529297, "epoch": 0.05500940102068225, "grad_norm": 0.10022153705358505, "kl": 0.011690139770507812, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.34913917072228e-07, "loss": 0.0329, "reward": -0.3361808843910694, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3361808843910694, "reward_after_std": 0.5343535616993904, "reward_before_mean": -0.17592088831588626, "reward_before_std": 0.5183319598436356, "reward_change_max": 0.0006352365016937256, "reward_change_mean": -0.16025998815894127, "reward_change_min": -0.30836330726742744, "reward_change_std": 0.12084709480404854, "reward_std": 0.534353569149971, "rewards/cosine_scaled_reward": -0.2233771225437522, "rewards/format_reward": 0.2708333432674408, "step": 128 }, { "advantage_max": 1.5856390669941902, "advantage_mean": 6.208820679809435e-10, "advantage_min": -0.8445003405213356, "advantage_std": 0.9279644563794136, "completion_length": 3135.6458740234375, "epoch": 0.05543916196615632, "grad_norm": 0.1721089780330658, "kl": 0.003987789154052734, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.332771203643714e-07, "loss": 0.0368, "reward": 0.21938915830105543, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21938915830105543, "reward_after_std": 0.9279644563794136, "reward_before_mean": 0.5369358099997044, "reward_before_std": 0.9452951550483704, "reward_change_max": 0.00033280253410339355, "reward_change_mean": -0.31754665449261665, "reward_change_min": -0.6524023413658142, "reward_change_std": 0.26818833593279123, "reward_std": 0.9279644936323166, "rewards/cosine_scaled_reward": 0.0705512291751802, "rewards/format_reward": 0.39583333767950535, "step": 129 }, { "advantage_max": 1.0561823174357414, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.567284919321537, "advantage_std": 0.6243378110229969, "completion_length": 3110.1458587646484, "epoch": 0.055868922911630405, "grad_norm": 0.14065316319465637, "kl": 0.006766319274902344, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.316216432703916e-07, "loss": 0.0248, "reward": -0.29209016263484955, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29209016263484955, "reward_after_std": 0.6243378184735775, "reward_before_mean": -0.12104359269142151, "reward_before_std": 0.6587507016956806, "reward_change_max": 0.00026642531156539917, "reward_change_mean": -0.1710465680807829, "reward_change_min": -0.42957304418087006, "reward_change_std": 0.1691564223729074, "reward_std": 0.6243378333747387, "rewards/cosine_scaled_reward": -0.17510513216257095, "rewards/format_reward": 0.2291666679084301, "step": 130 }, { "advantage_max": 0.943283062428236, "advantage_mean": -4.346171922353648e-09, "advantage_min": -0.4902738630771637, "advantage_std": 0.536124880425632, "completion_length": 3032.7083587646484, "epoch": 0.05629868385710449, "grad_norm": 0.11126387864351273, "kl": 0.0062541961669921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.299475664759068e-07, "loss": 0.0249, "reward": -0.2559091495350003, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2559091495350003, "reward_after_std": 0.536124880425632, "reward_before_mean": -0.06494930014014244, "reward_before_std": 0.5351277207955718, "reward_change_max": 0.0011912956833839417, "reward_change_mean": -0.19095990469213575, "reward_change_min": -0.363766860216856, "reward_change_std": 0.14107682660687715, "reward_std": 0.5361248878762126, "rewards/cosine_scaled_reward": -0.16789131180848926, "rewards/format_reward": 0.2708333358168602, "step": 131 }, { "advantage_max": 0.8564937897026539, "advantage_mean": 1.7384688799637615e-08, "advantage_min": -0.5475966855883598, "advantage_std": 0.5532482955604792, "completion_length": 3324.8125610351562, "epoch": 0.05672844480257857, "grad_norm": 0.13447709381580353, "kl": 0.00652313232421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.282549715730579e-07, "loss": 0.0308, "reward": -0.30903333658352494, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.30903333658352494, "reward_after_std": 0.5532482992857695, "reward_before_mean": -0.1291732260142453, "reward_before_std": 0.6109858956187963, "reward_change_max": 0.0, "reward_change_mean": -0.1798601138871163, "reward_change_min": -0.42405965924263, "reward_change_std": 0.17724589677527547, "reward_std": 0.5532483067363501, "rewards/cosine_scaled_reward": -0.16875328682363033, "rewards/format_reward": 0.2083333358168602, "step": 132 }, { "advantage_max": 0.8539167270064354, "advantage_mean": 1.738468857759301e-08, "advantage_min": -0.39376113191246986, "advantage_std": 0.46043526753783226, "completion_length": 3562.2708435058594, "epoch": 0.057158205748052644, "grad_norm": 0.09384007006883621, "kl": 0.007994890213012695, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.265439410565328e-07, "loss": 0.0122, "reward": -0.45954382652416825, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.45954382652416825, "reward_after_std": 0.46043526381254196, "reward_before_mean": -0.33373603224754333, "reward_before_std": 0.4504126403480768, "reward_change_max": 0.00026697665452957153, "reward_change_mean": -0.1258077872917056, "reward_change_min": -0.23594384267926216, "reward_change_std": 0.0932056475430727, "reward_std": 0.46043527498841286, "rewards/cosine_scaled_reward": -0.1877013493794948, "rewards/format_reward": 0.0416666679084301, "step": 133 }, { "advantage_max": 1.0910263918340206, "advantage_mean": -7.450580596923828e-09, "advantage_min": -0.7521674148738384, "advantage_std": 0.6830116137862206, "completion_length": 3061.2083740234375, "epoch": 0.057587966693526726, "grad_norm": 0.1252169907093048, "kl": 0.006773948669433594, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.248145583195447e-07, "loss": 0.0771, "reward": -0.13788206363096833, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13788206363096833, "reward_after_std": 0.6830115970224142, "reward_before_mean": 0.0856271330267191, "reward_before_std": 0.7404406350106001, "reward_change_max": 0.0012248307466506958, "reward_change_mean": -0.2235091980546713, "reward_change_min": -0.5032646283507347, "reward_change_std": 0.21202109567821026, "reward_std": 0.6830116230994463, "rewards/cosine_scaled_reward": -0.11343643721193075, "rewards/format_reward": 0.31250000931322575, "step": 134 }, { "advantage_max": 0.6830912232398987, "advantage_mean": 2.856055936195645e-08, "advantage_min": -0.3565994128584862, "advantage_std": 0.3917448855936527, "completion_length": 3489.875030517578, "epoch": 0.05801772763900081, "grad_norm": 0.08401557058095932, "kl": 0.0074138641357421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.230669076497687e-07, "loss": 0.0171, "reward": -0.551384000107646, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.551384000107646, "reward_after_std": 0.3917448855936527, "reward_before_mean": -0.4456505725393072, "reward_before_std": 0.40348074957728386, "reward_change_max": 0.0010098814964294434, "reward_change_mean": -0.1057334067299962, "reward_change_min": -0.2413748186081648, "reward_change_std": 0.09610975976102054, "reward_std": 0.3917448967695236, "rewards/cosine_scaled_reward": -0.26449195481836796, "rewards/format_reward": 0.0833333358168602, "step": 135 }, { "advantage_max": 0.7705427967011929, "advantage_mean": 1.5522043039783995e-08, "advantage_min": -0.4253736585378647, "advantage_std": 0.44707794301211834, "completion_length": 3572.7291870117188, "epoch": 0.058447488584474884, "grad_norm": 0.09392247349023819, "kl": 0.0090484619140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.213010742252327e-07, "loss": 0.0091, "reward": -0.5351011259481311, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5351011259481311, "reward_after_std": 0.4470779225230217, "reward_before_mean": -0.4307723045349121, "reward_before_std": 0.47015033662319183, "reward_change_max": 0.0008254498243331909, "reward_change_mean": -0.10432883026078343, "reward_change_min": -0.23195877484977245, "reward_change_std": 0.1017058165743947, "reward_std": 0.4470779336988926, "rewards/cosine_scaled_reward": -0.22580281645059586, "rewards/format_reward": 0.02083333395421505, "step": 136 }, { "advantage_max": 0.7908326201140881, "advantage_mean": 2.6387473484046353e-08, "advantage_min": -0.5027914196252823, "advantage_std": 0.4989481698721647, "completion_length": 3464.041717529297, "epoch": 0.058877249529948966, "grad_norm": 0.09184622764587402, "kl": 0.008295059204101562, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.195171441101668e-07, "loss": 0.0174, "reward": -0.3948749341070652, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3948749341070652, "reward_after_std": 0.4989481791853905, "reward_before_mean": -0.24209203384816647, "reward_before_std": 0.5437109218910336, "reward_change_max": 0.0007116198539733887, "reward_change_mean": -0.15278288070112467, "reward_change_min": -0.3856469355523586, "reward_change_std": 0.15741440933197737, "reward_std": 0.49894818663597107, "rewards/cosine_scaled_reward": -0.18354602064937353, "rewards/format_reward": 0.12500000186264515, "step": 137 }, { "advantage_max": 0.9980085231363773, "advantage_mean": 2.110997865401032e-08, "advantage_min": -0.47156353294849396, "advantage_std": 0.5548330582678318, "completion_length": 3508.9583740234375, "epoch": 0.05930701047542305, "grad_norm": 0.10555094480514526, "kl": 0.0058956146240234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.177152042508077e-07, "loss": 0.0161, "reward": -0.4773680828511715, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4773680828511715, "reward_after_std": 0.554833073168993, "reward_before_mean": -0.3678165841847658, "reward_before_std": 0.576345831155777, "reward_change_max": 0.0032513365149497986, "reward_change_mean": -0.10955148469656706, "reward_change_min": -0.25921760126948357, "reward_change_std": 0.10841279197484255, "reward_std": 0.554833110421896, "rewards/cosine_scaled_reward": -0.22557496652007103, "rewards/format_reward": 0.0833333358168602, "step": 138 }, { "advantage_max": 0.8846727833151817, "advantage_mean": 6.208822345143972e-10, "advantage_min": -0.4887484833598137, "advantage_std": 0.5054308921098709, "completion_length": 3169.770866394043, "epoch": 0.059736771420897124, "grad_norm": 0.09724171459674835, "kl": 0.009481430053710938, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.158953424711624e-07, "loss": 0.0158, "reward": -0.0455196276307106, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0455196276307106, "reward_after_std": 0.505430880934, "reward_before_mean": 0.22518206760287285, "reward_before_std": 0.4743648041039705, "reward_change_max": 0.0006197988986968994, "reward_change_mean": -0.2707017119973898, "reward_change_min": -0.4492959901690483, "reward_change_std": 0.18013359606266022, "reward_std": 0.5054308883845806, "rewards/cosine_scaled_reward": -0.022825635969638824, "rewards/format_reward": 0.27083333767950535, "step": 139 }, { "advantage_max": 0.7937037386000156, "advantage_mean": 8.692344621863413e-09, "advantage_min": -0.6644148379564285, "advantage_std": 0.5163745675235987, "completion_length": 3463.187530517578, "epoch": 0.060166532366371206, "grad_norm": 0.10959067195653915, "kl": 0.007296562194824219, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.140576474687263e-07, "loss": 0.0033, "reward": -0.2127493917942047, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2127493917942047, "reward_after_std": 0.5163745619356632, "reward_before_mean": 0.006966546177864075, "reward_before_std": 0.5606335122138262, "reward_change_max": 0.00017823278903961182, "reward_change_mean": -0.21971595473587513, "reward_change_min": -0.4284201953560114, "reward_change_std": 0.18153624143451452, "reward_std": 0.5163745786994696, "rewards/cosine_scaled_reward": -0.09026672691106796, "rewards/format_reward": 0.18750000558793545, "step": 140 }, { "advantage_max": 0.8155123218894005, "advantage_mean": 4.221995919539978e-08, "advantage_min": -0.5219373106956482, "advantage_std": 0.4815290756523609, "completion_length": 3336.937530517578, "epoch": 0.06059629331184529, "grad_norm": 0.09818120300769806, "kl": 0.0070056915283203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.122022088101613e-07, "loss": 0.0415, "reward": -0.26256594620645046, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26256594620645046, "reward_after_std": 0.48152909614145756, "reward_before_mean": -0.06304997205734253, "reward_before_std": 0.49045356176793575, "reward_change_max": 0.0008678510785102844, "reward_change_mean": -0.19951593689620495, "reward_change_min": -0.36662409268319607, "reward_change_std": 0.1493612783960998, "reward_std": 0.48152910359203815, "rewards/cosine_scaled_reward": -0.13569165766239166, "rewards/format_reward": 0.2083333358168602, "step": 141 }, { "advantage_max": 1.077614277601242, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.6663354933261871, "advantage_std": 0.6389063708484173, "completion_length": 3313.7708435058594, "epoch": 0.06102605425731936, "grad_norm": 0.10751213133335114, "kl": 0.004710197448730469, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.103291169269299e-07, "loss": 0.0678, "reward": -0.1166822649538517, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1166822649538517, "reward_after_std": 0.6389063727110624, "reward_before_mean": 0.11681712977588177, "reward_before_std": 0.6656335387378931, "reward_change_max": 0.0012138411402702332, "reward_change_mean": -0.23349939729087055, "reward_change_min": -0.43707483634352684, "reward_change_std": 0.18207782600075006, "reward_std": 0.6389063913375139, "rewards/cosine_scaled_reward": -0.045758103020489216, "rewards/format_reward": 0.20833334140479565, "step": 142 }, { "advantage_max": 0.7166655734181404, "advantage_mean": 3.1044083970144243e-09, "advantage_min": -0.41165250539779663, "advantage_std": 0.41752462834119797, "completion_length": 3527.6666870117188, "epoch": 0.061455815202793446, "grad_norm": 0.07803385704755783, "kl": 0.0061283111572265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.084384631108882e-07, "loss": 0.0133, "reward": -0.5343353822827339, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5343353822827339, "reward_after_std": 0.41752463206648827, "reward_before_mean": -0.42513055400922894, "reward_before_std": 0.4351458474993706, "reward_change_max": 0.001574590802192688, "reward_change_mean": -0.10920483712106943, "reward_change_min": -0.24320121854543686, "reward_change_std": 0.10182142560370266, "reward_std": 0.41752463951706886, "rewards/cosine_scaled_reward": -0.24381526932120323, "rewards/format_reward": 0.06250000186264515, "step": 143 }, { "advantage_max": 0.6513160765171051, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.471440926194191, "advantage_std": 0.41844250448048115, "completion_length": 3505.3541870117188, "epoch": 0.06188557614826753, "grad_norm": 0.07864533364772797, "kl": 0.009830474853515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.065303395098358e-07, "loss": 0.0169, "reward": -0.4054124131798744, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4054124131798744, "reward_after_std": 0.41844250820577145, "reward_before_mean": -0.2455281727015972, "reward_before_std": 0.4554564878344536, "reward_change_max": 1.8320977687835693e-05, "reward_change_mean": -0.15988424280658364, "reward_change_min": -0.3188432827591896, "reward_change_std": 0.138969705440104, "reward_std": 0.4184425137937069, "rewards/cosine_scaled_reward": -0.1644307542592287, "rewards/format_reward": 0.0833333358168602, "step": 144 }, { "advantage_max": 0.5654457807540894, "advantage_mean": 1.5522043539384356e-08, "advantage_min": -0.3258429169654846, "advantage_std": 0.340566910803318, "completion_length": 3584.0, "epoch": 0.0623153370937416, "grad_norm": 0.0643945187330246, "kl": 0.009260177612304688, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.046048391230247e-07, "loss": 0.0004, "reward": -0.6387216970324516, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6387216970324516, "reward_after_std": 0.340566910803318, "reward_before_mean": -0.555585291236639, "reward_before_std": 0.36758775264024734, "reward_change_max": 0.001831553876399994, "reward_change_mean": -0.08313640649430454, "reward_change_min": -0.22427873127162457, "reward_change_std": 0.09069517534226179, "reward_std": 0.3405669257044792, "rewards/cosine_scaled_reward": -0.2777926456183195, "rewards/format_reward": 0.0, "step": 145 }, { "advantage_max": 0.5860477425158024, "advantage_mean": 1.2417634864814175e-08, "advantage_min": -0.3244834691286087, "advantage_std": 0.3431299217045307, "completion_length": 3381.0833435058594, "epoch": 0.06274509803921569, "grad_norm": 0.06283292919397354, "kl": 0.007564544677734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.026620557966279e-07, "loss": 0.0055, "reward": -0.5465646963566542, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5465646963566542, "reward_after_std": 0.34312992356717587, "reward_before_mean": -0.4318493753671646, "reward_before_std": 0.350953770801425, "reward_change_max": 0.0011817589402198792, "reward_change_mean": -0.11471533216536045, "reward_change_min": -0.24000519886612892, "reward_change_std": 0.09958406956866384, "reward_std": 0.3431299403309822, "rewards/cosine_scaled_reward": -0.2680080235004425, "rewards/format_reward": 0.1041666716337204, "step": 146 }, { "advantage_max": 0.7750108353793621, "advantage_mean": 2.5456151409031236e-08, "advantage_min": -0.4393719211220741, "advantage_std": 0.44452954083681107, "completion_length": 3329.75, "epoch": 0.06317485898468976, "grad_norm": 0.07701614499092102, "kl": 0.008581161499023438, "lambda_div_used": 0.7000000000000001, "learning_rate": 9.007020842191634e-07, "loss": 0.0309, "reward": -0.4562487304210663, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4562487304210663, "reward_after_std": 0.4445295426994562, "reward_before_mean": -0.3234791634604335, "reward_before_std": 0.45524729043245316, "reward_change_max": 0.0010843724012374878, "reward_change_mean": -0.1327695520594716, "reward_change_min": -0.27794723212718964, "reward_change_std": 0.11879570363089442, "reward_std": 0.44452954828739166, "rewards/cosine_scaled_reward": -0.23465626267716289, "rewards/format_reward": 0.1458333395421505, "step": 147 }, { "advantage_max": 0.7337110564112663, "advantage_mean": 9.31322685637781e-10, "advantage_min": -0.5884901806712151, "advantage_std": 0.47536591067910194, "completion_length": 3423.8958740234375, "epoch": 0.06360461993016385, "grad_norm": 0.08940654247999191, "kl": 0.009160041809082031, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.987250199168808e-07, "loss": 0.0097, "reward": -0.32169317081570625, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.32169317081570625, "reward_after_std": 0.47536590695381165, "reward_before_mean": -0.13848424702882767, "reward_before_std": 0.5184689350426197, "reward_change_max": 0.0009155571460723877, "reward_change_mean": -0.18320895172655582, "reward_change_min": -0.375043623149395, "reward_change_std": 0.158781535923481, "reward_std": 0.47536591812968254, "rewards/cosine_scaled_reward": -0.16299211606383324, "rewards/format_reward": 0.1875000074505806, "step": 148 }, { "advantage_max": 0.6368842646479607, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.3957781568169594, "advantage_std": 0.3983878679573536, "completion_length": 3371.0625, "epoch": 0.06403438087563793, "grad_norm": 0.067354217171669, "kl": 0.012439727783203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.967309592491052e-07, "loss": 0.0089, "reward": -0.4384483899921179, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4384483899921179, "reward_after_std": 0.3983878716826439, "reward_before_mean": -0.2895002190489322, "reward_before_std": 0.4239782355725765, "reward_change_max": 0.0, "reward_change_mean": -0.1489481646567583, "reward_change_min": -0.3306741565465927, "reward_change_std": 0.1276971958577633, "reward_std": 0.3983878828585148, "rewards/cosine_scaled_reward": -0.2385001126676798, "rewards/format_reward": 0.1875, "step": 149 }, { "advantage_max": 0.6825664229691029, "advantage_mean": 1.6142925329809543e-08, "advantage_min": -0.41044866666197777, "advantage_std": 0.40097850933671, "completion_length": 3435.7083435058594, "epoch": 0.064464141821112, "grad_norm": 0.08548089861869812, "kl": 0.009796142578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.9471999940354e-07, "loss": 0.0122, "reward": -0.43983936216682196, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.43983936216682196, "reward_after_std": 0.4009785130620003, "reward_before_mean": -0.29293827805668116, "reward_before_std": 0.40895108599215746, "reward_change_max": 0.00043232738971710205, "reward_change_mean": -0.1469010771252215, "reward_change_min": -0.2973312698304653, "reward_change_std": 0.12275380454957485, "reward_std": 0.40097852051258087, "rewards/cosine_scaled_reward": -0.22980248369276524, "rewards/format_reward": 0.16666666977107525, "step": 150 }, { "advantage_max": 0.9776737838983536, "advantage_mean": 1.241763458725842e-08, "advantage_min": -0.6835503503680229, "advantage_std": 0.587976835668087, "completion_length": 3370.9166870117188, "epoch": 0.06489390276658609, "grad_norm": 0.09998104721307755, "kl": 0.007067680358886719, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.926922383915315e-07, "loss": 0.0325, "reward": -0.26297924295067787, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26297924295067787, "reward_after_std": 0.587976835668087, "reward_before_mean": -0.07567789405584335, "reward_before_std": 0.6214282065629959, "reward_change_max": 0.0021340474486351013, "reward_change_mean": -0.1873013344593346, "reward_change_min": -0.40594692528247833, "reward_change_std": 0.16816405579447746, "reward_std": 0.5879768617451191, "rewards/cosine_scaled_reward": -0.14200561679899693, "rewards/format_reward": 0.20833334140479565, "step": 151 }, { "advantage_max": 0.993885587900877, "advantage_mean": 1.5522049201521781e-09, "advantage_min": -0.6162971556186676, "advantage_std": 0.5987227149307728, "completion_length": 3014.479202270508, "epoch": 0.06532366371206016, "grad_norm": 0.15035343170166016, "kl": 0.009599685668945312, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.906477750432903e-07, "loss": 0.0358, "reward": -0.15518004074692726, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15518004074692726, "reward_after_std": 0.5987227186560631, "reward_before_mean": 0.06819121632725, "reward_before_std": 0.6265989430248737, "reward_change_max": 0.00034496188163757324, "reward_change_mean": -0.22337123844772577, "reward_change_min": -0.43576304987072945, "reward_change_std": 0.17303787264972925, "reward_std": 0.5987227223813534, "rewards/cosine_scaled_reward": -0.11173774022608995, "rewards/format_reward": 0.291666679084301, "step": 152 }, { "advantage_max": 1.2759642340242863, "advantage_mean": 2.9181441374426242e-08, "advantage_min": -0.47084981948137283, "advantage_std": 0.6580290794372559, "completion_length": 3550.8125, "epoch": 0.06575342465753424, "grad_norm": 0.11920297145843506, "kl": 0.01499176025390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.88586709003076e-07, "loss": 0.0076, "reward": -0.4457554267719388, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4457554267719388, "reward_after_std": 0.6580290645360947, "reward_before_mean": -0.3455572035163641, "reward_before_std": 0.6478042341768742, "reward_change_max": 0.0010235980153083801, "reward_change_mean": -0.1001982158049941, "reward_change_min": -0.21494369953870773, "reward_change_std": 0.0837489515542984, "reward_std": 0.6580290794372559, "rewards/cosine_scaled_reward": -0.20402860699687153, "rewards/format_reward": 0.06250000186264515, "step": 153 }, { "advantage_max": 0.7151554822921753, "advantage_mean": 1.7384688744126464e-08, "advantage_min": -0.3820602521300316, "advantage_std": 0.4081663489341736, "completion_length": 3136.4166870117188, "epoch": 0.06618318560300833, "grad_norm": 0.06933719664812088, "kl": 0.009355545043945312, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.865091407243394e-07, "loss": 0.0003, "reward": -0.2944636382162571, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2944636382162571, "reward_after_std": 0.4081663638353348, "reward_before_mean": -0.10011844336986542, "reward_before_std": 0.3866820354014635, "reward_change_max": 0.0012456625699996948, "reward_change_mean": -0.19434518041089177, "reward_change_min": -0.35621180944144726, "reward_change_std": 0.13901403080672026, "reward_std": 0.4081663750112057, "rewards/cosine_scaled_reward": -0.14380922354757786, "rewards/format_reward": 0.1875, "step": 154 }, { "advantage_max": 0.759428109973669, "advantage_mean": 2.4214387606136256e-08, "advantage_min": -0.48282021284103394, "advantage_std": 0.44041892513632774, "completion_length": 3424.875, "epoch": 0.0666129465484824, "grad_norm": 0.08082197606563568, "kl": 0.0102996826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.844151714648274e-07, "loss": 0.0034, "reward": -0.41388820111751556, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.41388820111751556, "reward_after_std": 0.4404189307242632, "reward_before_mean": -0.2649937607347965, "reward_before_std": 0.452032458037138, "reward_change_max": 0.0015329420566558838, "reward_change_mean": -0.1488944012671709, "reward_change_min": -0.2921294569969177, "reward_change_std": 0.1213954221457243, "reward_std": 0.440418953076005, "rewards/cosine_scaled_reward": -0.19499689526855946, "rewards/format_reward": 0.12500000558793545, "step": 155 }, { "advantage_max": 1.1606851816177368, "advantage_mean": 1.5522043095295146e-09, "advantage_min": -0.588886346668005, "advantage_std": 0.6579682305455208, "completion_length": 3266.2709045410156, "epoch": 0.06704270749395648, "grad_norm": 0.12433308362960815, "kl": 0.01091766357421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.823049032816478e-07, "loss": 0.0264, "reward": -0.2891694214195013, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2891694214195013, "reward_after_std": 0.6579682528972626, "reward_before_mean": -0.12456191354431212, "reward_before_std": 0.6774619128555059, "reward_change_max": 0.0016023889183998108, "reward_change_mean": -0.16460750438272953, "reward_change_min": -0.3966493047773838, "reward_change_std": 0.16087091714143753, "reward_std": 0.657968282699585, "rewards/cosine_scaled_reward": -0.20811429433524609, "rewards/format_reward": 0.2916666753590107, "step": 156 }, { "advantage_max": 0.8886239901185036, "advantage_mean": 8.071462720415923e-09, "advantage_min": -0.5048163309693336, "advantage_std": 0.5214715264737606, "completion_length": 3102.520835876465, "epoch": 0.06747246843943057, "grad_norm": 0.0977158397436142, "kl": 0.010652542114257812, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.801784390262943e-07, "loss": 0.0121, "reward": -0.07417371869087219, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07417371869087219, "reward_after_std": 0.5214715301990509, "reward_before_mean": 0.18692484870553017, "reward_before_std": 0.5063307154923677, "reward_change_max": 0.0, "reward_change_mean": -0.2610985357314348, "reward_change_min": -0.4932491388171911, "reward_change_std": 0.1905378568917513, "reward_std": 0.5214715376496315, "rewards/cosine_scaled_reward": -0.04195425100624561, "rewards/format_reward": 0.27083333767950535, "step": 157 }, { "advantage_max": 1.6110506132245064, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.8083760850131512, "advantage_std": 0.8910581842064857, "completion_length": 3297.2291870117188, "epoch": 0.06790222938490464, "grad_norm": 0.17574356496334076, "kl": 0.012479782104492188, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.780358823396352e-07, "loss": 0.0765, "reward": -0.12624962395057082, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12624962395057082, "reward_after_std": 0.8910581842064857, "reward_before_mean": 0.06499922648072243, "reward_before_std": 0.9125124774873257, "reward_change_max": 0.0010049715638160706, "reward_change_mean": -0.19124885648488998, "reward_change_min": -0.45409322157502174, "reward_change_std": 0.18645085487514734, "reward_std": 0.8910581860691309, "rewards/cosine_scaled_reward": -0.0925003852462396, "rewards/format_reward": 0.2500000037252903, "step": 158 }, { "advantage_max": 0.9454930946230888, "advantage_mean": -9.934107314535368e-09, "advantage_min": -0.48142295703291893, "advantage_std": 0.5349352955818176, "completion_length": 2933.9583587646484, "epoch": 0.06833199033037873, "grad_norm": 0.09202487766742706, "kl": 0.0146636962890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.758773376468604e-07, "loss": 0.0143, "reward": -0.31772942608222365, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.31772942608222365, "reward_after_std": 0.5349352918565273, "reward_before_mean": -0.14811554923653603, "reward_before_std": 0.5379427410662174, "reward_change_max": 0.0005845949053764343, "reward_change_mean": -0.1696139150299132, "reward_change_min": -0.3783189356327057, "reward_change_std": 0.1433570822700858, "reward_std": 0.5349353179335594, "rewards/cosine_scaled_reward": -0.2303077606484294, "rewards/format_reward": 0.31250000186264515, "step": 159 }, { "advantage_max": 0.7223776578903198, "advantage_mean": 1.738468852208186e-08, "advantage_min": -0.4351479932665825, "advantage_std": 0.43599105812609196, "completion_length": 3564.5416870117188, "epoch": 0.06876175127585281, "grad_norm": 0.08207883685827255, "kl": 0.012833595275878906, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.737029101523929e-07, "loss": 0.0128, "reward": -0.48927134834229946, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.48927134834229946, "reward_after_std": 0.43599105440080166, "reward_before_mean": -0.3638004083186388, "reward_before_std": 0.46499946154654026, "reward_change_max": 0.0005485415458679199, "reward_change_mean": -0.12547094374895096, "reward_change_min": -0.26667334139347076, "reward_change_std": 0.11599742714315653, "reward_std": 0.43599108047783375, "rewards/cosine_scaled_reward": -0.19231686927378178, "rewards/format_reward": 0.02083333395421505, "step": 160 }, { "advantage_max": 0.9133197255432606, "advantage_mean": 1.1175870895385742e-08, "advantage_min": -0.45166582614183426, "advantage_std": 0.4987768605351448, "completion_length": 3470.4791870117188, "epoch": 0.06919151222132688, "grad_norm": 0.11344372481107712, "kl": 0.010959625244140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.715127058347614e-07, "loss": -0.003, "reward": -0.48816221579909325, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.48816221579909325, "reward_after_std": 0.4987768717110157, "reward_before_mean": -0.377079501748085, "reward_before_std": 0.5025115385651588, "reward_change_max": 0.002038531005382538, "reward_change_mean": -0.11108272464480251, "reward_change_min": -0.24394665658473969, "reward_change_std": 0.10106323496438563, "reward_std": 0.4987768828868866, "rewards/cosine_scaled_reward": -0.24062308733118698, "rewards/format_reward": 0.1041666679084301, "step": 161 }, { "advantage_max": 0.6994108408689499, "advantage_mean": -6.829698528942885e-09, "advantage_min": -0.4849977046251297, "advantage_std": 0.43067341670393944, "completion_length": 3350.750030517578, "epoch": 0.06962127316680097, "grad_norm": 0.08091778308153152, "kl": 0.012584686279296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.693068314414344e-07, "loss": 0.0003, "reward": -0.36579396575689316, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.36579396575689316, "reward_after_std": 0.43067340925335884, "reward_before_mean": -0.19506661966443062, "reward_before_std": 0.45358215272426605, "reward_change_max": 0.0009861290454864502, "reward_change_mean": -0.1707273656502366, "reward_change_min": -0.3447840493172407, "reward_change_std": 0.13888960145413876, "reward_std": 0.43067343160510063, "rewards/cosine_scaled_reward": -0.170449985191226, "rewards/format_reward": 0.14583333395421505, "step": 162 }, { "advantage_max": 0.858251329511404, "advantage_mean": 1.490116174895917e-08, "advantage_min": -0.4266858883202076, "advantage_std": 0.48167357221245766, "completion_length": 3568.5416870117188, "epoch": 0.07005103411227505, "grad_norm": 0.08470692485570908, "kl": 0.0076351165771484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.670853944836176e-07, "loss": 0.0101, "reward": -0.5125717390328646, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5125717390328646, "reward_after_std": 0.48167360574007034, "reward_before_mean": -0.4051659945398569, "reward_before_std": 0.49607279896736145, "reward_change_max": 0.0007114261388778687, "reward_change_mean": -0.10740576032549143, "reward_change_min": -0.25392854399979115, "reward_change_std": 0.10439469665288925, "reward_std": 0.48167362436652184, "rewards/cosine_scaled_reward": -0.22341632470488548, "rewards/format_reward": 0.0416666679084301, "step": 163 }, { "advantage_max": 1.2096691243350506, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.5405268520116806, "advantage_std": 0.6684646289795637, "completion_length": 3177.0834045410156, "epoch": 0.07048079505774912, "grad_norm": 0.12623926997184753, "kl": 0.012638092041015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.648485032310144e-07, "loss": 0.0515, "reward": -0.26291514933109283, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.26291514933109283, "reward_after_std": 0.6684646289795637, "reward_before_mean": -0.0927696768194437, "reward_before_std": 0.6693109106272459, "reward_change_max": 0.00022067874670028687, "reward_change_mean": -0.17014547809958458, "reward_change_min": -0.3799786511808634, "reward_change_std": 0.14870858285576105, "reward_std": 0.6684646625071764, "rewards/cosine_scaled_reward": -0.18180151097476482, "rewards/format_reward": 0.27083334140479565, "step": 164 }, { "advantage_max": 1.4417489916086197, "advantage_mean": 6.208817238118058e-09, "advantage_min": -0.8711266815662384, "advantage_std": 0.8601990677416325, "completion_length": 3151.291702270508, "epoch": 0.07091055600322321, "grad_norm": 0.16582193970680237, "kl": 0.011958122253417969, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.625962667065487e-07, "loss": 0.014, "reward": 0.06220081262290478, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06220081262290478, "reward_after_std": 0.8601990453898907, "reward_before_mean": 0.3324306955037173, "reward_before_std": 0.9068262577056885, "reward_change_max": 0.0020727813243865967, "reward_change_mean": -0.2702298825606704, "reward_change_min": -0.582868367433548, "reward_change_std": 0.24314747657626867, "reward_std": 0.8601990826427937, "rewards/cosine_scaled_reward": -0.0004513245075941086, "rewards/format_reward": 0.33333334140479565, "step": 165 }, { "advantage_max": 0.8153008930385113, "advantage_mean": -8.071462498371318e-09, "advantage_min": -0.36161188036203384, "advantage_std": 0.4328983686864376, "completion_length": 3182.125030517578, "epoch": 0.07134031694869729, "grad_norm": 0.06606867909431458, "kl": 0.008549690246582031, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.603287946810513e-07, "loss": 0.0075, "reward": -0.04676421731710434, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.04676421731710434, "reward_after_std": 0.4328983649611473, "reward_before_mean": 0.2303354497998953, "reward_before_std": 0.354783670976758, "reward_change_max": 0.0006325840950012207, "reward_change_mean": -0.27709967899136245, "reward_change_min": -0.4251525029540062, "reward_change_std": 0.16204779176041484, "reward_std": 0.4328983686864376, "rewards/cosine_scaled_reward": -0.030665608122944832, "rewards/format_reward": 0.2916666679084301, "step": 166 }, { "advantage_max": 1.2702331133186817, "advantage_mean": -1.3659398279131096e-08, "advantage_min": -0.8416054770350456, "advantage_std": 0.7631019800901413, "completion_length": 2835.937530517578, "epoch": 0.07177007789417136, "grad_norm": 0.1930035650730133, "kl": 0.00884246826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.580461976679099e-07, "loss": 0.0716, "reward": -0.02532830648124218, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.02532830648124218, "reward_after_std": 0.7631019726395607, "reward_before_mean": 0.22559620370157063, "reward_before_std": 0.8085959181189537, "reward_change_max": 0.0007076486945152283, "reward_change_mean": -0.25092452485114336, "reward_change_min": -0.4704540781676769, "reward_change_std": 0.20873789209872484, "reward_std": 0.7631019800901413, "rewards/cosine_scaled_reward": -0.04345189966261387, "rewards/format_reward": 0.3125000074505806, "step": 167 }, { "advantage_max": 1.1199279725551605, "advantage_mean": -2.4835269951672956e-09, "advantage_min": -0.5489422678947449, "advantage_std": 0.6467698775231838, "completion_length": 3491.812530517578, "epoch": 0.07219983883964545, "grad_norm": 0.1348467618227005, "kl": 0.01166534423828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.557485869176825e-07, "loss": 0.0423, "reward": -0.3847584221512079, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3847584221512079, "reward_after_std": 0.6467698700726032, "reward_before_mean": -0.2513616408687085, "reward_before_std": 0.6858641244471073, "reward_change_max": 0.001642569899559021, "reward_change_mean": -0.13339679222553968, "reward_change_min": -0.37166671827435493, "reward_change_std": 0.15345313027501106, "reward_std": 0.646769892424345, "rewards/cosine_scaled_reward": -0.17776415729895234, "rewards/format_reward": 0.10416666977107525, "step": 168 }, { "advantage_max": 0.989535316824913, "advantage_mean": 4.346172199909404e-09, "advantage_min": -0.6400273889303207, "advantage_std": 0.6340872179716825, "completion_length": 3243.375030517578, "epoch": 0.07262959978511953, "grad_norm": 0.11802002042531967, "kl": 0.0119476318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.534360744126753e-07, "loss": 0.018, "reward": -0.22826672531664371, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.22826672531664371, "reward_after_std": 0.634087236598134, "reward_before_mean": -0.029737839475274086, "reward_before_std": 0.6964433901011944, "reward_change_max": 0.0008659511804580688, "reward_change_mean": -0.19852887140586972, "reward_change_min": -0.47646634839475155, "reward_change_std": 0.19656233675777912, "reward_std": 0.6340872850269079, "rewards/cosine_scaled_reward": -0.15028559789061546, "rewards/format_reward": 0.27083333395421505, "step": 169 }, { "advantage_max": 1.6323509365320206, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -0.8243374153971672, "advantage_std": 0.9356481861323118, "completion_length": 3323.3334045410156, "epoch": 0.0730593607305936, "grad_norm": 0.16599643230438232, "kl": 0.005237579345703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.511087728614862e-07, "loss": 0.0651, "reward": -0.06035856995731592, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06035856995731592, "reward_after_std": 0.9356481861323118, "reward_before_mean": 0.1534761432558298, "reward_before_std": 0.9853157699108124, "reward_change_max": 0.0007859021425247192, "reward_change_mean": -0.2138347232248634, "reward_change_min": -0.5527817774564028, "reward_change_std": 0.22042589006014168, "reward_std": 0.9356481917202473, "rewards/cosine_scaled_reward": -0.05867859348654747, "rewards/format_reward": 0.27083334140479565, "step": 170 }, { "advantage_max": 1.0350098237395287, "advantage_mean": 1.490116130486996e-08, "advantage_min": -0.5473735257983208, "advantage_std": 0.598974671214819, "completion_length": 3127.770835876465, "epoch": 0.07348912167606769, "grad_norm": 0.10873287916183472, "kl": 0.01145172119140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.487667956935087e-07, "loss": 0.0409, "reward": -0.29191356152296066, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.29191356152296066, "reward_after_std": 0.598974671214819, "reward_before_mean": -0.11775195598602295, "reward_before_std": 0.61875119805336, "reward_change_max": 0.00010274350643157959, "reward_change_mean": -0.17416160460561514, "reward_change_min": -0.3973977956920862, "reward_change_std": 0.1573415184393525, "reward_std": 0.5989746749401093, "rewards/cosine_scaled_reward": -0.16304264590144157, "rewards/format_reward": 0.2083333358168602, "step": 171 }, { "advantage_max": 1.0359412021934986, "advantage_mean": 1.8626452102932234e-08, "advantage_min": -0.6736522912979126, "advantage_std": 0.6378682851791382, "completion_length": 3360.6666870117188, "epoch": 0.07391888262154177, "grad_norm": 0.10557280480861664, "kl": 0.010257720947265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.464102570534061e-07, "loss": 0.008, "reward": 0.012850821018218994, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.012850821018218994, "reward_after_std": 0.6378682851791382, "reward_before_mean": 0.2944837249815464, "reward_before_std": 0.6508672572672367, "reward_change_max": 0.0007623210549354553, "reward_change_mean": -0.28163284342736006, "reward_change_min": -0.553506001830101, "reward_change_std": 0.22230198234319687, "reward_std": 0.6378683038055897, "rewards/cosine_scaled_reward": 0.011825168505311012, "rewards/format_reward": 0.2708333395421505, "step": 172 }, { "advantage_max": 0.48780016228556633, "advantage_mean": 1.4280279680978225e-08, "advantage_min": -0.33234717324376106, "advantage_std": 0.2977281454950571, "completion_length": 3570.0, "epoch": 0.07434864356701584, "grad_norm": 0.050988636910915375, "kl": 0.011743545532226562, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.440392717955475e-07, "loss": 0.0039, "reward": -0.5822819881141186, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5822819881141186, "reward_after_std": 0.29772814363241196, "reward_before_mean": -0.4728614240884781, "reward_before_std": 0.31291849352419376, "reward_change_max": 0.00019318610429763794, "reward_change_mean": -0.10942056961357594, "reward_change_min": -0.21841607056558132, "reward_change_std": 0.09234144864603877, "reward_std": 0.29772815480828285, "rewards/cosine_scaled_reward": -0.246847378090024, "rewards/format_reward": 0.02083333395421505, "step": 173 }, { "advantage_max": 0.9358778446912766, "advantage_mean": 1.1175871339474952e-08, "advantage_min": -0.6883617341518402, "advantage_std": 0.588120773434639, "completion_length": 2973.0833587646484, "epoch": 0.07477840451248993, "grad_norm": 0.10092277824878693, "kl": 0.014841079711914062, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.416539554784089e-07, "loss": 0.0023, "reward": -0.04580974578857422, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04580974578857422, "reward_after_std": 0.5881207697093487, "reward_before_mean": 0.22163715958595276, "reward_before_std": 0.6191687509417534, "reward_change_max": 0.0005386844277381897, "reward_change_mean": -0.26744689233601093, "reward_change_min": -0.5027681104838848, "reward_change_std": 0.20541273383423686, "reward_std": 0.5881207957863808, "rewards/cosine_scaled_reward": -0.06626475416123867, "rewards/format_reward": 0.35416667722165585, "step": 174 }, { "advantage_max": 1.193859301507473, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.6274926550686359, "advantage_std": 0.6849896423518658, "completion_length": 3280.250030517578, "epoch": 0.07520816545796401, "grad_norm": 0.12918001413345337, "kl": 0.014101982116699219, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.392544243589427e-07, "loss": 0.0456, "reward": -0.22920513665303588, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22920513665303588, "reward_after_std": 0.6849896349012852, "reward_before_mean": -0.04509166069328785, "reward_before_std": 0.7091170400381088, "reward_change_max": 0.0010616108775138855, "reward_change_mean": -0.1841134876012802, "reward_change_min": -0.40069411508738995, "reward_change_std": 0.16558128874748945, "reward_std": 0.6849896684288979, "rewards/cosine_scaled_reward": -0.1579625024460256, "rewards/format_reward": 0.27083333767950535, "step": 175 }, { "advantage_max": 0.7348998486995697, "advantage_mean": 3.1044086745701804e-08, "advantage_min": -0.3947395123541355, "advantage_std": 0.41977166198194027, "completion_length": 3382.7916717529297, "epoch": 0.07563792640343808, "grad_norm": 0.07057182490825653, "kl": 0.010601043701171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.368407953869103e-07, "loss": 0.0176, "reward": -0.45237624645233154, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.45237624645233154, "reward_after_std": 0.41977166570723057, "reward_before_mean": -0.3144308291375637, "reward_before_std": 0.4237563405185938, "reward_change_max": 0.0014314055442810059, "reward_change_mean": -0.13794539403170347, "reward_change_min": -0.28073785454034805, "reward_change_std": 0.1144007327966392, "reward_std": 0.4197716787457466, "rewards/cosine_scaled_reward": -0.20929875038564205, "rewards/format_reward": 0.1041666716337204, "step": 176 }, { "advantage_max": 1.2051672860980034, "advantage_mean": 7.45058115203534e-09, "advantage_min": -0.6239040419459343, "advantage_std": 0.6959295272827148, "completion_length": 3077.7708740234375, "epoch": 0.07606768734891217, "grad_norm": 0.156153604388237, "kl": 0.010128974914550781, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.344131861991828e-07, "loss": 0.0221, "reward": 0.0260214414447546, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0260214414447546, "reward_after_std": 0.6959295459091663, "reward_before_mean": 0.3021874763071537, "reward_before_std": 0.6823255415074527, "reward_change_max": 0.0006004571914672852, "reward_change_mean": -0.27616604696959257, "reward_change_min": -0.571710079908371, "reward_change_std": 0.21783905010670424, "reward_std": 0.6959295533597469, "rewards/cosine_scaled_reward": -0.03640626324340701, "rewards/format_reward": 0.3750000037252903, "step": 177 }, { "advantage_max": 1.2873021438717842, "advantage_mean": 4.346172421954009e-09, "advantage_min": -0.6903954669833183, "advantage_std": 0.7695224657654762, "completion_length": 3335.9583435058594, "epoch": 0.07649744829438625, "grad_norm": 0.16415968537330627, "kl": 0.013065338134765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.319717151140072e-07, "loss": 0.0374, "reward": 0.021780354902148247, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.021780354902148247, "reward_after_std": 0.7695224583148956, "reward_before_mean": 0.28958842996507883, "reward_before_std": 0.7912663780152798, "reward_change_max": 0.0017235055565834045, "reward_change_mean": -0.26780809834599495, "reward_change_min": -0.5760630331933498, "reward_change_std": 0.23950838297605515, "reward_std": 0.7695224769413471, "rewards/cosine_scaled_reward": -0.0010391036048531532, "rewards/format_reward": 0.2916666716337204, "step": 178 }, { "advantage_max": 1.1962079554796219, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.7053538411855698, "advantage_std": 0.7477802988141775, "completion_length": 3028.9375610351562, "epoch": 0.07692720923986032, "grad_norm": 0.21084211766719818, "kl": 0.0161895751953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.295165011252396e-07, "loss": 0.0831, "reward": -0.05178416799753904, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05178416799753904, "reward_after_std": 0.7477803211659193, "reward_before_mean": 0.19438596442341805, "reward_before_std": 0.8112748600542545, "reward_change_max": 0.0007152482867240906, "reward_change_mean": -0.24617013707756996, "reward_change_min": -0.5908875241875648, "reward_change_std": 0.23455293662846088, "reward_std": 0.7477803360670805, "rewards/cosine_scaled_reward": -0.06947368441615254, "rewards/format_reward": 0.3333333432674408, "step": 179 }, { "advantage_max": 1.1847373992204666, "advantage_mean": 1.055498977109437e-08, "advantage_min": -0.7390188351273537, "advantage_std": 0.7084804400801659, "completion_length": 3437.9791870117188, "epoch": 0.07735697018533441, "grad_norm": 0.1446566879749298, "kl": 0.018016815185546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.270476638965461e-07, "loss": 0.0302, "reward": -0.21776014706119895, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21776014706119895, "reward_after_std": 0.7084804400801659, "reward_before_mean": -0.02913693431764841, "reward_before_std": 0.7591647729277611, "reward_change_max": 0.00021192431449890137, "reward_change_mean": -0.18862321227788925, "reward_change_min": -0.428165839985013, "reward_change_std": 0.1826903447508812, "reward_std": 0.7084804475307465, "rewards/cosine_scaled_reward": -0.09790180483832955, "rewards/format_reward": 0.16666667349636555, "step": 180 }, { "advantage_max": 0.7088275849819183, "advantage_mean": 9.313225801665936e-09, "advantage_min": -0.3866943493485451, "advantage_std": 0.4092123806476593, "completion_length": 3513.2916870117188, "epoch": 0.07778673113080849, "grad_norm": 0.07776975631713867, "kl": 0.01340484619140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.245653237555705e-07, "loss": 0.0164, "reward": -0.5235188025981188, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5235188025981188, "reward_after_std": 0.4092123731970787, "reward_before_mean": -0.4094222137937322, "reward_before_std": 0.4213923439383507, "reward_change_max": 0.0006309151649475098, "reward_change_mean": -0.11409659218043089, "reward_change_min": -0.2683709114789963, "reward_change_std": 0.10639214981347322, "reward_std": 0.409212376922369, "rewards/cosine_scaled_reward": -0.25679444521665573, "rewards/format_reward": 0.1041666679084301, "step": 181 }, { "advantage_max": 0.8070258349180222, "advantage_mean": 3.476937704416372e-08, "advantage_min": -0.4562007486820221, "advantage_std": 0.4696733020246029, "completion_length": 3533.375030517578, "epoch": 0.07821649207628256, "grad_norm": 0.08872150629758835, "kl": 0.011148452758789062, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.220696016880687e-07, "loss": 0.0037, "reward": -0.4364183498546481, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4364183498546481, "reward_after_std": 0.4696733057498932, "reward_before_mean": -0.2984547335654497, "reward_before_std": 0.48629674687981606, "reward_change_max": 0.00034938007593154907, "reward_change_mean": -0.1379636018536985, "reward_change_min": -0.28605207800865173, "reward_change_std": 0.11937537975609303, "reward_std": 0.4696733094751835, "rewards/cosine_scaled_reward": -0.19089403236284852, "rewards/format_reward": 0.08333333395421505, "step": 182 }, { "advantage_max": 1.155573084950447, "advantage_mean": 3.3306690738754696e-16, "advantage_min": -0.6348418109118938, "advantage_std": 0.6799715030938387, "completion_length": 2938.3333740234375, "epoch": 0.07864625302175665, "grad_norm": 0.11650112271308899, "kl": 0.00815582275390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.195606193320136e-07, "loss": 0.0549, "reward": -0.004433078691363335, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.004433078691363335, "reward_after_std": 0.679971500299871, "reward_before_mean": 0.2628408339805901, "reward_before_std": 0.6786674512550235, "reward_change_max": 0.0005815327167510986, "reward_change_mean": -0.26727388240396976, "reward_change_min": -0.5546955578029156, "reward_change_std": 0.2210416877642274, "reward_std": 0.6799715263769031, "rewards/cosine_scaled_reward": -0.035246262326836586, "rewards/format_reward": 0.33333333767950535, "step": 183 }, { "advantage_max": 1.0457139946520329, "advantage_mean": 1.831601090240831e-08, "advantage_min": -0.5230764821171761, "advantage_std": 0.5822414234280586, "completion_length": 3421.1041870117188, "epoch": 0.07907601396723073, "grad_norm": 0.11961822211742401, "kl": 0.0168609619140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.170384989716657e-07, "loss": 0.044, "reward": -0.32291028276085854, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32291028276085854, "reward_after_std": 0.5822414308786392, "reward_before_mean": -0.1608234893064946, "reward_before_std": 0.5879745110869408, "reward_change_max": 0.0006099268794059753, "reward_change_mean": -0.16208678274415433, "reward_change_min": -0.32361212745308876, "reward_change_std": 0.1342193973250687, "reward_std": 0.5822414383292198, "rewards/cosine_scaled_reward": -0.17416174430400133, "rewards/format_reward": 0.1875000037252903, "step": 184 }, { "advantage_max": 1.3376928567886353, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.7337286621332169, "advantage_std": 0.7520164921879768, "completion_length": 3306.041717529297, "epoch": 0.0795057749127048, "grad_norm": 0.12118222564458847, "kl": 0.010723114013671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.145033635316128e-07, "loss": 0.019, "reward": -0.11233397759497166, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.11233397759497166, "reward_after_std": 0.7520164921879768, "reward_before_mean": 0.10431447811424732, "reward_before_std": 0.7659662291407585, "reward_change_max": 5.8069825172424316e-05, "reward_change_mean": -0.21664845943450928, "reward_change_min": -0.4292383808642626, "reward_change_std": 0.17078289669007063, "reward_std": 0.7520164959132671, "rewards/cosine_scaled_reward": -0.10409277770668268, "rewards/format_reward": 0.31250000558793545, "step": 185 }, { "advantage_max": 1.1260808184742928, "advantage_mean": 1.1486312401221e-08, "advantage_min": -0.5967557281255722, "advantage_std": 0.6546710543334484, "completion_length": 3436.854217529297, "epoch": 0.07993553585817889, "grad_norm": 0.12773968279361725, "kl": 0.018035888671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.119553365707802e-07, "loss": 0.0436, "reward": -0.2607352174818516, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2607352174818516, "reward_after_std": 0.654671061784029, "reward_before_mean": -0.08288030326366425, "reward_before_std": 0.6853405237197876, "reward_change_max": 0.0010144487023353577, "reward_change_mean": -0.1778549230657518, "reward_change_min": -0.44735531136393547, "reward_change_std": 0.17245432687923312, "reward_std": 0.6546710804104805, "rewards/cosine_scaled_reward": -0.15602348558604717, "rewards/format_reward": 0.22916666977107525, "step": 186 }, { "advantage_max": 0.8280354104936123, "advantage_mean": 2.4214387883692012e-08, "advantage_min": -0.5358212403953075, "advantage_std": 0.49056626483798027, "completion_length": 3322.7083435058594, "epoch": 0.08036529680365297, "grad_norm": 0.07801033556461334, "kl": 0.01351165771484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.093945422764069e-07, "loss": 0.0202, "reward": -0.41039771866053343, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.41039771866053343, "reward_after_std": 0.49056626856327057, "reward_before_mean": -0.26508296839892864, "reward_before_std": 0.515035055577755, "reward_change_max": 0.0014386698603630066, "reward_change_mean": -0.1453147353604436, "reward_change_min": -0.29577529057860374, "reward_change_std": 0.127681078389287, "reward_std": 0.49056627973914146, "rewards/cosine_scaled_reward": -0.2367081604897976, "rewards/format_reward": 0.2083333395421505, "step": 187 }, { "advantage_max": 1.0535172298550606, "advantage_mean": 5.58793583627093e-09, "advantage_min": -0.6584957391023636, "advantage_std": 0.6703969947993755, "completion_length": 3186.791702270508, "epoch": 0.08079505774912704, "grad_norm": 0.14253924787044525, "kl": 0.013067245483398438, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.068211054579943e-07, "loss": 0.0502, "reward": -0.19751077517867088, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19751077517867088, "reward_after_std": 0.670397013425827, "reward_before_mean": 0.007553219795227051, "reward_before_std": 0.7357200235128403, "reward_change_max": 0.0008845403790473938, "reward_change_mean": -0.2050639889203012, "reward_change_min": -0.5131743811070919, "reward_change_std": 0.20778892328962684, "reward_std": 0.6703970432281494, "rewards/cosine_scaled_reward": -0.12122339941561222, "rewards/format_reward": 0.2500000074505806, "step": 188 }, { "advantage_max": 0.8318752348423004, "advantage_mean": 2.545615107596433e-08, "advantage_min": -0.5120810121297836, "advantage_std": 0.5004717111587524, "completion_length": 3200.2708587646484, "epoch": 0.08122481869460113, "grad_norm": 0.0782642662525177, "kl": 0.01650238037109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.04235151541222e-07, "loss": 0.0062, "reward": -0.3036547750234604, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3036547750234604, "reward_after_std": 0.5004717037081718, "reward_before_mean": -0.12060155533254147, "reward_before_std": 0.524594958871603, "reward_change_max": 0.0004901140928268433, "reward_change_mean": -0.18305321456864476, "reward_change_min": -0.3688211031258106, "reward_change_std": 0.15304205939173698, "reward_std": 0.5004717167466879, "rewards/cosine_scaled_reward": -0.1644674427807331, "rewards/format_reward": 0.2083333358168602, "step": 189 }, { "advantage_max": 0.8493131250143051, "advantage_mean": 1.0554989382516311e-08, "advantage_min": -0.4793839789927006, "advantage_std": 0.5028671827167273, "completion_length": 3173.187515258789, "epoch": 0.0816545796400752, "grad_norm": 0.07996918261051178, "kl": 0.011198043823242188, "lambda_div_used": 0.7000000000000001, "learning_rate": 8.01636806561836e-07, "loss": 0.0093, "reward": -0.13453809544444084, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13453809544444084, "reward_after_std": 0.5028671771287918, "reward_before_mean": 0.10901417583227158, "reward_before_std": 0.48567812517285347, "reward_change_max": 0.0002104341983795166, "reward_change_mean": -0.24355223681777716, "reward_change_min": -0.45941703766584396, "reward_change_std": 0.17714389227330685, "reward_std": 0.5028671976178885, "rewards/cosine_scaled_reward": -0.08090959675610065, "rewards/format_reward": 0.27083333395421505, "step": 190 }, { "advantage_max": 0.7615587934851646, "advantage_mean": 1.5522043539384356e-08, "advantage_min": -0.3858828917145729, "advantage_std": 0.41652533411979675, "completion_length": 3582.375, "epoch": 0.08208434058554928, "grad_norm": 0.08212199807167053, "kl": 0.0157470703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.990261971595048e-07, "loss": 0.0013, "reward": -0.554683580994606, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.554683580994606, "reward_after_std": 0.41652533039450645, "reward_before_mean": -0.4555174186825752, "reward_before_std": 0.41887341812253, "reward_change_max": 0.0010845884680747986, "reward_change_mean": -0.09916617348790169, "reward_change_min": -0.22013112902641296, "reward_change_std": 0.08749024663120508, "reward_std": 0.41652534157037735, "rewards/cosine_scaled_reward": -0.24859204329550266, "rewards/format_reward": 0.0416666679084301, "step": 191 }, { "advantage_max": 0.9800485111773014, "advantage_mean": 1.80055704790405e-08, "advantage_min": -0.6958292573690414, "advantage_std": 0.6137600149959326, "completion_length": 3487.8333740234375, "epoch": 0.08251410153102337, "grad_norm": 0.11996909976005554, "kl": 0.017200469970703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.964034505716476e-07, "loss": 0.0289, "reward": -0.17976875603199005, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17976875603199005, "reward_after_std": 0.6137600224465132, "reward_before_mean": 0.037780795246362686, "reward_before_std": 0.6606552116572857, "reward_change_max": 0.001429736614227295, "reward_change_mean": -0.2175495270639658, "reward_change_min": -0.43880798295140266, "reward_change_std": 0.1941533787176013, "reward_std": 0.6137600485235453, "rewards/cosine_scaled_reward": -0.07485961355268955, "rewards/format_reward": 0.18750000558793545, "step": 192 }, { "advantage_max": 0.9953562207520008, "advantage_mean": 1.800557009046244e-08, "advantage_min": -0.4999169632792473, "advantage_std": 0.5396412769332528, "completion_length": 3584.0, "epoch": 0.08294386247649745, "grad_norm": 0.12422219663858414, "kl": 0.0122222900390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.93768694627233e-07, "loss": 0.0005, "reward": -0.45785919670015574, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.45785919670015574, "reward_after_std": 0.5396412769332528, "reward_before_mean": -0.3399127656593919, "reward_before_std": 0.5416818736121058, "reward_change_max": 0.000789545476436615, "reward_change_mean": -0.11794641090091318, "reward_change_min": -0.23652780055999756, "reward_change_std": 0.10019729030318558, "reward_std": 0.5396413076668978, "rewards/cosine_scaled_reward": -0.20120639353990555, "rewards/format_reward": 0.06250000186264515, "step": 193 }, { "advantage_max": 1.010417964309454, "advantage_mean": 2.4214387217558198e-08, "advantage_min": -0.5382858663797379, "advantage_std": 0.5854761768132448, "completion_length": 3532.6875, "epoch": 0.08337362342197153, "grad_norm": 0.10754309594631195, "kl": 0.01897430419921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.911220577405484e-07, "loss": 0.0182, "reward": -0.40681685507297516, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.40681685507297516, "reward_after_std": 0.5854761730879545, "reward_before_mean": -0.2731348928064108, "reward_before_std": 0.6199114341288805, "reward_change_max": 0.0005270689725875854, "reward_change_mean": -0.13368195202201605, "reward_change_min": -0.35587823763489723, "reward_change_std": 0.14496254827827215, "reward_std": 0.5854762140661478, "rewards/cosine_scaled_reward": -0.19906744733452797, "rewards/format_reward": 0.1250000037252903, "step": 194 }, { "advantage_max": 0.7860080115497112, "advantage_mean": -6.208818459363386e-10, "advantage_min": -0.5577155165374279, "advantage_std": 0.472563786432147, "completion_length": 3135.125030517578, "epoch": 0.08380338436744561, "grad_norm": 0.09463950246572495, "kl": 0.015666961669921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.884636689049422e-07, "loss": 0.0321, "reward": -0.29653753712773323, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29653753712773323, "reward_after_std": 0.4725637901574373, "reward_before_mean": -0.10748099163174629, "reward_before_std": 0.48971202224493027, "reward_change_max": 0.0012012496590614319, "reward_change_mean": -0.1890565464273095, "reward_change_min": -0.3732186555862427, "reward_change_std": 0.15037773409858346, "reward_std": 0.4725638087838888, "rewards/cosine_scaled_reward": -0.18915717117488384, "rewards/format_reward": 0.27083333767950535, "step": 195 }, { "advantage_max": 1.029255710542202, "advantage_mean": -1.8626448716752009e-09, "advantage_min": -0.4481686241924763, "advantage_std": 0.5629460588097572, "completion_length": 3521.3958435058594, "epoch": 0.08423314531291969, "grad_norm": 0.10178162902593613, "kl": 0.0207977294921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.857936576865356e-07, "loss": 0.0139, "reward": -0.37483600026462227, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.37483600026462227, "reward_after_std": 0.5629460625350475, "reward_before_mean": -0.23160822317004204, "reward_before_std": 0.5618504639714956, "reward_change_max": 0.0006427690386772156, "reward_change_mean": -0.14322778396308422, "reward_change_min": -0.32204058207571507, "reward_change_std": 0.12022016104310751, "reward_std": 0.5629460662603378, "rewards/cosine_scaled_reward": -0.14705411344766617, "rewards/format_reward": 0.06250000186264515, "step": 196 }, { "advantage_max": 0.6598530858755112, "advantage_mean": 2.2972624191819335e-08, "advantage_min": -0.4701697528362274, "advantage_std": 0.3981821835041046, "completion_length": 3191.1666870117188, "epoch": 0.08466290625839377, "grad_norm": 0.07793503254652023, "kl": 0.02243804931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.831121542179086e-07, "loss": 0.0317, "reward": -0.49536735005676746, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.49536735005676746, "reward_after_std": 0.3981821835041046, "reward_before_mean": -0.36842376086860895, "reward_before_std": 0.42280471697449684, "reward_change_max": 0.0026790574193000793, "reward_change_mean": -0.12694358173757792, "reward_change_min": -0.2553009930998087, "reward_change_std": 0.11298575263936073, "reward_std": 0.3981821835041046, "rewards/cosine_scaled_reward": -0.2779618799686432, "rewards/format_reward": 0.18750000558793545, "step": 197 }, { "advantage_max": 0.876005370169878, "advantage_mean": -3.601113912621301e-08, "advantage_min": -0.7180985324084759, "advantage_std": 0.5528365727514029, "completion_length": 3135.7708740234375, "epoch": 0.08509266720386785, "grad_norm": 0.09642280638217926, "kl": 0.0087738037109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.804192891917571e-07, "loss": 0.0242, "reward": 0.03133315593004227, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.03133315593004227, "reward_after_std": 0.5528365802019835, "reward_before_mean": 0.3315879050642252, "reward_before_std": 0.568988960236311, "reward_change_max": 0.0009246021509170532, "reward_change_mean": -0.3002547863870859, "reward_change_min": -0.5340749658644199, "reward_change_std": 0.21413663309067488, "reward_std": 0.5528366155922413, "rewards/cosine_scaled_reward": -0.03212272375822067, "rewards/format_reward": 0.3958333395421505, "step": 198 }, { "advantage_max": 0.7122074216604233, "advantage_mean": 2.2972623747730125e-08, "advantage_min": -0.4363497868180275, "advantage_std": 0.4238877035677433, "completion_length": 3401.312530517578, "epoch": 0.08552242814934193, "grad_norm": 0.07028008997440338, "kl": 0.0142822265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.777151938545235e-07, "loss": 0.0242, "reward": -0.493546555750072, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.493546555750072, "reward_after_std": 0.4238877072930336, "reward_before_mean": -0.369287371635437, "reward_before_std": 0.44754020869731903, "reward_change_max": 0.0011799708008766174, "reward_change_mean": -0.12425918504595757, "reward_change_min": -0.30985550582408905, "reward_change_std": 0.11880704201757908, "reward_std": 0.4238877221941948, "rewards/cosine_scaled_reward": -0.2888103537261486, "rewards/format_reward": 0.2083333395421505, "step": 199 }, { "advantage_max": 0.6079084314405918, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.38914382830262184, "advantage_std": 0.37304335460066795, "completion_length": 3357.6458435058594, "epoch": 0.08595218909481601, "grad_norm": 0.07836779206991196, "kl": 0.0166778564453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.75e-07, "loss": -0.0075, "reward": -0.4145989157259464, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4145989157259464, "reward_after_std": 0.3730433527380228, "reward_before_mean": -0.2549329958856106, "reward_before_std": 0.38267185539007187, "reward_change_max": 0.0007638335227966309, "reward_change_mean": -0.15966594475321472, "reward_change_min": -0.31076955795288086, "reward_change_std": 0.12437027436681092, "reward_std": 0.3730433639138937, "rewards/cosine_scaled_reward": -0.189966494217515, "rewards/format_reward": 0.125, "step": 200 }, { "advantage_max": 1.5218465253710747, "advantage_mean": 1.4901160749758446e-08, "advantage_min": -0.7611628621816635, "advantage_std": 0.8992416812106967, "completion_length": 3249.7083740234375, "epoch": 0.08638195004029009, "grad_norm": 0.1851598471403122, "kl": 0.01325225830078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.72273839962904e-07, "loss": 0.0937, "reward": -0.2030074130743742, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2030074130743742, "reward_after_std": 0.8992416812106967, "reward_before_mean": -0.03356902580708265, "reward_before_std": 0.9745304333046079, "reward_change_max": 0.002616606652736664, "reward_change_mean": -0.1694384040310979, "reward_change_min": -0.5086619816720486, "reward_change_std": 0.20800044480711222, "reward_std": 0.8992416961118579, "rewards/cosine_scaled_reward": -0.12095117103308439, "rewards/format_reward": 0.20833333767950535, "step": 201 }, { "advantage_max": 0.7933013215661049, "advantage_mean": 2.0489096974607435e-08, "advantage_min": -0.4925626441836357, "advantage_std": 0.46817703545093536, "completion_length": 3471.7291870117188, "epoch": 0.08681171098576417, "grad_norm": 0.08827139437198639, "kl": 0.014617919921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.695368466124296e-07, "loss": 0.002, "reward": -0.4055037163197994, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4055037163197994, "reward_after_std": 0.46817703172564507, "reward_before_mean": -0.25583774596452713, "reward_before_std": 0.4868803024291992, "reward_change_max": 0.0019458457827568054, "reward_change_mean": -0.14966594707220793, "reward_change_min": -0.31421347334980965, "reward_change_std": 0.1301927911117673, "reward_std": 0.46817703545093536, "rewards/cosine_scaled_reward": -0.2008355405414477, "rewards/format_reward": 0.1458333395421505, "step": 202 }, { "advantage_max": 1.3730687573552132, "advantage_mean": 2.669791410170319e-08, "advantage_min": -0.696600504219532, "advantage_std": 0.7600562870502472, "completion_length": 3327.9166870117188, "epoch": 0.08724147193123825, "grad_norm": 0.22163335978984833, "kl": 0.019535064697265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.667891533457718e-07, "loss": 0.0468, "reward": -0.18882405757904053, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18882405757904053, "reward_after_std": 0.7600562926381826, "reward_before_mean": -0.0028842128813266754, "reward_before_std": 0.7773140594363213, "reward_change_max": 0.0014945417642593384, "reward_change_mean": -0.18593982560560107, "reward_change_min": -0.42244331166148186, "reward_change_std": 0.16642792336642742, "reward_std": 0.7600563056766987, "rewards/cosine_scaled_reward": -0.11602545157074928, "rewards/format_reward": 0.22916667349636555, "step": 203 }, { "advantage_max": 0.7738309167325497, "advantage_mean": -8.071462331837864e-09, "advantage_min": -0.41631995141506195, "advantage_std": 0.4440557360649109, "completion_length": 3409.7916870117188, "epoch": 0.08767123287671233, "grad_norm": 0.08583240211009979, "kl": 0.0116424560546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.640308940816239e-07, "loss": 0.0273, "reward": -0.39117778837680817, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.39117778837680817, "reward_after_std": 0.44405573420226574, "reward_before_mean": -0.23571807518601418, "reward_before_std": 0.44661546126008034, "reward_change_max": 0.0, "reward_change_mean": -0.1554597271606326, "reward_change_min": -0.3242712374776602, "reward_change_std": 0.12359009683132172, "reward_std": 0.44405573792755604, "rewards/cosine_scaled_reward": -0.16994237527251244, "rewards/format_reward": 0.10416666977107525, "step": 204 }, { "advantage_max": 1.282199677079916, "advantage_mean": -1.024454904241523e-08, "advantage_min": -0.5806190297007561, "advantage_std": 0.7030315864831209, "completion_length": 3156.416717529297, "epoch": 0.0881009938221864, "grad_norm": 0.19791720807552338, "kl": 0.022251129150390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.612622032536507e-07, "loss": 0.0518, "reward": -0.006253892555832863, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.006253892555832863, "reward_after_std": 0.7030315846204758, "reward_before_mean": 0.25111038400791585, "reward_before_std": 0.6790586207062006, "reward_change_max": 0.0002579689025878906, "reward_change_mean": -0.25736428890377283, "reward_change_min": -0.49052800983190536, "reward_change_std": 0.18845067685469985, "reward_std": 0.7030315846204758, "rewards/cosine_scaled_reward": -0.07236147893127054, "rewards/format_reward": 0.39583333395421505, "step": 205 }, { "advantage_max": 0.8176142610609531, "advantage_mean": 2.2351743178550265e-08, "advantage_min": -0.6091083064675331, "advantage_std": 0.517665546387434, "completion_length": 3089.0208587646484, "epoch": 0.0885307547676605, "grad_norm": 0.10419939458370209, "kl": 0.011758804321289062, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.584832158039378e-07, "loss": 0.0038, "reward": 0.00862065702676773, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.00862065702676773, "reward_after_std": 0.5176655445247889, "reward_before_mean": 0.3063107542693615, "reward_before_std": 0.5235197730362415, "reward_change_max": 0.00038120895624160767, "reward_change_mean": -0.29769006557762623, "reward_change_min": -0.5016247779130936, "reward_change_std": 0.20865567424334586, "reward_std": 0.5176655519753695, "rewards/cosine_scaled_reward": -0.055177973583340645, "rewards/format_reward": 0.416666679084301, "step": 206 }, { "advantage_max": 1.1454123258590698, "advantage_mean": -6.829699306099002e-09, "advantage_min": -0.8360222205519676, "advantage_std": 0.7314620912075043, "completion_length": 3340.1041870117188, "epoch": 0.08896051571313457, "grad_norm": 0.14790798723697662, "kl": 0.02350616455078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.556940671764124e-07, "loss": 0.0464, "reward": 0.03707157075405121, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.03707157075405121, "reward_after_std": 0.731462087482214, "reward_before_mean": 0.3192422413267195, "reward_before_std": 0.7934761084616184, "reward_change_max": 0.0006707608699798584, "reward_change_mean": -0.2821706747636199, "reward_change_min": -0.5780200883746147, "reward_change_std": 0.24251736793667078, "reward_std": 0.7314621135592461, "rewards/cosine_scaled_reward": 0.013787778094410896, "rewards/format_reward": 0.2916666753590107, "step": 207 }, { "advantage_max": 0.88877734541893, "advantage_mean": -1.3038516710750514e-08, "advantage_min": -0.6325425133109093, "advantage_std": 0.5520250871777534, "completion_length": 3101.562530517578, "epoch": 0.08939027665860864, "grad_norm": 0.08140086382627487, "kl": 0.0145111083984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.528948933102438e-07, "loss": 0.0084, "reward": 0.06840807944536209, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06840807944536209, "reward_after_std": 0.5520251039415598, "reward_before_mean": 0.38129839673638344, "reward_before_std": 0.5491707995533943, "reward_change_max": 0.0007384270429611206, "reward_change_mean": -0.31289032008498907, "reward_change_min": -0.5408381074666977, "reward_change_std": 0.22034268453717232, "reward_std": 0.5520251207053661, "rewards/cosine_scaled_reward": 0.023982543498277664, "rewards/format_reward": 0.3333333358168602, "step": 208 }, { "advantage_max": 0.8235974162817001, "advantage_mean": 7.450580874479584e-09, "advantage_min": -0.515595905482769, "advantage_std": 0.4932776354253292, "completion_length": 3383.187530517578, "epoch": 0.08982003760408273, "grad_norm": 0.10428653657436371, "kl": 0.02545928955078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.500858306332172e-07, "loss": 0.0123, "reward": -0.4063703082501888, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4063703082501888, "reward_after_std": 0.4932776354253292, "reward_before_mean": -0.25898505188524723, "reward_before_std": 0.5211737491190434, "reward_change_max": 0.0004497915506362915, "reward_change_mean": -0.1473852675408125, "reward_change_min": -0.34285284020006657, "reward_change_std": 0.13839057832956314, "reward_std": 0.4932776540517807, "rewards/cosine_scaled_reward": -0.2024091947823763, "rewards/format_reward": 0.1458333395421505, "step": 209 }, { "advantage_max": 0.9850319549441338, "advantage_mean": 3.725290242950763e-09, "advantage_min": -0.45449571311473846, "advantage_std": 0.5447403118014336, "completion_length": 3568.1458435058594, "epoch": 0.09024979854955681, "grad_norm": 0.10483227670192719, "kl": 0.01971435546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.472670160550848e-07, "loss": 0.0086, "reward": -0.43232007697224617, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.43232007697224617, "reward_after_std": 0.5447403155267239, "reward_before_mean": -0.30519334971904755, "reward_before_std": 0.5559474863111973, "reward_change_max": 5.335360765457153e-05, "reward_change_mean": -0.12712673656642437, "reward_change_min": -0.26148882135748863, "reward_change_std": 0.10676967911422253, "reward_std": 0.5447403155267239, "rewards/cosine_scaled_reward": -0.18384667672216892, "rewards/format_reward": 0.06250000186264515, "step": 210 }, { "advantage_max": 1.0020484663546085, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.5697542242705822, "advantage_std": 0.5751647986471653, "completion_length": 3183.562530517578, "epoch": 0.09067955949503088, "grad_norm": 0.12769721448421478, "kl": 0.016811370849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.444385869608921e-07, "loss": 0.0374, "reward": -0.3363814535550773, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3363814535550773, "reward_after_std": 0.5751647986471653, "reward_before_mean": -0.1786195458844304, "reward_before_std": 0.5928840860724449, "reward_change_max": 0.0006612539291381836, "reward_change_mean": -0.1577619006857276, "reward_change_min": -0.319089163094759, "reward_change_std": 0.13607777375727892, "reward_std": 0.5751648060977459, "rewards/cosine_scaled_reward": -0.19347644201479852, "rewards/format_reward": 0.2083333395421505, "step": 211 }, { "advantage_max": 1.0563874766230583, "advantage_mean": 1.2107193636534674e-08, "advantage_min": -0.7405802682042122, "advantage_std": 0.6750030741095543, "completion_length": 3159.5416717529297, "epoch": 0.09110932044050497, "grad_norm": 0.13277776539325714, "kl": 0.022613525390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.416006812042827e-07, "loss": 0.0128, "reward": -0.014016177505254745, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.014016177505254745, "reward_after_std": 0.675003070384264, "reward_before_mean": 0.2551490105688572, "reward_before_std": 0.7230903543531895, "reward_change_max": 0.0005873590707778931, "reward_change_mean": -0.2691651936620474, "reward_change_min": -0.5248813033103943, "reward_change_std": 0.2242625905200839, "reward_std": 0.675003070384264, "rewards/cosine_scaled_reward": -0.0703421663492918, "rewards/format_reward": 0.3958333395421505, "step": 212 }, { "advantage_max": 0.9491157867014408, "advantage_mean": -1.6763807231257033e-08, "advantage_min": -0.6327035278081894, "advantage_std": 0.5847394056618214, "completion_length": 3337.6458740234375, "epoch": 0.09153908138597905, "grad_norm": 0.10074589401483536, "kl": 0.019466400146484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.387534371007797e-07, "loss": 0.0116, "reward": -0.011992666870355606, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.011992666870355606, "reward_after_std": 0.584739413112402, "reward_before_mean": 0.2683165445923805, "reward_before_std": 0.5948167145252228, "reward_change_max": 0.00023627281188964844, "reward_change_mean": -0.2803092375397682, "reward_change_min": -0.5357660707086325, "reward_change_std": 0.21163467038422823, "reward_std": 0.5847394280135632, "rewards/cosine_scaled_reward": -0.04292505793273449, "rewards/format_reward": 0.3541666716337204, "step": 213 }, { "advantage_max": 0.7135799862444401, "advantage_mean": 8.69234362266269e-09, "advantage_min": -0.42481984943151474, "advantage_std": 0.42982153594493866, "completion_length": 3514.4583435058594, "epoch": 0.09196884233145312, "grad_norm": 0.07945537567138672, "kl": 0.016887664794921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.358969934210438e-07, "loss": 0.0136, "reward": -0.4478939725086093, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4478939725086093, "reward_after_std": 0.4298215415328741, "reward_before_mean": -0.3073042184114456, "reward_before_std": 0.4552846737205982, "reward_change_max": 0.0012416914105415344, "reward_change_mean": -0.1405897568911314, "reward_change_min": -0.31337226554751396, "reward_change_std": 0.12826434336602688, "reward_std": 0.42982156574726105, "rewards/cosine_scaled_reward": -0.22656877432018518, "rewards/format_reward": 0.14583333395421505, "step": 214 }, { "advantage_max": 1.2070150300860405, "advantage_mean": 1.9868215850316062e-08, "advantage_min": -0.5258344039320946, "advantage_std": 0.6745498906821012, "completion_length": 3496.291717529297, "epoch": 0.09239860327692721, "grad_norm": 0.18869316577911377, "kl": 0.02115631103515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.330314893841101e-07, "loss": 0.0381, "reward": -0.2983117289841175, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2983117289841175, "reward_after_std": 0.674549849703908, "reward_before_mean": -0.14005974680185318, "reward_before_std": 0.6909940931946039, "reward_change_max": 0.0002567395567893982, "reward_change_mean": -0.15825198404490948, "reward_change_min": -0.42601517029106617, "reward_change_std": 0.15992903290316463, "reward_std": 0.674549849703908, "rewards/cosine_scaled_reward": -0.13252987526357174, "rewards/format_reward": 0.1250000037252903, "step": 215 }, { "advantage_max": 0.5845132246613503, "advantage_mean": 2.235174201281609e-08, "advantage_min": -0.34066785499453545, "advantage_std": 0.35198395885527134, "completion_length": 3287.8125, "epoch": 0.09282836422240129, "grad_norm": 0.06322920322418213, "kl": 0.0185546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.301570646506027e-07, "loss": 0.0033, "reward": -0.4990241825580597, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4990241825580597, "reward_after_std": 0.35198396258056164, "reward_before_mean": -0.36651815474033356, "reward_before_std": 0.3653986267745495, "reward_change_max": 0.0007480978965759277, "reward_change_mean": -0.1325060222297907, "reward_change_min": -0.2885711845010519, "reward_change_std": 0.11338116228580475, "reward_std": 0.35198397003114223, "rewards/cosine_scaled_reward": -0.2665924094617367, "rewards/format_reward": 0.1666666679084301, "step": 216 }, { "advantage_max": 1.7420502603054047, "advantage_mean": 4.967054101356894e-09, "advantage_min": -0.8590058088302612, "advantage_std": 1.0074434839189053, "completion_length": 3354.5833740234375, "epoch": 0.09325812516787536, "grad_norm": 0.26181402802467346, "kl": 0.0229339599609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.27273859315928e-07, "loss": 0.0851, "reward": 0.016483189072459936, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.016483189072459936, "reward_after_std": 1.0074434839189053, "reward_before_mean": 0.24929228238761425, "reward_before_std": 1.059232659637928, "reward_change_max": 0.0008864700794219971, "reward_change_mean": -0.23280909238383174, "reward_change_min": -0.5508107729256153, "reward_change_std": 0.22329208068549633, "reward_std": 1.0074435211718082, "rewards/cosine_scaled_reward": -0.0003538588061928749, "rewards/format_reward": 0.25000000558793545, "step": 217 }, { "advantage_max": 1.07870240136981, "advantage_mean": 1.9868215905827213e-08, "advantage_min": -0.57809928804636, "advantage_std": 0.6290264874696732, "completion_length": 3275.5208740234375, "epoch": 0.09368788611334945, "grad_norm": 0.10962985455989838, "kl": 0.0167236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.243820139034464e-07, "loss": 0.034, "reward": -0.09900899603962898, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09900899603962898, "reward_after_std": 0.6290264911949635, "reward_before_mean": 0.14103195304051042, "reward_before_std": 0.6300968825817108, "reward_change_max": 0.0004593506455421448, "reward_change_mean": -0.24004095140844584, "reward_change_min": -0.47393492236733437, "reward_change_std": 0.19036198034882545, "reward_std": 0.629026509821415, "rewards/cosine_scaled_reward": -0.04406733252108097, "rewards/format_reward": 0.2291666679084301, "step": 218 }, { "advantage_max": 1.3853733092546463, "advantage_mean": 6.208817071584605e-09, "advantage_min": -0.6258366405963898, "advantage_std": 0.7506828010082245, "completion_length": 3424.229217529297, "epoch": 0.09411764705882353, "grad_norm": 0.17209453880786896, "kl": 0.02288818359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.214816693576234e-07, "loss": 0.0051, "reward": -0.29816704941913486, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29816704941913486, "reward_after_std": 0.7506828047335148, "reward_before_mean": -0.15249420593318064, "reward_before_std": 0.763845793902874, "reward_change_max": 0.0008522719144821167, "reward_change_mean": -0.14567285007797182, "reward_change_min": -0.2772129587829113, "reward_change_std": 0.12387938005849719, "reward_std": 0.7506828345358372, "rewards/cosine_scaled_reward": -0.1908304337412119, "rewards/format_reward": 0.2291666753590107, "step": 219 }, { "advantage_max": 0.7651851959526539, "advantage_mean": 3.228584954939606e-08, "advantage_min": -0.4697493873536587, "advantage_std": 0.45775818079710007, "completion_length": 3344.9166870117188, "epoch": 0.0945474080042976, "grad_norm": 0.14247778058052063, "kl": 0.02466583251953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.185729670371604e-07, "loss": 0.0565, "reward": -0.4743209043517709, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4743209043517709, "reward_after_std": 0.45775817334651947, "reward_before_mean": -0.3484514355659485, "reward_before_std": 0.48876144737005234, "reward_change_max": 0.0011586323380470276, "reward_change_mean": -0.12586946273222566, "reward_change_min": -0.28271521255373955, "reward_change_std": 0.1265034442767501, "reward_std": 0.45775818079710007, "rewards/cosine_scaled_reward": -0.21589237824082375, "rewards/format_reward": 0.08333333395421505, "step": 220 }, { "advantage_max": 0.8930580765008926, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.48208484798669815, "advantage_std": 0.5019366480410099, "completion_length": 3488.229217529297, "epoch": 0.09497716894977169, "grad_norm": 0.1225610300898552, "kl": 0.02278900146484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.156560487081051e-07, "loss": 0.0331, "reward": -0.4843184631317854, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4843184631317854, "reward_after_std": 0.5019366592168808, "reward_before_mean": -0.37003418058156967, "reward_before_std": 0.5170651637017727, "reward_change_max": 0.0013438165187835693, "reward_change_mean": -0.11428429279476404, "reward_change_min": -0.24808008037507534, "reward_change_std": 0.10892942035570741, "reward_std": 0.5019366703927517, "rewards/cosine_scaled_reward": -0.2371004200540483, "rewards/format_reward": 0.1041666679084301, "step": 221 }, { "advantage_max": 1.0204047225415707, "advantage_mean": 1.8626449826975033e-09, "advantage_min": -0.577682126313448, "advantage_std": 0.6098344177007675, "completion_length": 3561.312530517578, "epoch": 0.09540692989524577, "grad_norm": 0.12180877476930618, "kl": 0.023075103759765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.127310565369415e-07, "loss": 0.0169, "reward": -0.3621921017765999, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3621921017765999, "reward_after_std": 0.6098343934863806, "reward_before_mean": -0.21308495476841927, "reward_before_std": 0.657078143209219, "reward_change_max": 0.00037401169538497925, "reward_change_mean": -0.14910716097801924, "reward_change_min": -0.4146813377737999, "reward_change_std": 0.1632744804956019, "reward_std": 0.6098344102501869, "rewards/cosine_scaled_reward": -0.14820914342999458, "rewards/format_reward": 0.0833333358168602, "step": 222 }, { "advantage_max": 1.6874186918139458, "advantage_mean": 1.0554989382516311e-08, "advantage_min": -0.927936140447855, "advantage_std": 1.0001515448093414, "completion_length": 3191.5625610351562, "epoch": 0.09583669084071984, "grad_norm": 0.26278793811798096, "kl": 0.019866943359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.097981330836616e-07, "loss": 0.0971, "reward": 0.06346365553326905, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06346365553326905, "reward_after_std": 1.0001515299081802, "reward_before_mean": 0.3169806282967329, "reward_before_std": 1.0602897331118584, "reward_change_max": 0.0004867836833000183, "reward_change_mean": -0.25351699627935886, "reward_change_min": -0.6025106068700552, "reward_change_std": 0.2533076498657465, "reward_std": 1.0001515448093414, "rewards/cosine_scaled_reward": -0.03942635050043464, "rewards/format_reward": 0.39583334140479565, "step": 223 }, { "advantage_max": 0.8657924458384514, "advantage_mean": -6.208818459363386e-10, "advantage_min": -0.6863936334848404, "advantage_std": 0.547605387866497, "completion_length": 3340.7291870117188, "epoch": 0.09626645178619393, "grad_norm": 0.12327700853347778, "kl": 0.02918243408203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.068574212948169e-07, "loss": 0.0236, "reward": -0.22030561207793653, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22030561207793653, "reward_after_std": 0.5476053915917873, "reward_before_mean": -0.009733063168823719, "reward_before_std": 0.5908951237797737, "reward_change_max": 0.00024409592151641846, "reward_change_mean": -0.21057254984043539, "reward_change_min": -0.39694950729608536, "reward_change_std": 0.17333252588286996, "reward_std": 0.5476054102182388, "rewards/cosine_scaled_reward": -0.14028320647776127, "rewards/format_reward": 0.27083334140479565, "step": 224 }, { "advantage_max": 0.6236376762390137, "advantage_mean": 2.0489097418696645e-08, "advantage_min": -0.45377515628933907, "advantage_std": 0.39049747586250305, "completion_length": 3512.3541870117188, "epoch": 0.09669621273166801, "grad_norm": 0.07287710905075073, "kl": 0.0273590087890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.039090644965509e-07, "loss": 0.0175, "reward": -0.4616866447031498, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4616866447031498, "reward_after_std": 0.39049748331308365, "reward_before_mean": -0.321016991045326, "reward_before_std": 0.41784200072288513, "reward_change_max": 0.0025215446949005127, "reward_change_mean": -0.1406696573831141, "reward_change_min": -0.3116382397711277, "reward_change_std": 0.12292253132909536, "reward_std": 0.39049748331308365, "rewards/cosine_scaled_reward": -0.2125918362289667, "rewards/format_reward": 0.1041666679084301, "step": 225 }, { "advantage_max": 1.1128062456846237, "advantage_mean": 1.862645238048799e-08, "advantage_min": -0.4835724011063576, "advantage_std": 0.611604705452919, "completion_length": 3219.625, "epoch": 0.09712597367714208, "grad_norm": 0.10916239768266678, "kl": 0.02201080322265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 7.009532063876148e-07, "loss": 0.0186, "reward": -0.36419799737632275, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.36419799737632275, "reward_after_std": 0.6116046942770481, "reward_before_mean": -0.22363119525834918, "reward_before_std": 0.6213466972112656, "reward_change_max": 0.0017142817378044128, "reward_change_mean": -0.14056679140776396, "reward_change_min": -0.3256027586758137, "reward_change_std": 0.13032553531229496, "reward_std": 0.6116047091782093, "rewards/cosine_scaled_reward": -0.20556560438126326, "rewards/format_reward": 0.18750000186264515, "step": 226 }, { "advantage_max": 0.9768772162497044, "advantage_mean": 8.07146216530441e-09, "advantage_min": -0.39358216896653175, "advantage_std": 0.516303513199091, "completion_length": 3155.437530517578, "epoch": 0.09755573462261617, "grad_norm": 0.10977691411972046, "kl": 0.030792236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.979899910323624e-07, "loss": 0.0108, "reward": -0.19619187153875828, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19619187153875828, "reward_after_std": 0.5163035094738007, "reward_before_mean": 0.015162771567702293, "reward_before_std": 0.4752299636602402, "reward_change_max": 0.001144401729106903, "reward_change_mean": -0.2113546619657427, "reward_change_min": -0.3417402245104313, "reward_change_std": 0.13465667539276183, "reward_std": 0.5163035094738007, "rewards/cosine_scaled_reward": -0.12783528864383698, "rewards/format_reward": 0.27083333767950535, "step": 227 }, { "advantage_max": 1.1608103774487972, "advantage_mean": 4.967053990334591e-09, "advantage_min": -0.5979930870234966, "advantage_std": 0.6399441361427307, "completion_length": 2881.416732788086, "epoch": 0.09798549556809025, "grad_norm": 0.11533227562904358, "kl": 0.0301361083984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.950195628537299e-07, "loss": 0.0484, "reward": 0.21317938016727567, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21317938016727567, "reward_after_std": 0.6399441324174404, "reward_before_mean": 0.558028407394886, "reward_before_std": 0.5800115168094635, "reward_change_max": 0.0, "reward_change_mean": -0.3448490248993039, "reward_change_min": -0.5557027906179428, "reward_change_std": 0.21160451881587505, "reward_std": 0.6399441547691822, "rewards/cosine_scaled_reward": -0.0022357935085892677, "rewards/format_reward": 0.562500013038516, "step": 228 }, { "advantage_max": 0.835795097053051, "advantage_mean": 7.450580763457282e-09, "advantage_min": -0.4440697208046913, "advantage_std": 0.4681421685963869, "completion_length": 3472.7083435058594, "epoch": 0.09841525651356434, "grad_norm": 0.08272621035575867, "kl": 0.0215911865234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.920420666261961e-07, "loss": 0.0218, "reward": -0.4052432104945183, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4052432104945183, "reward_after_std": 0.4681421648710966, "reward_before_mean": -0.2585753204766661, "reward_before_std": 0.46608917228877544, "reward_change_max": 0.00020221620798110962, "reward_change_mean": -0.14666790794581175, "reward_change_min": -0.3080200608819723, "reward_change_std": 0.11951669957488775, "reward_std": 0.46814218163490295, "rewards/cosine_scaled_reward": -0.21262099221348763, "rewards/format_reward": 0.16666666977107525, "step": 229 }, { "advantage_max": 0.8816927671432495, "advantage_mean": 7.761021741936602e-09, "advantage_min": -0.4877760037779808, "advantage_std": 0.519805122166872, "completion_length": 3047.020866394043, "epoch": 0.09884501745903841, "grad_norm": 0.11017224192619324, "kl": 0.035308837890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.890576474687263e-07, "loss": 0.0263, "reward": -0.38295852253213525, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.38295852253213525, "reward_after_std": 0.519805122166872, "reward_before_mean": -0.230764617677778, "reward_before_std": 0.5416655987501144, "reward_change_max": 0.0005422681570053101, "reward_change_mean": -0.152193920686841, "reward_change_min": -0.33529213443398476, "reward_change_std": 0.14187764190137386, "reward_std": 0.5198051333427429, "rewards/cosine_scaled_reward": -0.2924656420946121, "rewards/format_reward": 0.35416666977107525, "step": 230 }, { "advantage_max": 1.0319055020809174, "advantage_mean": 2.1109978542988017e-08, "advantage_min": -0.5115607306361198, "advantage_std": 0.5795635022222996, "completion_length": 3321.9583740234375, "epoch": 0.09927477840451249, "grad_norm": 0.14820247888565063, "kl": 0.04012298583984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.860664508377001e-07, "loss": 0.0355, "reward": -0.2541456453036517, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2541456453036517, "reward_after_std": 0.5795634984970093, "reward_before_mean": -0.06812699791043997, "reward_before_std": 0.5791305676102638, "reward_change_max": 0.0, "reward_change_mean": -0.18601864110678434, "reward_change_min": -0.3768342472612858, "reward_change_std": 0.14215826522558928, "reward_std": 0.5795635133981705, "rewards/cosine_scaled_reward": -0.14864683119230904, "rewards/format_reward": 0.22916667349636555, "step": 231 }, { "advantage_max": 0.8226564973592758, "advantage_mean": 8.692344455329959e-09, "advantage_min": -0.42317265644669533, "advantage_std": 0.4795389547944069, "completion_length": 3468.062530517578, "epoch": 0.09970453934998658, "grad_norm": 0.0985027328133583, "kl": 0.02783203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.83068622519821e-07, "loss": 0.0004, "reward": -0.39773488929495215, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.39773488929495215, "reward_after_std": 0.479538943618536, "reward_before_mean": -0.24681192450225353, "reward_before_std": 0.4914475455880165, "reward_change_max": 0.0007119402289390564, "reward_change_mean": -0.1509229768998921, "reward_change_min": -0.35299584455788136, "reward_change_std": 0.13467535329982638, "reward_std": 0.47953896410763264, "rewards/cosine_scaled_reward": -0.22757263109087944, "rewards/format_reward": 0.2083333358168602, "step": 232 }, { "advantage_max": 0.5870580486953259, "advantage_mean": 2.359350581571107e-08, "advantage_min": -0.3766268938779831, "advantage_std": 0.3662502057850361, "completion_length": 3567.6666870117188, "epoch": 0.10013430029546065, "grad_norm": 0.07991530001163483, "kl": 0.02536773681640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.800643086250121e-07, "loss": 0.0026, "reward": -0.5786222293972969, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5786222293972969, "reward_after_std": 0.3662502057850361, "reward_before_mean": -0.4765310361981392, "reward_before_std": 0.39891451969742775, "reward_change_max": 0.0009004846215248108, "reward_change_mean": -0.10209119319915771, "reward_change_min": -0.25280255638062954, "reward_change_std": 0.10519091784954071, "reward_std": 0.3662502132356167, "rewards/cosine_scaled_reward": -0.25909885205328465, "rewards/format_reward": 0.0416666679084301, "step": 233 }, { "advantage_max": 1.0256999842822552, "advantage_mean": 1.1796753351944744e-08, "advantage_min": -0.604721438139677, "advantage_std": 0.5928989462554455, "completion_length": 3365.291717529297, "epoch": 0.10056406124093473, "grad_norm": 0.11636658757925034, "kl": 0.0220184326171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.770536555792944e-07, "loss": 0.031, "reward": -0.14309864467941225, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14309864467941225, "reward_after_std": 0.5928989499807358, "reward_before_mean": 0.08333871932700276, "reward_before_std": 0.5910174176096916, "reward_change_max": 0.001217089593410492, "reward_change_mean": -0.22643735352903605, "reward_change_min": -0.3947868049144745, "reward_change_std": 0.16098644211888313, "reward_std": 0.592898964881897, "rewards/cosine_scaled_reward": -0.09374730847775936, "rewards/format_reward": 0.27083334513008595, "step": 234 }, { "advantage_max": 0.899391446262598, "advantage_mean": 1.459072043741294e-08, "advantage_min": -0.4354909062385559, "advantage_std": 0.493799414485693, "completion_length": 3374.8333740234375, "epoch": 0.10099382218640882, "grad_norm": 0.11165796965360641, "kl": 0.027923583984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.740368101176495e-07, "loss": 0.031, "reward": -0.4470532997511327, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4470532997511327, "reward_after_std": 0.4937994182109833, "reward_before_mean": -0.3197474591434002, "reward_before_std": 0.4953095503151417, "reward_change_max": 0.002029925584793091, "reward_change_mean": -0.12730582559015602, "reward_change_min": -0.25931455194950104, "reward_change_std": 0.10504573932848871, "reward_std": 0.4937994219362736, "rewards/cosine_scaled_reward": -0.3057070691138506, "rewards/format_reward": 0.29166667349636555, "step": 235 }, { "advantage_max": 1.6976716071367264, "advantage_mean": 8.692344621863413e-09, "advantage_min": -0.9229797199368477, "advantage_std": 0.9732730016112328, "completion_length": 3314.479248046875, "epoch": 0.10142358313188289, "grad_norm": 0.2387702465057373, "kl": 0.02822113037109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.710139192768694e-07, "loss": 0.0603, "reward": 0.054537381045520306, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.054537381045520306, "reward_after_std": 0.9732729867100716, "reward_before_mean": 0.3042441215366125, "reward_before_std": 1.011559996753931, "reward_change_max": 0.0019267499446868896, "reward_change_mean": -0.24970672093331814, "reward_change_min": -0.5888886488974094, "reward_change_std": 0.23488033190369606, "reward_std": 0.973273016512394, "rewards/cosine_scaled_reward": 0.006288717035204172, "rewards/format_reward": 0.29166667349636555, "step": 236 }, { "advantage_max": 1.2711116299033165, "advantage_mean": -1.8626450382086546e-09, "advantage_min": -0.7570880353450775, "advantage_std": 0.7522380761802197, "completion_length": 3257.500030517578, "epoch": 0.10185334407735697, "grad_norm": 0.20740225911140442, "kl": 0.02561187744140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.679851303883891e-07, "loss": 0.0635, "reward": -0.08662595227360725, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08662595227360725, "reward_after_std": 0.7522380985319614, "reward_before_mean": 0.14280837279511616, "reward_before_std": 0.789225971326232, "reward_change_max": 0.00035265088081359863, "reward_change_mean": -0.22943431604653597, "reward_change_min": -0.47185077518224716, "reward_change_std": 0.20079512195661664, "reward_std": 0.7522381581366062, "rewards/cosine_scaled_reward": -0.06401250294129568, "rewards/format_reward": 0.27083334140479565, "step": 237 }, { "advantage_max": 1.3122153878211975, "advantage_mean": 2.1109978542988017e-08, "advantage_min": -0.7105024233460426, "advantage_std": 0.7777662724256516, "completion_length": 3017.979232788086, "epoch": 0.10228310502283106, "grad_norm": 0.15131577849388123, "kl": 0.02734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.649505910711058e-07, "loss": 0.0327, "reward": -0.13657146692276, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13657146692276, "reward_after_std": 0.7777662575244904, "reward_before_mean": 0.07333112019114196, "reward_before_std": 0.8176084533333778, "reward_change_max": 0.0010031163692474365, "reward_change_mean": -0.20990259014070034, "reward_change_min": -0.43529413267970085, "reward_change_std": 0.1900767358019948, "reward_std": 0.777766264975071, "rewards/cosine_scaled_reward": -0.13000109884887934, "rewards/format_reward": 0.33333334140479565, "step": 238 }, { "advantage_max": 0.8061726428568363, "advantage_mean": 2.1109978765032622e-08, "advantage_min": -0.49055274575948715, "advantage_std": 0.4774854853749275, "completion_length": 3402.125030517578, "epoch": 0.10271286596830513, "grad_norm": 0.08651488274335861, "kl": 0.039459228515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.619104492241847e-07, "loss": 0.0218, "reward": -0.4060012139379978, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4060012139379978, "reward_after_std": 0.4774854965507984, "reward_before_mean": -0.25723179802298546, "reward_before_std": 0.501599095761776, "reward_change_max": 0.002423785626888275, "reward_change_mean": -0.14876941754482687, "reward_change_min": -0.3185141682624817, "reward_change_std": 0.13387555023655295, "reward_std": 0.477485504001379, "rewards/cosine_scaled_reward": -0.21194923110306263, "rewards/format_reward": 0.16666667349636555, "step": 239 }, { "advantage_max": 1.0455578304827213, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.6111446768045425, "advantage_std": 0.6205459181219339, "completion_length": 3337.250030517578, "epoch": 0.1031426269137792, "grad_norm": 0.15056943893432617, "kl": 0.045257568359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.588648530198504e-07, "loss": 0.0273, "reward": -0.21065146010369062, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21065146010369062, "reward_after_std": 0.6205459106713533, "reward_before_mean": -0.009093445958569646, "reward_before_std": 0.6459440533071756, "reward_change_max": 0.0001850724220275879, "reward_change_mean": -0.20155798830091953, "reward_change_min": -0.4139671139419079, "reward_change_std": 0.17134159756824374, "reward_std": 0.6205459292978048, "rewards/cosine_scaled_reward": -0.1399633940309286, "rewards/format_reward": 0.27083334140479565, "step": 240 }, { "advantage_max": 1.4675465263426304, "advantage_mean": 1.8626451714354175e-08, "advantage_min": -0.7891621738672256, "advantage_std": 0.8612227067351341, "completion_length": 3372.9584045410156, "epoch": 0.1035723878592533, "grad_norm": 0.21431508660316467, "kl": 0.029327392578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.558139508961654e-07, "loss": 0.0765, "reward": -0.1117607094347477, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1117607094347477, "reward_after_std": 0.8612226992845535, "reward_before_mean": 0.0947289140895009, "reward_before_std": 0.9119522273540497, "reward_change_max": 0.0014066100120544434, "reward_change_mean": -0.20648960955440998, "reward_change_min": -0.4569370932877064, "reward_change_std": 0.20738830603659153, "reward_std": 0.8612227030098438, "rewards/cosine_scaled_reward": -0.06721888110041618, "rewards/format_reward": 0.2291666716337204, "step": 241 }, { "advantage_max": 1.2041833065450191, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.6850482374429703, "advantage_std": 0.7250676937401295, "completion_length": 3472.5625610351562, "epoch": 0.10400214880472737, "grad_norm": 0.17057225108146667, "kl": 0.037994384765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.527578915497951e-07, "loss": 0.0449, "reward": -0.2780924765393138, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2780924765393138, "reward_after_std": 0.7250676937401295, "reward_before_mean": -0.11180292814970016, "reward_before_std": 0.7867466770112514, "reward_change_max": 0.0028170421719551086, "reward_change_mean": -0.16628957632929087, "reward_change_min": -0.4612081293016672, "reward_change_std": 0.19475694792345166, "reward_std": 0.7250677458941936, "rewards/cosine_scaled_reward": -0.14965146468603052, "rewards/format_reward": 0.18750000186264515, "step": 242 }, { "advantage_max": 0.6809981465339661, "advantage_mean": 7.450580874479584e-09, "advantage_min": -0.3686401881277561, "advantage_std": 0.39564523473381996, "completion_length": 3247.2083587646484, "epoch": 0.10443190975020145, "grad_norm": 0.07800758630037308, "kl": 0.03490447998046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.496968239287603e-07, "loss": 0.0192, "reward": -0.4923173300921917, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4923173300921917, "reward_after_std": 0.39564523473381996, "reward_before_mean": -0.3660241886973381, "reward_before_std": 0.4041763097047806, "reward_change_max": 0.0009647160768508911, "reward_change_mean": -0.12629316630773246, "reward_change_min": -0.2655727006494999, "reward_change_std": 0.10487937112338841, "reward_std": 0.39564524218440056, "rewards/cosine_scaled_reward": -0.2455120924860239, "rewards/format_reward": 0.12500000558793545, "step": 243 }, { "advantage_max": 1.1055401898920536, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.6362109929323196, "advantage_std": 0.6543268524110317, "completion_length": 3220.5208587646484, "epoch": 0.10486167069567554, "grad_norm": 0.13554978370666504, "kl": 0.0389556884765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.466308972251785e-07, "loss": 0.0212, "reward": -0.04168367385864258, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04168367385864258, "reward_after_std": 0.6543268635869026, "reward_before_mean": 0.2169942446053028, "reward_before_std": 0.6621673814952374, "reward_change_max": 0.000314958393573761, "reward_change_mean": -0.25867789424955845, "reward_change_min": -0.47669431008398533, "reward_change_std": 0.20274804718792439, "reward_std": 0.6543269008398056, "rewards/cosine_scaled_reward": -0.02691955305635929, "rewards/format_reward": 0.27083333767950535, "step": 244 }, { "advantage_max": 1.5510660260915756, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.9958107471466064, "advantage_std": 0.9386055581271648, "completion_length": 3165.291717529297, "epoch": 0.10529143164114961, "grad_norm": 0.23057624697685242, "kl": 0.038818359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.435602608679916e-07, "loss": 0.0302, "reward": 0.15572326444089413, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15572326444089413, "reward_after_std": 0.9386055581271648, "reward_before_mean": 0.4526360686868429, "reward_before_std": 1.0019046440720558, "reward_change_max": 0.0005668401718139648, "reward_change_mean": -0.2969127707183361, "reward_change_min": -0.6529092453420162, "reward_change_std": 0.26417900901287794, "reward_std": 0.9386055842041969, "rewards/cosine_scaled_reward": -0.023681987076997757, "rewards/format_reward": 0.5000000167638063, "step": 245 }, { "advantage_max": 1.335495226085186, "advantage_mean": 9.934107647602275e-09, "advantage_min": -0.5846002772450447, "advantage_std": 0.7299374155700207, "completion_length": 3469.2708740234375, "epoch": 0.10572119258662369, "grad_norm": 0.1586172729730606, "kl": 0.040679931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.404850645156841e-07, "loss": 0.0224, "reward": -0.23311333637684584, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23311333637684584, "reward_after_std": 0.7299373969435692, "reward_before_mean": -0.059911044634645805, "reward_before_std": 0.7383395880460739, "reward_change_max": 0.0015191882848739624, "reward_change_mean": -0.1732023013755679, "reward_change_min": -0.37870606407523155, "reward_change_std": 0.15086271613836288, "reward_std": 0.729937419295311, "rewards/cosine_scaled_reward": -0.13412219565361738, "rewards/format_reward": 0.2083333358168602, "step": 246 }, { "advantage_max": 1.2047169506549835, "advantage_mean": -1.8626449826975033e-09, "advantage_min": -0.7674169056117535, "advantage_std": 0.7345201186835766, "completion_length": 3376.6041870117188, "epoch": 0.10615095353209777, "grad_norm": 0.16350483894348145, "kl": 0.0385284423828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.374054580489873e-07, "loss": 0.058, "reward": -0.17826672829687595, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17826672829687595, "reward_after_std": 0.7345201335847378, "reward_before_mean": 0.023289592936635017, "reward_before_std": 0.7913740389049053, "reward_change_max": 0.0012575387954711914, "reward_change_mean": -0.2015563417226076, "reward_change_min": -0.433609914034605, "reward_change_std": 0.1951103936880827, "reward_std": 0.7345201373100281, "rewards/cosine_scaled_reward": -0.09252186771482229, "rewards/format_reward": 0.2083333395421505, "step": 247 }, { "advantage_max": 0.7294521480798721, "advantage_mean": -1.8626452047421083e-09, "advantage_min": -0.5909470319747925, "advantage_std": 0.4933103546500206, "completion_length": 3413.8958740234375, "epoch": 0.10658071447757185, "grad_norm": 0.12745484709739685, "kl": 0.044464111328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.343215915635761e-07, "loss": 0.0384, "reward": -0.28973592445254326, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.28973592445254326, "reward_after_std": 0.49331035278737545, "reward_before_mean": -0.09432091331109405, "reward_before_std": 0.5499195419251919, "reward_change_max": 0.0, "reward_change_mean": -0.1954150227829814, "reward_change_min": -0.39218697138130665, "reward_change_std": 0.1715530240908265, "reward_std": 0.4933103621006012, "rewards/cosine_scaled_reward": -0.14091046899557114, "rewards/format_reward": 0.1875000037252903, "step": 248 }, { "advantage_max": 1.651109267026186, "advantage_mean": -8.07146260939362e-09, "advantage_min": -0.847358874976635, "advantage_std": 0.9446089640259743, "completion_length": 3102.416732788086, "epoch": 0.10701047542304593, "grad_norm": 0.27707958221435547, "kl": 0.045654296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.31233615362752e-07, "loss": 0.0771, "reward": 0.011346116662025452, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.011346116662025452, "reward_after_std": 0.9446089528501034, "reward_before_mean": 0.24785595387220383, "reward_before_std": 0.9800823628902435, "reward_change_max": 0.0016763284802436829, "reward_change_mean": -0.2365098618902266, "reward_change_min": -0.4549998752772808, "reward_change_std": 0.19937642803415656, "reward_std": 0.9446089714765549, "rewards/cosine_scaled_reward": -0.04273868910968304, "rewards/format_reward": 0.3333333395421505, "step": 249 }, { "advantage_max": 0.974465224891901, "advantage_mean": -1.5522043372850902e-08, "advantage_min": -0.5081846415996552, "advantage_std": 0.5717221722006798, "completion_length": 3228.0208740234375, "epoch": 0.10744023636852001, "grad_norm": 0.1444813311100006, "kl": 0.0516510009765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.281416799501187e-07, "loss": 0.0191, "reward": -0.21887372434139252, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21887372434139252, "reward_after_std": 0.5717221908271313, "reward_before_mean": -0.015060501173138618, "reward_before_std": 0.5806613564491272, "reward_change_max": 0.0012713894248008728, "reward_change_mean": -0.20381323155015707, "reward_change_min": -0.4591298345476389, "reward_change_std": 0.16842297604307532, "reward_std": 0.5717222224920988, "rewards/cosine_scaled_reward": -0.13253026641905308, "rewards/format_reward": 0.25000000186264515, "step": 250 }, { "advantage_max": 1.5213196873664856, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.6204018890857697, "advantage_std": 0.7923304624855518, "completion_length": 3105.2083740234375, "epoch": 0.10786999731399409, "grad_norm": 0.267963171005249, "kl": 0.046844482421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.25045936022246e-07, "loss": 0.0755, "reward": 0.011436259374022484, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.011436259374022484, "reward_after_std": 0.7923304699361324, "reward_before_mean": 0.2571127056144178, "reward_before_std": 0.734659343957901, "reward_change_max": 0.0010038390755653381, "reward_change_mean": -0.24567646719515324, "reward_change_min": -0.40520622581243515, "reward_change_std": 0.16547434963285923, "reward_std": 0.7923305034637451, "rewards/cosine_scaled_reward": -0.02769365394487977, "rewards/format_reward": 0.31250000558793545, "step": 251 }, { "advantage_max": 1.185879610478878, "advantage_mean": 4.346172310931706e-09, "advantage_min": -0.5297804288566113, "advantage_std": 0.6570289172232151, "completion_length": 3524.4375, "epoch": 0.10829975825946817, "grad_norm": 0.1562061607837677, "kl": 0.06494140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.219465344613258e-07, "loss": 0.0248, "reward": -0.39559151884168386, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.39559151884168386, "reward_after_std": 0.6570289209485054, "reward_before_mean": -0.2694550082087517, "reward_before_std": 0.6793965063989162, "reward_change_max": 0.0008504092693328857, "reward_change_mean": -0.12613652553409338, "reward_change_min": -0.3504908252507448, "reward_change_std": 0.14007866382598877, "reward_std": 0.657028928399086, "rewards/cosine_scaled_reward": -0.1972275055013597, "rewards/format_reward": 0.12500000186264515, "step": 252 }, { "advantage_max": 1.3102785162627697, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.7418931350111961, "advantage_std": 0.7778161764144897, "completion_length": 2987.562545776367, "epoch": 0.10872951920494225, "grad_norm": 0.17246370017528534, "kl": 0.0495758056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.188436263278172e-07, "loss": 0.0481, "reward": -0.08836724422872066, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08836724422872066, "reward_after_std": 0.7778161764144897, "reward_before_mean": 0.13819970563054085, "reward_before_std": 0.8179987631738186, "reward_change_max": 0.0004450678825378418, "reward_change_mean": -0.2265669647604227, "reward_change_min": -0.5621887221932411, "reward_change_std": 0.21555717568844557, "reward_std": 0.77781618013978, "rewards/cosine_scaled_reward": -0.11840014421613887, "rewards/format_reward": 0.3750000074505806, "step": 253 }, { "advantage_max": 0.7734759636223316, "advantage_mean": 7.450580874479584e-09, "advantage_min": -0.45850635319948196, "advantage_std": 0.447357963770628, "completion_length": 3396.9166870117188, "epoch": 0.10915928015041633, "grad_norm": 0.1155315414071083, "kl": 0.05487060546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.157373628530852e-07, "loss": -0.0018, "reward": -0.5023153249640018, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5023153249640018, "reward_after_std": 0.4473579600453377, "reward_before_mean": -0.38481870852410793, "reward_before_std": 0.4657217524945736, "reward_change_max": 0.0020345523953437805, "reward_change_mean": -0.11749662645161152, "reward_change_min": -0.2538427785038948, "reward_change_std": 0.10926568135619164, "reward_std": 0.4473579674959183, "rewards/cosine_scaled_reward": -0.23407601937651634, "rewards/format_reward": 0.08333333395421505, "step": 254 }, { "advantage_max": 1.2042768821120262, "advantage_mean": -4.96705393482344e-09, "advantage_min": -0.6446366533637047, "advantage_std": 0.6961430795490742, "completion_length": 3408.1458740234375, "epoch": 0.1095890410958904, "grad_norm": 0.2277013510465622, "kl": 0.0638427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.126278954320294e-07, "loss": 0.0518, "reward": -0.14978221245110035, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14978221245110035, "reward_after_std": 0.6961430758237839, "reward_before_mean": 0.063008246361278, "reward_before_std": 0.7142836451530457, "reward_change_max": 0.00011585652828216553, "reward_change_mean": -0.21279048640280962, "reward_change_min": -0.5113924369215965, "reward_change_std": 0.18861977569758892, "reward_std": 0.6961430907249451, "rewards/cosine_scaled_reward": -0.10391253884881735, "rewards/format_reward": 0.27083333767950535, "step": 255 }, { "advantage_max": 1.280396819114685, "advantage_mean": 3.72529057601767e-09, "advantage_min": -0.5476054660975933, "advantage_std": 0.677925031632185, "completion_length": 3237.1041870117188, "epoch": 0.1100188020413645, "grad_norm": 0.15157827734947205, "kl": 0.065765380859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.095153756157051e-07, "loss": 0.023, "reward": -0.348019156139344, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.348019156139344, "reward_after_std": 0.6779250353574753, "reward_before_mean": -0.21312856022268534, "reward_before_std": 0.6692674476653337, "reward_change_max": 0.001871459186077118, "reward_change_mean": -0.13489059172570705, "reward_change_min": -0.27364651672542095, "reward_change_std": 0.11408885754644871, "reward_std": 0.6779250651597977, "rewards/cosine_scaled_reward": -0.18989761848933995, "rewards/format_reward": 0.16666667349636555, "step": 256 }, { "advantage_max": 0.7276310995221138, "advantage_mean": 1.8626451603331873e-08, "advantage_min": -0.3663204461336136, "advantage_std": 0.41161247715353966, "completion_length": 3584.0, "epoch": 0.11044856298683857, "grad_norm": 0.1254163682460785, "kl": 0.07086181640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.06399955103937e-07, "loss": 0.0028, "reward": -0.5596191585063934, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5596191585063934, "reward_after_std": 0.41161247342824936, "reward_before_mean": -0.4610707429237664, "reward_before_std": 0.42291854321956635, "reward_change_max": 0.0004875063896179199, "reward_change_mean": -0.09854840021580458, "reward_change_min": -0.22953752614557743, "reward_change_std": 0.0925815966911614, "reward_std": 0.41161247715353966, "rewards/cosine_scaled_reward": -0.24095204006880522, "rewards/format_reward": 0.02083333395421505, "step": 257 }, { "advantage_max": 1.5063483640551567, "advantage_mean": -2.4835265510780857e-09, "advantage_min": -0.8705936968326569, "advantage_std": 0.9076240211725235, "completion_length": 3400.1875610351562, "epoch": 0.11087832393231264, "grad_norm": 0.2888564467430115, "kl": 0.08056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.032817857379256e-07, "loss": 0.0363, "reward": -0.007699865847826004, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.007699865847826004, "reward_after_std": 0.9076240435242653, "reward_before_mean": 0.234684057533741, "reward_before_std": 0.9710745289921761, "reward_change_max": 0.0007583200931549072, "reward_change_mean": -0.2423839345574379, "reward_change_min": -0.5997596010565758, "reward_change_std": 0.24962764140218496, "reward_std": 0.907624077051878, "rewards/cosine_scaled_reward": -0.04932463448494673, "rewards/format_reward": 0.33333333767950535, "step": 258 }, { "advantage_max": 0.8615490980446339, "advantage_mean": 9.313226190243995e-09, "advantage_min": -0.5725163109600544, "advantage_std": 0.5286407377570868, "completion_length": 3370.0833740234375, "epoch": 0.11130808487778673, "grad_norm": 0.15495982766151428, "kl": 0.080657958984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 6.001610194928464e-07, "loss": 0.0319, "reward": -0.23666931316256523, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23666931316256523, "reward_after_std": 0.5286407358944416, "reward_before_mean": -0.030563104897737503, "reward_before_std": 0.557973925024271, "reward_change_max": 0.0010328665375709534, "reward_change_mean": -0.20610620267689228, "reward_change_min": -0.443426676094532, "reward_change_std": 0.1736170956864953, "reward_std": 0.528640741482377, "rewards/cosine_scaled_reward": -0.1298648876399966, "rewards/format_reward": 0.2291666753590107, "step": 259 }, { "advantage_max": 0.6324207037687302, "advantage_mean": 2.2817403577679585e-08, "advantage_min": -0.38971780985593796, "advantage_std": 0.375492338091135, "completion_length": 3252.062515258789, "epoch": 0.11173784582326081, "grad_norm": 0.1424371600151062, "kl": 0.0789794921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.97037808470444e-07, "loss": 0.0058, "reward": -0.44783515235758387, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.44783515235758387, "reward_after_std": 0.3754923418164253, "reward_before_mean": -0.30112627148628235, "reward_before_std": 0.37953848764300346, "reward_change_max": 0.0006958469748497009, "reward_change_mean": -0.14670888055115938, "reward_change_min": -0.28926150873303413, "reward_change_std": 0.11772471945732832, "reward_std": 0.3754923604428768, "rewards/cosine_scaled_reward": -0.254729812964797, "rewards/format_reward": 0.2083333358168602, "step": 260 }, { "advantage_max": 1.1513529978692532, "advantage_mean": 1.986821579480491e-08, "advantage_min": -0.5313710384070873, "advantage_std": 0.6236794646829367, "completion_length": 3172.5416717529297, "epoch": 0.11216760676873488, "grad_norm": 0.24021708965301514, "kl": 0.07867431640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.939123048916173e-07, "loss": 0.0195, "reward": -0.4031802290119231, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4031802290119231, "reward_after_std": 0.6236794786527753, "reward_before_mean": -0.27939662616699934, "reward_before_std": 0.6319431299343705, "reward_change_max": 0.0009956881403923035, "reward_change_mean": -0.12378359865397215, "reward_change_min": -0.28081204928457737, "reward_change_std": 0.11511482088826597, "reward_std": 0.6236794898286462, "rewards/cosine_scaled_reward": -0.21261498203966767, "rewards/format_reward": 0.14583333767950535, "step": 261 }, { "advantage_max": 0.9349787309765816, "advantage_mean": 6.208817571184966e-09, "advantage_min": -0.49400635063648224, "advantage_std": 0.5301149450242519, "completion_length": 3384.7708740234375, "epoch": 0.11259736771420897, "grad_norm": 0.1940862089395523, "kl": 0.066680908203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.907846610890011e-07, "loss": 0.0166, "reward": -0.3641768489032984, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3641768489032984, "reward_after_std": 0.5301149562001228, "reward_before_mean": -0.2095111459493637, "reward_before_std": 0.5438346192240715, "reward_change_max": 0.0012353211641311646, "reward_change_mean": -0.15466570295393467, "reward_change_min": -0.33453699573874474, "reward_change_std": 0.13391570001840591, "reward_std": 0.5301149860024452, "rewards/cosine_scaled_reward": -0.2297555822879076, "rewards/format_reward": 0.25000000186264515, "step": 262 }, { "advantage_max": 0.8801203519105911, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.5485141798853874, "advantage_std": 0.5248471237719059, "completion_length": 3365.7083435058594, "epoch": 0.11302712865968305, "grad_norm": 0.15608467161655426, "kl": 0.08056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.87655029499542e-07, "loss": 0.0149, "reward": -0.34681250900030136, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.34681250900030136, "reward_after_std": 0.5248471237719059, "reward_before_mean": -0.1810881271958351, "reward_before_std": 0.5492185801267624, "reward_change_max": 0.0007206946611404419, "reward_change_mean": -0.1657243873924017, "reward_change_min": -0.34865728579461575, "reward_change_std": 0.14806428365409374, "reward_std": 0.5248471312224865, "rewards/cosine_scaled_reward": -0.184294069185853, "rewards/format_reward": 0.1875000074505806, "step": 263 }, { "advantage_max": 0.8341222666203976, "advantage_mean": 2.359350592673337e-08, "advantage_min": -0.4395537041127682, "advantage_std": 0.4735940657556057, "completion_length": 2822.8333740234375, "epoch": 0.11345688960515714, "grad_norm": 0.12201210111379623, "kl": 0.080322265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.845235626570683e-07, "loss": 0.0148, "reward": -0.26717107463628054, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26717107463628054, "reward_after_std": 0.4735940806567669, "reward_before_mean": -0.07079494325444102, "reward_before_std": 0.4540337696671486, "reward_change_max": 0.0017086640000343323, "reward_change_mean": -0.1963761169463396, "reward_change_min": -0.35727898590266705, "reward_change_std": 0.1432845564559102, "reward_std": 0.4735940843820572, "rewards/cosine_scaled_reward": -0.13956413883715868, "rewards/format_reward": 0.2083333358168602, "step": 264 }, { "advantage_max": 1.0825435891747475, "advantage_mean": 9.313225857177088e-09, "advantage_min": -0.6393377184867859, "advantage_std": 0.6500541474670172, "completion_length": 3262.2916870117188, "epoch": 0.11388665055063121, "grad_norm": 0.21023572981357574, "kl": 0.090362548828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.813904131848564e-07, "loss": 0.0146, "reward": -0.31325029814615846, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.31325029814615846, "reward_after_std": 0.6500541474670172, "reward_before_mean": -0.15127715282142162, "reward_before_std": 0.6967314276844263, "reward_change_max": 0.0016005635261535645, "reward_change_mean": -0.16197312995791435, "reward_change_min": -0.35035456344485283, "reward_change_std": 0.16158457566052675, "reward_std": 0.6500541903078556, "rewards/cosine_scaled_reward": -0.1798052377998829, "rewards/format_reward": 0.2083333395421505, "step": 265 }, { "advantage_max": 0.8958307467401028, "advantage_mean": 5.587935336670569e-09, "advantage_min": -0.3991657458245754, "advantage_std": 0.490163529291749, "completion_length": 3259.2291870117188, "epoch": 0.11431641149610529, "grad_norm": 0.14117099344730377, "kl": 0.0877685546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.78255733788191e-07, "loss": -0.0011, "reward": -0.4136801166459918, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4136801166459918, "reward_after_std": 0.490163529291749, "reward_before_mean": -0.2746377162402496, "reward_before_std": 0.48176294565200806, "reward_change_max": 0.0012708678841590881, "reward_change_mean": -0.13904240354895592, "reward_change_min": -0.26934339478611946, "reward_change_std": 0.105195471085608, "reward_std": 0.4901635367423296, "rewards/cosine_scaled_reward": -0.19981886632740498, "rewards/format_reward": 0.12500000186264515, "step": 266 }, { "advantage_max": 1.008179321885109, "advantage_mean": 8.071462387349015e-09, "advantage_min": -0.6115937903523445, "advantage_std": 0.5878358818590641, "completion_length": 2887.291679382324, "epoch": 0.11474617244157938, "grad_norm": 0.1871396154165268, "kl": 0.084320068359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.751196772469237e-07, "loss": 0.0077, "reward": -0.08586978353559971, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08586978353559971, "reward_after_std": 0.5878358818590641, "reward_before_mean": 0.16259978711605072, "reward_before_std": 0.5845226123929024, "reward_change_max": 0.00018671154975891113, "reward_change_mean": -0.24846955947577953, "reward_change_min": -0.4579412881284952, "reward_change_std": 0.18582632020115852, "reward_std": 0.5878358893096447, "rewards/cosine_scaled_reward": -0.08536677993834019, "rewards/format_reward": 0.33333333395421505, "step": 267 }, { "advantage_max": 1.2193130403757095, "advantage_mean": 8.071462054282108e-09, "advantage_min": -0.5299973636865616, "advantage_std": 0.640822745859623, "completion_length": 3225.3959045410156, "epoch": 0.11517593338705345, "grad_norm": 0.18753713369369507, "kl": 0.09259033203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.71982396408026e-07, "loss": 0.0105, "reward": -0.13292261492460966, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13292261492460966, "reward_after_std": 0.6408227346837521, "reward_before_mean": 0.08263992867432535, "reward_before_std": 0.5878083910793066, "reward_change_max": 0.0008655786514282227, "reward_change_mean": -0.21556250751018524, "reward_change_min": -0.3521035984158516, "reward_change_std": 0.15044401213526726, "reward_std": 0.6408227682113647, "rewards/cosine_scaled_reward": -0.06284672487527132, "rewards/format_reward": 0.2083333358168602, "step": 268 }, { "advantage_max": 0.7910887412726879, "advantage_mean": -9.313226190243995e-09, "advantage_min": -0.4820495806634426, "advantage_std": 0.4623129963874817, "completion_length": 3065.0208740234375, "epoch": 0.11560569433252753, "grad_norm": 0.16396020352840424, "kl": 0.0833587646484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.688440441781398e-07, "loss": 0.0172, "reward": -0.1965200938284397, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1965200938284397, "reward_after_std": 0.462313000112772, "reward_before_mean": 0.02676655352115631, "reward_before_std": 0.45091476291418076, "reward_change_max": 0.0010702461004257202, "reward_change_mean": -0.2232866669073701, "reward_change_min": -0.39697558246552944, "reward_change_std": 0.16079184063710272, "reward_std": 0.4623130038380623, "rewards/cosine_scaled_reward": -0.08036671578884125, "rewards/format_reward": 0.18750000186264515, "step": 269 }, { "advantage_max": 0.5319292098283768, "advantage_mean": 9.313225746154785e-09, "advantage_min": -0.3846990317106247, "advantage_std": 0.3349338732659817, "completion_length": 3409.7291870117188, "epoch": 0.11603545527800162, "grad_norm": 0.15463104844093323, "kl": 0.104888916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.657047735161255e-07, "loss": 0.0122, "reward": -0.508622182533145, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.508622182533145, "reward_after_std": 0.3349338732659817, "reward_before_mean": -0.3771735057234764, "reward_before_std": 0.3576428275555372, "reward_change_max": 0.0011251121759414673, "reward_change_mean": -0.13144868426024914, "reward_change_min": -0.2761802449822426, "reward_change_std": 0.10976577969267964, "reward_std": 0.33493388071656227, "rewards/cosine_scaled_reward": -0.230253417044878, "rewards/format_reward": 0.08333333395421505, "step": 270 }, { "advantage_max": 1.085092693567276, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.4794849678874016, "advantage_std": 0.5914131253957748, "completion_length": 3163.2084197998047, "epoch": 0.11646521622347569, "grad_norm": 0.2552463114261627, "kl": 0.0860595703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.625647374256061e-07, "loss": 0.0347, "reward": -0.34840062633156776, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.34840062633156776, "reward_after_std": 0.5914131328463554, "reward_before_mean": -0.2003322271630168, "reward_before_std": 0.5850070789456367, "reward_change_max": 0.0030118227005004883, "reward_change_mean": -0.14806839358061552, "reward_change_min": -0.32006631419062614, "reward_change_std": 0.1269065304659307, "reward_std": 0.5914131477475166, "rewards/cosine_scaled_reward": -0.20433278614655137, "rewards/format_reward": 0.2083333358168602, "step": 271 }, { "advantage_max": 1.2362486720085144, "advantage_mean": 8.071462831438225e-09, "advantage_min": -0.5775024555623531, "advantage_std": 0.693760834634304, "completion_length": 2675.7708740234375, "epoch": 0.11689497716894977, "grad_norm": 0.31447526812553406, "kl": 0.09442138671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.594240889475106e-07, "loss": 0.0592, "reward": 0.07979025691747665, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07979025691747665, "reward_after_std": 0.6937608495354652, "reward_before_mean": 0.3705106731504202, "reward_before_std": 0.6536267679184675, "reward_change_max": 0.0008590444922447205, "reward_change_mean": -0.2907204180955887, "reward_change_min": -0.5733746476471424, "reward_change_std": 0.2186190839856863, "reward_std": 0.6937608644366264, "rewards/cosine_scaled_reward": -0.00224466435611248, "rewards/format_reward": 0.3750000037252903, "step": 272 }, { "advantage_max": 0.8440443426370621, "advantage_mean": 8.692344455329959e-09, "advantage_min": -0.6072041057050228, "advantage_std": 0.5392494574189186, "completion_length": 3310.687530517578, "epoch": 0.11732473811442386, "grad_norm": 0.2169463336467743, "kl": 0.092437744140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.562829811526154e-07, "loss": 0.033, "reward": -0.21284152194857597, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21284152194857597, "reward_after_std": 0.5392494536936283, "reward_before_mean": 9.439606219530106e-05, "reward_before_std": 0.5773180536925793, "reward_change_max": 0.002281591296195984, "reward_change_mean": -0.2129358984529972, "reward_change_min": -0.42184010706841946, "reward_change_std": 0.17968396749347448, "reward_std": 0.5392494574189186, "rewards/cosine_scaled_reward": -0.18745281174778938, "rewards/format_reward": 0.37500000558793545, "step": 273 }, { "advantage_max": 1.507428616285324, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.9935997650027275, "advantage_std": 0.9137443788349628, "completion_length": 3092.9166870117188, "epoch": 0.11775449905989793, "grad_norm": 0.29289039969444275, "kl": 0.09503173828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.531415671340826e-07, "loss": 0.0252, "reward": 0.1291647171601653, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1291647171601653, "reward_after_std": 0.9137443788349628, "reward_before_mean": 0.41935762763023376, "reward_before_std": 0.9744222909212112, "reward_change_max": 0.0012353137135505676, "reward_change_mean": -0.2901929020881653, "reward_change_min": -0.6539932079613209, "reward_change_std": 0.268905040808022, "reward_std": 0.9137443974614143, "rewards/cosine_scaled_reward": -0.04032119736075401, "rewards/format_reward": 0.5000000111758709, "step": 274 }, { "advantage_max": 1.0380645357072353, "advantage_mean": 2.3593506037755674e-08, "advantage_min": -0.5266988985240459, "advantage_std": 0.5786697715520859, "completion_length": 3017.250015258789, "epoch": 0.11818426000537201, "grad_norm": 0.4233959913253784, "kl": 0.1070556640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.5e-07, "loss": 0.0578, "reward": -0.34382849745452404, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34382849745452404, "reward_after_std": 0.5786697827279568, "reward_before_mean": -0.19083243112254422, "reward_before_std": 0.5899570472538471, "reward_change_max": 0.0, "reward_change_mean": -0.152996058575809, "reward_change_min": -0.33991224505007267, "reward_change_std": 0.1359951812773943, "reward_std": 0.5786698050796986, "rewards/cosine_scaled_reward": -0.1683328878134489, "rewards/format_reward": 0.1458333395421505, "step": 275 }, { "advantage_max": 1.2778971940279007, "advantage_mean": -6.208820124697922e-10, "advantage_min": -0.7827732115983963, "advantage_std": 0.7522504199296236, "completion_length": 3097.1458740234375, "epoch": 0.1186140209508461, "grad_norm": 0.32514703273773193, "kl": 0.10076904296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.468584328659172e-07, "loss": 0.0364, "reward": -0.02349297795444727, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.02349297795444727, "reward_after_std": 0.7522504199296236, "reward_before_mean": 0.22807370871305466, "reward_before_std": 0.7849975526332855, "reward_change_max": 0.0012751221656799316, "reward_change_mean": -0.25156672461889684, "reward_change_min": -0.516126124188304, "reward_change_std": 0.20954430056735873, "reward_std": 0.7522504311054945, "rewards/cosine_scaled_reward": -0.04221314191818237, "rewards/format_reward": 0.31250000558793545, "step": 276 }, { "advantage_max": 1.3622451648116112, "advantage_mean": 1.9247334059890875e-08, "advantage_min": -0.7121556326746941, "advantage_std": 0.7644608467817307, "completion_length": 3284.0208740234375, "epoch": 0.11904378189632017, "grad_norm": 0.23637831211090088, "kl": 0.10858154296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.437170188473847e-07, "loss": 0.0391, "reward": -0.22789290419314057, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22789290419314057, "reward_after_std": 0.7644608467817307, "reward_before_mean": -0.056170093826949596, "reward_before_std": 0.7885326780378819, "reward_change_max": 0.0019710958003997803, "reward_change_mean": -0.1717228088527918, "reward_change_min": -0.4007701724767685, "reward_change_std": 0.16489384975284338, "reward_std": 0.7644608691334724, "rewards/cosine_scaled_reward": -0.1322517158696428, "rewards/format_reward": 0.2083333358168602, "step": 277 }, { "advantage_max": 1.145736400038004, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.7405370604246855, "advantage_std": 0.6818780563771725, "completion_length": 2933.854217529297, "epoch": 0.11947354284179425, "grad_norm": 0.21350227296352386, "kl": 0.10638427734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.405759110524894e-07, "loss": 0.0204, "reward": 0.050614255014806986, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.050614255014806986, "reward_after_std": 0.6818780712783337, "reward_before_mean": 0.3394548427313566, "reward_before_std": 0.6915983557701111, "reward_change_max": 0.0, "reward_change_mean": -0.28884059749543667, "reward_change_min": -0.4914071448147297, "reward_change_std": 0.2076748562976718, "reward_std": 0.681878101080656, "rewards/cosine_scaled_reward": -0.05943924654275179, "rewards/format_reward": 0.4583333395421505, "step": 278 }, { "advantage_max": 1.1358279809355736, "advantage_mean": 1.0554989271494009e-08, "advantage_min": -0.5852786153554916, "advantage_std": 0.6545769646763802, "completion_length": 3337.9791870117188, "epoch": 0.11990330378726834, "grad_norm": 0.3101421296596527, "kl": 0.1519775390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.37435262574394e-07, "loss": 0.006, "reward": -0.29368816688656807, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29368816688656807, "reward_after_std": 0.6545769795775414, "reward_before_mean": -0.12941771373152733, "reward_before_std": 0.6829807087779045, "reward_change_max": 0.001104399561882019, "reward_change_mean": -0.16427046852186322, "reward_change_min": -0.3638614732772112, "reward_change_std": 0.15205396991223097, "reward_std": 0.6545769944787025, "rewards/cosine_scaled_reward": -0.1480421926244162, "rewards/format_reward": 0.16666667349636555, "step": 279 }, { "advantage_max": 0.7546833716332912, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.48685088753700256, "advantage_std": 0.4466598145663738, "completion_length": 3140.0833740234375, "epoch": 0.12033306473274241, "grad_norm": 0.3025180697441101, "kl": 0.141265869140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.342952264838747e-07, "loss": 0.0072, "reward": -0.4372501354664564, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4372501354664564, "reward_after_std": 0.4466598182916641, "reward_before_mean": -0.29626163886860013, "reward_before_std": 0.46768759563565254, "reward_change_max": 0.0014112740755081177, "reward_change_mean": -0.14098850265145302, "reward_change_min": -0.3089320007711649, "reward_change_std": 0.1250120042823255, "reward_std": 0.4466598406434059, "rewards/cosine_scaled_reward": -0.20021415129303932, "rewards/format_reward": 0.10416666977107525, "step": 280 }, { "advantage_max": 1.1959688775241375, "advantage_mean": 4.346172144398253e-09, "advantage_min": -0.6171353831887245, "advantage_std": 0.6672304458916187, "completion_length": 2811.500030517578, "epoch": 0.12076282567821649, "grad_norm": 0.3476429283618927, "kl": 0.12994384765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.311559558218603e-07, "loss": 0.0447, "reward": -0.21905191242694855, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21905191242694855, "reward_after_std": 0.667230449616909, "reward_before_mean": -0.03119399247225374, "reward_before_std": 0.667837630957365, "reward_change_max": 0.0008665546774864197, "reward_change_mean": -0.18785792775452137, "reward_change_min": -0.38418367877602577, "reward_change_std": 0.1545093827880919, "reward_std": 0.6672304570674896, "rewards/cosine_scaled_reward": -0.1718469960615039, "rewards/format_reward": 0.3125000037252903, "step": 281 }, { "advantage_max": 1.2260249555110931, "advantage_mean": 6.208814573582799e-10, "advantage_min": -0.8227491453289986, "advantage_std": 0.758858572691679, "completion_length": 3273.6666870117188, "epoch": 0.12119258662369058, "grad_norm": 0.387556254863739, "kl": 0.11932373046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.28017603591974e-07, "loss": 0.0452, "reward": 0.19334421679377556, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19334421679377556, "reward_after_std": 0.7588585764169693, "reward_before_mean": 0.5266936728730798, "reward_before_std": 0.7771921046078205, "reward_change_max": 0.0, "reward_change_mean": -0.3333494057878852, "reward_change_min": -0.6056358814239502, "reward_change_std": 0.26108126156032085, "reward_std": 0.7588585764169693, "rewards/cosine_scaled_reward": 0.04459681920707226, "rewards/format_reward": 0.43750000931322575, "step": 282 }, { "advantage_max": 0.7315772138535976, "advantage_mean": 9.623666669122954e-09, "advantage_min": -0.5726720169186592, "advantage_std": 0.4609476812183857, "completion_length": 3130.916717529297, "epoch": 0.12162234756916465, "grad_norm": 0.13027717173099518, "kl": 0.122467041015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.248803227530763e-07, "loss": 0.02, "reward": -0.2621837866026908, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2621837866026908, "reward_after_std": 0.460947684943676, "reward_before_mean": -0.056230805814266205, "reward_before_std": 0.4849946014583111, "reward_change_max": 0.00018583238124847412, "reward_change_mean": -0.20595297683030367, "reward_change_min": -0.3778469115495682, "reward_change_std": 0.15768434666097164, "reward_std": 0.4609477035701275, "rewards/cosine_scaled_reward": -0.1947820857167244, "rewards/format_reward": 0.33333334140479565, "step": 283 }, { "advantage_max": 0.8172631971538067, "advantage_mean": -6.208817904251873e-10, "advantage_min": -0.4573686681687832, "advantage_std": 0.4856023285537958, "completion_length": 3315.2708435058594, "epoch": 0.12205210851463873, "grad_norm": 0.1633276790380478, "kl": 0.12286376953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.21744266211809e-07, "loss": 0.0207, "reward": -0.32998483069241047, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32998483069241047, "reward_after_std": 0.48560232296586037, "reward_before_mean": -0.1546590938232839, "reward_before_std": 0.499278049916029, "reward_change_max": 0.0008084401488304138, "reward_change_mean": -0.17532574757933617, "reward_change_min": -0.3571357876062393, "reward_change_std": 0.1462052697315812, "reward_std": 0.4856023285537958, "rewards/cosine_scaled_reward": -0.1294128829613328, "rewards/format_reward": 0.1041666679084301, "step": 284 }, { "advantage_max": 0.8041092157363892, "advantage_mean": 1.428027990302283e-08, "advantage_min": -0.5186256021261215, "advantage_std": 0.49103478714823723, "completion_length": 3182.437545776367, "epoch": 0.12248186946011282, "grad_norm": 0.2136518508195877, "kl": 0.14678955078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.186095868151436e-07, "loss": 0.0148, "reward": -0.44087208434939384, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.44087208434939384, "reward_after_std": 0.4910347815603018, "reward_before_mean": -0.3052906086668372, "reward_before_std": 0.5300016570836306, "reward_change_max": 0.0015909969806671143, "reward_change_mean": -0.13558145891875029, "reward_change_min": -0.33961706794798374, "reward_change_std": 0.1388704008422792, "reward_std": 0.4910348001867533, "rewards/cosine_scaled_reward": -0.2151453085243702, "rewards/format_reward": 0.125, "step": 285 }, { "advantage_max": 0.7011575847864151, "advantage_mean": 1.303851610012785e-08, "advantage_min": -0.3656802587211132, "advantage_std": 0.4015003554522991, "completion_length": 3223.979248046875, "epoch": 0.12291163040558689, "grad_norm": 0.21834701299667358, "kl": 0.144775390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.154764373429315e-07, "loss": 0.0162, "reward": -0.21185201895423234, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21185201895423234, "reward_after_std": 0.4015003517270088, "reward_before_mean": 0.013366896659135818, "reward_before_std": 0.36685472913086414, "reward_change_max": 0.0010210052132606506, "reward_change_mean": -0.22521890397183597, "reward_change_min": -0.3932218924164772, "reward_change_std": 0.15119481342844665, "reward_std": 0.4015003517270088, "rewards/cosine_scaled_reward": -0.15998322889208794, "rewards/format_reward": 0.33333334140479565, "step": 286 }, { "advantage_max": 0.9320897832512856, "advantage_mean": 1.3659397946064189e-08, "advantage_min": -0.4795554168522358, "advantage_std": 0.539739865809679, "completion_length": 3442.7291870117188, "epoch": 0.12334139135106097, "grad_norm": 0.21218940615653992, "kl": 0.1693115234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.123449705004581e-07, "loss": 0.0134, "reward": -0.4729801067151129, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4729801067151129, "reward_after_std": 0.539739865809679, "reward_before_mean": -0.35686985962092876, "reward_before_std": 0.5719426199793816, "reward_change_max": 0.0002767890691757202, "reward_change_mean": -0.11611024243757129, "reward_change_min": -0.31945983693003654, "reward_change_std": 0.13026831671595573, "reward_std": 0.539739865809679, "rewards/cosine_scaled_reward": -0.20968493004329503, "rewards/format_reward": 0.06250000186264515, "step": 287 }, { "advantage_max": 1.1425553224980831, "advantage_mean": -1.862645149230957e-09, "advantage_min": -0.7705506198108196, "advantage_std": 0.684613399207592, "completion_length": 3018.687530517578, "epoch": 0.12377115229653506, "grad_norm": 0.5515904426574707, "kl": 0.137664794921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.09215338910999e-07, "loss": 0.055, "reward": 0.0028756074607372284, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0028756074607372284, "reward_after_std": 0.6846134252846241, "reward_before_mean": 0.2719299551099539, "reward_before_std": 0.7088113687932491, "reward_change_max": 0.001004830002784729, "reward_change_mean": -0.2690543457865715, "reward_change_min": -0.48163413628935814, "reward_change_std": 0.20358690153807402, "reward_std": 0.6846134327352047, "rewards/cosine_scaled_reward": -0.00986835453659296, "rewards/format_reward": 0.2916666716337204, "step": 288 }, { "advantage_max": 1.3431474901735783, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.6358992829918861, "advantage_std": 0.7296426184475422, "completion_length": 3236.729217529297, "epoch": 0.12420091324200913, "grad_norm": 0.38007545471191406, "kl": 0.16259765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.060876951083828e-07, "loss": 0.0604, "reward": -0.12801504810340703, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12801504810340703, "reward_after_std": 0.7296426221728325, "reward_before_mean": 0.08196483521169284, "reward_before_std": 0.7192053915932775, "reward_change_max": 0.0013238191604614258, "reward_change_mean": -0.20997985545545816, "reward_change_min": -0.3979705609381199, "reward_change_std": 0.15810929238796234, "reward_std": 0.7296426370739937, "rewards/cosine_scaled_reward": -0.05276759644038975, "rewards/format_reward": 0.1875000074505806, "step": 289 }, { "advantage_max": 0.8264056518673897, "advantage_mean": 4.035730971629903e-09, "advantage_min": -0.5370477102696896, "advantage_std": 0.49496686458587646, "completion_length": 2990.5833892822266, "epoch": 0.1246306741874832, "grad_norm": 0.22771453857421875, "kl": 0.156890869140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 5.02962191529556e-07, "loss": 0.0317, "reward": -0.2288378532975912, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2288378532975912, "reward_after_std": 0.49496686831116676, "reward_before_mean": -0.019048910588026047, "reward_before_std": 0.5029870085418224, "reward_change_max": 0.0031675174832344055, "reward_change_mean": -0.2097889599390328, "reward_change_min": -0.36991651728749275, "reward_change_std": 0.15698903566226363, "reward_std": 0.49496688321232796, "rewards/cosine_scaled_reward": -0.13452445529401302, "rewards/format_reward": 0.2500000037252903, "step": 290 }, { "advantage_max": 1.0561319924890995, "advantage_mean": 1.7384688688615313e-08, "advantage_min": -0.4601830542087555, "advantage_std": 0.5659034736454487, "completion_length": 3517.3958740234375, "epoch": 0.12506043513295728, "grad_norm": 0.23331016302108765, "kl": 0.1968994140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.998389805071536e-07, "loss": 0.0179, "reward": -0.45905274827964604, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.45905274827964604, "reward_after_std": 0.5659034922719002, "reward_before_mean": -0.34678794955834746, "reward_before_std": 0.5680414754897356, "reward_change_max": 0.0005789622664451599, "reward_change_mean": -0.1122647994197905, "reward_change_min": -0.24433544650673866, "reward_change_std": 0.10274240979924798, "reward_std": 0.5659035071730614, "rewards/cosine_scaled_reward": -0.24631063733249903, "rewards/format_reward": 0.14583333767950535, "step": 291 }, { "advantage_max": 0.7819907106459141, "advantage_mean": 1.1796752907855534e-08, "advantage_min": -0.447596050798893, "advantage_std": 0.45008964464068413, "completion_length": 2885.416702270508, "epoch": 0.12549019607843137, "grad_norm": 0.20868058502674103, "kl": 0.14910888671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.967182142620745e-07, "loss": 0.0073, "reward": -0.18975193705409765, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.18975193705409765, "reward_after_std": 0.45008964091539383, "reward_before_mean": 0.03863196074962616, "reward_before_std": 0.4304855801165104, "reward_change_max": 0.0005499497056007385, "reward_change_mean": -0.22838388802483678, "reward_change_min": -0.3916623555123806, "reward_change_std": 0.1553486417979002, "reward_std": 0.4500896669924259, "rewards/cosine_scaled_reward": -0.17860069125890732, "rewards/format_reward": 0.39583333395421505, "step": 292 }, { "advantage_max": 0.5089023411273956, "advantage_mean": 1.800557003495129e-08, "advantage_min": -0.3330877050757408, "advantage_std": 0.31463501416146755, "completion_length": 3544.75, "epoch": 0.12591995702390546, "grad_norm": 0.30345186591148376, "kl": 0.19580078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.93600044896063e-07, "loss": 0.0152, "reward": -0.6130343154072762, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6130343154072762, "reward_after_std": 0.31463501416146755, "reward_before_mean": -0.5180140174925327, "reward_before_std": 0.33928588312119246, "reward_change_max": 0.0022564008831977844, "reward_change_mean": -0.09502031141892076, "reward_change_min": -0.21076207235455513, "reward_change_std": 0.09134689811617136, "reward_std": 0.3146350188180804, "rewards/cosine_scaled_reward": -0.2694236673414707, "rewards/format_reward": 0.02083333395421505, "step": 293 }, { "advantage_max": 1.4441608414053917, "advantage_mean": 2.5456150853919723e-08, "advantage_min": -0.5894607827067375, "advantage_std": 0.7765249088406563, "completion_length": 3450.8958740234375, "epoch": 0.12634971796937952, "grad_norm": 0.44929933547973633, "kl": 0.16973876953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.904846243842949e-07, "loss": 0.054, "reward": -0.3050708007067442, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3050708007067442, "reward_after_std": 0.7765249088406563, "reward_before_mean": -0.16588336415588856, "reward_before_std": 0.7844679802656174, "reward_change_max": 0.001365557312965393, "reward_change_mean": -0.1391874222899787, "reward_change_min": -0.3301129713654518, "reward_change_std": 0.13535843265708536, "reward_std": 0.7765249460935593, "rewards/cosine_scaled_reward": -0.17669168394058943, "rewards/format_reward": 0.1875000037252903, "step": 294 }, { "advantage_max": 0.9763939492404461, "advantage_mean": 4.2219957974154454e-08, "advantage_min": -0.4980233833193779, "advantage_std": 0.5610970687121153, "completion_length": 3474.312530517578, "epoch": 0.1267794789148536, "grad_norm": 0.3198108375072479, "kl": 0.1956787109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.873721045679706e-07, "loss": 0.0476, "reward": -0.4607831072062254, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4607831072062254, "reward_after_std": 0.5610970705747604, "reward_before_mean": -0.3435654938220978, "reward_before_std": 0.592672111466527, "reward_change_max": 0.002817682921886444, "reward_change_mean": -0.1172175770625472, "reward_change_min": -0.28041235357522964, "reward_change_std": 0.12638260750100017, "reward_std": 0.5610970854759216, "rewards/cosine_scaled_reward": -0.2238660864531994, "rewards/format_reward": 0.10416666977107525, "step": 295 }, { "advantage_max": 1.1216935366392136, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.6479685008525848, "advantage_std": 0.6531911492347717, "completion_length": 3355.416717529297, "epoch": 0.1272092398603277, "grad_norm": 0.28531572222709656, "kl": 0.20306396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.842626371469149e-07, "loss": 0.0555, "reward": -0.3106984067708254, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3106984067708254, "reward_after_std": 0.6531911678612232, "reward_before_mean": -0.15182944014668465, "reward_before_std": 0.6871637478470802, "reward_change_max": 0.0007486343383789062, "reward_change_mean": -0.15886896662414074, "reward_change_min": -0.3761896900832653, "reward_change_std": 0.15635326504707336, "reward_std": 0.6531911827623844, "rewards/cosine_scaled_reward": -0.13841471821069717, "rewards/format_reward": 0.12500000186264515, "step": 296 }, { "advantage_max": 1.4579402282834053, "advantage_mean": -6.829699139565548e-09, "advantage_min": -0.7225583121180534, "advantage_std": 0.7991862371563911, "completion_length": 3136.7917098999023, "epoch": 0.12763900080580176, "grad_norm": 0.2663061320781708, "kl": 0.2049560546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.811563736721829e-07, "loss": 0.031, "reward": -0.09228997118771076, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09228997118771076, "reward_after_std": 0.7991862073540688, "reward_before_mean": 0.12189705856144428, "reward_before_std": 0.7998332735151052, "reward_change_max": 0.0016183778643608093, "reward_change_mean": -0.214187016710639, "reward_change_min": -0.44562828540802, "reward_change_std": 0.1732492196606472, "reward_std": 0.7991862371563911, "rewards/cosine_scaled_reward": -0.1161348121240735, "rewards/format_reward": 0.3541666753590107, "step": 297 }, { "advantage_max": 1.0584010854363441, "advantage_mean": 6.829699028543246e-09, "advantage_min": -0.4953242391347885, "advantage_std": 0.572895335033536, "completion_length": 3253.625030517578, "epoch": 0.12806876175127585, "grad_norm": 0.1782265454530716, "kl": 0.161346435546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.780534655386743e-07, "loss": 0.0417, "reward": -0.3407241627573967, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3407241627573967, "reward_after_std": 0.5728953443467617, "reward_before_mean": -0.18652180209755898, "reward_before_std": 0.5623484589159489, "reward_change_max": 0.00028168410062789917, "reward_change_mean": -0.15420235786587, "reward_change_min": -0.3289311062544584, "reward_change_std": 0.12524724192917347, "reward_std": 0.5728953778743744, "rewards/cosine_scaled_reward": -0.21826090663671494, "rewards/format_reward": 0.2500000037252903, "step": 298 }, { "advantage_max": 0.8785120993852615, "advantage_mean": -1.8626452158443385e-08, "advantage_min": -0.5166600793600082, "advantage_std": 0.5006861351430416, "completion_length": 3132.8333740234375, "epoch": 0.12849852269674994, "grad_norm": 0.2209920585155487, "kl": 0.20880126953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.749540639777539e-07, "loss": 0.0416, "reward": -0.23161973431706429, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23161973431706429, "reward_after_std": 0.5006861425936222, "reward_before_mean": -0.025696586817502975, "reward_before_std": 0.4925981406122446, "reward_change_max": 0.0004803091287612915, "reward_change_mean": -0.2059231624007225, "reward_change_min": -0.3664503414183855, "reward_change_std": 0.14509868621826172, "reward_std": 0.5006861425936222, "rewards/cosine_scaled_reward": -0.16909829899668694, "rewards/format_reward": 0.31250000186264515, "step": 299 }, { "advantage_max": 1.1155625022947788, "advantage_mean": -1.7384688244526103e-08, "advantage_min": -0.684398926794529, "advantage_std": 0.6856273524463177, "completion_length": 3521.125030517578, "epoch": 0.128928283642224, "grad_norm": 0.41423046588897705, "kl": 0.21051025390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.7185832004988133e-07, "loss": 0.0356, "reward": -0.275171484798193, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.275171484798193, "reward_after_std": 0.6856273412704468, "reward_before_mean": -0.10107248276472092, "reward_before_std": 0.7486118748784065, "reward_change_max": 0.0003411322832107544, "reward_change_mean": -0.17409902391955256, "reward_change_min": -0.4591302238404751, "reward_change_std": 0.1908327336423099, "reward_std": 0.6856273617595434, "rewards/cosine_scaled_reward": -0.15470290556550026, "rewards/format_reward": 0.2083333395421505, "step": 300 }, { "advantage_max": 0.5909117199480534, "advantage_mean": 8.69234451084111e-09, "advantage_min": -0.3319068029522896, "advantage_std": 0.3519499022513628, "completion_length": 3582.1458435058594, "epoch": 0.1293580445876981, "grad_norm": 0.33114540576934814, "kl": 0.248291015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.68766384637248e-07, "loss": 0.0104, "reward": -0.6012953966856003, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6012953966856003, "reward_after_std": 0.35194990783929825, "reward_before_mean": -0.5069984756410122, "reward_before_std": 0.3748570717871189, "reward_change_max": 0.0005580708384513855, "reward_change_mean": -0.09429693291895092, "reward_change_min": -0.2428525909781456, "reward_change_std": 0.0962411742657423, "reward_std": 0.35194992274045944, "rewards/cosine_scaled_reward": -0.2743325661867857, "rewards/format_reward": 0.0416666679084301, "step": 301 }, { "advantage_max": 0.8204503990709782, "advantage_mean": 1.4280280069556284e-08, "advantage_min": -0.4119828939437866, "advantage_std": 0.46756322495639324, "completion_length": 3010.9584045410156, "epoch": 0.12978780553317218, "grad_norm": 0.47053688764572144, "kl": 0.21630859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.656784084364238e-07, "loss": 0.0041, "reward": -0.2512365598231554, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2512365598231554, "reward_after_std": 0.46756322868168354, "reward_before_mean": -0.04876471310853958, "reward_before_std": 0.45047610625624657, "reward_change_max": 0.00029747188091278076, "reward_change_mean": -0.2024718406610191, "reward_change_min": -0.3658544756472111, "reward_change_std": 0.13882464566268027, "reward_std": 0.4675632454454899, "rewards/cosine_scaled_reward": -0.20146569213829935, "rewards/format_reward": 0.35416666977107525, "step": 302 }, { "advantage_max": 1.0027489960193634, "advantage_mean": -1.3659398501175701e-08, "advantage_min": -0.6079968810081482, "advantage_std": 0.6013409867882729, "completion_length": 3294.0208435058594, "epoch": 0.13021756647864624, "grad_norm": 0.2946097254753113, "kl": 0.20513916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.6259454195101267e-07, "loss": 0.0272, "reward": -0.037076723761856556, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.037076723761856556, "reward_after_std": 0.6013409793376923, "reward_before_mean": 0.22968878597021103, "reward_before_std": 0.6106087286025286, "reward_change_max": 0.0007095932960510254, "reward_change_mean": -0.2667655418626964, "reward_change_min": -0.5118739418685436, "reward_change_std": 0.20511623448692262, "reward_std": 0.601341001689434, "rewards/cosine_scaled_reward": -0.020572278066538274, "rewards/format_reward": 0.2708333395421505, "step": 303 }, { "advantage_max": 1.2136338874697685, "advantage_mean": 1.30385160446167e-08, "advantage_min": -0.536174613982439, "advantage_std": 0.6652023270726204, "completion_length": 3135.3333435058594, "epoch": 0.13064732742412033, "grad_norm": 0.909904956817627, "kl": 0.21875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.59514935484316e-07, "loss": 0.0954, "reward": -0.3976028058677912, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3976028058677912, "reward_after_std": 0.6652023233473301, "reward_before_mean": -0.27619913406670094, "reward_before_std": 0.6835384741425514, "reward_change_max": 0.0007878690958023071, "reward_change_mean": -0.12140368483960629, "reward_change_min": -0.3066728226840496, "reward_change_std": 0.12346395384520292, "reward_std": 0.6652023270726204, "rewards/cosine_scaled_reward": -0.20059956423938274, "rewards/format_reward": 0.12500000186264515, "step": 304 }, { "advantage_max": 0.828279159963131, "advantage_mean": 2.0489096919096284e-08, "advantage_min": -0.4640212059020996, "advantage_std": 0.4604710415005684, "completion_length": 3353.312530517578, "epoch": 0.13107708836959442, "grad_norm": 0.2224566787481308, "kl": 0.2042236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.5643973913200837e-07, "loss": 0.0131, "reward": -0.4555053375661373, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4555053375661373, "reward_after_std": 0.4604710415005684, "reward_before_mean": -0.3266245014965534, "reward_before_std": 0.46067017689347267, "reward_change_max": 0.0012560561299324036, "reward_change_mean": -0.12888082768768072, "reward_change_min": -0.2733853794634342, "reward_change_std": 0.1072563249617815, "reward_std": 0.460471048951149, "rewards/cosine_scaled_reward": -0.23622892145067453, "rewards/format_reward": 0.14583333767950535, "step": 305 }, { "advantage_max": 0.9919533617794514, "advantage_mean": -1.3038515933594397e-08, "advantage_min": -0.6583028584718704, "advantage_std": 0.5860892832279205, "completion_length": 3181.6459045410156, "epoch": 0.13150684931506848, "grad_norm": 0.2910352051258087, "kl": 0.20068359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.5336910277482155e-07, "loss": 0.0436, "reward": -0.16564980894327164, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16564980894327164, "reward_after_std": 0.5860892869532108, "reward_before_mean": 0.05445084348320961, "reward_before_std": 0.6001198813319206, "reward_change_max": 0.0008018389344215393, "reward_change_mean": -0.22010071342810988, "reward_change_min": -0.43430665135383606, "reward_change_std": 0.1725041400641203, "reward_std": 0.5860893167555332, "rewards/cosine_scaled_reward": -0.11860790103673935, "rewards/format_reward": 0.29166667349636555, "step": 306 }, { "advantage_max": 1.087684117257595, "advantage_mean": 9.313226467799751e-09, "advantage_min": -0.7205011770129204, "advantage_std": 0.6583491340279579, "completion_length": 3365.2916870117188, "epoch": 0.13193661026054257, "grad_norm": 0.267095148563385, "kl": 0.192108154296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.503031760712397e-07, "loss": 0.0315, "reward": -0.24927114881575108, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24927114881575108, "reward_after_std": 0.6583491340279579, "reward_before_mean": -0.06470988690853119, "reward_before_std": 0.7104393504559994, "reward_change_max": 0.0005958974361419678, "reward_change_mean": -0.1845612870529294, "reward_change_min": -0.394296370446682, "reward_change_std": 0.17448607087135315, "reward_std": 0.6583491563796997, "rewards/cosine_scaled_reward": -0.1261049285531044, "rewards/format_reward": 0.1875000074505806, "step": 307 }, { "advantage_max": 1.0975428111851215, "advantage_mean": 1.7384688688615313e-08, "advantage_min": -0.5416125729680061, "advantage_std": 0.6091160476207733, "completion_length": 3471.5416870117188, "epoch": 0.13236637120601666, "grad_norm": 0.31679731607437134, "kl": 0.2174072265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.4724210845020494e-07, "loss": 0.049, "reward": -0.38449055328965187, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.38449055328965187, "reward_after_std": 0.6091160513460636, "reward_before_mean": -0.24975207168608904, "reward_before_std": 0.6226618140935898, "reward_change_max": 0.0009910240769386292, "reward_change_mean": -0.13473847368732095, "reward_change_min": -0.3096616677939892, "reward_change_std": 0.12519349483773112, "reward_std": 0.6091160699725151, "rewards/cosine_scaled_reward": -0.18737603351473808, "rewards/format_reward": 0.12500000186264515, "step": 308 }, { "advantage_max": 0.6175320520997047, "advantage_mean": 1.862645243599914e-08, "advantage_min": -0.34809035062789917, "advantage_std": 0.36943780444562435, "completion_length": 3569.3541870117188, "epoch": 0.13279613215149072, "grad_norm": 0.2638093829154968, "kl": 0.2203369140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.441860491038345e-07, "loss": 0.0143, "reward": -0.5737578943371773, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5737578943371773, "reward_after_std": 0.3694378063082695, "reward_before_mean": -0.46981882490217686, "reward_before_std": 0.3899908382445574, "reward_change_max": 0.00044862180948257446, "reward_change_mean": -0.10393907222896814, "reward_change_min": -0.23785406351089478, "reward_change_std": 0.10086669120937586, "reward_std": 0.3694378100335598, "rewards/cosine_scaled_reward": -0.266159413382411, "rewards/format_reward": 0.06250000186264515, "step": 309 }, { "advantage_max": 0.817685928195715, "advantage_mean": 1.3659398223619945e-08, "advantage_min": -0.3955564573407173, "advantage_std": 0.4571103174239397, "completion_length": 3462.7916870117188, "epoch": 0.1332258930969648, "grad_norm": 0.22951604425907135, "kl": 0.2353515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.4113514698014953e-07, "loss": 0.028, "reward": -0.5360782295465469, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5360782295465469, "reward_after_std": 0.4571103136986494, "reward_before_mean": -0.4353763135150075, "reward_before_std": 0.4729432687163353, "reward_change_max": 0.00160951167345047, "reward_change_mean": -0.1007019174285233, "reward_change_min": -0.21457276679575443, "reward_change_std": 0.0927984886802733, "reward_std": 0.4571103136986494, "rewards/cosine_scaled_reward": -0.22810482699424028, "rewards/format_reward": 0.02083333395421505, "step": 310 }, { "advantage_max": 1.941018432378769, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.9756518229842186, "advantage_std": 1.104165904223919, "completion_length": 3024.041717529297, "epoch": 0.1336556540424389, "grad_norm": 0.38119131326675415, "kl": 0.1837158203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3808955077581546e-07, "loss": 0.038, "reward": 0.3841081727296114, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3841081727296114, "reward_after_std": 1.1041659265756607, "reward_before_mean": 0.7348857917822897, "reward_before_std": 1.1151660047471523, "reward_change_max": 0.00021977722644805908, "reward_change_mean": -0.350777605548501, "reward_change_min": -0.7593125626444817, "reward_change_std": 0.29093188885599375, "reward_std": 1.104165956377983, "rewards/cosine_scaled_reward": 0.07577620819211006, "rewards/format_reward": 0.583333345130086, "step": 311 }, { "advantage_max": 1.1331679672002792, "advantage_mean": 7.450580874479584e-09, "advantage_min": -0.5661154612898827, "advantage_std": 0.6239109337329865, "completion_length": 3242.6458740234375, "epoch": 0.13408541498791296, "grad_norm": 0.47699397802352905, "kl": 0.19873046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.350494089288943e-07, "loss": 0.0595, "reward": -0.27003084775060415, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.27003084775060415, "reward_after_std": 0.6239109076559544, "reward_before_mean": -0.09597002156078815, "reward_before_std": 0.6241473704576492, "reward_change_max": 0.00012999027967453003, "reward_change_mean": -0.17406082898378372, "reward_change_min": -0.3532703071832657, "reward_change_std": 0.1409221258945763, "reward_std": 0.6239109262824059, "rewards/cosine_scaled_reward": -0.1729850135743618, "rewards/format_reward": 0.2500000074505806, "step": 312 }, { "advantage_max": 0.8522760942578316, "advantage_mean": 1.614292516327609e-08, "advantage_min": -0.5112400576472282, "advantage_std": 0.5129505135118961, "completion_length": 3525.1875, "epoch": 0.13451517593338705, "grad_norm": 0.3101408779621124, "kl": 0.23974609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.3201486961161093e-07, "loss": 0.0298, "reward": -0.4367400035262108, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4367400035262108, "reward_after_std": 0.5129505209624767, "reward_before_mean": -0.3022693954408169, "reward_before_std": 0.5507849641144276, "reward_change_max": 0.0027219951152801514, "reward_change_mean": -0.134470593649894, "reward_change_min": -0.318566033616662, "reward_change_std": 0.1392009335104376, "reward_std": 0.5129505544900894, "rewards/cosine_scaled_reward": -0.192801371216774, "rewards/format_reward": 0.08333333395421505, "step": 313 }, { "advantage_max": 1.277863722294569, "advantage_mean": -3.414849625293925e-08, "advantage_min": -0.6820605844259262, "advantage_std": 0.7343168128281832, "completion_length": 3019.5208892822266, "epoch": 0.13494493687886114, "grad_norm": 0.32630807161331177, "kl": 0.18817138671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2898608072313045e-07, "loss": 0.032, "reward": 0.1771506890654564, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1771506890654564, "reward_after_std": 0.7343167997896671, "reward_before_mean": 0.5005612391978502, "reward_before_std": 0.7140043303370476, "reward_change_max": 0.0004641786217689514, "reward_change_mean": -0.32341056829318404, "reward_change_min": -0.6036920920014381, "reward_change_std": 0.235004156595096, "reward_std": 0.7343168370425701, "rewards/cosine_scaled_reward": 0.02111393678933382, "rewards/format_reward": 0.45833333767950535, "step": 314 }, { "advantage_max": 1.2209500595927238, "advantage_mean": 6.208817404651512e-09, "advantage_min": -0.626538373529911, "advantage_std": 0.6995561234652996, "completion_length": 3144.3125610351562, "epoch": 0.13537469782433523, "grad_norm": 0.4137888252735138, "kl": 0.2066650390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2596318988235037e-07, "loss": 0.0598, "reward": -0.11921213869936764, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11921213869936764, "reward_after_std": 0.699556116014719, "reward_before_mean": 0.10206769104115665, "reward_before_std": 0.7095492631196976, "reward_change_max": 0.0006157979369163513, "reward_change_mean": -0.22127982415258884, "reward_change_min": -0.4458838626742363, "reward_change_std": 0.17664472851902246, "reward_std": 0.699556116014719, "rewards/cosine_scaled_reward": -0.1468828348442912, "rewards/format_reward": 0.39583334513008595, "step": 315 }, { "advantage_max": 1.1820750199258327, "advantage_mean": 1.1796753407455896e-08, "advantage_min": -0.6927676498889923, "advantage_std": 0.6832530628889799, "completion_length": 3352.7708435058594, "epoch": 0.1358044587698093, "grad_norm": 0.4590507447719574, "kl": 0.2908935546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.2294634442070553e-07, "loss": 0.0236, "reward": -0.1680474071763456, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1680474071763456, "reward_after_std": 0.6832530628889799, "reward_before_mean": 0.03907960094511509, "reward_before_std": 0.702156150713563, "reward_change_max": 0.0009274706244468689, "reward_change_mean": -0.20712700299918652, "reward_change_min": -0.40073617175221443, "reward_change_std": 0.16930399276316166, "reward_std": 0.6832530684769154, "rewards/cosine_scaled_reward": -0.09504354221280664, "rewards/format_reward": 0.22916667722165585, "step": 316 }, { "advantage_max": 0.6547868959605694, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.4412279613316059, "advantage_std": 0.3971537575125694, "completion_length": 3449.7291870117188, "epoch": 0.13623421971528338, "grad_norm": 0.20841732621192932, "kl": 0.2603759765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1993569137498776e-07, "loss": 0.0348, "reward": -0.5068665631115437, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5068665631115437, "reward_after_std": 0.3971537612378597, "reward_before_mean": -0.38367069512605667, "reward_before_std": 0.42491958104074, "reward_change_max": 0.0007017478346824646, "reward_change_mean": -0.12319588242098689, "reward_change_min": -0.24818914011120796, "reward_change_std": 0.10988725908100605, "reward_std": 0.3971537798643112, "rewards/cosine_scaled_reward": -0.20225201547145844, "rewards/format_reward": 0.02083333395421505, "step": 317 }, { "advantage_max": 0.5610633417963982, "advantage_mean": 1.6763806454100916e-08, "advantage_min": -0.4056569039821625, "advantage_std": 0.3434066530317068, "completion_length": 3370.5833435058594, "epoch": 0.13666398066075747, "grad_norm": 0.4364916682243347, "kl": 0.2579345703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1693137748017915e-07, "loss": 0.0041, "reward": -0.5771885542199016, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5771885542199016, "reward_after_std": 0.3434066567569971, "reward_before_mean": -0.4728893060237169, "reward_before_std": 0.36756037920713425, "reward_change_max": 0.0025258511304855347, "reward_change_mean": -0.10429924679920077, "reward_change_min": -0.2390405237674713, "reward_change_std": 0.0996963488869369, "reward_std": 0.3434066642075777, "rewards/cosine_scaled_reward": -0.28852798976004124, "rewards/format_reward": 0.10416666977107525, "step": 318 }, { "advantage_max": 0.8978993855416775, "advantage_mean": 4.346171977864799e-09, "advantage_min": -0.4651683270931244, "advantage_std": 0.5090115256607533, "completion_length": 3092.4583435058594, "epoch": 0.13709374160623153, "grad_norm": 0.2740548253059387, "kl": 0.2391357421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1393354916230005e-07, "loss": 0.029, "reward": -0.360146077349782, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.360146077349782, "reward_after_std": 0.5090115405619144, "reward_before_mean": -0.20190691202878952, "reward_before_std": 0.508797250688076, "reward_change_max": 0.0010357275605201721, "reward_change_mean": -0.15823917277157307, "reward_change_min": -0.313229214400053, "reward_change_std": 0.12622191943228245, "reward_std": 0.5090115517377853, "rewards/cosine_scaled_reward": -0.2467867974191904, "rewards/format_reward": 0.29166667349636555, "step": 319 }, { "advantage_max": 1.0785776749253273, "advantage_mean": 3.3306690738754696e-16, "advantage_min": -0.6852370798587799, "advantage_std": 0.6292976140975952, "completion_length": 3482.5833740234375, "epoch": 0.13752350255170562, "grad_norm": 0.2715871334075928, "kl": 0.235107421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.1094235253127374e-07, "loss": 0.0339, "reward": -0.24707840383052826, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24707840383052826, "reward_after_std": 0.6292976178228855, "reward_before_mean": -0.06078509567305446, "reward_before_std": 0.6530494913458824, "reward_change_max": 0.001035161316394806, "reward_change_mean": -0.18629332026466727, "reward_change_min": -0.35968566313385963, "reward_change_std": 0.15348220150917768, "reward_std": 0.6292976513504982, "rewards/cosine_scaled_reward": -0.13455921597778797, "rewards/format_reward": 0.20833334140479565, "step": 320 }, { "advantage_max": 0.8649835288524628, "advantage_mean": 8.692343789196144e-09, "advantage_min": -0.4761129803955555, "advantage_std": 0.5017462074756622, "completion_length": 3371.604217529297, "epoch": 0.1379532634971797, "grad_norm": 0.20877622067928314, "kl": 0.2081298828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.079579333738039e-07, "loss": 0.0328, "reward": -0.3520048074424267, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3520048074424267, "reward_after_std": 0.5017462112009525, "reward_before_mean": -0.1873673014342785, "reward_before_std": 0.5133707597851753, "reward_change_max": 0.00033995509147644043, "reward_change_mean": -0.1646375055424869, "reward_change_min": -0.33179262839257717, "reward_change_std": 0.13372111832723022, "reward_std": 0.5017462335526943, "rewards/cosine_scaled_reward": -0.18743366189301014, "rewards/format_reward": 0.1875000074505806, "step": 321 }, { "advantage_max": 1.695661038160324, "advantage_mean": 3.725290742551124e-09, "advantage_min": -0.9484876394271851, "advantage_std": 0.9707401916384697, "completion_length": 3176.9584045410156, "epoch": 0.13838302444265377, "grad_norm": 0.39954689145088196, "kl": 0.18072509765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.0498043714627006e-07, "loss": 0.0147, "reward": 0.2530910149216652, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2530910149216652, "reward_after_std": 0.9707401990890503, "reward_before_mean": 0.5735904686152935, "reward_before_std": 0.9938974939286709, "reward_change_max": 0.0004786550998687744, "reward_change_mean": -0.32049945648759604, "reward_change_min": -0.6748972125351429, "reward_change_std": 0.261513514444232, "reward_std": 0.9707402586936951, "rewards/cosine_scaled_reward": 0.0680452270898968, "rewards/format_reward": 0.43750000186264515, "step": 322 }, { "advantage_max": 0.6920017264783382, "advantage_mean": 1.5522043150806297e-08, "advantage_min": -0.35473257303237915, "advantage_std": 0.3933586720377207, "completion_length": 3374.8958740234375, "epoch": 0.13881278538812786, "grad_norm": 0.297222375869751, "kl": 0.263916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 4.020100089676376e-07, "loss": 0.0215, "reward": -0.5647532492876053, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5647532492876053, "reward_after_std": 0.3933586683124304, "reward_before_mean": -0.465068805962801, "reward_before_std": 0.4043907020241022, "reward_change_max": 0.0002632811665534973, "reward_change_mean": -0.09968444425612688, "reward_change_min": -0.2134816087782383, "reward_change_std": 0.09020489361137152, "reward_std": 0.3933586794883013, "rewards/cosine_scaled_reward": -0.2846177350729704, "rewards/format_reward": 0.1041666679084301, "step": 323 }, { "advantage_max": 1.182119145989418, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.589189887046814, "advantage_std": 0.6736166179180145, "completion_length": 3174.1458740234375, "epoch": 0.13924254633360195, "grad_norm": 0.27667075395584106, "kl": 0.20458984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.9904679361238526e-07, "loss": 0.0336, "reward": -0.2722993418574333, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2722993418574333, "reward_after_std": 0.6736166253685951, "reward_before_mean": -0.10293327644467354, "reward_before_std": 0.6979727037250996, "reward_change_max": 0.0011743083596229553, "reward_change_mean": -0.16936606261879206, "reward_change_min": -0.4443657621741295, "reward_change_std": 0.16784037183970213, "reward_std": 0.6736166514456272, "rewards/cosine_scaled_reward": -0.15563330706208944, "rewards/format_reward": 0.2083333358168602, "step": 324 }, { "advantage_max": 0.7161058038473129, "advantage_mean": -1.924733378233512e-08, "advantage_min": -0.42096275091171265, "advantage_std": 0.42308012396097183, "completion_length": 3457.9583740234375, "epoch": 0.139672307279076, "grad_norm": 0.3043292760848999, "kl": 0.2640380859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.9609093550344907e-07, "loss": 0.0248, "reward": -0.49917715275660157, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.49917715275660157, "reward_after_std": 0.42308012396097183, "reward_before_mean": -0.37698634527623653, "reward_before_std": 0.44260069727897644, "reward_change_max": 0.000109843909740448, "reward_change_mean": -0.12219085113611072, "reward_change_min": -0.249165752902627, "reward_change_std": 0.10831605177372694, "reward_std": 0.42308013141155243, "rewards/cosine_scaled_reward": -0.219743175432086, "rewards/format_reward": 0.06250000186264515, "step": 325 }, { "advantage_max": 0.873748991638422, "advantage_mean": 1.1796752907855534e-08, "advantage_min": -0.4862126335501671, "advantage_std": 0.4957350492477417, "completion_length": 3568.1041870117188, "epoch": 0.1401020682245501, "grad_norm": 0.30774810910224915, "kl": 0.2177734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.931425787051832e-07, "loss": 0.0098, "reward": -0.43956192396581173, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.43956192396581173, "reward_after_std": 0.495735052973032, "reward_before_mean": -0.3076224606484175, "reward_before_std": 0.5075330026447773, "reward_change_max": 0.002222321927547455, "reward_change_mean": -0.1319394651800394, "reward_change_min": -0.262392645701766, "reward_change_std": 0.11392830964177847, "reward_std": 0.4957350678741932, "rewards/cosine_scaled_reward": -0.19547789730131626, "rewards/format_reward": 0.0833333358168602, "step": 326 }, { "advantage_max": 0.9745964594185352, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -0.5230999998748302, "advantage_std": 0.5807579308748245, "completion_length": 3320.104217529297, "epoch": 0.1405318291700242, "grad_norm": 0.24493198096752167, "kl": 0.21142578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.902018669163384e-07, "loss": 0.0319, "reward": -0.329512644559145, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.329512644559145, "reward_after_std": 0.5807579346001148, "reward_before_mean": -0.16519994661211967, "reward_before_std": 0.6164507642388344, "reward_change_max": 0.00039371103048324585, "reward_change_mean": -0.1643127277493477, "reward_change_min": -0.3965680245310068, "reward_change_std": 0.16003134800121188, "reward_std": 0.5807579457759857, "rewards/cosine_scaled_reward": -0.2284332998096943, "rewards/format_reward": 0.2916666716337204, "step": 327 }, { "advantage_max": 0.9292752891778946, "advantage_mean": -3.725290464995368e-09, "advantage_min": -0.6286787204444408, "advantage_std": 0.559509139508009, "completion_length": 3411.6666870117188, "epoch": 0.14096159011549825, "grad_norm": 0.25871866941452026, "kl": 0.20391845703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.872689434630585e-07, "loss": 0.0353, "reward": -0.24246928468346596, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24246928468346596, "reward_after_std": 0.5595091283321381, "reward_before_mean": -0.04504328779876232, "reward_before_std": 0.5903872884809971, "reward_change_max": 0.000963360071182251, "reward_change_mean": -0.19742596801370382, "reward_change_min": -0.3887498751282692, "reward_change_std": 0.16155768744647503, "reward_std": 0.5595091357827187, "rewards/cosine_scaled_reward": -0.1058549890294671, "rewards/format_reward": 0.16666667349636555, "step": 328 }, { "advantage_max": 1.440411750227213, "advantage_mean": 2.421438777266971e-08, "advantage_min": -0.7262358516454697, "advantage_std": 0.8411347977817059, "completion_length": 3427.375030517578, "epoch": 0.14139135106097234, "grad_norm": 0.6285274624824524, "kl": 0.2213134765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.843439512918949e-07, "loss": 0.0674, "reward": -0.22303597256541252, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.22303597256541252, "reward_after_std": 0.8411347977817059, "reward_before_mean": -0.05335083790123463, "reward_before_std": 0.903580904006958, "reward_change_max": 0.0007652938365936279, "reward_change_mean": -0.16968512441962957, "reward_change_min": -0.4875176288187504, "reward_change_std": 0.19847944751381874, "reward_std": 0.8411348517984152, "rewards/cosine_scaled_reward": -0.14125875011086464, "rewards/format_reward": 0.2291666753590107, "step": 329 }, { "advantage_max": 1.123570017516613, "advantage_mean": -1.3038516488705909e-08, "advantage_min": -0.5770345367491245, "advantage_std": 0.6387611366808414, "completion_length": 3027.8958435058594, "epoch": 0.14182111200644643, "grad_norm": 0.47540807723999023, "kl": 0.1708984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.8142703296283953e-07, "loss": 0.0523, "reward": -0.19906144216656685, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19906144216656685, "reward_after_std": 0.6387611404061317, "reward_before_mean": 0.0005499956496350933, "reward_before_std": 0.6400634087622166, "reward_change_max": 0.0007862299680709839, "reward_change_mean": -0.19961145147681236, "reward_change_min": -0.40712923742830753, "reward_change_std": 0.16520654130727053, "reward_std": 0.6387611590325832, "rewards/cosine_scaled_reward": -0.15597500931471586, "rewards/format_reward": 0.31250000186264515, "step": 330 }, { "advantage_max": 1.708313513547182, "advantage_mean": -6.208817904251873e-10, "advantage_min": -0.7142710834741592, "advantage_std": 0.9557903781533241, "completion_length": 3075.0209350585938, "epoch": 0.1422508729519205, "grad_norm": 0.5600759387016296, "kl": 0.206298828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.785183306423767e-07, "loss": 0.0549, "reward": -0.09575383830815554, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.09575383830815554, "reward_after_std": 0.9557903818786144, "reward_before_mean": 0.09911664738319814, "reward_before_std": 0.988950040191412, "reward_change_max": 0.00021012872457504272, "reward_change_mean": -0.19487048499286175, "reward_change_min": -0.5461224801838398, "reward_change_std": 0.20158475451171398, "reward_std": 0.9557904042303562, "rewards/cosine_scaled_reward": -0.11710834898985922, "rewards/format_reward": 0.3333333395421505, "step": 331 }, { "advantage_max": 1.0761032812297344, "advantage_mean": 8.692344399818808e-09, "advantage_min": -0.6102985218167305, "advantage_std": 0.6388733554631472, "completion_length": 3123.729217529297, "epoch": 0.14268063389739458, "grad_norm": 0.6462621092796326, "kl": 0.2242431640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.7561798609655373e-07, "loss": 0.0762, "reward": -0.3358348747715354, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3358348747715354, "reward_after_std": 0.6388733480125666, "reward_before_mean": -0.18209782801568508, "reward_before_std": 0.6838607098907232, "reward_change_max": 0.0011688843369483948, "reward_change_mean": -0.15373704163357615, "reward_change_min": -0.39926641806960106, "reward_change_std": 0.16479186434298754, "reward_std": 0.6388733722269535, "rewards/cosine_scaled_reward": -0.23688225261867046, "rewards/format_reward": 0.2916666753590107, "step": 332 }, { "advantage_max": 1.2885508500039577, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.6179642155766487, "advantage_std": 0.7300197221338749, "completion_length": 3356.6875915527344, "epoch": 0.14311039484286867, "grad_norm": 0.7881283164024353, "kl": 0.25048828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.72726140684072e-07, "loss": 0.0781, "reward": -0.2873462284915149, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2873462284915149, "reward_after_std": 0.7300197333097458, "reward_before_mean": -0.13114534609485418, "reward_before_std": 0.7628993988037109, "reward_change_max": 0.0, "reward_change_mean": -0.15620088949799538, "reward_change_min": -0.3791564330458641, "reward_change_std": 0.1514206761494279, "reward_std": 0.7300197593867779, "rewards/cosine_scaled_reward": -0.18015600554645061, "rewards/format_reward": 0.22916666977107525, "step": 333 }, { "advantage_max": 1.157546617090702, "advantage_mean": 1.8626447051417472e-09, "advantage_min": -0.6865742281079292, "advantage_std": 0.7083022836595774, "completion_length": 3158.229202270508, "epoch": 0.14354015578834273, "grad_norm": 0.32352304458618164, "kl": 0.2451171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6984293534939737e-07, "loss": 0.0179, "reward": -0.20468927174806595, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20468927174806595, "reward_after_std": 0.7083022687584162, "reward_before_mean": -0.010039042681455612, "reward_before_std": 0.7680972199887037, "reward_change_max": 0.0006060376763343811, "reward_change_mean": -0.19465022161602974, "reward_change_min": -0.5314572900533676, "reward_change_std": 0.20605905167758465, "reward_std": 0.7083022873848677, "rewards/cosine_scaled_reward": -0.0987695250660181, "rewards/format_reward": 0.1875000074505806, "step": 334 }, { "advantage_max": 1.1048667840659618, "advantage_mean": 2.235174251241645e-08, "advantage_min": -0.5316229537129402, "advantage_std": 0.6343257799744606, "completion_length": 3044.750030517578, "epoch": 0.14396991673381682, "grad_norm": 0.317387193441391, "kl": 0.23883056640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.6696851061588994e-07, "loss": 0.0406, "reward": -0.3395836129784584, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3395836129784584, "reward_after_std": 0.634325809776783, "reward_before_mean": -0.18944216519594193, "reward_before_std": 0.6617873013019562, "reward_change_max": 0.002580210566520691, "reward_change_mean": -0.15014143660664558, "reward_change_min": -0.3786518294364214, "reward_change_std": 0.1525773424655199, "reward_std": 0.6343258321285248, "rewards/cosine_scaled_reward": -0.17805442307144403, "rewards/format_reward": 0.1666666679084301, "step": 335 }, { "advantage_max": 1.2700268849730492, "advantage_mean": 6.208815683805824e-10, "advantage_min": -0.5814338214695454, "advantage_std": 0.7086201570928097, "completion_length": 3437.0208740234375, "epoch": 0.1443996776792909, "grad_norm": 0.42034420371055603, "kl": 0.35107421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.641030065789562e-07, "loss": 0.0417, "reward": -0.3261487316340208, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3261487316340208, "reward_after_std": 0.7086201608181, "reward_before_mean": -0.18288051569834352, "reward_before_std": 0.7308860048651695, "reward_change_max": 0.0015492364764213562, "reward_change_mean": -0.14326823549345136, "reward_change_min": -0.3825642690062523, "reward_change_std": 0.1492543537169695, "reward_std": 0.7086201906204224, "rewards/cosine_scaled_reward": -0.16435692831873894, "rewards/format_reward": 0.14583333767950535, "step": 336 }, { "advantage_max": 1.6234952360391617, "advantage_mean": 1.8005569812906685e-08, "advantage_min": -0.6116728410124779, "advantage_std": 0.8681371882557869, "completion_length": 3439.5625610351562, "epoch": 0.14482943862476497, "grad_norm": 1.0478427410125732, "kl": 0.3363037109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.612465628992203e-07, "loss": 0.0638, "reward": -0.3082355409860611, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3082355409860611, "reward_after_std": 0.8681371808052063, "reward_before_mean": -0.1821025451645255, "reward_before_std": 0.8823285885155201, "reward_change_max": 0.0008411183953285217, "reward_change_mean": -0.12613298930227757, "reward_change_min": -0.360259223729372, "reward_change_std": 0.14401454385370016, "reward_std": 0.8681371957063675, "rewards/cosine_scaled_reward": -0.16396794421598315, "rewards/format_reward": 0.14583333767950535, "step": 337 }, { "advantage_max": 1.4676710031926632, "advantage_mean": -1.2417635253392234e-09, "advantage_min": -0.7588036432862282, "advantage_std": 0.8164703287184238, "completion_length": 3014.541717529297, "epoch": 0.14525919957023906, "grad_norm": 0.4125087559223175, "kl": 0.2408447265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.5839931879571725e-07, "loss": 0.0231, "reward": -0.08051511645317078, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08051511645317078, "reward_after_std": 0.816470343619585, "reward_before_mean": 0.1366025833413005, "reward_before_std": 0.8272438943386078, "reward_change_max": 0.0005482733249664307, "reward_change_mean": -0.21711770072579384, "reward_change_min": -0.4673200659453869, "reward_change_std": 0.189130206592381, "reward_std": 0.8164703510701656, "rewards/cosine_scaled_reward": -0.18169871205464005, "rewards/format_reward": 0.5000000093132257, "step": 338 }, { "advantage_max": 1.1786171831190586, "advantage_mean": 5.587935614226325e-09, "advantage_min": -0.5647024661302567, "advantage_std": 0.6768219210207462, "completion_length": 3355.291717529297, "epoch": 0.14568896051571315, "grad_norm": 0.33755725622177124, "kl": 0.324462890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.555614130391079e-07, "loss": 0.0285, "reward": -0.28566701896488667, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.28566701896488667, "reward_after_std": 0.6768219247460365, "reward_before_mean": -0.11962558422237635, "reward_before_std": 0.7088327780365944, "reward_change_max": 0.00016217678785324097, "reward_change_mean": -0.16604144219309092, "reward_change_min": -0.4165051244199276, "reward_change_std": 0.16324879508465528, "reward_std": 0.6768219619989395, "rewards/cosine_scaled_reward": -0.18481279164552689, "rewards/format_reward": 0.25000000186264515, "step": 339 }, { "advantage_max": 1.5522412955760956, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -1.2172414809465408, "advantage_std": 1.0155989415943623, "completion_length": 2962.9584350585938, "epoch": 0.1461187214611872, "grad_norm": 1.7811740636825562, "kl": 0.258544921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.5273298394491515e-07, "loss": 0.1304, "reward": 0.32437431439757347, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32437431439757347, "reward_after_std": 1.0155989415943623, "reward_before_mean": 0.6807701885700226, "reward_before_std": 1.1278506498783827, "reward_change_max": 0.0007163882255554199, "reward_change_mean": -0.356395878829062, "reward_change_min": -0.7661301642656326, "reward_change_std": 0.3370817294344306, "reward_std": 1.0155989900231361, "rewards/cosine_scaled_reward": 0.1008017435669899, "rewards/format_reward": 0.4791666828095913, "step": 340 }, { "advantage_max": 1.4978766441345215, "advantage_mean": 2.3593506148777976e-08, "advantage_min": -0.7607886791229248, "advantage_std": 0.8346401229500771, "completion_length": 2930.8750610351562, "epoch": 0.1465484824066613, "grad_norm": 0.7398524880409241, "kl": 0.29986572265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4991416936678276e-07, "loss": 0.0558, "reward": 0.012990274000912905, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.012990274000912905, "reward_after_std": 0.8346401173621416, "reward_before_mean": 0.26139856642112136, "reward_before_std": 0.8377609476447105, "reward_change_max": 0.0009343475103378296, "reward_change_mean": -0.24840826354920864, "reward_change_min": -0.5077629946172237, "reward_change_std": 0.20527341170236468, "reward_std": 0.8346401266753674, "rewards/cosine_scaled_reward": -0.08805072586983442, "rewards/format_reward": 0.43750000558793545, "step": 341 }, { "advantage_max": 0.7179121859371662, "advantage_mean": 1.8626452047421083e-08, "advantage_min": -0.42212627083063126, "advantage_std": 0.4260858818888664, "completion_length": 3430.6250610351562, "epoch": 0.14697824335213538, "grad_norm": 0.46214380860328674, "kl": 0.3692626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.471051066897562e-07, "loss": 0.0319, "reward": -0.5253956690430641, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5253956690430641, "reward_after_std": 0.4260858856141567, "reward_before_mean": -0.41314850747585297, "reward_before_std": 0.45367948710918427, "reward_change_max": 0.0005648583173751831, "reward_change_mean": -0.11224715691059828, "reward_change_min": -0.25735435634851456, "reward_change_std": 0.11029793787747622, "reward_std": 0.4260859005153179, "rewards/cosine_scaled_reward": -0.22740759141743183, "rewards/format_reward": 0.0416666679084301, "step": 342 }, { "advantage_max": 0.9076493047177792, "advantage_mean": 1.6142924996742636e-08, "advantage_min": -0.5641684904694557, "advantage_std": 0.5049604885280132, "completion_length": 3396.291717529297, "epoch": 0.14740800429760945, "grad_norm": 0.33302438259124756, "kl": 0.39404296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4430593282358777e-07, "loss": 0.0417, "reward": -0.2601945735514164, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2601945735514164, "reward_after_std": 0.5049604959785938, "reward_before_mean": -0.06683588307350874, "reward_before_std": 0.4911368750035763, "reward_change_max": 0.0002960190176963806, "reward_change_mean": -0.1933586746454239, "reward_change_min": -0.3342648409307003, "reward_change_std": 0.13726553786545992, "reward_std": 0.504960510879755, "rewards/cosine_scaled_reward": -0.1792512801475823, "rewards/format_reward": 0.2916666753590107, "step": 343 }, { "advantage_max": 0.7030718214809895, "advantage_mean": 4.5324366704235786e-08, "advantage_min": -0.5397257395088673, "advantage_std": 0.4334312826395035, "completion_length": 3231.6458587646484, "epoch": 0.14783776524308354, "grad_norm": 0.42617008090019226, "kl": 0.36602783203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.4151678419606233e-07, "loss": 0.0329, "reward": -0.31361502408981323, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.31361502408981323, "reward_after_std": 0.43343128357082605, "reward_before_mean": -0.12356434762477875, "reward_before_std": 0.45542024821043015, "reward_change_max": 0.0009629204869270325, "reward_change_mean": -0.19005064698285423, "reward_change_min": -0.3282003402709961, "reward_change_std": 0.1461672187433578, "reward_std": 0.4334313040599227, "rewards/cosine_scaled_reward": -0.1555321691557765, "rewards/format_reward": 0.1875, "step": 344 }, { "advantage_max": 0.9652685113251209, "advantage_mean": -5.587935447692871e-09, "advantage_min": -0.4865330271422863, "advantage_std": 0.5394256673753262, "completion_length": 3438.0416870117188, "epoch": 0.14826752618855762, "grad_norm": 0.36651167273521423, "kl": 0.3768310546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.387377967463493e-07, "loss": 0.0258, "reward": -0.4063768535852432, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4063768535852432, "reward_after_std": 0.539425652474165, "reward_before_mean": -0.268603740260005, "reward_before_std": 0.5458202846348286, "reward_change_max": 0.0008480250835418701, "reward_change_mean": -0.13777313893660903, "reward_change_min": -0.28674106672406197, "reward_change_std": 0.12192887486889958, "reward_std": 0.5394256673753262, "rewards/cosine_scaled_reward": -0.2593018701300025, "rewards/format_reward": 0.2500000074505806, "step": 345 }, { "advantage_max": 0.9139313846826553, "advantage_mean": 1.7384688466570708e-08, "advantage_min": -0.6771651431918144, "advantage_std": 0.5761717595160007, "completion_length": 3191.0625610351562, "epoch": 0.14869728713403169, "grad_norm": 0.4109465181827545, "kl": 0.4078369140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.359691059183761e-07, "loss": 0.0364, "reward": -0.08440115302801132, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08440115302801132, "reward_after_std": 0.5761717595160007, "reward_before_mean": 0.17032834887504578, "reward_before_std": 0.6083875820040703, "reward_change_max": 0.0, "reward_change_mean": -0.2547294972464442, "reward_change_min": -0.4637962970882654, "reward_change_std": 0.19586024433374405, "reward_std": 0.5761717818677425, "rewards/cosine_scaled_reward": -0.03983583673834801, "rewards/format_reward": 0.25000000931322575, "step": 346 }, { "advantage_max": 0.8628620095551014, "advantage_mean": 1.8005570145973593e-08, "advantage_min": -0.43197477981448174, "advantage_std": 0.4868442080914974, "completion_length": 3535.0833740234375, "epoch": 0.14912704807950578, "grad_norm": 0.45861735939979553, "kl": 0.442138671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.3321084665422803e-07, "loss": 0.0279, "reward": -0.42930918000638485, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.42930918000638485, "reward_after_std": 0.4868442118167877, "reward_before_mean": -0.2917594891041517, "reward_before_std": 0.49203010462224483, "reward_change_max": 0.0004329681396484375, "reward_change_mean": -0.1375496843829751, "reward_change_min": -0.3300237450748682, "reward_change_std": 0.12435463396832347, "reward_std": 0.486844215542078, "rewards/cosine_scaled_reward": -0.19796307710930705, "rewards/format_reward": 0.10416666977107525, "step": 347 }, { "advantage_max": 1.1666476875543594, "advantage_mean": 3.725290464995368e-09, "advantage_min": -0.6327951774001122, "advantage_std": 0.6702994406223297, "completion_length": 3131.125030517578, "epoch": 0.14955680902497986, "grad_norm": 0.6294884085655212, "kl": 0.318115234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.3046315338757026e-07, "loss": 0.0689, "reward": -0.03958517068531364, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03958517068531364, "reward_after_std": 0.6702994368970394, "reward_before_mean": 0.21501663699746132, "reward_before_std": 0.6668683513998985, "reward_change_max": 0.0002495124936103821, "reward_change_mean": -0.254601813852787, "reward_change_min": -0.5318710785359144, "reward_change_std": 0.20335440803319216, "reward_std": 0.6702994666993618, "rewards/cosine_scaled_reward": -0.05915834230836481, "rewards/format_reward": 0.33333334513008595, "step": 348 }, { "advantage_max": 1.042497094720602, "advantage_mean": 1.738468896617107e-08, "advantage_min": -0.739569865167141, "advantage_std": 0.6344134546816349, "completion_length": 3203.791717529297, "epoch": 0.14998656997045393, "grad_norm": 0.41810399293899536, "kl": 0.385986328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.2772616003709616e-07, "loss": 0.065, "reward": -0.0451164785772562, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0451164785772562, "reward_after_std": 0.6344134621322155, "reward_before_mean": 0.21574708353728056, "reward_before_std": 0.6618472505360842, "reward_change_max": 0.0012898370623588562, "reward_change_mean": -0.26086354767903686, "reward_change_min": -0.473096389323473, "reward_change_std": 0.2028391407802701, "reward_std": 0.6344134658575058, "rewards/cosine_scaled_reward": -0.07962645590305328, "rewards/format_reward": 0.37500000558793545, "step": 349 }, { "advantage_max": 0.8934106975793839, "advantage_mean": -2.4835269951672956e-09, "advantage_min": -0.6546983867883682, "advantage_std": 0.5871218405663967, "completion_length": 3367.750030517578, "epoch": 0.15041633091592801, "grad_norm": 0.4494400918483734, "kl": 0.343505859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.250000000000001e-07, "loss": 0.0592, "reward": -0.24265778064727783, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.24265778064727783, "reward_after_std": 0.5871218591928482, "reward_before_mean": -0.041677091270685196, "reward_before_std": 0.6517411395907402, "reward_change_max": 0.0003565177321434021, "reward_change_mean": -0.20098068937659264, "reward_change_min": -0.4406622815877199, "reward_change_std": 0.19253108091652393, "reward_std": 0.5871218666434288, "rewards/cosine_scaled_reward": -0.11458855122327805, "rewards/format_reward": 0.18750000558793545, "step": 350 }, { "advantage_max": 0.6652895249426365, "advantage_mean": 2.1109979042588378e-08, "advantage_min": -0.3828222528100014, "advantage_std": 0.379185164347291, "completion_length": 3445.875030517578, "epoch": 0.1508460918614021, "grad_norm": 0.4565449655056, "kl": 0.38671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.222848061454764e-07, "loss": 0.0392, "reward": -0.4533465914428234, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4533465914428234, "reward_after_std": 0.37918516620993614, "reward_before_mean": -0.3116770936176181, "reward_before_std": 0.3782077766954899, "reward_change_max": 0.0011613592505455017, "reward_change_mean": -0.1416694913059473, "reward_change_min": -0.26786714047193527, "reward_change_std": 0.10319932200945914, "reward_std": 0.3791851755231619, "rewards/cosine_scaled_reward": -0.19750521145761013, "rewards/format_reward": 0.0833333358168602, "step": 351 }, { "advantage_max": 0.9029742926359177, "advantage_mean": -6.208822345143972e-10, "advantage_min": -0.6764661595225334, "advantage_std": 0.5552022065967321, "completion_length": 3248.875030517578, "epoch": 0.15127585280687617, "grad_norm": 0.5519656538963318, "kl": 0.3743896484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.195807108082429e-07, "loss": 0.0697, "reward": -0.23321722075343132, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23321722075343132, "reward_after_std": 0.5552022028714418, "reward_before_mean": -0.029210317879915237, "reward_before_std": 0.5901325158774853, "reward_change_max": 0.0007890388369560242, "reward_change_mean": -0.20400691265240312, "reward_change_min": -0.4115216601639986, "reward_change_std": 0.17074357951059937, "reward_std": 0.5552022289484739, "rewards/cosine_scaled_reward": -0.09793849987909198, "rewards/format_reward": 0.1666666716337204, "step": 352 }, { "advantage_max": 1.3923504278063774, "advantage_mean": -1.6142924108564216e-08, "advantage_min": -0.6513471007347107, "advantage_std": 0.793393399566412, "completion_length": 2987.5625610351562, "epoch": 0.15170561375235025, "grad_norm": 0.42798274755477905, "kl": 0.318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.168878457820915e-07, "loss": 0.0229, "reward": 0.059631578624248505, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.059631578624248505, "reward_after_std": 0.7933933809399605, "reward_before_mean": 0.33282197889639065, "reward_before_std": 0.786486130207777, "reward_change_max": 0.0013612061738967896, "reward_change_mean": -0.2731904350221157, "reward_change_min": -0.5540026407688856, "reward_change_std": 0.21697786636650562, "reward_std": 0.793393399566412, "rewards/cosine_scaled_reward": -0.021089009009301662, "rewards/format_reward": 0.37500000186264515, "step": 353 }, { "advantage_max": 1.0524289980530739, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.6576583422720432, "advantage_std": 0.6650989204645157, "completion_length": 3226.7083740234375, "epoch": 0.15213537469782434, "grad_norm": 0.6022195816040039, "kl": 0.3345947265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.142063423134644e-07, "loss": 0.0591, "reward": -0.09448637929745018, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09448637929745018, "reward_after_std": 0.6650989279150963, "reward_before_mean": 0.1486101746559143, "reward_before_std": 0.7158238813281059, "reward_change_max": 0.0018325969576835632, "reward_change_mean": -0.24309653975069523, "reward_change_min": -0.5281756948679686, "reward_change_std": 0.2197531731799245, "reward_std": 0.6650989428162575, "rewards/cosine_scaled_reward": -0.1340282540768385, "rewards/format_reward": 0.4166666753590107, "step": 354 }, { "advantage_max": 0.9374836757779121, "advantage_mean": 1.676380662063437e-08, "advantage_min": -0.6024764440953732, "advantage_std": 0.5678879953920841, "completion_length": 2907.5833435058594, "epoch": 0.1525651356432984, "grad_norm": 0.4607475996017456, "kl": 0.3255615234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.115363310950578e-07, "loss": 0.0218, "reward": -0.17420263343956321, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17420263343956321, "reward_after_std": 0.5678880102932453, "reward_before_mean": 0.048328730277717113, "reward_before_std": 0.5894381925463676, "reward_change_max": 0.0019035637378692627, "reward_change_mean": -0.2225313587114215, "reward_change_min": -0.42627808824181557, "reward_change_std": 0.17642337922006845, "reward_std": 0.5678880140185356, "rewards/cosine_scaled_reward": -0.15291896648705006, "rewards/format_reward": 0.3541666679084301, "step": 355 }, { "advantage_max": 0.9826746061444283, "advantage_mean": 1.8005570201484744e-08, "advantage_min": -0.5339677557349205, "advantage_std": 0.5786398090422153, "completion_length": 3067.7083587646484, "epoch": 0.1529948965887725, "grad_norm": 0.42515474557876587, "kl": 0.38238525390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0887794225945143e-07, "loss": 0.0307, "reward": -0.27631525695323944, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27631525695323944, "reward_after_std": 0.578639805316925, "reward_before_mean": -0.09397636726498604, "reward_before_std": 0.6004910208284855, "reward_change_max": 0.0009521842002868652, "reward_change_mean": -0.18233888130635023, "reward_change_min": -0.36994978226721287, "reward_change_std": 0.153377465903759, "reward_std": 0.5786398462951183, "rewards/cosine_scaled_reward": -0.15115485712885857, "rewards/format_reward": 0.20833333395421505, "step": 356 }, { "advantage_max": 1.1516551412642002, "advantage_mean": 8.692343955729598e-09, "advantage_min": -0.5390980392694473, "advantage_std": 0.6242373548448086, "completion_length": 3292.312530517578, "epoch": 0.15342465753424658, "grad_norm": 0.47047317028045654, "kl": 0.413818359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.062313053727671e-07, "loss": 0.0507, "reward": -0.3354032195638865, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3354032195638865, "reward_after_std": 0.6242373660206795, "reward_before_mean": -0.18608021596446633, "reward_before_std": 0.6216853391379118, "reward_change_max": 0.00011768192052841187, "reward_change_mean": -0.149323008954525, "reward_change_min": -0.32390549033880234, "reward_change_std": 0.12388110812753439, "reward_std": 0.6242373809218407, "rewards/cosine_scaled_reward": -0.17637343879323453, "rewards/format_reward": 0.1666666716337204, "step": 357 }, { "advantage_max": 1.0283693373203278, "advantage_mean": 7.450580596923828e-09, "advantage_min": -0.6539464369416237, "advantage_std": 0.6141700930893421, "completion_length": 3133.6459350585938, "epoch": 0.15385441847972064, "grad_norm": 0.3991037607192993, "kl": 0.3453369140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0359654942835247e-07, "loss": 0.0478, "reward": -0.24329330446198583, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24329330446198583, "reward_after_std": 0.6141700819134712, "reward_before_mean": -0.05385550274513662, "reward_before_std": 0.6464029587805271, "reward_change_max": 0.0013827979564666748, "reward_change_mean": -0.18943780940026045, "reward_change_min": -0.4270742852240801, "reward_change_std": 0.17507890798151493, "reward_std": 0.6141700856387615, "rewards/cosine_scaled_reward": -0.2040110882371664, "rewards/format_reward": 0.35416667722165585, "step": 358 }, { "advantage_max": 1.3082921542227268, "advantage_mean": 1.3659397946064189e-08, "advantage_min": -0.6339577846229076, "advantage_std": 0.7471686601638794, "completion_length": 3191.479202270508, "epoch": 0.15428417942519473, "grad_norm": 0.36053282022476196, "kl": 0.3935546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 3.0097380284049523e-07, "loss": 0.0427, "reward": -0.32041662000119686, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.32041662000119686, "reward_after_std": 0.7471686601638794, "reward_before_mean": -0.17606553621590137, "reward_before_std": 0.7912971898913383, "reward_change_max": 0.0005232319235801697, "reward_change_mean": -0.14435108727775514, "reward_change_min": -0.4181915335357189, "reward_change_std": 0.1638546665199101, "reward_std": 0.7471686825156212, "rewards/cosine_scaled_reward": -0.1505327643826604, "rewards/format_reward": 0.1250000037252903, "step": 359 }, { "advantage_max": 0.6500449478626251, "advantage_mean": 6.208818459363386e-10, "advantage_min": -0.3576705567538738, "advantage_std": 0.3774940147995949, "completion_length": 3118.875015258789, "epoch": 0.15471394037066882, "grad_norm": 0.6774700284004211, "kl": 0.3837890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9836319343816397e-07, "loss": 0.0127, "reward": -0.45817547757178545, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.45817547757178545, "reward_after_std": 0.3774940110743046, "reward_before_mean": -0.3170413710176945, "reward_before_std": 0.3816569484770298, "reward_change_max": 0.0004105493426322937, "reward_change_mean": -0.14113411121070385, "reward_change_min": -0.28253750316798687, "reward_change_std": 0.11212658975273371, "reward_std": 0.3774940110743046, "rewards/cosine_scaled_reward": -0.23143735527992249, "rewards/format_reward": 0.14583333395421505, "step": 360 }, { "advantage_max": 1.7807577922940254, "advantage_mean": -8.692344732885715e-09, "advantage_min": -0.7218186259269714, "advantage_std": 0.9359456598758698, "completion_length": 2621.916732788086, "epoch": 0.15514370131614288, "grad_norm": 0.9420484900474548, "kl": 0.28668212890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9576484845877793e-07, "loss": 0.0943, "reward": 0.3694741961080581, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3694741961080581, "reward_after_std": 0.9359456673264503, "reward_before_mean": 0.7248419672250748, "reward_before_std": 0.8512788899242878, "reward_change_max": 0.001120045781135559, "reward_change_mean": -0.355367767624557, "reward_change_min": -0.6061654984951019, "reward_change_std": 0.23884753789752722, "reward_std": 0.935945674777031, "rewards/cosine_scaled_reward": 0.08117098233196884, "rewards/format_reward": 0.5625000093132257, "step": 361 }, { "advantage_max": 1.203673604875803, "advantage_mean": 3.1044087300813317e-09, "advantage_min": -0.6685433685779572, "advantage_std": 0.7163474448025227, "completion_length": 3009.812545776367, "epoch": 0.15557346226161697, "grad_norm": 0.5713723301887512, "kl": 0.3385009765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.931788945420058e-07, "loss": 0.0127, "reward": -0.13031230121850967, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13031230121850967, "reward_after_std": 0.7163474261760712, "reward_before_mean": 0.08839916810393333, "reward_before_std": 0.7513237930834293, "reward_change_max": 0.0005206987261772156, "reward_change_mean": -0.2187114479020238, "reward_change_min": -0.49749576300382614, "reward_change_std": 0.19939822610467672, "reward_std": 0.7163474299013615, "rewards/cosine_scaled_reward": -0.10163376480340958, "rewards/format_reward": 0.2916666716337204, "step": 362 }, { "advantage_max": 0.8532003164291382, "advantage_mean": 2.6077033254967574e-08, "advantage_min": -0.45291025936603546, "advantage_std": 0.5015895813703537, "completion_length": 3278.5833587646484, "epoch": 0.15600322320709106, "grad_norm": 0.8850196599960327, "kl": 0.3995361328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.9060545772359305e-07, "loss": -0.0043, "reward": -0.22814308106899261, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.22814308106899261, "reward_after_std": 0.5015895674005151, "reward_before_mean": -0.018825657665729523, "reward_before_std": 0.5012480625882745, "reward_change_max": 0.001236841082572937, "reward_change_mean": -0.209317404194735, "reward_change_min": -0.4122386146336794, "reward_change_std": 0.16402861417736858, "reward_std": 0.5015895860269666, "rewards/cosine_scaled_reward": -0.11357950931414962, "rewards/format_reward": 0.20833333395421505, "step": 363 }, { "advantage_max": 0.6200065053999424, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.37952279672026634, "advantage_std": 0.37497300654649734, "completion_length": 3459.0208740234375, "epoch": 0.15643298415256512, "grad_norm": 0.6001635789871216, "kl": 0.3914794921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8804466342921987e-07, "loss": 0.0203, "reward": -0.5288160406053066, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5288160406053066, "reward_after_std": 0.37497300282120705, "reward_before_mean": -0.41059558209963143, "reward_before_std": 0.39630461297929287, "reward_change_max": 0.0016367584466934204, "reward_change_mean": -0.11822046525776386, "reward_change_min": -0.26810767129063606, "reward_change_std": 0.1110933618620038, "reward_std": 0.37497301399707794, "rewards/cosine_scaled_reward": -0.2365477867424488, "rewards/format_reward": 0.0625, "step": 364 }, { "advantage_max": 1.5177637785673141, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.8321307301521301, "advantage_std": 0.8813756592571735, "completion_length": 3044.7083740234375, "epoch": 0.1568627450980392, "grad_norm": 0.4625545144081116, "kl": 0.300537109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.854966364683872e-07, "loss": 0.0359, "reward": 0.0251250802539289, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0251250802539289, "reward_after_std": 0.8813756518065929, "reward_before_mean": 0.2766309681646817, "reward_before_std": 0.922408040612936, "reward_change_max": 0.0008354038000106812, "reward_change_mean": -0.2515058871358633, "reward_change_min": -0.5757402963936329, "reward_change_std": 0.2260899180546403, "reward_std": 0.8813756741583347, "rewards/cosine_scaled_reward": -0.0908511895686388, "rewards/format_reward": 0.45833333767950535, "step": 365 }, { "advantage_max": 1.3559943288564682, "advantage_mean": 2.2972624136308184e-08, "advantage_min": -0.7671582400798798, "advantage_std": 0.782998837530613, "completion_length": 3002.166717529297, "epoch": 0.1572925060435133, "grad_norm": 0.6409279704093933, "kl": 0.2744140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.829615010283344e-07, "loss": 0.0551, "reward": -0.12488361168652773, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12488361168652773, "reward_after_std": 0.7829988300800323, "reward_before_mean": 0.08467844570986927, "reward_before_std": 0.8160797134041786, "reward_change_max": 0.0009141638875007629, "reward_change_mean": -0.20956205297261477, "reward_change_min": -0.4426113925874233, "reward_change_std": 0.1889957133680582, "reward_std": 0.7829988487064838, "rewards/cosine_scaled_reward": -0.14516077749431133, "rewards/format_reward": 0.3750000074505806, "step": 366 }, { "advantage_max": 1.2142094373703003, "advantage_mean": 1.800557003495129e-08, "advantage_min": -0.7967988327145576, "advantage_std": 0.7586616910994053, "completion_length": 3091.5208740234375, "epoch": 0.15772226698898736, "grad_norm": 0.7676061391830444, "kl": 0.3155517578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.8043938066798645e-07, "loss": 0.0606, "reward": -0.007531823590397835, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.007531823590397835, "reward_after_std": 0.7586616910994053, "reward_before_mean": 0.25509078428149223, "reward_before_std": 0.8134810142219067, "reward_change_max": 0.0004536062479019165, "reward_change_mean": -0.2626225920394063, "reward_change_min": -0.5931111760437489, "reward_change_std": 0.23841848969459534, "reward_std": 0.7586617022752762, "rewards/cosine_scaled_reward": -0.0807879469357431, "rewards/format_reward": 0.4166666753590107, "step": 367 }, { "advantage_max": 0.9865973927080631, "advantage_mean": -2.4835269951672956e-09, "advantage_min": -0.7806943729519844, "advantage_std": 0.6175127159804106, "completion_length": 2927.395896911621, "epoch": 0.15815202793446145, "grad_norm": 0.2891014516353607, "kl": 0.2928466796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7793039831193133e-07, "loss": 0.022, "reward": 0.1399083212018013, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1399083212018013, "reward_after_std": 0.6175127159804106, "reward_before_mean": 0.47076944541186094, "reward_before_std": 0.6360297724604607, "reward_change_max": 0.0011920034885406494, "reward_change_mean": -0.33086110558360815, "reward_change_min": -0.5970012731850147, "reward_change_std": 0.2383087445050478, "reward_std": 0.6175127476453781, "rewards/cosine_scaled_reward": 0.06871803291141987, "rewards/format_reward": 0.33333334513008595, "step": 368 }, { "advantage_max": 1.597794346511364, "advantage_mean": -8.071462276326713e-09, "advantage_min": -0.7856799587607384, "advantage_std": 0.8948786146938801, "completion_length": 3424.1459045410156, "epoch": 0.15858178887993554, "grad_norm": 0.4553248882293701, "kl": 0.3157958984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7543467624442956e-07, "loss": 0.0481, "reward": -0.12305483594536781, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12305483594536781, "reward_after_std": 0.8948786184191704, "reward_before_mean": 0.06922929733991623, "reward_before_std": 0.9247158244252205, "reward_change_max": 0.001806609332561493, "reward_change_mean": -0.19228412443771958, "reward_change_min": -0.4231702350080013, "reward_change_std": 0.18406124226748943, "reward_std": 0.894878663122654, "rewards/cosine_scaled_reward": -0.06955202564131469, "rewards/format_reward": 0.20833333767950535, "step": 369 }, { "advantage_max": 0.9189092107117176, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -0.5557832270860672, "advantage_std": 0.5555372778326273, "completion_length": 3299.479217529297, "epoch": 0.1590115498254096, "grad_norm": 0.47279489040374756, "kl": 0.3394775390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.729523361034538e-07, "loss": 0.0544, "reward": -0.29450376331806183, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.29450376331806183, "reward_after_std": 0.5555372759699821, "reward_before_mean": -0.11311948858201504, "reward_before_std": 0.5889889039099216, "reward_change_max": 0.0013103187084197998, "reward_change_mean": -0.18138427240774035, "reward_change_min": -0.43477962911129, "reward_change_std": 0.16560602746903896, "reward_std": 0.5555372778326273, "rewards/cosine_scaled_reward": -0.1607264094054699, "rewards/format_reward": 0.2083333432674408, "step": 370 }, { "advantage_max": 1.2061527147889137, "advantage_mean": 1.4901161526914564e-08, "advantage_min": -0.6430985778570175, "advantage_std": 0.6763911582529545, "completion_length": 3375.604248046875, "epoch": 0.1594413107708837, "grad_norm": 0.36862221360206604, "kl": 0.341552734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.7048349887476037e-07, "loss": 0.0518, "reward": -0.30412859097123146, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.30412859097123146, "reward_after_std": 0.6763911619782448, "reward_before_mean": -0.14841757342219353, "reward_before_std": 0.6949256807565689, "reward_change_max": 0.00106830894947052, "reward_change_mean": -0.15571101009845734, "reward_change_min": -0.3709173910319805, "reward_change_std": 0.1518853446468711, "reward_std": 0.6763911619782448, "rewards/cosine_scaled_reward": -0.16795879136770964, "rewards/format_reward": 0.1875000037252903, "step": 371 }, { "advantage_max": 1.2328257113695145, "advantage_mean": -1.4280280846712401e-08, "advantage_min": -0.8807566314935684, "advantage_std": 0.7773857545107603, "completion_length": 3049.604217529297, "epoch": 0.15987107171635778, "grad_norm": 0.4701407849788666, "kl": 0.3099365234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6802828488599294e-07, "loss": 0.0467, "reward": 0.06277298461645842, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06277298461645842, "reward_after_std": 0.77738575078547, "reward_before_mean": 0.348312322050333, "reward_before_std": 0.8400245327502489, "reward_change_max": 1.607835292816162e-05, "reward_change_mean": -0.2855393411591649, "reward_change_min": -0.5820327438414097, "reward_change_std": 0.24761223187670112, "reward_std": 0.7773857731372118, "rewards/cosine_scaled_reward": -0.03417719097342342, "rewards/format_reward": 0.416666679084301, "step": 372 }, { "advantage_max": 0.8513304516673088, "advantage_mean": -3.725290464995368e-09, "advantage_min": -0.7837282046675682, "advantage_std": 0.5991636887192726, "completion_length": 3192.041702270508, "epoch": 0.16030083266183184, "grad_norm": 0.3445509672164917, "kl": 0.278076171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.655868138008171e-07, "loss": 0.0268, "reward": -0.13842065632343292, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13842065632343292, "reward_after_std": 0.5991636887192726, "reward_before_mean": 0.10207586362957954, "reward_before_std": 0.6822909116744995, "reward_change_max": 0.0008417144417762756, "reward_change_mean": -0.24049653392285109, "reward_change_min": -0.49204727075994015, "reward_change_std": 0.21670057531446218, "reward_std": 0.5991636961698532, "rewards/cosine_scaled_reward": -0.05312872491776943, "rewards/format_reward": 0.2083333432674408, "step": 373 }, { "advantage_max": 1.3242492005228996, "advantage_mean": -9.313225801665936e-09, "advantage_min": -0.7362012714147568, "advantage_std": 0.8000509403645992, "completion_length": 3357.9584045410156, "epoch": 0.16073059360730593, "grad_norm": 1.1216022968292236, "kl": 0.343017578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.631592046130896e-07, "loss": 0.0857, "reward": -0.15046915039420128, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15046915039420128, "reward_after_std": 0.8000509217381477, "reward_before_mean": 0.052945492789149284, "reward_before_std": 0.8635219391435385, "reward_change_max": 2.4460256099700928e-05, "reward_change_mean": -0.20341465948149562, "reward_change_min": -0.49167686700820923, "reward_change_std": 0.20924733043648303, "reward_std": 0.8000509403645992, "rewards/cosine_scaled_reward": -0.12977725639939308, "rewards/format_reward": 0.31250000558793545, "step": 374 }, { "advantage_max": 1.4105933234095573, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.6906870678067207, "advantage_std": 0.7830157801508904, "completion_length": 2569.9167098999023, "epoch": 0.16116035455278002, "grad_norm": 0.41892313957214355, "kl": 0.221923828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.6074557564105724e-07, "loss": 0.0315, "reward": 0.19060913566499949, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19060913566499949, "reward_after_std": 0.7830157801508904, "reward_before_mean": 0.5079487208276987, "reward_before_std": 0.7434209361672401, "reward_change_max": 0.000989288091659546, "reward_change_mean": -0.3173395823687315, "reward_change_min": -0.6324963476508856, "reward_change_std": 0.23367712646722794, "reward_std": 0.7830158099532127, "rewards/cosine_scaled_reward": 0.024807682260870934, "rewards/format_reward": 0.45833334140479565, "step": 375 }, { "advantage_max": 1.4357290789484978, "advantage_mean": 1.1796752963366686e-08, "advantage_min": -0.6067278571426868, "advantage_std": 0.7571667619049549, "completion_length": 3461.625030517578, "epoch": 0.16159011549825408, "grad_norm": 0.4707706868648529, "kl": 0.392822265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.583460445215911e-07, "loss": 0.0566, "reward": -0.24857788905501366, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24857788905501366, "reward_after_std": 0.7571667656302452, "reward_before_mean": -0.0881968978792429, "reward_before_std": 0.7407194040715694, "reward_change_max": 0.0003581419587135315, "reward_change_mean": -0.16038098884746432, "reward_change_min": -0.3215008042752743, "reward_change_std": 0.13000522693619132, "reward_std": 0.7571667730808258, "rewards/cosine_scaled_reward": -0.14826511964201927, "rewards/format_reward": 0.2083333358168602, "step": 376 }, { "advantage_max": 1.518710795789957, "advantage_mean": 1.8626452047421083e-08, "advantage_min": -0.6844476610422134, "advantage_std": 0.8172686100006104, "completion_length": 3310.0625610351562, "epoch": 0.16201987644372817, "grad_norm": 0.5886012315750122, "kl": 0.360595703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.5596072820445254e-07, "loss": 0.0583, "reward": -0.22981440764851868, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22981440764851868, "reward_after_std": 0.8172686100006104, "reward_before_mean": -0.070173523388803, "reward_before_std": 0.8218402788043022, "reward_change_max": 0.0018361285328865051, "reward_change_mean": -0.15964087517932057, "reward_change_min": -0.35415197536349297, "reward_change_std": 0.1495197582989931, "reward_std": 0.8172686398029327, "rewards/cosine_scaled_reward": -0.18092009611427784, "rewards/format_reward": 0.29166667349636555, "step": 377 }, { "advantage_max": 1.0424085967242718, "advantage_mean": 1.924733339375706e-08, "advantage_min": -0.68810173869133, "advantage_std": 0.6244521290063858, "completion_length": 2909.041778564453, "epoch": 0.16244963738920226, "grad_norm": 0.8702689409255981, "kl": 0.322509765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.5358974294659373e-07, "loss": -0.0021, "reward": -0.20458247605711222, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.20458247605711222, "reward_after_std": 0.6244521401822567, "reward_before_mean": -0.0014384333044290543, "reward_before_std": 0.6580058261752129, "reward_change_max": 0.0, "reward_change_mean": -0.2031440418213606, "reward_change_min": -0.463660828769207, "reward_change_std": 0.17941322969272733, "reward_std": 0.6244521625339985, "rewards/cosine_scaled_reward": -0.1882192064076662, "rewards/format_reward": 0.37500000931322575, "step": 378 }, { "advantage_max": 1.506170116364956, "advantage_mean": 5.587935669737476e-09, "advantage_min": -0.8330711424350739, "advantage_std": 0.8799382485449314, "completion_length": 3186.166717529297, "epoch": 0.16287939833467632, "grad_norm": 0.8381545543670654, "kl": 0.339111328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.512332043064913e-07, "loss": 0.0547, "reward": -0.019568707328289747, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.019568707328289747, "reward_after_std": 0.8799382485449314, "reward_before_mean": 0.21720337169244885, "reward_before_std": 0.9253427572548389, "reward_change_max": 0.0020304247736930847, "reward_change_mean": -0.23677205992862582, "reward_change_min": -0.5809053536504507, "reward_change_std": 0.2297684159129858, "reward_std": 0.8799382522702217, "rewards/cosine_scaled_reward": -0.08931497763842344, "rewards/format_reward": 0.3958333469927311, "step": 379 }, { "advantage_max": 1.2497223988175392, "advantage_mean": -6.2088170160734535e-09, "advantage_min": -0.7278815880417824, "advantage_std": 0.7585644647479057, "completion_length": 2968.312530517578, "epoch": 0.1633091592801504, "grad_norm": 0.8016571402549744, "kl": 0.3392333984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.488912271385139e-07, "loss": 0.0641, "reward": -0.09182755276560783, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09182755276560783, "reward_after_std": 0.7585644647479057, "reward_before_mean": 0.13689248263835907, "reward_before_std": 0.8068147860467434, "reward_change_max": 0.000734470784664154, "reward_change_mean": -0.22872004471719265, "reward_change_min": -0.5504883974790573, "reward_change_std": 0.2177856587804854, "reward_std": 0.7585644870996475, "rewards/cosine_scaled_reward": -0.11905376426875591, "rewards/format_reward": 0.37500000558793545, "step": 380 }, { "advantage_max": 0.7846017330884933, "advantage_mean": 2.359350592673337e-08, "advantage_min": -0.4272094964981079, "advantage_std": 0.45592255890369415, "completion_length": 3433.3958435058594, "epoch": 0.1637389202256245, "grad_norm": 0.3217938244342804, "kl": 0.407958984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.465639255873246e-07, "loss": 0.0463, "reward": -0.5629216097295284, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5629216097295284, "reward_after_std": 0.45592256262898445, "reward_before_mean": -0.4685330092906952, "reward_before_std": 0.4835641272366047, "reward_change_max": 0.0021490901708602905, "reward_change_mean": -0.09438859485089779, "reward_change_min": -0.2520258463919163, "reward_change_std": 0.10614891909062862, "reward_std": 0.45592257753014565, "rewards/cosine_scaled_reward": -0.2655165046453476, "rewards/format_reward": 0.06250000186264515, "step": 381 }, { "advantage_max": 0.6658443249762058, "advantage_mean": 7.450580929990736e-09, "advantage_min": -0.4371752478182316, "advantage_std": 0.3922428097575903, "completion_length": 3266.5416870117188, "epoch": 0.16416868117109856, "grad_norm": 0.8975198268890381, "kl": 0.4254150390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4425141308231765e-07, "loss": 0.01, "reward": -0.37477924302220345, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.37477924302220345, "reward_after_std": 0.39224281162023544, "reward_before_mean": -0.20502624567598104, "reward_before_std": 0.39399662613868713, "reward_change_max": 0.001018911600112915, "reward_change_mean": -0.1697530006058514, "reward_change_min": -0.3196086287498474, "reward_change_std": 0.12551445607095957, "reward_std": 0.3922428283840418, "rewards/cosine_scaled_reward": -0.17542979680001736, "rewards/format_reward": 0.14583333395421505, "step": 382 }, { "advantage_max": 0.7521005980670452, "advantage_mean": 2.1730860833013566e-08, "advantage_min": -0.42990664206445217, "advantage_std": 0.42815791442990303, "completion_length": 3362.8333740234375, "epoch": 0.16459844211657265, "grad_norm": 0.5806494355201721, "kl": 0.4093017578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.4195380233209006e-07, "loss": 0.0292, "reward": -0.4113504383713007, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4113504383713007, "reward_after_std": 0.42815791815519333, "reward_before_mean": -0.2601887481287122, "reward_before_std": 0.4226635470986366, "reward_change_max": 8.226931095123291e-05, "reward_change_mean": -0.1511616762727499, "reward_change_min": -0.2709957640618086, "reward_change_std": 0.11551726702600718, "reward_std": 0.4281579293310642, "rewards/cosine_scaled_reward": -0.22384438663721085, "rewards/format_reward": 0.18750000558793545, "step": 383 }, { "advantage_max": 1.2567856460809708, "advantage_mean": -8.692344344307656e-09, "advantage_min": -0.6382150501012802, "advantage_std": 0.7298181466758251, "completion_length": 3163.8958740234375, "epoch": 0.16502820306204674, "grad_norm": 0.6464402675628662, "kl": 0.36236572265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3967120531894857e-07, "loss": 0.0088, "reward": -0.2007374595850706, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2007374595850706, "reward_after_std": 0.7298181429505348, "reward_before_mean": -0.01088327681645751, "reward_before_std": 0.7643853314220905, "reward_change_max": 0.0001830458641052246, "reward_change_mean": -0.18985420325770974, "reward_change_min": -0.4629392884671688, "reward_change_std": 0.18239823263138533, "reward_std": 0.729818157851696, "rewards/cosine_scaled_reward": -0.14085829723626375, "rewards/format_reward": 0.27083333767950535, "step": 384 }, { "advantage_max": 1.009941577911377, "advantage_mean": -7.140139812733537e-09, "advantage_min": -0.5099415555596352, "advantage_std": 0.5658974517136812, "completion_length": 3098.5208740234375, "epoch": 0.16545796400752083, "grad_norm": 0.44427549839019775, "kl": 0.321533203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.374037332934512e-07, "loss": 0.0171, "reward": -0.13919230923056602, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13919230923056602, "reward_after_std": 0.56589743681252, "reward_before_mean": 0.08963096211664379, "reward_before_std": 0.5477831847965717, "reward_change_max": 0.0009142383933067322, "reward_change_mean": -0.22882326692342758, "reward_change_min": -0.4367269016802311, "reward_change_std": 0.16989680007100105, "reward_std": 0.5658974535763264, "rewards/cosine_scaled_reward": -0.06976786314044148, "rewards/format_reward": 0.22916666977107525, "step": 385 }, { "advantage_max": 1.2086443230509758, "advantage_mean": -1.3038515933594397e-08, "advantage_min": -0.5034029893577099, "advantage_std": 0.6398791931569576, "completion_length": 3239.0416870117188, "epoch": 0.1658877249529949, "grad_norm": 0.3589456379413605, "kl": 0.35406494140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3515149676898552e-07, "loss": 0.0307, "reward": -0.10225249454379082, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10225249454379082, "reward_after_std": 0.6398792117834091, "reward_before_mean": 0.12576019996777177, "reward_before_std": 0.5992455370724201, "reward_change_max": 0.0008250102400779724, "reward_change_mean": -0.22801271500065923, "reward_change_min": -0.3760439604520798, "reward_change_std": 0.1500204592011869, "reward_std": 0.6398792192339897, "rewards/cosine_scaled_reward": -0.11420323827769607, "rewards/format_reward": 0.35416666977107525, "step": 386 }, { "advantage_max": 1.2408776134252548, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.7202555164694786, "advantage_std": 0.7246770560741425, "completion_length": 3055.479232788086, "epoch": 0.16631748589846898, "grad_norm": 0.5273728370666504, "kl": 0.3414306640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.3291460551638237e-07, "loss": 0.0262, "reward": -0.025316673330962658, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.025316673330962658, "reward_after_std": 0.724677037447691, "reward_before_mean": 0.22579327668063343, "reward_before_std": 0.7429092451930046, "reward_change_max": 0.0008508190512657166, "reward_change_mean": -0.25110994558781385, "reward_change_min": -0.47625257819890976, "reward_change_std": 0.1999763185158372, "reward_std": 0.724677074700594, "rewards/cosine_scaled_reward": -0.053770036436617374, "rewards/format_reward": 0.3333333395421505, "step": 387 }, { "advantage_max": 1.0400738045573235, "advantage_mean": 2.1109979320144134e-08, "advantage_min": -0.8311022520065308, "advantage_std": 0.6580462232232094, "completion_length": 3276.5833740234375, "epoch": 0.16674724684394307, "grad_norm": 0.8636550307273865, "kl": 0.3292236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.306931685585657e-07, "loss": 0.0025, "reward": -0.08846690505743027, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08846690505743027, "reward_after_std": 0.6580462157726288, "reward_before_mean": 0.15689044259488583, "reward_before_std": 0.7135017402470112, "reward_change_max": 0.0003162696957588196, "reward_change_mean": -0.2453573625534773, "reward_change_min": -0.49488671123981476, "reward_change_std": 0.2102921586483717, "reward_std": 0.6580462418496609, "rewards/cosine_scaled_reward": -0.05697142332792282, "rewards/format_reward": 0.27083334513008595, "step": 388 }, { "advantage_max": 1.204227589070797, "advantage_mean": -3.72529057601767e-09, "advantage_min": -0.6426680758595467, "advantage_std": 0.7008140534162521, "completion_length": 3394.1458435058594, "epoch": 0.16717700778941713, "grad_norm": 0.3513981103897095, "kl": 0.3175048828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2848729416523859e-07, "loss": 0.0278, "reward": -0.22621393762528896, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22621393762528896, "reward_after_std": 0.7008140534162521, "reward_before_mean": -0.0418687891215086, "reward_before_std": 0.7373157255351543, "reward_change_max": 0.0005122050642967224, "reward_change_mean": -0.18434515874832869, "reward_change_min": -0.3606463298201561, "reward_change_std": 0.15919563500210643, "reward_std": 0.7008140757679939, "rewards/cosine_scaled_reward": -0.13551772851496935, "rewards/format_reward": 0.2291666753590107, "step": 389 }, { "advantage_max": 1.4547392949461937, "advantage_mean": 8.071462720415923e-09, "advantage_min": -0.5981867611408234, "advantage_std": 0.7892596200108528, "completion_length": 3330.3541870117188, "epoch": 0.16760676873489122, "grad_norm": 0.8333050012588501, "kl": 0.3167724609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2629708984760706e-07, "loss": 0.0769, "reward": -0.2920600244542584, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2920600244542584, "reward_after_std": 0.7892595939338207, "reward_before_mean": -0.14914833195507526, "reward_before_std": 0.8043610099703074, "reward_change_max": 0.0007552430033683777, "reward_change_mean": -0.14291170379146934, "reward_change_min": -0.3743924852460623, "reward_change_std": 0.14588559162802994, "reward_std": 0.789259634912014, "rewards/cosine_scaled_reward": -0.17874083374044858, "rewards/format_reward": 0.20833333767950535, "step": 390 }, { "advantage_max": 1.2921189591288567, "advantage_mean": 6.829699139565548e-09, "advantage_min": -0.6094074621796608, "advantage_std": 0.7078515402972698, "completion_length": 3456.041717529297, "epoch": 0.1680365296803653, "grad_norm": 0.3390703797340393, "kl": 0.330078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2412266235313973e-07, "loss": 0.0123, "reward": -0.34280408546328545, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.34280408546328545, "reward_after_std": 0.7078515440225601, "reward_before_mean": -0.20629326323978603, "reward_before_std": 0.7232176773250103, "reward_change_max": 0.0007589608430862427, "reward_change_mean": -0.1365108210593462, "reward_change_min": -0.3297334350645542, "reward_change_std": 0.13814121764153242, "reward_std": 0.7078515626490116, "rewards/cosine_scaled_reward": -0.19689663354074582, "rewards/format_reward": 0.18750000186264515, "step": 391 }, { "advantage_max": 1.251602664589882, "advantage_mean": 1.2107194219401762e-08, "advantage_min": -0.6215802617371082, "advantage_std": 0.7246743068099022, "completion_length": 3137.7709045410156, "epoch": 0.16846629062583937, "grad_norm": 0.44249120354652405, "kl": 0.3118896484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.2196411766036487e-07, "loss": 0.0272, "reward": -0.17402904573827982, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.17402904573827982, "reward_after_std": 0.7246743105351925, "reward_before_mean": 0.02446424402296543, "reward_before_std": 0.7477903589606285, "reward_change_max": 0.0008437782526016235, "reward_change_mean": -0.19849328324198723, "reward_change_min": -0.4558328837156296, "reward_change_std": 0.18858129903674126, "reward_std": 0.724674329161644, "rewards/cosine_scaled_reward": -0.11276788963004947, "rewards/format_reward": 0.25000000558793545, "step": 392 }, { "advantage_max": 1.363963931798935, "advantage_mean": -6.20881729362921e-09, "advantage_min": -0.629760380834341, "advantage_std": 0.7519446350634098, "completion_length": 3069.3959045410156, "epoch": 0.16889605157131346, "grad_norm": 0.7502540946006775, "kl": 0.2872314453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1982156097370557e-07, "loss": 0.043, "reward": -0.2926993817090988, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2926993817090988, "reward_after_std": 0.7519446536898613, "reward_before_mean": -0.14441546890884638, "reward_before_std": 0.7730610929429531, "reward_change_max": 0.0018625706434249878, "reward_change_mean": -0.14828391652554274, "reward_change_min": -0.3649334441870451, "reward_change_std": 0.1511009531095624, "reward_std": 0.7519446741789579, "rewards/cosine_scaled_reward": -0.1867910698056221, "rewards/format_reward": 0.2291666716337204, "step": 393 }, { "advantage_max": 1.0463023260235786, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -0.7668169774115086, "advantage_std": 0.6571172922849655, "completion_length": 3088.1458587646484, "epoch": 0.16932581251678755, "grad_norm": 0.5270704627037048, "kl": 0.242828369140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1769509671835223e-07, "loss": 0.0419, "reward": -0.07874491391703486, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07874491391703486, "reward_after_std": 0.6571172922849655, "reward_before_mean": 0.1706651938147843, "reward_before_std": 0.7013993561267853, "reward_change_max": 0.0004040077328681946, "reward_change_mean": -0.24941013287752867, "reward_change_min": -0.5160678084939718, "reward_change_std": 0.2134703379124403, "reward_std": 0.6571172997355461, "rewards/cosine_scaled_reward": -0.1334173996001482, "rewards/format_reward": 0.43750000931322575, "step": 394 }, { "advantage_max": 1.1905727162957191, "advantage_mean": 1.9247333948868572e-08, "advantage_min": -0.6826298795640469, "advantage_std": 0.6732536237686872, "completion_length": 3222.729248046875, "epoch": 0.1697555734622616, "grad_norm": 0.9873429536819458, "kl": 0.2982177734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1558482853517253e-07, "loss": 0.0746, "reward": -0.08977969735860825, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08977969735860825, "reward_after_std": 0.6732536237686872, "reward_before_mean": 0.14401146955788136, "reward_before_std": 0.6719720307737589, "reward_change_max": 0.0, "reward_change_mean": -0.23379114735871553, "reward_change_min": -0.38438038155436516, "reward_change_std": 0.16333725582808256, "reward_std": 0.6732536256313324, "rewards/cosine_scaled_reward": -0.021744257770478725, "rewards/format_reward": 0.1875000074505806, "step": 395 }, { "advantage_max": 1.379203423857689, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.6869696751236916, "advantage_std": 0.7804112918674946, "completion_length": 3369.437530517578, "epoch": 0.1701853344077357, "grad_norm": 0.5937776565551758, "kl": 0.331787109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.134908592756607e-07, "loss": 0.0597, "reward": -0.23921585641801357, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23921585641801357, "reward_after_std": 0.7804112918674946, "reward_before_mean": -0.07209537550806999, "reward_before_std": 0.8133157454431057, "reward_change_max": 0.0006091371178627014, "reward_change_mean": -0.16712048929184675, "reward_change_min": -0.44778870791196823, "reward_change_std": 0.17600109428167343, "reward_std": 0.7804113104939461, "rewards/cosine_scaled_reward": -0.12979769054800272, "rewards/format_reward": 0.1875000037252903, "step": 396 }, { "advantage_max": 1.3004822693765163, "advantage_mean": 1.5522043095295146e-08, "advantage_min": -0.7036722302436829, "advantage_std": 0.7343753259629011, "completion_length": 2939.3750610351562, "epoch": 0.1706150953532098, "grad_norm": 1.6454477310180664, "kl": 0.3079833984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.1141329099692406e-07, "loss": 0.1095, "reward": -0.11076481360942125, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11076481360942125, "reward_after_std": 0.7343752644956112, "reward_before_mean": 0.1072701474186033, "reward_before_std": 0.7380689550191164, "reward_change_max": 0.0004348158836364746, "reward_change_mean": -0.21803498081862926, "reward_change_min": -0.4325283318758011, "reward_change_std": 0.17945316340774298, "reward_std": 0.7343752905726433, "rewards/cosine_scaled_reward": -0.09219825750915334, "rewards/format_reward": 0.29166667349636555, "step": 397 }, { "advantage_max": 1.2014844827353954, "advantage_mean": 1.0554989438027462e-08, "advantage_min": -0.90420301258564, "advantage_std": 0.7711508795619011, "completion_length": 3126.8541717529297, "epoch": 0.17104485629868385, "grad_norm": 0.4581427276134491, "kl": 0.32421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0935222495670968e-07, "loss": 0.0344, "reward": 0.026221081614494324, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.026221081614494324, "reward_after_std": 0.7711508795619011, "reward_before_mean": 0.3010478839278221, "reward_before_std": 0.8465890176594257, "reward_change_max": 0.0004696398973464966, "reward_change_mean": -0.27482680790126324, "reward_change_min": -0.5476282946765423, "reward_change_std": 0.2450251814443618, "reward_std": 0.7711509056389332, "rewards/cosine_scaled_reward": -0.026559388265013695, "rewards/format_reward": 0.35416667349636555, "step": 398 }, { "advantage_max": 0.674967810511589, "advantage_mean": 1.2417634809303024e-08, "advantage_min": -0.36018291488289833, "advantage_std": 0.3928775414824486, "completion_length": 3474.354217529297, "epoch": 0.17147461724415794, "grad_norm": 0.6505287885665894, "kl": 0.408203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0730776160846853e-07, "loss": 0.0185, "reward": -0.5831986404955387, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5831986404955387, "reward_after_std": 0.3928775303065777, "reward_before_mean": -0.48842116445302963, "reward_before_std": 0.41468488425016403, "reward_change_max": 0.0, "reward_change_mean": -0.09477749792858958, "reward_change_min": -0.24031966552138329, "reward_change_std": 0.09762066463008523, "reward_std": 0.3928775414824486, "rewards/cosine_scaled_reward": -0.2754605747759342, "rewards/format_reward": 0.06250000186264515, "step": 399 }, { "advantage_max": 0.9325446747243404, "advantage_mean": 1.862645188088763e-08, "advantage_min": -0.5802384093403816, "advantage_std": 0.5354755632579327, "completion_length": 2819.1250610351562, "epoch": 0.17190437818963203, "grad_norm": 0.4000764787197113, "kl": 0.3126220703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0528000059645995e-07, "loss": 0.0569, "reward": -0.17311914265155792, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17311914265155792, "reward_after_std": 0.5354755632579327, "reward_before_mean": 0.05000012833625078, "reward_before_std": 0.5293943881988525, "reward_change_max": 0.0008205249905586243, "reward_change_mean": -0.22311925515532494, "reward_change_min": -0.3971652314066887, "reward_change_std": 0.15885510109364986, "reward_std": 0.5354755856096745, "rewards/cosine_scaled_reward": -0.1520832795649767, "rewards/format_reward": 0.35416667349636555, "step": 400 }, { "advantage_max": 1.39947210252285, "advantage_mean": -1.5522043372850902e-08, "advantage_min": -0.8646727055311203, "advantage_std": 0.8108454793691635, "completion_length": 2587.062530517578, "epoch": 0.1723341391351061, "grad_norm": 0.5335204005241394, "kl": 0.255615234375, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.032690407508949e-07, "loss": 0.0508, "reward": 0.2592086410149932, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2592086410149932, "reward_after_std": 0.810845497995615, "reward_before_mean": 0.6031267067883164, "reward_before_std": 0.8129908181726933, "reward_change_max": 0.0, "reward_change_mean": -0.34391810093075037, "reward_change_min": -0.6264822296798229, "reward_change_std": 0.24040825013071299, "reward_std": 0.8108455277979374, "rewards/cosine_scaled_reward": 0.03073002799646929, "rewards/format_reward": 0.541666679084301, "step": 401 }, { "advantage_max": 0.9142828658223152, "advantage_mean": 1.5522043872451263e-08, "advantage_min": -0.6070458739995956, "advantage_std": 0.5652897506952286, "completion_length": 3120.625030517578, "epoch": 0.17276390008058018, "grad_norm": 0.46507272124290466, "kl": 0.3720703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 2.0127498008311922e-07, "loss": 0.0465, "reward": -0.04347945377230644, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04347945377230644, "reward_after_std": 0.5652897581458092, "reward_before_mean": 0.2283969409763813, "reward_before_std": 0.5786354951560497, "reward_change_max": 0.001015208661556244, "reward_change_mean": -0.27187635097652674, "reward_change_min": -0.5027057453989983, "reward_change_std": 0.2088111499324441, "reward_std": 0.5652897618710995, "rewards/cosine_scaled_reward": -0.0628848671913147, "rewards/format_reward": 0.3541666716337204, "step": 402 }, { "advantage_max": 1.0485853515565395, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.6589442566037178, "advantage_std": 0.6309670433402061, "completion_length": 3107.604217529297, "epoch": 0.17319366102605427, "grad_norm": 0.6441790461540222, "kl": 0.3045654296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9929791578083655e-07, "loss": 0.058, "reward": -0.2681356613757089, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2681356613757089, "reward_after_std": 0.6309670656919479, "reward_before_mean": -0.08778932597488165, "reward_before_std": 0.6704566776752472, "reward_change_max": 2.7477741241455078e-05, "reward_change_mean": -0.18034633249044418, "reward_change_min": -0.3825173173099756, "reward_change_std": 0.17140215821564198, "reward_std": 0.6309670843183994, "rewards/cosine_scaled_reward": -0.24181134067475796, "rewards/format_reward": 0.3958333469927311, "step": 403 }, { "advantage_max": 0.6523730270564556, "advantage_mean": 1.4901161138336505e-08, "advantage_min": -0.4079050309956074, "advantage_std": 0.39261235669255257, "completion_length": 3439.9583435058594, "epoch": 0.17362342197152833, "grad_norm": 0.8330584168434143, "kl": 0.410888671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9733794420337213e-07, "loss": 0.0208, "reward": -0.4417586326599121, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4417586326599121, "reward_after_std": 0.39261235669255257, "reward_before_mean": -0.29399993643164635, "reward_before_std": 0.4073510989546776, "reward_change_max": 0.0007183179259300232, "reward_change_mean": -0.1477586943656206, "reward_change_min": -0.28734432719647884, "reward_change_std": 0.11925210524350405, "reward_std": 0.39261238276958466, "rewards/cosine_scaled_reward": -0.21991663612425327, "rewards/format_reward": 0.14583333395421505, "step": 404 }, { "advantage_max": 0.9651088081300259, "advantage_mean": 2.0489097363185493e-08, "advantage_min": -0.6014821976423264, "advantage_std": 0.5710999183356762, "completion_length": 3113.7916870117188, "epoch": 0.17405318291700242, "grad_norm": 0.4654783606529236, "kl": 0.373779296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.9539516087697517e-07, "loss": 0.0432, "reward": -0.2541431449353695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2541431449353695, "reward_after_std": 0.5710999220609665, "reward_before_mean": -0.06387923285365105, "reward_before_std": 0.594166949391365, "reward_change_max": 0.0004720166325569153, "reward_change_mean": -0.1902638846077025, "reward_change_min": -0.40094830468297005, "reward_change_std": 0.16179871885105968, "reward_std": 0.5710999332368374, "rewards/cosine_scaled_reward": -0.16735628992319107, "rewards/format_reward": 0.27083334140479565, "step": 405 }, { "advantage_max": 1.2024127915501595, "advantage_mean": -1.8626452047421083e-09, "advantage_min": -0.7163936905562878, "advantage_std": 0.7201002053916454, "completion_length": 3340.104217529297, "epoch": 0.1744829438624765, "grad_norm": 0.3642021417617798, "kl": 0.423583984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.934696604901642e-07, "loss": 0.0456, "reward": -0.1823072754777968, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1823072754777968, "reward_after_std": 0.7201002016663551, "reward_before_mean": 0.01833081990480423, "reward_before_std": 0.7659327164292336, "reward_change_max": 0.0009614527225494385, "reward_change_mean": -0.20063810888677835, "reward_change_min": -0.42538352869451046, "reward_change_std": 0.1848788522183895, "reward_std": 0.7201002165675163, "rewards/cosine_scaled_reward": -0.16791792353615165, "rewards/format_reward": 0.35416667722165585, "step": 406 }, { "advantage_max": 0.9568363949656487, "advantage_mean": 1.3659398057086491e-08, "advantage_min": -0.5269550830125809, "advantage_std": 0.547583844512701, "completion_length": 2981.166717529297, "epoch": 0.17491270480795057, "grad_norm": 0.7327854037284851, "kl": 0.3778076171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.915615368891117e-07, "loss": 0.0149, "reward": -0.22692469786852598, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22692469786852598, "reward_after_std": 0.547583844512701, "reward_before_mean": -0.02427714318037033, "reward_before_std": 0.5445889607071877, "reward_change_max": 0.000745236873626709, "reward_change_mean": -0.20264754630625248, "reward_change_min": -0.410320695489645, "reward_change_std": 0.15683315601199865, "reward_std": 0.5475838482379913, "rewards/cosine_scaled_reward": -0.1579719092696905, "rewards/format_reward": 0.29166666977107525, "step": 407 }, { "advantage_max": 1.0792118534445763, "advantage_mean": 3.104408619059029e-09, "advantage_min": -0.6029591634869576, "advantage_std": 0.6411285698413849, "completion_length": 3058.9583740234375, "epoch": 0.17534246575342466, "grad_norm": 0.3817065358161926, "kl": 0.33154296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8967088307307e-07, "loss": 0.031, "reward": -0.27623447217047215, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27623447217047215, "reward_after_std": 0.6411285735666752, "reward_before_mean": -0.10106014832854271, "reward_before_std": 0.6789880953729153, "reward_change_max": 0.0005949214100837708, "reward_change_mean": -0.17517433501780033, "reward_change_min": -0.4431847259402275, "reward_change_std": 0.17573662381619215, "reward_std": 0.6411285996437073, "rewards/cosine_scaled_reward": -0.19636341478326358, "rewards/format_reward": 0.2916666753590107, "step": 408 }, { "advantage_max": 1.374371774494648, "advantage_mean": -8.071462498371318e-09, "advantage_min": -0.9059640467166901, "advantage_std": 0.8620566688477993, "completion_length": 2821.3334045410156, "epoch": 0.17577222669889875, "grad_norm": 0.5286200642585754, "kl": 0.275421142578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8779779118983867e-07, "loss": 0.0288, "reward": 0.2700458976905793, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2700458976905793, "reward_after_std": 0.8620566725730896, "reward_before_mean": 0.6197148598730564, "reward_before_std": 0.9137398786842823, "reward_change_max": 0.0, "reward_change_mean": -0.3496689684689045, "reward_change_min": -0.7045222297310829, "reward_change_std": 0.28851046320050955, "reward_std": 0.862056702375412, "rewards/cosine_scaled_reward": 0.049440762028098106, "rewards/format_reward": 0.5208333376795053, "step": 409 }, { "advantage_max": 0.7105138078331947, "advantage_mean": 2.110997865401032e-08, "advantage_min": -0.4317469783127308, "advantage_std": 0.42244425415992737, "completion_length": 3379.6458740234375, "epoch": 0.1762019876443728, "grad_norm": 0.595300018787384, "kl": 0.450927734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8594235253127372e-07, "loss": 0.0464, "reward": -0.5105401165783405, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5105401165783405, "reward_after_std": 0.42244425788521767, "reward_before_mean": -0.39202951174229383, "reward_before_std": 0.44522079080343246, "reward_change_max": 0.0007661283016204834, "reward_change_mean": -0.1185105899348855, "reward_change_min": -0.27850930020213127, "reward_change_std": 0.11453505605459213, "reward_std": 0.42244426906108856, "rewards/cosine_scaled_reward": -0.24809808656573296, "rewards/format_reward": 0.10416666977107525, "step": 410 }, { "advantage_max": 1.0548147782683372, "advantage_mean": 9.313226190243995e-09, "advantage_min": -0.6514202803373337, "advantage_std": 0.6515691131353378, "completion_length": 2966.479217529297, "epoch": 0.1766317485898469, "grad_norm": 0.5911142230033875, "kl": 0.427093505859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.8410465752883758e-07, "loss": 0.0407, "reward": -0.24301138380542397, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.24301138380542397, "reward_after_std": 0.6515691243112087, "reward_before_mean": -0.055737750604748726, "reward_before_std": 0.709802620112896, "reward_change_max": 0.0065581053495407104, "reward_change_mean": -0.1872736206278205, "reward_change_min": -0.4615279510617256, "reward_change_std": 0.1935176346451044, "reward_std": 0.6515691354870796, "rewards/cosine_scaled_reward": -0.1320355455391109, "rewards/format_reward": 0.20833334140479565, "step": 411 }, { "advantage_max": 0.743270818144083, "advantage_mean": 1.0554989660072067e-08, "advantage_min": -0.5206784754991531, "advantage_std": 0.46872048638761044, "completion_length": 3160.7708892822266, "epoch": 0.177061509535321, "grad_norm": 0.5234943628311157, "kl": 0.38629150390625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.822847957491922e-07, "loss": 0.0259, "reward": -0.3631281591951847, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3631281591951847, "reward_after_std": 0.46872048638761044, "reward_before_mean": -0.19482686650007963, "reward_before_std": 0.5060403365641832, "reward_change_max": 0.0008045360445976257, "reward_change_mean": -0.16830127476714551, "reward_change_min": -0.33425163477659225, "reward_change_std": 0.1458892181981355, "reward_std": 0.4687205161899328, "rewards/cosine_scaled_reward": -0.1807467769831419, "rewards/format_reward": 0.1666666716337204, "step": 412 }, { "advantage_max": 1.441023726016283, "advantage_mean": -4.346172088887101e-09, "advantage_min": -0.6074897721409798, "advantage_std": 0.7565805613994598, "completion_length": 3020.3125610351562, "epoch": 0.17749127048079505, "grad_norm": 0.59324711561203, "kl": 0.35546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.804828558898332e-07, "loss": 0.057, "reward": -0.10025971196591854, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10025971196591854, "reward_after_std": 0.7565805744379759, "reward_before_mean": 0.1122362120077014, "reward_before_std": 0.7192585822194815, "reward_change_max": 0.0015117600560188293, "reward_change_mean": -0.21249592071399093, "reward_change_min": -0.38194140419363976, "reward_change_std": 0.14794394560158253, "reward_std": 0.7565805874764919, "rewards/cosine_scaled_reward": -0.18346523342188448, "rewards/format_reward": 0.47916667722165585, "step": 413 }, { "advantage_max": 1.6939236000180244, "advantage_mean": -9.934107814135729e-09, "advantage_min": -0.7496881783008575, "advantage_std": 0.9060630239546299, "completion_length": 2985.8125610351562, "epoch": 0.17792103142626914, "grad_norm": 0.5688344836235046, "kl": 0.335205078125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7869892577476722e-07, "loss": 0.0226, "reward": -0.005151113495230675, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.005151113495230675, "reward_after_std": 0.9060630239546299, "reward_before_mean": 0.22277012560516596, "reward_before_std": 0.8882392421364784, "reward_change_max": 0.0, "reward_change_mean": -0.2279212661087513, "reward_change_min": -0.46065774746239185, "reward_change_std": 0.18277086317539215, "reward_std": 0.9060630276799202, "rewards/cosine_scaled_reward": -0.10736493766307831, "rewards/format_reward": 0.4375000074505806, "step": 414 }, { "advantage_max": 1.0349603779613972, "advantage_mean": -8.071462886949377e-09, "advantage_min": -0.4716205894947052, "advantage_std": 0.5763940662145615, "completion_length": 2955.0625534057617, "epoch": 0.17835079237174323, "grad_norm": 0.7609078884124756, "kl": 0.35491943359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7693309235023127e-07, "loss": 0.0095, "reward": -0.30648043006658554, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.30648043006658554, "reward_after_std": 0.5763940922915936, "reward_before_mean": -0.1391908535733819, "reward_before_std": 0.5784778315573931, "reward_change_max": 0.0016353428363800049, "reward_change_mean": -0.1672896002419293, "reward_change_min": -0.38898801803588867, "reward_change_std": 0.14776768069714308, "reward_std": 0.5763940997421741, "rewards/cosine_scaled_reward": -0.215428764000535, "rewards/format_reward": 0.2916666679084301, "step": 415 }, { "advantage_max": 1.3438450396060944, "advantage_mean": 3.4148494587604716e-09, "advantage_min": -0.7342542633414268, "advantage_std": 0.7859014496207237, "completion_length": 3153.125030517578, "epoch": 0.1787805533172173, "grad_norm": 1.0777666568756104, "kl": 0.3922119140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7518544168045524e-07, "loss": 0.0722, "reward": -0.15606826078146696, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15606826078146696, "reward_after_std": 0.7859014496207237, "reward_before_mean": 0.04203657899051905, "reward_before_std": 0.8306329026818275, "reward_change_max": 0.0011892691254615784, "reward_change_mean": -0.19810484908521175, "reward_change_min": -0.49114035069942474, "reward_change_std": 0.19481848552823067, "reward_std": 0.7859014645218849, "rewards/cosine_scaled_reward": -0.13523171516135335, "rewards/format_reward": 0.31250000558793545, "step": 416 }, { "advantage_max": 1.0427075736224651, "advantage_mean": -1.4280279736489376e-08, "advantage_min": -0.5181296765804291, "advantage_std": 0.5624011866748333, "completion_length": 3333.250030517578, "epoch": 0.17921031426269138, "grad_norm": 0.4928218126296997, "kl": 0.43408203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7345605894346726e-07, "loss": 0.0771, "reward": -0.43828903668327257, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.43828903668327257, "reward_after_std": 0.5624011848121881, "reward_before_mean": -0.31879311986267567, "reward_before_std": 0.5637495666742325, "reward_change_max": 0.0007164850831031799, "reward_change_mean": -0.1194959469139576, "reward_change_min": -0.26691415533423424, "reward_change_std": 0.10959426732733846, "reward_std": 0.5624011848121881, "rewards/cosine_scaled_reward": -0.2218965559732169, "rewards/format_reward": 0.1250000037252903, "step": 417 }, { "advantage_max": 1.0997261106967926, "advantage_mean": 9.93410786964688e-09, "advantage_min": -0.5772667042911053, "advantage_std": 0.6249068602919579, "completion_length": 3160.9584045410156, "epoch": 0.17964007520816547, "grad_norm": 0.6578710675239563, "kl": 0.341705322265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7174502842694212e-07, "loss": 0.0544, "reward": -0.15084208454936743, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15084208454936743, "reward_after_std": 0.6249068528413773, "reward_before_mean": 0.06811442319303751, "reward_before_std": 0.6207924596965313, "reward_change_max": 0.0, "reward_change_mean": -0.21895649749785662, "reward_change_min": -0.435267997905612, "reward_change_std": 0.1700435420498252, "reward_std": 0.624906875193119, "rewards/cosine_scaled_reward": -0.1534427972510457, "rewards/format_reward": 0.3750000074505806, "step": 418 }, { "advantage_max": 0.9672140553593636, "advantage_mean": 1.676380662063437e-08, "advantage_min": -0.5048319771885872, "advantage_std": 0.5469458568841219, "completion_length": 3304.625, "epoch": 0.18006983615363953, "grad_norm": 0.5045256614685059, "kl": 0.4156494140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.7005243352409333e-07, "loss": 0.0416, "reward": -0.4129097582772374, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4129097582772374, "reward_after_std": 0.546945858746767, "reward_before_mean": -0.2776251509785652, "reward_before_std": 0.5643527638167143, "reward_change_max": 0.0009662508964538574, "reward_change_mean": -0.1352846254594624, "reward_change_min": -0.3068793471902609, "reward_change_std": 0.1251770919188857, "reward_std": 0.5469458606094122, "rewards/cosine_scaled_reward": -0.18047924479469657, "rewards/format_reward": 0.0833333358168602, "step": 419 }, { "advantage_max": 0.6953948847949505, "advantage_mean": 1.6453366058488683e-08, "advantage_min": -0.4570082910358906, "advantage_std": 0.40402534045279026, "completion_length": 3258.6666870117188, "epoch": 0.18049959709911362, "grad_norm": 0.6379658579826355, "kl": 0.41162109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6837835672960831e-07, "loss": 0.0302, "reward": -0.2964170614723116, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2964170614723116, "reward_after_std": 0.4040253460407257, "reward_before_mean": -0.10100882593542337, "reward_before_std": 0.3982083983719349, "reward_change_max": 2.8505921363830566e-05, "reward_change_mean": -0.19540822692215443, "reward_change_min": -0.33818120136857033, "reward_change_std": 0.13202541088685393, "reward_std": 0.4040253460407257, "rewards/cosine_scaled_reward": -0.1963377515785396, "rewards/format_reward": 0.29166667722165585, "step": 420 }, { "advantage_max": 1.2738246694207191, "advantage_mean": -6.829699139565548e-09, "advantage_min": -0.7089805342257023, "advantage_std": 0.7481806986033916, "completion_length": 2826.479248046875, "epoch": 0.1809293580445877, "grad_norm": 0.4447559714317322, "kl": 0.384033203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6672287963562852e-07, "loss": 0.0348, "reward": 0.05640237871557474, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05640237871557474, "reward_after_std": 0.7481806762516499, "reward_before_mean": 0.3371896645985544, "reward_before_std": 0.7482710368931293, "reward_change_max": 0.0, "reward_change_mean": -0.28078727424144745, "reward_change_min": -0.5555907655507326, "reward_change_std": 0.2220691442489624, "reward_std": 0.7481806799769402, "rewards/cosine_scaled_reward": -0.0918218526057899, "rewards/format_reward": 0.520833345130086, "step": 421 }, { "advantage_max": 1.4212798550724983, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.5734515115618706, "advantage_std": 0.7425553165376186, "completion_length": 3471.6250610351562, "epoch": 0.18135911899006177, "grad_norm": 0.45426347851753235, "kl": 0.422607421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6508608292777203e-07, "loss": 0.0525, "reward": -0.2799212224781513, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2799212224781513, "reward_after_std": 0.7425552941858768, "reward_before_mean": -0.12998057529330254, "reward_before_std": 0.7228503525257111, "reward_change_max": 0.0, "reward_change_mean": -0.14994066581130028, "reward_change_min": -0.33009601570665836, "reward_change_std": 0.12617502082139254, "reward_std": 0.7425553239881992, "rewards/cosine_scaled_reward": -0.1587402825243771, "rewards/format_reward": 0.18750000558793545, "step": 422 }, { "advantage_max": 1.3074338138103485, "advantage_mean": 1.8626450382086546e-09, "advantage_min": -0.7338702082633972, "advantage_std": 0.7715246826410294, "completion_length": 3272.8959045410156, "epoch": 0.18178887993553586, "grad_norm": 0.7873498201370239, "kl": 0.431884765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6346804638120098e-07, "loss": 0.0655, "reward": -0.05061357223894447, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05061357223894447, "reward_after_std": 0.7715247124433517, "reward_before_mean": 0.18861189484596252, "reward_before_std": 0.8060728684067726, "reward_change_max": 0.0007305890321731567, "reward_change_mean": -0.23922546580433846, "reward_change_min": -0.5926108732819557, "reward_change_std": 0.22063307091593742, "reward_std": 0.7715247422456741, "rewards/cosine_scaled_reward": -0.10361073072999716, "rewards/format_reward": 0.3958333395421505, "step": 423 }, { "advantage_max": 1.075336754322052, "advantage_mean": -9.934108424758392e-09, "advantage_min": -0.4614795967936516, "advantage_std": 0.5771752707660198, "completion_length": 3392.5208740234375, "epoch": 0.18221864088100995, "grad_norm": 0.5222776532173157, "kl": 0.4495849609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6186884885673413e-07, "loss": 0.0647, "reward": -0.21659628674387932, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21659628674387932, "reward_after_std": 0.5771752782166004, "reward_before_mean": -0.019143784418702126, "reward_before_std": 0.5455721896141768, "reward_change_max": 0.0010890811681747437, "reward_change_mean": -0.197452531196177, "reward_change_min": -0.3400824014097452, "reward_change_std": 0.1409585541114211, "reward_std": 0.5771752819418907, "rewards/cosine_scaled_reward": -0.11373856291174889, "rewards/format_reward": 0.2083333358168602, "step": 424 }, { "advantage_max": 1.2228608839213848, "advantage_mean": -8.692343955729598e-09, "advantage_min": -0.6169806905090809, "advantage_std": 0.6881819777190685, "completion_length": 2793.041702270508, "epoch": 0.182648401826484, "grad_norm": 0.4874855875968933, "kl": 0.34295654296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.6028856829700258e-07, "loss": 0.0234, "reward": -0.022728698328137398, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.022728698328137398, "reward_after_std": 0.6881819851696491, "reward_before_mean": 0.23221459239721298, "reward_before_std": 0.6737983804196119, "reward_change_max": 0.0005160272121429443, "reward_change_mean": -0.2549433051608503, "reward_change_min": -0.48996769823133945, "reward_change_std": 0.18834537407383323, "reward_std": 0.6881820075213909, "rewards/cosine_scaled_reward": -0.10264270869083703, "rewards/format_reward": 0.4375000037252903, "step": 425 }, { "advantage_max": 0.8565645590424538, "advantage_mean": 3.104408707876871e-08, "advantage_min": -0.6323705688118935, "advantage_std": 0.5653311796486378, "completion_length": 3358.354217529297, "epoch": 0.1830781627719581, "grad_norm": 0.5510448813438416, "kl": 0.442626953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5872728172265146e-07, "loss": 0.0311, "reward": -0.31297726184129715, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.31297726184129715, "reward_after_std": 0.5653311647474766, "reward_before_mean": -0.13411634787917137, "reward_before_std": 0.6339066475629807, "reward_change_max": 0.0010028481483459473, "reward_change_mean": -0.17886086460202932, "reward_change_min": -0.3896334022283554, "reward_change_std": 0.17762285750359297, "reward_std": 0.5653311908245087, "rewards/cosine_scaled_reward": -0.16080818697810173, "rewards/format_reward": 0.18750000558793545, "step": 426 }, { "advantage_max": 0.9311670996248722, "advantage_mean": 2.1730860444435507e-08, "advantage_min": -0.5055506005883217, "advantage_std": 0.5323159042745829, "completion_length": 2938.479202270508, "epoch": 0.1835079237174322, "grad_norm": 0.41469454765319824, "kl": 0.334228515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5718506522858572e-07, "loss": 0.0281, "reward": -0.3125184550881386, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3125184550881386, "reward_after_std": 0.5323159005492926, "reward_before_mean": -0.14093271642923355, "reward_before_std": 0.5376874450594187, "reward_change_max": 0.0, "reward_change_mean": -0.1715857177041471, "reward_change_min": -0.3674417547881603, "reward_change_std": 0.14554436784237623, "reward_std": 0.5323159210383892, "rewards/cosine_scaled_reward": -0.2162996963597834, "rewards/format_reward": 0.29166666977107525, "step": 427 }, { "advantage_max": 0.9278038889169693, "advantage_mean": 1.0865430249973329e-08, "advantage_min": -0.6444680877029896, "advantage_std": 0.5638421066105366, "completion_length": 3157.0625610351562, "epoch": 0.18393768466290625, "grad_norm": 0.41730329394340515, "kl": 0.36376953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5566199398026147e-07, "loss": 0.0302, "reward": -0.13957075029611588, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13957075029611588, "reward_after_std": 0.5638421066105366, "reward_before_mean": 0.09544008725788444, "reward_before_std": 0.5817429684102535, "reward_change_max": 0.0, "reward_change_mean": -0.23501083813607693, "reward_change_min": -0.44477928802371025, "reward_change_std": 0.1791014475747943, "reward_std": 0.5638421140611172, "rewards/cosine_scaled_reward": -0.13977995794266462, "rewards/format_reward": 0.3750000111758709, "step": 428 }, { "advantage_max": 1.0207031406462193, "advantage_mean": 1.3038516155639002e-08, "advantage_min": -0.5473864823579788, "advantage_std": 0.5747033916413784, "completion_length": 2941.8750610351562, "epoch": 0.18436744560838034, "grad_norm": 1.0725964307785034, "kl": 0.3460693359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5415814221002265e-07, "loss": -0.0052, "reward": -0.35181875713169575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.35181875713169575, "reward_after_std": 0.574703399091959, "reward_before_mean": -0.2010254431515932, "reward_before_std": 0.5856017433106899, "reward_change_max": 0.0013648346066474915, "reward_change_mean": -0.1507933083921671, "reward_change_min": -0.34978898987174034, "reward_change_std": 0.13985463231801987, "reward_std": 0.5747034195810556, "rewards/cosine_scaled_reward": -0.21509606298059225, "rewards/format_reward": 0.2291666716337204, "step": 429 }, { "advantage_max": 1.0959882959723473, "advantage_mean": 1.30385160446167e-08, "advantage_min": -0.4858676940202713, "advantage_std": 0.5903975255787373, "completion_length": 3528.6875, "epoch": 0.18479720655385443, "grad_norm": 0.46479299664497375, "kl": 0.42138671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5267358321348285e-07, "loss": 0.0498, "reward": -0.484538646414876, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.484538646414876, "reward_after_std": 0.590397521853447, "reward_before_mean": -0.3843041993677616, "reward_before_std": 0.5999570935964584, "reward_change_max": 0.001091204583644867, "reward_change_mean": -0.1002344498410821, "reward_change_min": -0.24216662533581257, "reward_change_std": 0.10349710285663605, "reward_std": 0.5903975628316402, "rewards/cosine_scaled_reward": -0.23381876666098833, "rewards/format_reward": 0.0833333358168602, "step": 430 }, { "advantage_max": 0.8240176737308502, "advantage_mean": 1.4280279569955923e-08, "advantage_min": -0.5961046144366264, "advantage_std": 0.4965447038412094, "completion_length": 3102.6875610351562, "epoch": 0.1852269674993285, "grad_norm": 0.8354600071907043, "kl": 0.3426513671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.5120838934595337e-07, "loss": 0.005, "reward": -0.32524115964770317, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.32524115964770317, "reward_after_std": 0.4965446926653385, "reward_before_mean": -0.14975540433079004, "reward_before_std": 0.5226058810949326, "reward_change_max": 0.0018779411911964417, "reward_change_mean": -0.17548574693500996, "reward_change_min": -0.3610811196267605, "reward_change_std": 0.152143232524395, "reward_std": 0.4965446963906288, "rewards/cosine_scaled_reward": -0.23112770915031433, "rewards/format_reward": 0.31250001303851604, "step": 431 }, { "advantage_max": 1.3202426508069038, "advantage_mean": 3.352761324126874e-08, "advantage_min": -0.5803426653146744, "advantage_std": 0.7454185895621777, "completion_length": 3375.4375610351562, "epoch": 0.18565672844480258, "grad_norm": 0.7566119432449341, "kl": 0.36053466796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4976263201891613e-07, "loss": -0.0023, "reward": -0.32706610951572657, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32706610951572657, "reward_after_std": 0.7454186156392097, "reward_before_mean": -0.18657900765538216, "reward_before_std": 0.7812497541308403, "reward_change_max": 0.0014080479741096497, "reward_change_mean": -0.14048708509653807, "reward_change_min": -0.4211413413286209, "reward_change_std": 0.1629410684108734, "reward_std": 0.7454186342656612, "rewards/cosine_scaled_reward": -0.18703951477073133, "rewards/format_reward": 0.18750000558793545, "step": 432 }, { "advantage_max": 1.6170933693647385, "advantage_mean": 1.7384688244526103e-08, "advantage_min": -0.6471758484840393, "advantage_std": 0.8433545455336571, "completion_length": 2941.4791870117188, "epoch": 0.18608648939027667, "grad_norm": 0.46476736664772034, "kl": 0.31292724609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.483363816965435e-07, "loss": 0.0177, "reward": -0.23713862150907516, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23713862150907516, "reward_after_std": 0.8433545753359795, "reward_before_mean": -0.0876062992028892, "reward_before_std": 0.8318687453866005, "reward_change_max": 0.0028158575296401978, "reward_change_mean": -0.1495323139242828, "reward_change_min": -0.3322202228009701, "reward_change_std": 0.13445302145555615, "reward_std": 0.8433545827865601, "rewards/cosine_scaled_reward": -0.1792198196053505, "rewards/format_reward": 0.27083333767950535, "step": 433 }, { "advantage_max": 0.9958814196288586, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.6574972346425056, "advantage_std": 0.6107678227126598, "completion_length": 3221.604217529297, "epoch": 0.18651625033575073, "grad_norm": 0.4797721803188324, "kl": 0.3419189453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.469297078922642e-07, "loss": 0.0272, "reward": -0.12642667442560196, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12642667442560196, "reward_after_std": 0.610767837613821, "reward_before_mean": 0.10777171328663826, "reward_before_std": 0.6437437795102596, "reward_change_max": 0.00038917362689971924, "reward_change_mean": -0.23419836908578873, "reward_change_min": -0.4724628385156393, "reward_change_std": 0.19195820484310389, "reward_std": 0.6107678562402725, "rewards/cosine_scaled_reward": -0.06069747731089592, "rewards/format_reward": 0.2291666753590107, "step": 434 }, { "advantage_max": 0.8714477568864822, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.5213457383215427, "advantage_std": 0.5204565636813641, "completion_length": 3285.5000915527344, "epoch": 0.18694601128122482, "grad_norm": 0.48351359367370605, "kl": 0.338134765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4554267916537495e-07, "loss": 0.0196, "reward": -0.4319613091647625, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4319613091647625, "reward_after_std": 0.5204565599560738, "reward_before_mean": -0.29812956042587757, "reward_before_std": 0.5565923266112804, "reward_change_max": 0.0010014921426773071, "reward_change_mean": -0.1338317529298365, "reward_change_min": -0.3429897166788578, "reward_change_std": 0.13940041977912188, "reward_std": 0.520456574857235, "rewards/cosine_scaled_reward": -0.2428147830069065, "rewards/format_reward": 0.18750000186264515, "step": 435 }, { "advantage_max": 1.3295316435396671, "advantage_mean": -8.692344455329959e-09, "advantage_min": -0.7708868980407715, "advantage_std": 0.7838545609265566, "completion_length": 3152.041778564453, "epoch": 0.1873757722266989, "grad_norm": 1.491934061050415, "kl": 0.3226318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4417536311769885e-07, "loss": 0.0838, "reward": -0.09442456439137459, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.09442456439137459, "reward_after_std": 0.7838545609265566, "reward_before_mean": 0.1266978019848466, "reward_before_std": 0.8256864584982395, "reward_change_max": 0.000685669481754303, "reward_change_mean": -0.22112239431589842, "reward_change_min": -0.49476948007941246, "reward_change_std": 0.2046093288809061, "reward_std": 0.783854590728879, "rewards/cosine_scaled_reward": -0.10331775434315205, "rewards/format_reward": 0.33333334140479565, "step": 436 }, { "advantage_max": 0.8692563436925411, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.5588644780218601, "advantage_std": 0.5266159772872925, "completion_length": 2934.416717529297, "epoch": 0.18780553317217297, "grad_norm": 0.30450907349586487, "kl": 0.2861328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4282782639029128e-07, "loss": 0.0502, "reward": -0.1960180550813675, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1960180550813675, "reward_after_std": 0.5266159847378731, "reward_before_mean": 0.02128699515014887, "reward_before_std": 0.5444532912224531, "reward_change_max": 0.0005632713437080383, "reward_change_mean": -0.21730506233870983, "reward_change_min": -0.40743764117360115, "reward_change_std": 0.1679140143096447, "reward_std": 0.5266159996390343, "rewards/cosine_scaled_reward": -0.12477316707372665, "rewards/format_reward": 0.27083333767950535, "step": 437 }, { "advantage_max": 1.3721318319439888, "advantage_mean": -1.1796752907855534e-08, "advantage_min": -0.6450934112071991, "advantage_std": 0.7483322843909264, "completion_length": 3038.729217529297, "epoch": 0.18823529411764706, "grad_norm": 0.390404611825943, "kl": 0.26702880859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4150013466019114e-07, "loss": 0.0316, "reward": 0.21853264793753624, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21853264793753624, "reward_after_std": 0.7483322769403458, "reward_before_mean": 0.5511247105896473, "reward_before_std": 0.6861387528479099, "reward_change_max": 0.0, "reward_change_mean": -0.3325920421630144, "reward_change_min": -0.5462813153862953, "reward_change_std": 0.21747148130089045, "reward_std": 0.7483322955667973, "rewards/cosine_scaled_reward": 0.004729010164737701, "rewards/format_reward": 0.5416666679084301, "step": 438 }, { "advantage_max": 1.400515541434288, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.7002792097628117, "advantage_std": 0.7960162907838821, "completion_length": 3420.0416870117188, "epoch": 0.18866505506312115, "grad_norm": 0.6881502866744995, "kl": 0.33984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.4019235263722034e-07, "loss": 0.0668, "reward": -0.13043580204248428, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13043580204248428, "reward_after_std": 0.796016301959753, "reward_before_mean": 0.07467849552631378, "reward_before_std": 0.8167938552796841, "reward_change_max": 0.001444108784198761, "reward_change_mean": -0.2051142998971045, "reward_change_min": -0.46712362952530384, "reward_change_std": 0.19286625739187002, "reward_std": 0.7960163205862045, "rewards/cosine_scaled_reward": -0.09807742200791836, "rewards/format_reward": 0.2708333395421505, "step": 439 }, { "advantage_max": 1.1045918576419353, "advantage_mean": 1.3348957300651776e-08, "advantage_min": -0.6887406334280968, "advantage_std": 0.6677297912538052, "completion_length": 3287.2291870117188, "epoch": 0.1890948160085952, "grad_norm": 0.4915427267551422, "kl": 0.325927734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3890454406082956e-07, "loss": 0.0539, "reward": -0.26758655719459057, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.26758655719459057, "reward_after_std": 0.6677297949790955, "reward_before_mean": -0.09060697071254253, "reward_before_std": 0.7170297503471375, "reward_change_max": 0.0009088218212127686, "reward_change_mean": -0.17697958461940289, "reward_change_min": -0.39751362055540085, "reward_change_std": 0.17468777671456337, "reward_std": 0.6677298061549664, "rewards/cosine_scaled_reward": -0.1598868235014379, "rewards/format_reward": 0.2291666716337204, "step": 440 }, { "advantage_max": 1.2540669590234756, "advantage_mean": 1.0554989438027462e-08, "advantage_min": -0.5631530582904816, "advantage_std": 0.7040696013718843, "completion_length": 3168.666702270508, "epoch": 0.1895245769540693, "grad_norm": 0.4210474193096161, "kl": 0.31170654296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3763677169699217e-07, "loss": 0.0168, "reward": -0.2718427209183574, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2718427209183574, "reward_after_std": 0.7040696162730455, "reward_before_mean": -0.1082594282925129, "reward_before_std": 0.7272697072476149, "reward_change_max": 0.0010016337037086487, "reward_change_mean": -0.16358328703790903, "reward_change_min": -0.393042478710413, "reward_change_std": 0.16052498761564493, "reward_std": 0.7040696162730455, "rewards/cosine_scaled_reward": -0.14787971694022417, "rewards/format_reward": 0.18750000186264515, "step": 441 }, { "advantage_max": 0.9970009699463844, "advantage_mean": 1.6142924885720333e-08, "advantage_min": -0.5600589029490948, "advantage_std": 0.5921139009296894, "completion_length": 3217.5625915527344, "epoch": 0.18995433789954339, "grad_norm": 0.24283969402313232, "kl": 0.31976318359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3638909733514452e-07, "loss": 0.0395, "reward": -0.2663369467481971, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2663369467481971, "reward_after_std": 0.5921139158308506, "reward_before_mean": -0.08192895725369453, "reward_before_std": 0.620398472994566, "reward_change_max": 0.0004730224609375, "reward_change_mean": -0.1844079946167767, "reward_change_min": -0.4309392645955086, "reward_change_std": 0.1669258214533329, "reward_std": 0.5921139195561409, "rewards/cosine_scaled_reward": -0.18679781630635262, "rewards/format_reward": 0.29166666977107525, "step": 442 }, { "advantage_max": 1.474471092224121, "advantage_mean": 1.1175871339474952e-08, "advantage_min": -0.761995442211628, "advantage_std": 0.830416327342391, "completion_length": 3204.041717529297, "epoch": 0.19038409884501745, "grad_norm": 0.5632076263427734, "kl": 0.3399658203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.351615817851748e-07, "loss": 0.061, "reward": 0.005875192582607269, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.005875192582607269, "reward_after_std": 0.830416327342391, "reward_before_mean": 0.25281623285263777, "reward_before_std": 0.8449702057987452, "reward_change_max": 0.0, "reward_change_mean": -0.24694101698696613, "reward_change_min": -0.5259372256696224, "reward_change_std": 0.20223100390285254, "reward_std": 0.8304163590073586, "rewards/cosine_scaled_reward": -0.009008564986288548, "rewards/format_reward": 0.2708333432674408, "step": 443 }, { "advantage_max": 1.3837426453828812, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.5867674425244331, "advantage_std": 0.7314891945570707, "completion_length": 3141.7083740234375, "epoch": 0.19081385979049154, "grad_norm": 0.5449833869934082, "kl": 0.328155517578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3395428487445914e-07, "loss": 0.0427, "reward": -0.1554530646535568, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1554530646535568, "reward_after_std": 0.7314891759306192, "reward_before_mean": 0.04253510572016239, "reward_before_std": 0.7000963781028986, "reward_change_max": 0.0006845742464065552, "reward_change_mean": -0.19798818766139448, "reward_change_min": -0.3838120140135288, "reward_change_std": 0.15072523429989815, "reward_std": 0.7314892075955868, "rewards/cosine_scaled_reward": -0.1766491187736392, "rewards/format_reward": 0.39583334140479565, "step": 444 }, { "advantage_max": 0.9433692023158073, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -0.47257934510707855, "advantage_std": 0.5158413518220186, "completion_length": 3312.9375610351562, "epoch": 0.19124362073596562, "grad_norm": 0.4852328598499298, "kl": 0.352294921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3276726544494571e-07, "loss": 0.048, "reward": -0.3933799620717764, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3933799620717764, "reward_after_std": 0.5158413350582123, "reward_before_mean": -0.24976772931404412, "reward_before_std": 0.51588299497962, "reward_change_max": 0.0005793720483779907, "reward_change_mean": -0.1436122483573854, "reward_change_min": -0.26960366033017635, "reward_change_std": 0.10835136892274022, "reward_std": 0.5158413574099541, "rewards/cosine_scaled_reward": -0.2394672017544508, "rewards/format_reward": 0.2291666679084301, "step": 445 }, { "advantage_max": 1.179099179804325, "advantage_mean": 4.967053546245381e-09, "advantage_min": -0.8058816492557526, "advantage_std": 0.7069848962128162, "completion_length": 3290.729217529297, "epoch": 0.1916733816814397, "grad_norm": 0.7529411315917969, "kl": 0.33441162109375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.316005813502869e-07, "loss": 0.0685, "reward": -0.13800800405442715, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13800800405442715, "reward_after_std": 0.7069849260151386, "reward_before_mean": 0.08011463284492493, "reward_before_std": 0.749067448079586, "reward_change_max": 0.0005923062562942505, "reward_change_mean": -0.21812264621257782, "reward_change_min": -0.4721273221075535, "reward_change_std": 0.1992535339668393, "reward_std": 0.7069849334657192, "rewards/cosine_scaled_reward": -0.08494268637150526, "rewards/format_reward": 0.25000000931322575, "step": 446 }, { "advantage_max": 1.0695558488368988, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.5367366038262844, "advantage_std": 0.5971044786274433, "completion_length": 3385.0834045410156, "epoch": 0.19210314262691378, "grad_norm": 0.31246575713157654, "kl": 0.3231201171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.3045428945301953e-07, "loss": 0.0321, "reward": -0.32995523139834404, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.32995523139834404, "reward_after_std": 0.5971044637262821, "reward_before_mean": -0.17260274291038513, "reward_before_std": 0.6066189482808113, "reward_change_max": 0.00038471072912216187, "reward_change_mean": -0.15735248569399118, "reward_change_min": -0.33648228645324707, "reward_change_std": 0.13704345375299454, "reward_std": 0.5971044637262821, "rewards/cosine_scaled_reward": -0.24255137983709574, "rewards/format_reward": 0.3125000037252903, "step": 447 }, { "advantage_max": 1.1362785771489143, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.548996239900589, "advantage_std": 0.6378029212355614, "completion_length": 3495.0416870117188, "epoch": 0.19253290357238786, "grad_norm": 0.3686494529247284, "kl": 0.373046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2932844562179352e-07, "loss": 0.0391, "reward": -0.41287659108638763, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.41287659108638763, "reward_after_std": 0.637802928686142, "reward_before_mean": -0.2901984741911292, "reward_before_std": 0.6655464693903923, "reward_change_max": 0.00022982805967330933, "reward_change_mean": -0.12267811968922615, "reward_change_min": -0.3108152374625206, "reward_change_std": 0.13244542572647333, "reward_std": 0.6378029584884644, "rewards/cosine_scaled_reward": -0.1971825771033764, "rewards/format_reward": 0.1041666679084301, "step": 448 }, { "advantage_max": 0.8169592283666134, "advantage_mean": 7.450581041013038e-09, "advantage_min": -0.5978700220584869, "advantage_std": 0.5064218789339066, "completion_length": 3269.062530517578, "epoch": 0.19296266451786193, "grad_norm": 0.3622429370880127, "kl": 0.337158203125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2822310472864885e-07, "loss": 0.0209, "reward": -0.30930935218930244, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.30930935218930244, "reward_after_std": 0.5064218807965517, "reward_before_mean": -0.12645667418837547, "reward_before_std": 0.543878061696887, "reward_change_max": 0.0005560517311096191, "reward_change_mean": -0.18285270570777357, "reward_change_min": -0.38796953670680523, "reward_change_std": 0.15741661563515663, "reward_std": 0.5064219161868095, "rewards/cosine_scaled_reward": -0.15697832591831684, "rewards/format_reward": 0.1875000074505806, "step": 449 }, { "advantage_max": 1.7001039795577526, "advantage_mean": -6.829698639965187e-09, "advantage_min": -0.7766020074486732, "advantage_std": 0.9646117985248566, "completion_length": 3410.1250915527344, "epoch": 0.19339242546333602, "grad_norm": 1.4053958654403687, "kl": 0.3594970703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2713832064634125e-07, "loss": 0.0786, "reward": -0.14230191428214312, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14230191428214312, "reward_after_std": 0.9646118134260178, "reward_before_mean": 0.03708207234740257, "reward_before_std": 1.01846032589674, "reward_change_max": 0.001162216067314148, "reward_change_mean": -0.1793839894235134, "reward_change_min": -0.4469225388020277, "reward_change_std": 0.19370606821030378, "reward_std": 0.9646118134260178, "rewards/cosine_scaled_reward": -0.09604230150580406, "rewards/format_reward": 0.22916667349636555, "step": 450 }, { "advantage_max": 0.7259072177112103, "advantage_mean": 2.421438782818086e-08, "advantage_min": -0.39796046167612076, "advantage_std": 0.4082306120544672, "completion_length": 3471.750030517578, "epoch": 0.1938221864088101, "grad_norm": 0.5835897922515869, "kl": 0.39013671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.260741462457165e-07, "loss": 0.0321, "reward": -0.5147962849587202, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5147962849587202, "reward_after_std": 0.40823061391711235, "reward_before_mean": -0.3989803958684206, "reward_before_std": 0.4127159435302019, "reward_change_max": 0.0005483776330947876, "reward_change_mean": -0.11581587977707386, "reward_change_min": -0.24908852204680443, "reward_change_std": 0.09751816699281335, "reward_std": 0.4082306195050478, "rewards/cosine_scaled_reward": -0.2411568621173501, "rewards/format_reward": 0.0833333358168602, "step": 451 }, { "advantage_max": 0.7229549959301949, "advantage_mean": -1.6142924552653426e-08, "advantage_min": -0.3796108849346638, "advantage_std": 0.39128516614437103, "completion_length": 2773.916732788086, "epoch": 0.19425194735428417, "grad_norm": 0.672677755355835, "kl": 0.30126953125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2503063339313356e-07, "loss": 0.0035, "reward": -0.1724774376489222, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1724774376489222, "reward_after_std": 0.39128517732024193, "reward_before_mean": 0.06450562737882137, "reward_before_std": 0.33875428326427937, "reward_change_max": 0.0009053722023963928, "reward_change_mean": -0.23698307992890477, "reward_change_min": -0.36804749444127083, "reward_change_std": 0.14096720097586513, "reward_std": 0.3912851847708225, "rewards/cosine_scaled_reward": -0.11358053237199783, "rewards/format_reward": 0.29166667349636555, "step": 452 }, { "advantage_max": 1.0738292783498764, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.5574468336999416, "advantage_std": 0.6114741154015064, "completion_length": 2789.604202270508, "epoch": 0.19468170829975825, "grad_norm": 0.903685986995697, "kl": 0.293212890625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2400783294793668e-07, "loss": 0.0459, "reward": -0.23720011487603188, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23720011487603188, "reward_after_std": 0.6114741079509258, "reward_before_mean": -0.050279058050364256, "reward_before_std": 0.6262397021055222, "reward_change_max": 0.0009617358446121216, "reward_change_mean": -0.18692103726789355, "reward_change_min": -0.4182846173644066, "reward_change_std": 0.15933939325623214, "reward_std": 0.611474122852087, "rewards/cosine_scaled_reward": -0.18138953763991594, "rewards/format_reward": 0.3125000037252903, "step": 453 }, { "advantage_max": 1.0807162299752235, "advantage_mean": 2.359350637082258e-08, "advantage_min": -0.631520189344883, "advantage_std": 0.6266374513506889, "completion_length": 3319.9166870117188, "epoch": 0.19511146924523234, "grad_norm": 0.5043768882751465, "kl": 0.3760986328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2300579475997657e-07, "loss": 0.0593, "reward": -0.2834633266320452, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2834633266320452, "reward_after_std": 0.6266374662518501, "reward_before_mean": -0.11107560526579618, "reward_before_std": 0.6505269221961498, "reward_change_max": 0.00040149688720703125, "reward_change_mean": -0.17238769866526127, "reward_change_min": -0.3394498638808727, "reward_change_std": 0.14803719543851912, "reward_std": 0.6266374662518501, "rewards/cosine_scaled_reward": -0.1805378075223416, "rewards/format_reward": 0.25000000186264515, "step": 454 }, { "advantage_max": 1.0385187566280365, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -0.592309482395649, "advantage_std": 0.581235371530056, "completion_length": 3341.729217529297, "epoch": 0.19554123019070643, "grad_norm": 0.4420550465583801, "kl": 0.4063720703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.220245676671809e-07, "loss": 0.0328, "reward": -0.2760108858346939, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2760108858346939, "reward_after_std": 0.5812353678047657, "reward_before_mean": -0.09772268682718277, "reward_before_std": 0.5856316909193993, "reward_change_max": 0.0013298913836479187, "reward_change_mean": -0.17828821297734976, "reward_change_min": -0.3385009169578552, "reward_change_std": 0.14134181011468172, "reward_std": 0.5812353789806366, "rewards/cosine_scaled_reward": -0.1842780103906989, "rewards/format_reward": 0.2708333432674408, "step": 455 }, { "advantage_max": 0.6896219030022621, "advantage_mean": 2.110997870952147e-08, "advantage_min": -0.4432862587273121, "advantage_std": 0.41593967005610466, "completion_length": 3296.3125610351562, "epoch": 0.1959709911361805, "grad_norm": 0.43318450450897217, "kl": 0.3643798828125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2106419949317388e-07, "loss": 0.0304, "reward": -0.4911864921450615, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4911864921450615, "reward_after_std": 0.41593966260552406, "reward_before_mean": -0.3655815124511719, "reward_before_std": 0.44093791767954826, "reward_change_max": 0.0015463531017303467, "reward_change_mean": -0.12560497387312353, "reward_change_min": -0.2699763234704733, "reward_change_std": 0.11570897931233048, "reward_std": 0.41593966260552406, "rewards/cosine_scaled_reward": -0.2765407580882311, "rewards/format_reward": 0.18750000558793545, "step": 456 }, { "advantage_max": 0.8859930969774723, "advantage_mean": 3.725290387279756e-08, "advantage_min": -0.578439686447382, "advantage_std": 0.5553487539291382, "completion_length": 3214.8750610351562, "epoch": 0.19640075208165458, "grad_norm": 0.46279218792915344, "kl": 0.360595703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.2012473704494537e-07, "loss": 0.0298, "reward": -0.3486741867964156, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3486741867964156, "reward_after_std": 0.5553487557917833, "reward_before_mean": -0.18556277081370354, "reward_before_std": 0.6037445273250341, "reward_change_max": 0.0004561692476272583, "reward_change_mean": -0.16311139333993196, "reward_change_min": -0.37286195158958435, "reward_change_std": 0.16159101203083992, "reward_std": 0.5553487706929445, "rewards/cosine_scaled_reward": -0.20736471563577652, "rewards/format_reward": 0.22916667349636555, "step": 457 }, { "advantage_max": 1.7193792760372162, "advantage_mean": 5.277494580235853e-09, "advantage_min": -0.6731682606041431, "advantage_std": 0.8904592394828796, "completion_length": 3334.7501220703125, "epoch": 0.19683051302712867, "grad_norm": 0.6523022651672363, "kl": 0.417236328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1920622611056974e-07, "loss": 0.0518, "reward": -0.1715361649694387, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1715361649694387, "reward_after_std": 0.8904592432081699, "reward_before_mean": -0.004130126908421516, "reward_before_std": 0.8646661564707756, "reward_change_max": 0.0, "reward_change_mean": -0.1674060458317399, "reward_change_min": -0.3487138841301203, "reward_change_std": 0.1347489869222045, "reward_std": 0.8904592432081699, "rewards/cosine_scaled_reward": -0.14789839833974838, "rewards/format_reward": 0.29166667349636555, "step": 458 }, { "advantage_max": 0.7916051372885704, "advantage_mean": 1.4280280014045132e-08, "advantage_min": -0.42358628660440445, "advantage_std": 0.4588632099330425, "completion_length": 3317.6875610351562, "epoch": 0.19726027397260273, "grad_norm": 0.6304188966751099, "kl": 0.3729248046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1830871145697412e-07, "loss": 0.0222, "reward": -0.4716331127565354, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4716331127565354, "reward_after_std": 0.4588632099330425, "reward_before_mean": -0.34677324816584587, "reward_before_std": 0.47800807282328606, "reward_change_max": 0.0014858618378639221, "reward_change_mean": -0.12485986016690731, "reward_change_min": -0.2616387754678726, "reward_change_std": 0.11408987734466791, "reward_std": 0.4588632248342037, "rewards/cosine_scaled_reward": -0.28796995617449284, "rewards/format_reward": 0.22916667349636555, "step": 459 }, { "advantage_max": 1.635408192873001, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.7635039314627647, "advantage_std": 0.9181083738803864, "completion_length": 2963.479217529297, "epoch": 0.19769003491807682, "grad_norm": 0.5301191210746765, "kl": 0.35986328125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1743223682775649e-07, "loss": 0.0592, "reward": -0.05027494765818119, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05027494765818119, "reward_after_std": 0.9181083738803864, "reward_before_mean": 0.16546978591941297, "reward_before_std": 0.9426909312605858, "reward_change_max": 0.00035962462425231934, "reward_change_mean": -0.21574474405497313, "reward_change_min": -0.4561479352414608, "reward_change_std": 0.19359177444130182, "reward_std": 0.9181084148585796, "rewards/cosine_scaled_reward": -0.06309843761846423, "rewards/format_reward": 0.2916666753590107, "step": 460 }, { "advantage_max": 1.1959885135293007, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.559939831495285, "advantage_std": 0.6656781025230885, "completion_length": 3261.0833740234375, "epoch": 0.1981197958635509, "grad_norm": 1.0901010036468506, "kl": 0.33624267578125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1657684494105386e-07, "loss": 0.0807, "reward": -0.11272954382002354, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11272954382002354, "reward_after_std": 0.6656780950725079, "reward_before_mean": 0.11362938955426216, "reward_before_std": 0.6507318653166294, "reward_change_max": 0.001958519220352173, "reward_change_mean": -0.22635892452672124, "reward_change_min": -0.4752196650952101, "reward_change_std": 0.18431826774030924, "reward_std": 0.6656781248748302, "rewards/cosine_scaled_reward": -0.0786019703373313, "rewards/format_reward": 0.27083333395421505, "step": 461 }, { "advantage_max": 0.7798829600214958, "advantage_mean": 9.313225912688239e-09, "advantage_min": -0.43340061977505684, "advantage_std": 0.4435693845152855, "completion_length": 2713.8750534057617, "epoch": 0.19854955680902497, "grad_norm": 0.47222283482551575, "kl": 0.344207763671875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1574257748745986e-07, "loss": 0.0249, "reward": -0.1574770286679268, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1574770286679268, "reward_after_std": 0.4435693882405758, "reward_before_mean": 0.07944672554731369, "reward_before_std": 0.40474721044301987, "reward_change_max": 0.0007408931851387024, "reward_change_mean": -0.23692374117672443, "reward_change_min": -0.40386301279067993, "reward_change_std": 0.1592177450656891, "reward_std": 0.4435694105923176, "rewards/cosine_scaled_reward": -0.14777665212750435, "rewards/format_reward": 0.37500000186264515, "step": 462 }, { "advantage_max": 1.084140419960022, "advantage_mean": 1.5522045870852708e-09, "advantage_min": -0.6314124464988708, "advantage_std": 0.6430176310241222, "completion_length": 3384.7500610351562, "epoch": 0.19897931775449906, "grad_norm": 0.4507286548614502, "kl": 0.371826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1492947512799328e-07, "loss": 0.0339, "reward": -0.157953767105937, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.157953767105937, "reward_after_std": 0.6430176310241222, "reward_before_mean": 0.06043356750160456, "reward_before_std": 0.6663023754954338, "reward_change_max": 0.0008150041103363037, "reward_change_mean": -0.21838734578341246, "reward_change_min": -0.4551526941359043, "reward_change_std": 0.18607959616929293, "reward_std": 0.6430176347494125, "rewards/cosine_scaled_reward": -0.14686655439436436, "rewards/format_reward": 0.3541666716337204, "step": 463 }, { "advantage_max": 0.8689987920224667, "advantage_mean": 6.208817793229571e-09, "advantage_min": -0.46696673333644867, "advantage_std": 0.5108197052031755, "completion_length": 3339.5834045410156, "epoch": 0.19940907869997315, "grad_norm": 1.0421850681304932, "kl": 0.435546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1413757749211602e-07, "loss": 0.0049, "reward": -0.38541407138109207, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.38541407138109207, "reward_after_std": 0.5108197275549173, "reward_before_mean": -0.23528496362268925, "reward_before_std": 0.5358093492686749, "reward_change_max": 0.005812451243400574, "reward_change_mean": -0.15012911055237055, "reward_change_min": -0.35624982230365276, "reward_change_std": 0.142950851470232, "reward_std": 0.5108197499066591, "rewards/cosine_scaled_reward": -0.16972581017762423, "rewards/format_reward": 0.10416666977107525, "step": 464 }, { "advantage_max": 1.3575346767902374, "advantage_mean": 8.071462553882469e-09, "advantage_min": -0.6706593334674835, "advantage_std": 0.7512159049510956, "completion_length": 3047.1458892822266, "epoch": 0.1998388396454472, "grad_norm": 0.6496729254722595, "kl": 0.368621826171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1336692317580158e-07, "loss": 0.0605, "reward": -0.12026693765074015, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12026693765074015, "reward_after_std": 0.7512159198522568, "reward_before_mean": 0.09115784522145987, "reward_before_std": 0.7580849938094616, "reward_change_max": 0.0006444007158279419, "reward_change_mean": -0.2114247651770711, "reward_change_min": -0.43965012952685356, "reward_change_std": 0.1734008239582181, "reward_std": 0.7512159384787083, "rewards/cosine_scaled_reward": -0.10025441402103752, "rewards/format_reward": 0.2916666716337204, "step": 465 }, { "advantage_max": 1.009197637438774, "advantage_mean": 4.346172310931706e-09, "advantage_min": -0.5232692584395409, "advantage_std": 0.5689041335135698, "completion_length": 3409.3750915527344, "epoch": 0.2002686005909213, "grad_norm": 0.3268691599369049, "kl": 0.451904296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1261754973965422e-07, "loss": 0.0554, "reward": -0.4200416002422571, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4200416002422571, "reward_after_std": 0.5689041372388601, "reward_before_mean": -0.28998097963631153, "reward_before_std": 0.5877886563539505, "reward_change_max": 0.0015011727809906006, "reward_change_mean": -0.13006062991917133, "reward_change_min": -0.3295734953135252, "reward_change_std": 0.13454102771356702, "reward_std": 0.5689041465520859, "rewards/cosine_scaled_reward": -0.24915715772658587, "rewards/format_reward": 0.20833333767950535, "step": 466 }, { "advantage_max": 1.2522402964532375, "advantage_mean": 1.490116180447032e-08, "advantage_min": -0.5748643130064011, "advantage_std": 0.7136582657694817, "completion_length": 2788.9792289733887, "epoch": 0.2006983615363954, "grad_norm": 0.40178239345550537, "kl": 0.33197021484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1188949370707787e-07, "loss": 0.0313, "reward": -0.24831391125917435, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24831391125917435, "reward_after_std": 0.7136582732200623, "reward_before_mean": -0.07540171593427658, "reward_before_std": 0.7381091378629208, "reward_change_max": 0.00022836774587631226, "reward_change_mean": -0.17291217856109142, "reward_change_min": -0.40662967786192894, "reward_change_std": 0.16540365107357502, "reward_std": 0.7136583104729652, "rewards/cosine_scaled_reward": -0.1835342012345791, "rewards/format_reward": 0.29166666977107525, "step": 467 }, { "advantage_max": 1.1479545906186104, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.6627045907080173, "advantage_std": 0.6412419304251671, "completion_length": 3002.187545776367, "epoch": 0.20112812248186945, "grad_norm": 0.6472480893135071, "kl": 0.25946044921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1118279056249653e-07, "loss": 0.0593, "reward": -0.1514514461159706, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1514514461159706, "reward_after_std": 0.6412419304251671, "reward_before_mean": 0.06383924372494221, "reward_before_std": 0.6384194567799568, "reward_change_max": 0.0012133866548538208, "reward_change_mean": -0.21529066446237266, "reward_change_min": -0.3886506203562021, "reward_change_std": 0.15845524217002094, "reward_std": 0.6412419490516186, "rewards/cosine_scaled_reward": -0.10349705582484603, "rewards/format_reward": 0.2708333432674408, "step": 468 }, { "advantage_max": 1.2847687304019928, "advantage_mean": 1.4280279792000528e-08, "advantage_min": -0.6391138508915901, "advantage_std": 0.736264294013381, "completion_length": 2710.6458740234375, "epoch": 0.20155788342734354, "grad_norm": 0.44027215242385864, "kl": 0.257171630859375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.1049747474962444e-07, "loss": 0.0028, "reward": 0.1416614931076765, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1416614931076765, "reward_after_std": 0.7362643014639616, "reward_before_mean": 0.4512300183996558, "reward_before_std": 0.723043953999877, "reward_change_max": 0.00013172626495361328, "reward_change_mean": -0.30956850852817297, "reward_change_min": -0.5748772211372852, "reward_change_std": 0.2275826008990407, "reward_std": 0.7362643200904131, "rewards/cosine_scaled_reward": -0.0035516652278602123, "rewards/format_reward": 0.4583333432674408, "step": 469 }, { "advantage_max": 1.4804739877581596, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.6414166390895844, "advantage_std": 0.8009080067276955, "completion_length": 3231.8750610351562, "epoch": 0.20198764437281763, "grad_norm": 1.397734522819519, "kl": 0.3841552734375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0983357966978745e-07, "loss": 0.0792, "reward": -0.1685739466920495, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1685739466920495, "reward_after_std": 0.8009080179035664, "reward_before_mean": 0.016780972306150943, "reward_before_std": 0.8000213950872421, "reward_change_max": 0.0006921812891960144, "reward_change_mean": -0.18535490659996867, "reward_change_min": -0.4408244863152504, "reward_change_std": 0.1686264043673873, "reward_std": 0.8009080477058887, "rewards/cosine_scaled_reward": -0.11660952214151621, "rewards/format_reward": 0.2500000037252903, "step": 470 }, { "advantage_max": 1.2243851348757744, "advantage_mean": 2.173086105505817e-08, "advantage_min": -0.8073594272136688, "advantage_std": 0.7328378856182098, "completion_length": 2841.416702270508, "epoch": 0.2024174053182917, "grad_norm": 0.714418351650238, "kl": 0.28045654296875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0919113768029517e-07, "loss": 0.0532, "reward": 0.09823329607024789, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09823329607024789, "reward_after_std": 0.7328378781676292, "reward_before_mean": 0.3965921439230442, "reward_before_std": 0.7369103953242302, "reward_change_max": 0.0024262890219688416, "reward_change_mean": -0.29835881921462715, "reward_change_min": -0.5579108372330666, "reward_change_std": 0.23703121952712536, "reward_std": 0.7328378893435001, "rewards/cosine_scaled_reward": -0.020453942008316517, "rewards/format_reward": 0.4375000074505806, "step": 471 }, { "advantage_max": 1.0749364122748375, "advantage_mean": 1.2728075482471013e-08, "advantage_min": -0.4944019615650177, "advantage_std": 0.56617821007967, "completion_length": 3154.104263305664, "epoch": 0.20284716626376578, "grad_norm": 0.4662166237831116, "kl": 0.31304931640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0857018009286381e-07, "loss": 0.0041, "reward": -0.34365259297192097, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34365259297192097, "reward_after_std": 0.56617821007967, "reward_before_mean": -0.19235705584287643, "reward_before_std": 0.5425015948712826, "reward_change_max": 0.00048436224460601807, "reward_change_mean": -0.1512955455109477, "reward_change_min": -0.3007952943444252, "reward_change_std": 0.11558177135884762, "reward_std": 0.5661782175302505, "rewards/cosine_scaled_reward": -0.24201186187565327, "rewards/format_reward": 0.2916666716337204, "step": 472 }, { "advantage_max": 0.9885262697935104, "advantage_mean": 1.1175871339474952e-08, "advantage_min": -0.46361351385712624, "advantage_std": 0.5351900905370712, "completion_length": 3074.7708435058594, "epoch": 0.20327692720923987, "grad_norm": 0.7448961138725281, "kl": 0.38916015625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0797073717209013e-07, "loss": 0.0189, "reward": -0.290002278983593, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.290002278983593, "reward_after_std": 0.53519007563591, "reward_before_mean": -0.11414494924247265, "reward_before_std": 0.5161207653582096, "reward_change_max": 0.0007808655500411987, "reward_change_mean": -0.1758573199622333, "reward_change_min": -0.31598240323364735, "reward_change_std": 0.12244301568716764, "reward_std": 0.5351901073008776, "rewards/cosine_scaled_reward": -0.22373914625495672, "rewards/format_reward": 0.33333334140479565, "step": 473 }, { "advantage_max": 1.1998543813824654, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.6848982945084572, "advantage_std": 0.7290734760463238, "completion_length": 3240.9791870117188, "epoch": 0.20370668815471393, "grad_norm": 0.9557588696479797, "kl": 0.39453125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0739283813397639e-07, "loss": 0.0764, "reward": -0.18736706039635465, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.18736706039635465, "reward_after_std": 0.7290734611451626, "reward_before_mean": 0.01134549081325531, "reward_before_std": 0.7818126603960991, "reward_change_max": 0.0008042678236961365, "reward_change_mean": -0.19871254917234182, "reward_change_min": -0.5066852234303951, "reward_change_std": 0.20744303707033396, "reward_std": 0.7290734834969044, "rewards/cosine_scaled_reward": -0.16099392715841532, "rewards/format_reward": 0.33333333767950535, "step": 474 }, { "advantage_max": 0.9820737019181252, "advantage_mean": 6.829698806498641e-09, "advantage_min": -0.47309938818216324, "advantage_std": 0.530810508877039, "completion_length": 3242.604217529297, "epoch": 0.20413644910018802, "grad_norm": 0.562795877456665, "kl": 0.402099609375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.068365111445064e-07, "loss": 0.0353, "reward": -0.3822549059987068, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3822549059987068, "reward_after_std": 0.5308105200529099, "reward_before_mean": -0.23837255872786045, "reward_before_std": 0.5171588286757469, "reward_change_max": 0.0023872777819633484, "reward_change_mean": -0.14388233702629805, "reward_change_min": -0.28487095795571804, "reward_change_std": 0.11479413975030184, "reward_std": 0.530810534954071, "rewards/cosine_scaled_reward": -0.21293628448620439, "rewards/format_reward": 0.1875000074505806, "step": 475 }, { "advantage_max": 0.748881459236145, "advantage_mean": 1.73846881335038e-08, "advantage_min": -0.4368247501552105, "advantage_std": 0.4429945610463619, "completion_length": 3354.625030517578, "epoch": 0.2045662100456621, "grad_norm": 0.5131328701972961, "kl": 0.34320068359375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.063017833182728e-07, "loss": 0.023, "reward": -0.39842486660927534, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.39842486660927534, "reward_after_std": 0.44299454987049103, "reward_before_mean": -0.242546072229743, "reward_before_std": 0.4568288177251816, "reward_change_max": 9.784102439880371e-05, "reward_change_mean": -0.15587879065424204, "reward_change_min": -0.3393108695745468, "reward_change_std": 0.13274936471134424, "reward_std": 0.4429945647716522, "rewards/cosine_scaled_reward": -0.23585637472569942, "rewards/format_reward": 0.2291666716337204, "step": 476 }, { "advantage_max": 1.2588340118527412, "advantage_mean": 9.313226190243995e-09, "advantage_min": -0.6148485392332077, "advantage_std": 0.7043963074684143, "completion_length": 2625.416732788086, "epoch": 0.20499597099113617, "grad_norm": 0.503267228603363, "kl": 0.2635498046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0578868071715544e-07, "loss": 0.0304, "reward": 0.23759766545845196, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23759766545845196, "reward_after_std": 0.7043963149189949, "reward_before_mean": 0.5831128628924489, "reward_before_std": 0.6504729427397251, "reward_change_max": 0.0006654113531112671, "reward_change_mean": -0.34551518596708775, "reward_change_min": -0.6003990229219198, "reward_change_std": 0.2363085960969329, "reward_std": 0.704396340996027, "rewards/cosine_scaled_reward": 0.051973097026348114, "rewards/format_reward": 0.47916666977107525, "step": 477 }, { "advantage_max": 1.5631098672747612, "advantage_mean": 4.967053990334591e-09, "advantage_min": -1.018498219549656, "advantage_std": 0.951477337628603, "completion_length": 2469.5833892822266, "epoch": 0.20542573193661026, "grad_norm": 0.5975154638290405, "kl": 0.241668701171875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0529722834905125e-07, "loss": 0.0141, "reward": 0.2632585233077407, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2632585233077407, "reward_after_std": 0.9514773264527321, "reward_before_mean": 0.5955849029123783, "reward_before_std": 1.0048289373517036, "reward_change_max": 0.0026333406567573547, "reward_change_mean": -0.33232639357447624, "reward_change_min": -0.7211836762726307, "reward_change_std": 0.28996519837528467, "reward_std": 0.9514773897826672, "rewards/cosine_scaled_reward": -0.004290889948606491, "rewards/format_reward": 0.6041666883975267, "step": 478 }, { "advantage_max": 0.721226267516613, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.48217107355594635, "advantage_std": 0.4332780037075281, "completion_length": 3283.875030517578, "epoch": 0.20585549288208435, "grad_norm": 0.3673146963119507, "kl": 0.3736572265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0482745016665526e-07, "loss": 0.0317, "reward": -0.39989104121923447, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.39989104121923447, "reward_after_std": 0.4332779888063669, "reward_before_mean": -0.24340405594557524, "reward_before_std": 0.4522307850420475, "reward_change_max": 0.0013126060366630554, "reward_change_mean": -0.15648699458688498, "reward_change_min": -0.31309567391872406, "reward_change_std": 0.13332579471170902, "reward_std": 0.4332779962569475, "rewards/cosine_scaled_reward": -0.26753536937758327, "rewards/format_reward": 0.29166667722165585, "step": 479 }, { "advantage_max": 0.8936086073517799, "advantage_mean": 7.140140256822747e-09, "advantage_min": -0.5996589288115501, "advantage_std": 0.52460852637887, "completion_length": 3235.5208740234375, "epoch": 0.2062852538275584, "grad_norm": 0.3530065715312958, "kl": 0.350341796875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0437936906629334e-07, "loss": 0.0327, "reward": -0.22914913203567266, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.22914913203567266, "reward_after_std": 0.5246085338294506, "reward_before_mean": -0.02255989983677864, "reward_before_std": 0.5322840847074986, "reward_change_max": 0.0009247064590454102, "reward_change_mean": -0.20658923359587789, "reward_change_min": -0.37002528831362724, "reward_change_std": 0.15850120782852173, "reward_std": 0.5246085450053215, "rewards/cosine_scaled_reward": -0.16752996295690536, "rewards/format_reward": 0.3125000074505806, "step": 480 }, { "advantage_max": 1.109962061047554, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.5118862017989159, "advantage_std": 0.5976303070783615, "completion_length": 3288.937530517578, "epoch": 0.2067150147730325, "grad_norm": 0.5865932703018188, "kl": 0.352294921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0395300688680625e-07, "loss": 0.0603, "reward": -0.3251997996121645, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3251997996121645, "reward_after_std": 0.5976303033530712, "reward_before_mean": -0.169459022115916, "reward_before_std": 0.5862900763750076, "reward_change_max": 0.0006274804472923279, "reward_change_mean": -0.15574077144265175, "reward_change_min": -0.31581558659672737, "reward_change_std": 0.12340012099593878, "reward_std": 0.5976303108036518, "rewards/cosine_scaled_reward": -0.18889617454260588, "rewards/format_reward": 0.2083333395421505, "step": 481 }, { "advantage_max": 0.9843368865549564, "advantage_mean": 2.0489097363185493e-08, "advantage_min": -0.465573713183403, "advantage_std": 0.5335717387497425, "completion_length": 3393.979217529297, "epoch": 0.2071447757185066, "grad_norm": 0.41169318556785583, "kl": 0.396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0354838440848501e-07, "loss": 0.0544, "reward": -0.5233206264674664, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5233206264674664, "reward_after_std": 0.5335717387497425, "reward_before_mean": -0.43069106061011553, "reward_before_std": 0.5436425469815731, "reward_change_max": 0.0020020529627799988, "reward_change_mean": -0.0926295688841492, "reward_change_min": -0.2169207688421011, "reward_change_std": 0.09836359811015427, "reward_std": 0.533571757376194, "rewards/cosine_scaled_reward": -0.2778455279767513, "rewards/format_reward": 0.1250000037252903, "step": 482 }, { "advantage_max": 1.1604081094264984, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.7130227386951447, "advantage_std": 0.6892171520739794, "completion_length": 3185.187530517578, "epoch": 0.20757453666398065, "grad_norm": 0.4431546628475189, "kl": 0.34002685546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0316552135205837e-07, "loss": 0.0391, "reward": -0.08649748424068093, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08649748424068093, "reward_after_std": 0.6892171408981085, "reward_before_mean": 0.15104814060032368, "reward_before_std": 0.7150641363114119, "reward_change_max": 0.000995643436908722, "reward_change_mean": -0.23754562065005302, "reward_change_min": -0.49529997259378433, "reward_change_std": 0.19879611115902662, "reward_std": 0.6892171539366245, "rewards/cosine_scaled_reward": -0.14322593668475747, "rewards/format_reward": 0.43750000931322575, "step": 483 }, { "advantage_max": 1.5369227305054665, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.6903833150863647, "advantage_std": 0.825469933450222, "completion_length": 3202.3750610351562, "epoch": 0.20800429760945474, "grad_norm": 0.6212185621261597, "kl": 0.31463623046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0280443637773163e-07, "loss": 0.0358, "reward": 0.0815081661567092, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0815081661567092, "reward_after_std": 0.8254699409008026, "reward_before_mean": 0.3530906802043319, "reward_before_std": 0.7870620340108871, "reward_change_max": 0.0002982392907142639, "reward_change_mean": -0.2715825098566711, "reward_change_min": -0.5183478463441133, "reward_change_std": 0.2002053977921605, "reward_std": 0.8254699669778347, "rewards/cosine_scaled_reward": -0.06303800269961357, "rewards/format_reward": 0.4791666753590107, "step": 484 }, { "advantage_max": 1.441197782754898, "advantage_mean": -1.241763691872677e-09, "advantage_min": -0.7776016741991043, "advantage_std": 0.7889208234846592, "completion_length": 2897.6042289733887, "epoch": 0.20843405855492883, "grad_norm": 0.31614747643470764, "kl": 0.30377197265625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0246514708427701e-07, "loss": 0.0299, "reward": 0.340908071026206, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.340908071026206, "reward_after_std": 0.7889208234846592, "reward_before_mean": 0.7105672666803002, "reward_before_std": 0.7328723557293415, "reward_change_max": 0.0, "reward_change_mean": -0.36965921707451344, "reward_change_min": -0.6026812419295311, "reward_change_std": 0.2384345536120236, "reward_std": 0.7889208309352398, "rewards/cosine_scaled_reward": 0.08445029752328992, "rewards/format_reward": 0.5416666697710752, "step": 485 }, { "advantage_max": 1.2633581347763538, "advantage_mean": 9.313226301266297e-10, "advantage_min": -0.8129956796765327, "advantage_std": 0.7463502120226622, "completion_length": 2618.1250610351562, "epoch": 0.2088638195004029, "grad_norm": 0.4495880603790283, "kl": 0.2987060546875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0214767000817596e-07, "loss": 0.0459, "reward": 0.34600337594747543, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.34600337594747543, "reward_after_std": 0.7463502045720816, "reward_before_mean": 0.7305690310895443, "reward_before_std": 0.7359522432088852, "reward_change_max": 7.127225399017334e-05, "reward_change_mean": -0.38456564908847213, "reward_change_min": -0.6757028754800558, "reward_change_std": 0.2623256416991353, "reward_std": 0.7463502418249846, "rewards/cosine_scaled_reward": 0.1048678457736969, "rewards/format_reward": 0.5208333376795053, "step": 486 }, { "advantage_max": 1.023912027478218, "advantage_mean": -1.0554988993938252e-08, "advantage_min": -0.7477976828813553, "advantage_std": 0.6359601058065891, "completion_length": 3212.8959045410156, "epoch": 0.20929358044587698, "grad_norm": 0.4299567937850952, "kl": 0.3765869140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0185202062281336e-07, "loss": 0.0319, "reward": -0.03948194161057472, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03948194161057472, "reward_after_std": 0.6359601095318794, "reward_before_mean": 0.2244144342839718, "reward_before_std": 0.6748822554945946, "reward_change_max": 0.0007820650935173035, "reward_change_mean": -0.26389640010893345, "reward_change_min": -0.51700559258461, "reward_change_std": 0.21411369647830725, "reward_std": 0.6359601095318794, "rewards/cosine_scaled_reward": -0.06487612053751945, "rewards/format_reward": 0.3541666753590107, "step": 487 }, { "advantage_max": 1.2995302230119705, "advantage_mean": -2.0489097474207796e-08, "advantage_min": -0.8146852180361748, "advantage_std": 0.7694093398749828, "completion_length": 2746.416748046875, "epoch": 0.20972334139135107, "grad_norm": 0.7231144309043884, "kl": 0.26531982421875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0157821333772304e-07, "loss": 0.0286, "reward": 0.17192705534398556, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.17192705534398556, "reward_after_std": 0.7694093249738216, "reward_before_mean": 0.49201055988669395, "reward_before_std": 0.7808945998549461, "reward_change_max": 0.00043126195669174194, "reward_change_mean": -0.3200835301540792, "reward_change_min": -0.6217946484684944, "reward_change_std": 0.24068319704383612, "reward_std": 0.7694093659520149, "rewards/cosine_scaled_reward": -0.035244714468717575, "rewards/format_reward": 0.562500013038516, "step": 488 }, { "advantage_max": 0.9947935417294502, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.6799236573278904, "advantage_std": 0.6062453892081976, "completion_length": 3199.7084045410156, "epoch": 0.21015310233682513, "grad_norm": 0.30218178033828735, "kl": 0.37579345703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.013262614978859e-07, "loss": 0.0234, "reward": -0.1261199340224266, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1261199340224266, "reward_after_std": 0.6062454041093588, "reward_before_mean": 0.10902630572672933, "reward_before_std": 0.6334007764235139, "reward_change_max": 0.0009804964065551758, "reward_change_mean": -0.2351462438236922, "reward_change_min": -0.4782833680510521, "reward_change_std": 0.19698128616437316, "reward_std": 0.6062454059720039, "rewards/cosine_scaled_reward": -0.1017368477769196, "rewards/format_reward": 0.31250000931322575, "step": 489 }, { "advantage_max": 1.3380321152508259, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.6827748566865921, "advantage_std": 0.7887098044157028, "completion_length": 3090.250045776367, "epoch": 0.21058286328229922, "grad_norm": 0.6663458943367004, "kl": 0.33843994140625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0109617738307911e-07, "loss": 0.0301, "reward": -0.24926690943539143, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.24926690943539143, "reward_after_std": 0.7887098081409931, "reward_before_mean": -0.08283695578575134, "reward_before_std": 0.8454214483499527, "reward_change_max": 0.0017175599932670593, "reward_change_mean": -0.16642996110022068, "reward_change_min": -0.4667244888842106, "reward_change_std": 0.195183583535254, "reward_std": 0.7887098230421543, "rewards/cosine_scaled_reward": -0.13516847789287567, "rewards/format_reward": 0.1875000037252903, "step": 490 }, { "advantage_max": 1.3428443260490894, "advantage_mean": 1.30385160446167e-08, "advantage_min": -0.5237845256924629, "advantage_std": 0.6904821675270796, "completion_length": 3182.6458587646484, "epoch": 0.2110126242277733, "grad_norm": 0.4857102036476135, "kl": 0.405548095703125, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0088797220727779e-07, "loss": 0.0542, "reward": -0.2774577341042459, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2774577341042459, "reward_after_std": 0.6904821693897247, "reward_before_mean": -0.12141535896807909, "reward_before_std": 0.6529953014105558, "reward_change_max": 0.00031735002994537354, "reward_change_mean": -0.15604236256331205, "reward_change_min": -0.2952235322445631, "reward_change_std": 0.11132631497457623, "reward_std": 0.690482173115015, "rewards/cosine_scaled_reward": -0.1648743434343487, "rewards/format_reward": 0.2083333358168602, "step": 491 }, { "advantage_max": 1.123649675399065, "advantage_mean": 3.725290298461914e-09, "advantage_min": -0.8044852539896965, "advantage_std": 0.6812125369906425, "completion_length": 2851.791717529297, "epoch": 0.21144238517324737, "grad_norm": 0.3895203769207001, "kl": 0.34765625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0070165611810855e-07, "loss": 0.0343, "reward": -0.011792315170168877, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.011792315170168877, "reward_after_std": 0.6812125220894814, "reward_before_mean": 0.25286890123970807, "reward_before_std": 0.7108705416321754, "reward_change_max": 0.001825094223022461, "reward_change_mean": -0.2646612133830786, "reward_change_min": -0.504352979362011, "reward_change_std": 0.20900819636881351, "reward_std": 0.6812125258147717, "rewards/cosine_scaled_reward": -0.08189889788627625, "rewards/format_reward": 0.4166666716337204, "step": 492 }, { "advantage_max": 1.1224517971277237, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.6017221696674824, "advantage_std": 0.6208785828202963, "completion_length": 3209.2709350585938, "epoch": 0.21187214611872146, "grad_norm": 0.7126463055610657, "kl": 0.3756103515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.005372381963547e-07, "loss": 0.0463, "reward": -0.19977465644478798, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19977465644478798, "reward_after_std": 0.6208785865455866, "reward_before_mean": 4.852190613746643e-06, "reward_before_std": 0.6119182854890823, "reward_change_max": 0.0, "reward_change_mean": -0.19977950677275658, "reward_change_min": -0.3761254772543907, "reward_change_std": 0.1533208778128028, "reward_std": 0.6208786014467478, "rewards/cosine_scaled_reward": -0.15624758042395115, "rewards/format_reward": 0.31250000186264515, "step": 493 }, { "advantage_max": 1.5478151440620422, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.7828881144523621, "advantage_std": 0.8624896481633186, "completion_length": 3098.9375610351562, "epoch": 0.21230190706419555, "grad_norm": 0.8009844422340393, "kl": 0.34930419921875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0039472645551372e-07, "loss": 0.0336, "reward": 0.09293487668037415, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09293487668037415, "reward_after_std": 0.8624896630644798, "reward_before_mean": 0.3670063656754792, "reward_before_std": 0.86739182472229, "reward_change_max": 0.00038533657789230347, "reward_change_mean": -0.27407151972875, "reward_change_min": -0.589913472533226, "reward_change_std": 0.2239205539226532, "reward_std": 0.8624896816909313, "rewards/cosine_scaled_reward": -0.035246819257736206, "rewards/format_reward": 0.43750000931322575, "step": 494 }, { "advantage_max": 1.0109502747654915, "advantage_mean": 1.4280279792000528e-08, "advantage_min": -0.6496784165501595, "advantage_std": 0.6111225187778473, "completion_length": 3075.7500610351562, "epoch": 0.2127316680096696, "grad_norm": 0.6675384640693665, "kl": 0.3572998046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.002741278414069e-07, "loss": 0.0224, "reward": -0.18597674526972696, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18597674526972696, "reward_after_std": 0.6111225076019764, "reward_before_mean": 0.026072880253195763, "reward_before_std": 0.6398382596671581, "reward_change_max": 0.001205146312713623, "reward_change_mean": -0.21204961277544498, "reward_change_min": -0.46527033671736717, "reward_change_std": 0.1842224821448326, "reward_std": 0.6111225225031376, "rewards/cosine_scaled_reward": -0.19529690593481064, "rewards/format_reward": 0.4166666828095913, "step": 495 }, { "advantage_max": 1.0198055058717728, "advantage_mean": 1.6142924996742636e-08, "advantage_min": -0.5322484746575356, "advantage_std": 0.5591434575617313, "completion_length": 3194.0833587646484, "epoch": 0.2131614289551437, "grad_norm": 0.644253134727478, "kl": 0.42767333984375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0017544823184055e-07, "loss": 0.0245, "reward": -0.38865906931459904, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.38865906931459904, "reward_after_std": 0.5591434501111507, "reward_before_mean": -0.24959524790756404, "reward_before_std": 0.5603953637182713, "reward_change_max": 0.001073397696018219, "reward_change_mean": -0.13906382536515594, "reward_change_min": -0.30526651442050934, "reward_change_std": 0.12588651850819588, "reward_std": 0.559143453836441, "rewards/cosine_scaled_reward": -0.27063095942139626, "rewards/format_reward": 0.2916666753590107, "step": 496 }, { "advantage_max": 1.0906573310494423, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -0.5231519304215908, "advantage_std": 0.5996862910687923, "completion_length": 3114.625030517578, "epoch": 0.2135911899006178, "grad_norm": 0.5536237359046936, "kl": 0.38885498046875, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0009869243631952e-07, "loss": 0.0285, "reward": -0.16377000976353884, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16377000976353884, "reward_after_std": 0.5996862985193729, "reward_before_mean": 0.05263152252882719, "reward_before_std": 0.5768584869801998, "reward_change_max": 0.0004113316535949707, "reward_change_mean": -0.21640154253691435, "reward_change_min": -0.42747030407190323, "reward_change_std": 0.15621319506317377, "reward_std": 0.5996863096952438, "rewards/cosine_scaled_reward": -0.1924342392012477, "rewards/format_reward": 0.43750000558793545, "step": 497 }, { "advantage_max": 1.258760441094637, "advantage_mean": 6.208815683805824e-10, "advantage_min": -0.8018830344080925, "advantage_std": 0.7378112562000751, "completion_length": 3329.666732788086, "epoch": 0.21402095084609185, "grad_norm": 0.6849310994148254, "kl": 0.40289306640625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.000438641958131e-07, "loss": 0.0124, "reward": -0.021134817972779274, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.021134817972779274, "reward_after_std": 0.7378112636506557, "reward_before_mean": 0.23240254074335098, "reward_before_std": 0.763609116896987, "reward_change_max": 0.0006149262189865112, "reward_change_mean": -0.2535373370628804, "reward_change_min": -0.48724211007356644, "reward_change_std": 0.2091447040438652, "reward_std": 0.7378112711012363, "rewards/cosine_scaled_reward": -0.08171541430056095, "rewards/format_reward": 0.39583334140479565, "step": 498 }, { "advantage_max": 1.3974319845438004, "advantage_mean": -3.104408619059029e-08, "advantage_min": -0.7871885746717453, "advantage_std": 0.7993848323822021, "completion_length": 2751.2291946411133, "epoch": 0.21445071179156594, "grad_norm": 0.5386936068534851, "kl": 0.31396484375, "lambda_div_used": 0.7000000000000001, "learning_rate": 1.0001096618257236e-07, "loss": 0.0179, "reward": 0.3062382088974118, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3062382088974118, "reward_after_std": 0.7993848472833633, "reward_before_mean": 0.6677423343062401, "reward_before_std": 0.7825437784194946, "reward_change_max": 0.00024021416902542114, "reward_change_mean": -0.36150417546741664, "reward_change_min": -0.6484993621706963, "reward_change_std": 0.2500389099586755, "reward_std": 0.799384880810976, "rewards/cosine_scaled_reward": 0.06303784251213074, "rewards/format_reward": 0.5416666772216558, "step": 499 }, { "advantage_max": 1.1045014262199402, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.5962215512990952, "advantage_std": 0.6318068094551563, "completion_length": 3178.041717529297, "epoch": 0.21488047273704003, "grad_norm": 0.5711367130279541, "kl": 0.40478515625, "lambda_div_used": 0.7000000000000001, "learning_rate": 1e-07, "loss": 0.011, "reward": -0.31559908762574196, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.31559908762574196, "reward_after_std": 0.6318068169057369, "reward_before_mean": -0.15613664221018553, "reward_before_std": 0.6527485027909279, "reward_change_max": 0.0011594295501708984, "reward_change_mean": -0.15946244448423386, "reward_change_min": -0.3858543932437897, "reward_change_std": 0.1566927768290043, "reward_std": 0.6318068318068981, "rewards/cosine_scaled_reward": -0.21348498645238578, "rewards/format_reward": 0.2708333395421505, "step": 500 }, { "epoch": 0.21488047273704003, "step": 500, "total_flos": 0.0, "train_loss": 0.030813425727123103, "train_runtime": 83450.6868, "train_samples_per_second": 0.288, "train_steps_per_second": 0.006 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }